From ff052eec8057748bf21e247432e6490c3d2375f7 Mon Sep 17 00:00:00 2001 From: Jacob Pratt Date: Sat, 18 Feb 2023 00:13:50 +0000 Subject: [PATCH 1/5] Use restricted Damerau-Levenshtein algorithm --- compiler/rustc_span/src/lev_distance.rs | 94 ++++++++++++++++----- compiler/rustc_span/src/lib.rs | 1 + tests/ui/check-cfg/invalid-cfg-value.stderr | 4 +- 3 files changed, 75 insertions(+), 24 deletions(-) diff --git a/compiler/rustc_span/src/lev_distance.rs b/compiler/rustc_span/src/lev_distance.rs index 61e4b98a8d2..87ab1adc30d 100644 --- a/compiler/rustc_span/src/lev_distance.rs +++ b/compiler/rustc_span/src/lev_distance.rs @@ -1,49 +1,97 @@ -//! Levenshtein distances. +//! Damerau-Levenshtein distances. //! -//! The [Levenshtein distance] is a metric for measuring the difference between two strings. +//! The [Damerau-Levenshtein distance] is a metric for measuring the difference between two strings. +//! This implementation is a restricted version of the algorithm, as it does not permit modifying +//! characters that have already been transposed. //! -//! [Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance +//! [Damerau-Levenshtein distance]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance use crate::symbol::Symbol; -use std::cmp; +use std::{cmp, mem}; #[cfg(test)] mod tests; -/// Finds the Levenshtein distance between two strings. +/// Finds the restricted Damerau-Levenshtein distance between two strings. Characters that have +/// already been transposed may not be modified. /// /// Returns None if the distance exceeds the limit. pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option { - let n = a.chars().count(); - let m = b.chars().count(); - let min_dist = if n < m { m - n } else { n - m }; + let mut a = &a.chars().collect::>()[..]; + let mut b = &b.chars().collect::>()[..]; + // Ensure that `b` is the shorter string, minimizing memory use. + if a.len() < b.len() { + mem::swap(&mut a, &mut b); + } + + let min_dist = a.len() - b.len(); + // If we know the limit will be exceeded, we can return early. if min_dist > limit { return None; } - if n == 0 || m == 0 { - return (min_dist <= limit).then_some(min_dist); + + // Strip common prefix. + while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first()) + && a_char == b_char + { + a = a_rest; + b = b_rest; + } + // Strip common suffix. + while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last()) + && a_char == b_char + { + a = a_rest; + b = b_rest; } - let mut dcol: Vec<_> = (0..=m).collect(); + // If either string is empty, the distance is the length of the other. + // We know that `b` is the shorter string, so we don't need to check `a`. + if b.len() == 0 { + return Some(min_dist); + } - for (i, sc) in a.chars().enumerate() { - let mut current = i; - dcol[0] = current + 1; + let mut prev_prev = vec![usize::MAX; b.len() + 1]; + let mut prev = (0..=b.len()).collect::>(); + let mut current = vec![0; b.len() + 1]; - for (j, tc) in b.chars().enumerate() { - let next = dcol[j + 1]; - if sc == tc { - dcol[j + 1] = current; - } else { - dcol[j + 1] = cmp::min(current, next); - dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1; + // row by row + for i in 1..=a.len() { + current[0] = i; + let a_idx = i - 1; + + // column by column + for j in 1..=b.len() { + let b_idx = j - 1; + + // There is no cost to substitute a character with itself. + let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 }; + + current[j] = cmp::min( + // deletion + prev[j] + 1, + cmp::min( + // insertion + current[j - 1] + 1, + // substitution + prev[j - 1] + substitution_cost, + ), + ); + + if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) { + // transposition + current[j] = cmp::min(current[j], prev_prev[j - 2] + 1); } - current = next; } + + // Rotate the buffers, reusing the memory. + [prev_prev, prev, current] = [prev, current, prev_prev]; } - (dcol[m] <= limit).then_some(dcol[m]) + // `prev` because we already rotated the buffers. + let distance = prev[b.len()]; + (distance <= limit).then_some(distance) } /// Provides a word similarity score between two words that accounts for substrings being more diff --git a/compiler/rustc_span/src/lib.rs b/compiler/rustc_span/src/lib.rs index e095cf3fda2..4335db3823f 100644 --- a/compiler/rustc_span/src/lib.rs +++ b/compiler/rustc_span/src/lib.rs @@ -19,6 +19,7 @@ #![feature(negative_impls)] #![feature(min_specialization)] #![feature(rustc_attrs)] +#![feature(let_chains)] #![deny(rustc::untranslatable_diagnostic)] #![deny(rustc::diagnostic_outside_of_impl)] diff --git a/tests/ui/check-cfg/invalid-cfg-value.stderr b/tests/ui/check-cfg/invalid-cfg-value.stderr index 60abcb18824..83383ea61a4 100644 --- a/tests/ui/check-cfg/invalid-cfg-value.stderr +++ b/tests/ui/check-cfg/invalid-cfg-value.stderr @@ -2,7 +2,9 @@ warning: unexpected `cfg` condition value --> $DIR/invalid-cfg-value.rs:7:7 | LL | #[cfg(feature = "sedre")] - | ^^^^^^^^^^^^^^^^^ + | ^^^^^^^^^^------- + | | + | help: did you mean: `"serde"` | = note: expected values for `feature` are: full, serde = note: `#[warn(unexpected_cfgs)]` on by default From c2f2a3cff2392b8c9898cdfff347770d8e8c05b2 Mon Sep 17 00:00:00 2001 From: Jacob Pratt Date: Sat, 18 Feb 2023 00:34:41 +0000 Subject: [PATCH 2/5] Add test for `println!` typo --- tests/ui/did_you_mean/println-typo.rs | 6 ++++++ tests/ui/did_you_mean/println-typo.stderr | 11 +++++++++++ 2 files changed, 17 insertions(+) create mode 100644 tests/ui/did_you_mean/println-typo.rs create mode 100644 tests/ui/did_you_mean/println-typo.stderr diff --git a/tests/ui/did_you_mean/println-typo.rs b/tests/ui/did_you_mean/println-typo.rs new file mode 100644 index 00000000000..685b5e1f284 --- /dev/null +++ b/tests/ui/did_you_mean/println-typo.rs @@ -0,0 +1,6 @@ +// https://internals.rust-lang.org/t/18227 + +fn main() { + prinltn!(); //~ ERROR cannot find macro `prinltn` in this scope + //^ a macro with a similar name exists: `println` +} diff --git a/tests/ui/did_you_mean/println-typo.stderr b/tests/ui/did_you_mean/println-typo.stderr new file mode 100644 index 00000000000..43b7b1894e2 --- /dev/null +++ b/tests/ui/did_you_mean/println-typo.stderr @@ -0,0 +1,11 @@ +error: cannot find macro `prinltn` in this scope + --> $DIR/println-typo.rs:4:5 + | +LL | prinltn!(); + | ^^^^^^^ help: a macro with a similar name exists: `println` + --> $SRC_DIR/std/src/macros.rs:LL:COL + | + = note: similarly named macro `println` defined here + +error: aborting due to previous error + From 378c4ab9ab1f75c4375425077c46cceb556fbc86 Mon Sep 17 00:00:00 2001 From: Jacob Pratt Date: Sun, 19 Feb 2023 04:03:56 +0000 Subject: [PATCH 3/5] Make public API, docs algorithm-agnostic --- compiler/rustc_ast_lowering/src/item.rs | 2 +- .../rustc_hir_analysis/src/astconv/errors.rs | 2 +- .../rustc_hir_analysis/src/astconv/mod.rs | 2 +- compiler/rustc_hir_typeck/src/expr.rs | 2 +- compiler/rustc_hir_typeck/src/method/probe.rs | 15 +++-- .../rustc_hir_typeck/src/method/suggest.rs | 4 +- compiler/rustc_hir_typeck/src/pat.rs | 2 +- compiler/rustc_interface/src/util.rs | 2 +- compiler/rustc_lint/src/context.rs | 2 +- compiler/rustc_parse/src/parser/item.rs | 5 +- compiler/rustc_resolve/src/diagnostics.rs | 2 +- compiler/rustc_resolve/src/imports.rs | 2 +- .../rustc_resolve/src/late/diagnostics.rs | 6 +- .../src/{lev_distance.rs => edit_distance.rs} | 64 ++++++++++--------- .../{lev_distance => edit_distance}/tests.rs | 40 ++++++------ compiler/rustc_span/src/lib.rs | 2 +- 16 files changed, 81 insertions(+), 73 deletions(-) rename compiler/rustc_span/src/{lev_distance.rs => edit_distance.rs} (72%) rename compiler/rustc_span/src/{lev_distance => edit_distance}/tests.rs (55%) diff --git a/compiler/rustc_ast_lowering/src/item.rs b/compiler/rustc_ast_lowering/src/item.rs index 6bafbfbc14c..4a0e005b8b9 100644 --- a/compiler/rustc_ast_lowering/src/item.rs +++ b/compiler/rustc_ast_lowering/src/item.rs @@ -13,7 +13,7 @@ use rustc_hir::def_id::{LocalDefId, CRATE_DEF_ID}; use rustc_hir::PredicateOrigin; use rustc_index::vec::{Idx, IndexVec}; use rustc_middle::ty::{DefIdTree, ResolverAstLowering, TyCtxt}; -use rustc_span::lev_distance::find_best_match_for_name; +use rustc_span::edit_distance::find_best_match_for_name; use rustc_span::source_map::DesugaringKind; use rustc_span::symbol::{kw, sym, Ident}; use rustc_span::{Span, Symbol}; diff --git a/compiler/rustc_hir_analysis/src/astconv/errors.rs b/compiler/rustc_hir_analysis/src/astconv/errors.rs index 232ef2079d6..006d37cc57c 100644 --- a/compiler/rustc_hir_analysis/src/astconv/errors.rs +++ b/compiler/rustc_hir_analysis/src/astconv/errors.rs @@ -6,7 +6,7 @@ use rustc_hir as hir; use rustc_hir::def_id::DefId; use rustc_middle::ty; use rustc_session::parse::feature_err; -use rustc_span::lev_distance::find_best_match_for_name; +use rustc_span::edit_distance::find_best_match_for_name; use rustc_span::symbol::{sym, Ident}; use rustc_span::{Span, Symbol, DUMMY_SP}; diff --git a/compiler/rustc_hir_analysis/src/astconv/mod.rs b/compiler/rustc_hir_analysis/src/astconv/mod.rs index abc33e84139..44e6797f3f4 100644 --- a/compiler/rustc_hir_analysis/src/astconv/mod.rs +++ b/compiler/rustc_hir_analysis/src/astconv/mod.rs @@ -34,8 +34,8 @@ use rustc_middle::ty::DynKind; use rustc_middle::ty::GenericParamDefKind; use rustc_middle::ty::{self, Const, DefIdTree, IsSuggestable, Ty, TyCtxt, TypeVisitable}; use rustc_session::lint::builtin::{AMBIGUOUS_ASSOCIATED_ITEMS, BARE_TRAIT_OBJECTS}; +use rustc_span::edit_distance::find_best_match_for_name; use rustc_span::edition::Edition; -use rustc_span::lev_distance::find_best_match_for_name; use rustc_span::symbol::{kw, Ident, Symbol}; use rustc_span::{sym, Span, DUMMY_SP}; use rustc_target::spec::abi; diff --git a/compiler/rustc_hir_typeck/src/expr.rs b/compiler/rustc_hir_typeck/src/expr.rs index 2f79071f6dc..08cbfffdd17 100644 --- a/compiler/rustc_hir_typeck/src/expr.rs +++ b/compiler/rustc_hir_typeck/src/expr.rs @@ -45,8 +45,8 @@ use rustc_middle::ty::subst::SubstsRef; use rustc_middle::ty::{self, AdtKind, Ty, TypeVisitable}; use rustc_session::errors::ExprParenthesesNeeded; use rustc_session::parse::feature_err; +use rustc_span::edit_distance::find_best_match_for_name; use rustc_span::hygiene::DesugaringKind; -use rustc_span::lev_distance::find_best_match_for_name; use rustc_span::source_map::{Span, Spanned}; use rustc_span::symbol::{kw, sym, Ident, Symbol}; use rustc_target::spec::abi::Abi::RustIntrinsic; diff --git a/compiler/rustc_hir_typeck/src/method/probe.rs b/compiler/rustc_hir_typeck/src/method/probe.rs index 0b9226802cf..edeb2a847f9 100644 --- a/compiler/rustc_hir_typeck/src/method/probe.rs +++ b/compiler/rustc_hir_typeck/src/method/probe.rs @@ -25,8 +25,8 @@ use rustc_middle::ty::{InternalSubsts, SubstsRef}; use rustc_session::lint; use rustc_span::def_id::DefId; use rustc_span::def_id::LocalDefId; -use rustc_span::lev_distance::{ - find_best_match_for_name_with_substrings, lev_distance_with_substrings, +use rustc_span::edit_distance::{ + edit_distance_with_substrings, find_best_match_for_name_with_substrings, }; use rustc_span::symbol::sym; use rustc_span::{symbol::Ident, Span, Symbol, DUMMY_SP}; @@ -70,7 +70,7 @@ struct ProbeContext<'a, 'tcx> { impl_dups: FxHashSet, /// When probing for names, include names that are close to the - /// requested name (by Levenshtein distance) + /// requested name (by edit distance) allow_similar_names: bool, /// Some(candidate) if there is a private candidate @@ -1794,7 +1794,7 @@ impl<'a, 'tcx> ProbeContext<'a, 'tcx> { /// Similarly to `probe_for_return_type`, this method attempts to find the best matching /// candidate method where the method name may have been misspelled. Similarly to other - /// Levenshtein based suggestions, we provide at most one such suggestion. + /// edit distance based suggestions, we provide at most one such suggestion. fn probe_for_similar_candidate(&mut self) -> Result, MethodError<'tcx>> { debug!("probing for method names similar to {:?}", self.method_name); @@ -2052,8 +2052,11 @@ impl<'a, 'tcx> ProbeContext<'a, 'tcx> { if self.matches_by_doc_alias(x.def_id) { return true; } - match lev_distance_with_substrings(name.as_str(), x.name.as_str(), max_dist) - { + match edit_distance_with_substrings( + name.as_str(), + x.name.as_str(), + max_dist, + ) { Some(d) => d > 0, None => false, } diff --git a/compiler/rustc_hir_typeck/src/method/suggest.rs b/compiler/rustc_hir_typeck/src/method/suggest.rs index 6a7b1f6646a..4f3dbe03c05 100644 --- a/compiler/rustc_hir_typeck/src/method/suggest.rs +++ b/compiler/rustc_hir_typeck/src/method/suggest.rs @@ -31,7 +31,7 @@ use rustc_middle::ty::{self, DefIdTree, GenericArgKind, Ty, TyCtxt, TypeVisitabl use rustc_middle::ty::{IsSuggestable, ToPolyTraitRef}; use rustc_span::symbol::{kw, sym, Ident}; use rustc_span::Symbol; -use rustc_span::{lev_distance, source_map, ExpnKind, FileName, MacroKind, Span}; +use rustc_span::{edit_distance, source_map, ExpnKind, FileName, MacroKind, Span}; use rustc_trait_selection::traits::error_reporting::on_unimplemented::OnUnimplementedNote; use rustc_trait_selection::traits::error_reporting::on_unimplemented::TypeErrCtxtExt as _; use rustc_trait_selection::traits::query::evaluate_obligation::InferCtxtExt as _; @@ -1014,7 +1014,7 @@ impl<'a, 'tcx> FnCtxt<'a, 'tcx> { // that had unsatisfied trait bounds if unsatisfied_predicates.is_empty() && rcvr_ty.is_enum() { let adt_def = rcvr_ty.ty_adt_def().expect("enum is not an ADT"); - if let Some(suggestion) = lev_distance::find_best_match_for_name( + if let Some(suggestion) = edit_distance::find_best_match_for_name( &adt_def.variants().iter().map(|s| s.name).collect::>(), item_name.name, None, diff --git a/compiler/rustc_hir_typeck/src/pat.rs b/compiler/rustc_hir_typeck/src/pat.rs index 3881efe87db..ab6e76ef8aa 100644 --- a/compiler/rustc_hir_typeck/src/pat.rs +++ b/compiler/rustc_hir_typeck/src/pat.rs @@ -14,8 +14,8 @@ use rustc_infer::infer::type_variable::{TypeVariableOrigin, TypeVariableOriginKi use rustc_middle::middle::stability::EvalResult; use rustc_middle::ty::{self, Adt, BindingMode, Ty, TypeVisitable}; use rustc_session::lint::builtin::NON_EXHAUSTIVE_OMITTED_PATTERNS; +use rustc_span::edit_distance::find_best_match_for_name; use rustc_span::hygiene::DesugaringKind; -use rustc_span::lev_distance::find_best_match_for_name; use rustc_span::source_map::{Span, Spanned}; use rustc_span::symbol::{kw, sym, Ident}; use rustc_span::{BytePos, DUMMY_SP}; diff --git a/compiler/rustc_interface/src/util.rs b/compiler/rustc_interface/src/util.rs index e4b4d5375e6..475d3601b52 100644 --- a/compiler/rustc_interface/src/util.rs +++ b/compiler/rustc_interface/src/util.rs @@ -14,8 +14,8 @@ use rustc_session::filesearch::sysroot_candidates; use rustc_session::lint::{self, BuiltinLintDiagnostics, LintBuffer}; use rustc_session::parse::CrateConfig; use rustc_session::{early_error, filesearch, output, Session}; +use rustc_span::edit_distance::find_best_match_for_name; use rustc_span::edition::Edition; -use rustc_span::lev_distance::find_best_match_for_name; use rustc_span::source_map::FileLoader; use rustc_span::symbol::{sym, Symbol}; use session::CompilerIO; diff --git a/compiler/rustc_lint/src/context.rs b/compiler/rustc_lint/src/context.rs index 9a9e2de7b5c..aace4974cc9 100644 --- a/compiler/rustc_lint/src/context.rs +++ b/compiler/rustc_lint/src/context.rs @@ -39,7 +39,7 @@ use rustc_middle::ty::{self, print::Printer, subst::GenericArg, RegisteredTools, use rustc_session::lint::{BuiltinLintDiagnostics, LintExpectationId}; use rustc_session::lint::{FutureIncompatibleInfo, Level, Lint, LintBuffer, LintId}; use rustc_session::Session; -use rustc_span::lev_distance::find_best_match_for_name; +use rustc_span::edit_distance::find_best_match_for_name; use rustc_span::symbol::{sym, Ident, Symbol}; use rustc_span::{BytePos, Span}; use rustc_target::abi; diff --git a/compiler/rustc_parse/src/parser/item.rs b/compiler/rustc_parse/src/parser/item.rs index fd46a1292a8..3e3399d05c9 100644 --- a/compiler/rustc_parse/src/parser/item.rs +++ b/compiler/rustc_parse/src/parser/item.rs @@ -19,8 +19,8 @@ use rustc_errors::{ struct_span_err, Applicability, DiagnosticBuilder, ErrorGuaranteed, IntoDiagnostic, PResult, StashKey, }; +use rustc_span::edit_distance::edit_distance; use rustc_span::edition::Edition; -use rustc_span::lev_distance::lev_distance; use rustc_span::source_map::{self, Span}; use rustc_span::symbol::{kw, sym, Ident, Symbol}; use rustc_span::DUMMY_SP; @@ -459,7 +459,8 @@ impl<'a> Parser<'a> { // Maybe the user misspelled `macro_rules` (issue #91227) if self.token.is_ident() && path.segments.len() == 1 - && lev_distance("macro_rules", &path.segments[0].ident.to_string(), 3).is_some() + && edit_distance("macro_rules", &path.segments[0].ident.to_string(), 3) + .is_some() { err.span_suggestion( path.span, diff --git a/compiler/rustc_resolve/src/diagnostics.rs b/compiler/rustc_resolve/src/diagnostics.rs index 934d60589d4..cd26dbd6190 100644 --- a/compiler/rustc_resolve/src/diagnostics.rs +++ b/compiler/rustc_resolve/src/diagnostics.rs @@ -21,9 +21,9 @@ use rustc_session::lint::builtin::ABSOLUTE_PATHS_NOT_STARTING_WITH_CRATE; use rustc_session::lint::builtin::MACRO_EXPANDED_MACRO_EXPORTS_ACCESSED_BY_ABSOLUTE_PATHS; use rustc_session::lint::BuiltinLintDiagnostics; use rustc_session::Session; +use rustc_span::edit_distance::find_best_match_for_name; use rustc_span::edition::Edition; use rustc_span::hygiene::MacroKind; -use rustc_span::lev_distance::find_best_match_for_name; use rustc_span::source_map::SourceMap; use rustc_span::symbol::{kw, sym, Ident, Symbol}; use rustc_span::{BytePos, Span, SyntaxContext}; diff --git a/compiler/rustc_resolve/src/imports.rs b/compiler/rustc_resolve/src/imports.rs index da3e5095e53..48188b4ba35 100644 --- a/compiler/rustc_resolve/src/imports.rs +++ b/compiler/rustc_resolve/src/imports.rs @@ -21,8 +21,8 @@ use rustc_middle::span_bug; use rustc_middle::ty; use rustc_session::lint::builtin::{PUB_USE_OF_PRIVATE_EXTERN_CRATE, UNUSED_IMPORTS}; use rustc_session::lint::BuiltinLintDiagnostics; +use rustc_span::edit_distance::find_best_match_for_name; use rustc_span::hygiene::LocalExpnId; -use rustc_span::lev_distance::find_best_match_for_name; use rustc_span::symbol::{kw, Ident, Symbol}; use rustc_span::Span; diff --git a/compiler/rustc_resolve/src/late/diagnostics.rs b/compiler/rustc_resolve/src/late/diagnostics.rs index 5205d055cf9..174a543fe46 100644 --- a/compiler/rustc_resolve/src/late/diagnostics.rs +++ b/compiler/rustc_resolve/src/late/diagnostics.rs @@ -25,9 +25,9 @@ use rustc_middle::ty::DefIdTree; use rustc_session::lint; use rustc_session::parse::feature_err; use rustc_session::Session; +use rustc_span::edit_distance::find_best_match_for_name; use rustc_span::edition::Edition; use rustc_span::hygiene::MacroKind; -use rustc_span::lev_distance::find_best_match_for_name; use rustc_span::symbol::{kw, sym, Ident, Symbol}; use rustc_span::{BytePos, Span}; @@ -542,7 +542,7 @@ impl<'a: 'ast, 'ast, 'tcx> LateResolutionVisitor<'a, '_, 'ast, 'tcx> { } } - // Try Levenshtein algorithm. + // Try finding a suitable replacement. let typo_sugg = self.lookup_typo_candidate(path, source.namespace(), is_expected).to_opt_suggestion(); if path.len() == 1 && self.self_type_is_available() { @@ -770,7 +770,7 @@ impl<'a: 'ast, 'ast, 'tcx> LateResolutionVisitor<'a, '_, 'ast, 'tcx> { _ => {} } - // If the trait has a single item (which wasn't matched by Levenshtein), suggest it + // If the trait has a single item (which wasn't matched by the algorithm), suggest it let suggestion = self.get_single_associated_item(&path, &source, is_expected); if !self.r.add_typo_suggestion(err, suggestion, ident_span) { fallback = !self.let_binding_suggestion(err, ident_span); diff --git a/compiler/rustc_span/src/lev_distance.rs b/compiler/rustc_span/src/edit_distance.rs similarity index 72% rename from compiler/rustc_span/src/lev_distance.rs rename to compiler/rustc_span/src/edit_distance.rs index 87ab1adc30d..89f0386e3e9 100644 --- a/compiler/rustc_span/src/lev_distance.rs +++ b/compiler/rustc_span/src/edit_distance.rs @@ -1,10 +1,13 @@ -//! Damerau-Levenshtein distances. +//! Edit distances. //! -//! The [Damerau-Levenshtein distance] is a metric for measuring the difference between two strings. -//! This implementation is a restricted version of the algorithm, as it does not permit modifying -//! characters that have already been transposed. +//! The [edit distance] is a metric for measuring the difference between two strings. //! -//! [Damerau-Levenshtein distance]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance +//! [edit distance]: https://en.wikipedia.org/wiki/Edit_distance + +// The current implementation is the restricted Damerau-Levenshtein algorithm. It is restricted +// because it does not permit modifying characters that have already been transposed. The specific +// algorithm should not matter to the caller of the methods, which is why it is not noted in the +// documentation. use crate::symbol::Symbol; use std::{cmp, mem}; @@ -12,11 +15,12 @@ use std::{cmp, mem}; #[cfg(test)] mod tests; -/// Finds the restricted Damerau-Levenshtein distance between two strings. Characters that have -/// already been transposed may not be modified. +/// Finds the [edit distance] between two strings. /// -/// Returns None if the distance exceeds the limit. -pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option { +/// Returns `None` if the distance exceeds the limit. +/// +/// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance +pub fn edit_distance(a: &str, b: &str, limit: usize) -> Option { let mut a = &a.chars().collect::>()[..]; let mut b = &b.chars().collect::>()[..]; @@ -95,18 +99,18 @@ pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option { } /// Provides a word similarity score between two words that accounts for substrings being more -/// meaningful than a typical Levenshtein distance. The lower the score, the closer the match. -/// 0 is an identical match. +/// meaningful than a typical edit distance. The lower the score, the closer the match. 0 is an +/// identical match. /// -/// Uses the Levenshtein distance between the two strings and removes the cost of the length -/// difference. If this is 0 then it is either a substring match or a full word match, in the -/// substring match case we detect this and return `1`. To prevent finding meaningless substrings, -/// eg. "in" in "shrink", we only perform this subtraction of length difference if one of the words -/// is not greater than twice the length of the other. For cases where the words are close in size -/// but not an exact substring then the cost of the length difference is discounted by half. +/// Uses the edit distance between the two strings and removes the cost of the length difference. +/// If this is 0 then it is either a substring match or a full word match, in the substring match +/// case we detect this and return `1`. To prevent finding meaningless substrings, eg. "in" in +/// "shrink", we only perform this subtraction of length difference if one of the words is not +/// greater than twice the length of the other. For cases where the words are close in size but not +/// an exact substring then the cost of the length difference is discounted by half. /// /// Returns `None` if the distance exceeds the limit. -pub fn lev_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option { +pub fn edit_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option { let n = a.chars().count(); let m = b.chars().count(); @@ -114,10 +118,10 @@ pub fn lev_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option Option return Some(*c), Some(d) => { diff --git a/compiler/rustc_span/src/lev_distance/tests.rs b/compiler/rustc_span/src/edit_distance/tests.rs similarity index 55% rename from compiler/rustc_span/src/lev_distance/tests.rs rename to compiler/rustc_span/src/edit_distance/tests.rs index ed03b22c61f..4b2373eb926 100644 --- a/compiler/rustc_span/src/lev_distance/tests.rs +++ b/compiler/rustc_span/src/edit_distance/tests.rs @@ -1,40 +1,40 @@ use super::*; #[test] -fn test_lev_distance() { +fn test_edit_distance() { // Test bytelength agnosticity for c in (0..char::MAX as u32).filter_map(char::from_u32).map(|i| i.to_string()) { - assert_eq!(lev_distance(&c[..], &c[..], usize::MAX), Some(0)); + assert_eq!(edit_distance(&c[..], &c[..], usize::MAX), Some(0)); } let a = "\nMäry häd ä little lämb\n\nLittle lämb\n"; let b = "\nMary häd ä little lämb\n\nLittle lämb\n"; let c = "Mary häd ä little lämb\n\nLittle lämb\n"; - assert_eq!(lev_distance(a, b, usize::MAX), Some(1)); - assert_eq!(lev_distance(b, a, usize::MAX), Some(1)); - assert_eq!(lev_distance(a, c, usize::MAX), Some(2)); - assert_eq!(lev_distance(c, a, usize::MAX), Some(2)); - assert_eq!(lev_distance(b, c, usize::MAX), Some(1)); - assert_eq!(lev_distance(c, b, usize::MAX), Some(1)); + assert_eq!(edit_distance(a, b, usize::MAX), Some(1)); + assert_eq!(edit_distance(b, a, usize::MAX), Some(1)); + assert_eq!(edit_distance(a, c, usize::MAX), Some(2)); + assert_eq!(edit_distance(c, a, usize::MAX), Some(2)); + assert_eq!(edit_distance(b, c, usize::MAX), Some(1)); + assert_eq!(edit_distance(c, b, usize::MAX), Some(1)); } #[test] -fn test_lev_distance_limit() { - assert_eq!(lev_distance("abc", "abcd", 1), Some(1)); - assert_eq!(lev_distance("abc", "abcd", 0), None); - assert_eq!(lev_distance("abc", "xyz", 3), Some(3)); - assert_eq!(lev_distance("abc", "xyz", 2), None); +fn test_edit_distance_limit() { + assert_eq!(edit_distance("abc", "abcd", 1), Some(1)); + assert_eq!(edit_distance("abc", "abcd", 0), None); + assert_eq!(edit_distance("abc", "xyz", 3), Some(3)); + assert_eq!(edit_distance("abc", "xyz", 2), None); } #[test] fn test_method_name_similarity_score() { - assert_eq!(lev_distance_with_substrings("empty", "is_empty", 1), Some(1)); - assert_eq!(lev_distance_with_substrings("shrunk", "rchunks", 2), None); - assert_eq!(lev_distance_with_substrings("abc", "abcd", 1), Some(1)); - assert_eq!(lev_distance_with_substrings("a", "abcd", 1), None); - assert_eq!(lev_distance_with_substrings("edf", "eq", 1), None); - assert_eq!(lev_distance_with_substrings("abc", "xyz", 3), Some(3)); - assert_eq!(lev_distance_with_substrings("abcdef", "abcdef", 2), Some(0)); + assert_eq!(edit_distance_with_substrings("empty", "is_empty", 1), Some(1)); + assert_eq!(edit_distance_with_substrings("shrunk", "rchunks", 2), None); + assert_eq!(edit_distance_with_substrings("abc", "abcd", 1), Some(1)); + assert_eq!(edit_distance_with_substrings("a", "abcd", 1), None); + assert_eq!(edit_distance_with_substrings("edf", "eq", 1), None); + assert_eq!(edit_distance_with_substrings("abc", "xyz", 3), Some(3)); + assert_eq!(edit_distance_with_substrings("abcdef", "abcdef", 2), Some(0)); } #[test] diff --git a/compiler/rustc_span/src/lib.rs b/compiler/rustc_span/src/lib.rs index 4335db3823f..e112100aa5f 100644 --- a/compiler/rustc_span/src/lib.rs +++ b/compiler/rustc_span/src/lib.rs @@ -47,7 +47,7 @@ pub use hygiene::{ExpnData, ExpnHash, ExpnId, LocalExpnId, SyntaxContext}; use rustc_data_structures::stable_hasher::HashingControls; pub mod def_id; use def_id::{CrateNum, DefId, DefPathHash, LocalDefId, LOCAL_CRATE}; -pub mod lev_distance; +pub mod edit_distance; mod span_encoding; pub use span_encoding::{Span, DUMMY_SP}; From 20282c1b20bb1ce196a3addc9f681cf6cf81feb9 Mon Sep 17 00:00:00 2001 From: Jacob Pratt Date: Sun, 19 Feb 2023 04:17:58 +0000 Subject: [PATCH 4/5] Reduce limit on `macro_rules!` diagnostic --- compiler/rustc_parse/src/parser/item.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler/rustc_parse/src/parser/item.rs b/compiler/rustc_parse/src/parser/item.rs index 3e3399d05c9..f164bb330f3 100644 --- a/compiler/rustc_parse/src/parser/item.rs +++ b/compiler/rustc_parse/src/parser/item.rs @@ -459,7 +459,7 @@ impl<'a> Parser<'a> { // Maybe the user misspelled `macro_rules` (issue #91227) if self.token.is_ident() && path.segments.len() == 1 - && edit_distance("macro_rules", &path.segments[0].ident.to_string(), 3) + && edit_distance("macro_rules", &path.segments[0].ident.to_string(), 2) .is_some() { err.span_suggestion( From ab4c0dd137f064b9a141d1341c546170b1c7272b Mon Sep 17 00:00:00 2001 From: Jacob Pratt Date: Sun, 19 Feb 2023 22:59:22 +0000 Subject: [PATCH 5/5] Add test for precise algorithm used --- compiler/rustc_span/src/edit_distance/tests.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/compiler/rustc_span/src/edit_distance/tests.rs b/compiler/rustc_span/src/edit_distance/tests.rs index 4b2373eb926..c9c7a1f1bf2 100644 --- a/compiler/rustc_span/src/edit_distance/tests.rs +++ b/compiler/rustc_span/src/edit_distance/tests.rs @@ -68,3 +68,13 @@ fn test_find_best_match_for_name() { ); }) } + +#[test] +fn test_precise_algorithm() { + // Not Levenshtein distance. + assert_ne!(edit_distance("ab", "ba", usize::MAX), Some(2)); + // Not unrestricted Damerau-Levenshtein distance. + assert_ne!(edit_distance("abde", "bcaed", usize::MAX), Some(3)); + // The current implementation is a restricted Damerau-Levenshtein distance. + assert_eq!(edit_distance("abde", "bcaed", usize::MAX), Some(4)); +}