Rollup merge of #108200 - jhpratt:restricted-damerau-levenshtein-distance, r=tmiasko

Use restricted Damerau-Levenshtein distance for diagnostics

This replaces the existing Levenshtein algorithm with the Damerau-Levenshtein algorithm. This means that "ab" to "ba" is one change (a transposition) instead of two (a deletion and insertion). More specifically, this is a _restricted_ implementation, in that "ca" to "abc" cannot be performed as "ca" → "ac" → "abc", as there is an insertion in the middle of a transposition. I believe that errors like that are sufficiently rare that it's not worth taking into account.

This was first brought up [on IRLO](https://internals.rust-lang.org/t/18227) when it was noticed that the diagnostic for `prinltn!` (transposed L and T) was `print!` and not `println!`. Only a single existing UI test was effected, with the result being an objective improvement.

~~I have left the method name and various other references to the Levenshtein algorithm untouched, as the exact manner in which the edit distance is calculated should not be relevant to the caller.~~

r? ``@estebank``

``@rustbot`` label +A-diagnostics +C-enhancement
This commit is contained in:
Matthias Krüger 2023-02-20 14:32:55 +01:00 committed by GitHub
commit 226ce31edd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 357 additions and 271 deletions

View File

@ -13,7 +13,7 @@ use rustc_hir::def_id::{LocalDefId, CRATE_DEF_ID};
use rustc_hir::PredicateOrigin;
use rustc_index::vec::{Idx, IndexVec};
use rustc_middle::ty::{DefIdTree, ResolverAstLowering, TyCtxt};
use rustc_span::lev_distance::find_best_match_for_name;
use rustc_span::edit_distance::find_best_match_for_name;
use rustc_span::source_map::DesugaringKind;
use rustc_span::symbol::{kw, sym, Ident};
use rustc_span::{Span, Symbol};

View File

@ -7,7 +7,7 @@ use rustc_hir::def_id::DefId;
use rustc_infer::traits::FulfillmentError;
use rustc_middle::ty::{self, Ty};
use rustc_session::parse::feature_err;
use rustc_span::lev_distance::find_best_match_for_name;
use rustc_span::edit_distance::find_best_match_for_name;
use rustc_span::symbol::{sym, Ident};
use rustc_span::{Span, Symbol, DUMMY_SP};

View File

@ -37,8 +37,8 @@ use rustc_middle::ty::DynKind;
use rustc_middle::ty::GenericParamDefKind;
use rustc_middle::ty::{self, Const, DefIdTree, IsSuggestable, Ty, TyCtxt, TypeVisitable};
use rustc_session::lint::builtin::{AMBIGUOUS_ASSOCIATED_ITEMS, BARE_TRAIT_OBJECTS};
use rustc_span::edit_distance::find_best_match_for_name;
use rustc_span::edition::Edition;
use rustc_span::lev_distance::find_best_match_for_name;
use rustc_span::symbol::{kw, Ident, Symbol};
use rustc_span::{sym, Span, DUMMY_SP};
use rustc_target::spec::abi;

View File

@ -45,8 +45,8 @@ use rustc_middle::ty::subst::SubstsRef;
use rustc_middle::ty::{self, AdtKind, Ty, TypeVisitable};
use rustc_session::errors::ExprParenthesesNeeded;
use rustc_session::parse::feature_err;
use rustc_span::edit_distance::find_best_match_for_name;
use rustc_span::hygiene::DesugaringKind;
use rustc_span::lev_distance::find_best_match_for_name;
use rustc_span::source_map::{Span, Spanned};
use rustc_span::symbol::{kw, sym, Ident, Symbol};
use rustc_target::spec::abi::Abi::RustIntrinsic;

View File

@ -24,8 +24,8 @@ use rustc_middle::ty::{InternalSubsts, SubstsRef};
use rustc_session::lint;
use rustc_span::def_id::DefId;
use rustc_span::def_id::LocalDefId;
use rustc_span::lev_distance::{
find_best_match_for_name_with_substrings, lev_distance_with_substrings,
use rustc_span::edit_distance::{
edit_distance_with_substrings, find_best_match_for_name_with_substrings,
};
use rustc_span::symbol::sym;
use rustc_span::{symbol::Ident, Span, Symbol, DUMMY_SP};
@ -69,7 +69,7 @@ struct ProbeContext<'a, 'tcx> {
impl_dups: FxHashSet<DefId>,
/// When probing for names, include names that are close to the
/// requested name (by Levenshtein distance)
/// requested name (by edit distance)
allow_similar_names: bool,
/// Some(candidate) if there is a private candidate
@ -1793,7 +1793,7 @@ impl<'a, 'tcx> ProbeContext<'a, 'tcx> {
/// Similarly to `probe_for_return_type`, this method attempts to find the best matching
/// candidate method where the method name may have been misspelled. Similarly to other
/// Levenshtein based suggestions, we provide at most one such suggestion.
/// edit distance based suggestions, we provide at most one such suggestion.
fn probe_for_similar_candidate(&mut self) -> Result<Option<ty::AssocItem>, MethodError<'tcx>> {
debug!("probing for method names similar to {:?}", self.method_name);
@ -2024,8 +2024,11 @@ impl<'a, 'tcx> ProbeContext<'a, 'tcx> {
if self.matches_by_doc_alias(x.def_id) {
return true;
}
match lev_distance_with_substrings(name.as_str(), x.name.as_str(), max_dist)
{
match edit_distance_with_substrings(
name.as_str(),
x.name.as_str(),
max_dist,
) {
Some(d) => d > 0,
None => false,
}

View File

@ -31,7 +31,7 @@ use rustc_middle::ty::{self, DefIdTree, GenericArgKind, Ty, TyCtxt, TypeVisitabl
use rustc_middle::ty::{IsSuggestable, ToPolyTraitRef};
use rustc_span::symbol::{kw, sym, Ident};
use rustc_span::Symbol;
use rustc_span::{lev_distance, source_map, ExpnKind, FileName, MacroKind, Span};
use rustc_span::{edit_distance, source_map, ExpnKind, FileName, MacroKind, Span};
use rustc_trait_selection::traits::error_reporting::on_unimplemented::OnUnimplementedNote;
use rustc_trait_selection::traits::error_reporting::on_unimplemented::TypeErrCtxtExt as _;
use rustc_trait_selection::traits::query::evaluate_obligation::InferCtxtExt as _;
@ -1014,7 +1014,7 @@ impl<'a, 'tcx> FnCtxt<'a, 'tcx> {
// that had unsatisfied trait bounds
if unsatisfied_predicates.is_empty() && rcvr_ty.is_enum() {
let adt_def = rcvr_ty.ty_adt_def().expect("enum is not an ADT");
if let Some(suggestion) = lev_distance::find_best_match_for_name(
if let Some(suggestion) = edit_distance::find_best_match_for_name(
&adt_def.variants().iter().map(|s| s.name).collect::<Vec<_>>(),
item_name.name,
None,

View File

@ -14,8 +14,8 @@ use rustc_infer::infer::type_variable::{TypeVariableOrigin, TypeVariableOriginKi
use rustc_middle::middle::stability::EvalResult;
use rustc_middle::ty::{self, Adt, BindingMode, Ty, TypeVisitable};
use rustc_session::lint::builtin::NON_EXHAUSTIVE_OMITTED_PATTERNS;
use rustc_span::edit_distance::find_best_match_for_name;
use rustc_span::hygiene::DesugaringKind;
use rustc_span::lev_distance::find_best_match_for_name;
use rustc_span::source_map::{Span, Spanned};
use rustc_span::symbol::{kw, sym, Ident};
use rustc_span::{BytePos, DUMMY_SP};

View File

@ -14,8 +14,8 @@ use rustc_session::filesearch::sysroot_candidates;
use rustc_session::lint::{self, BuiltinLintDiagnostics, LintBuffer};
use rustc_session::parse::CrateConfig;
use rustc_session::{early_error, filesearch, output, Session};
use rustc_span::edit_distance::find_best_match_for_name;
use rustc_span::edition::Edition;
use rustc_span::lev_distance::find_best_match_for_name;
use rustc_span::source_map::FileLoader;
use rustc_span::symbol::{sym, Symbol};
use session::CompilerIO;

View File

@ -39,7 +39,7 @@ use rustc_middle::ty::{self, print::Printer, subst::GenericArg, RegisteredTools,
use rustc_session::lint::{BuiltinLintDiagnostics, LintExpectationId};
use rustc_session::lint::{FutureIncompatibleInfo, Level, Lint, LintBuffer, LintId};
use rustc_session::Session;
use rustc_span::lev_distance::find_best_match_for_name;
use rustc_span::edit_distance::find_best_match_for_name;
use rustc_span::symbol::{sym, Ident, Symbol};
use rustc_span::{BytePos, Span};
use rustc_target::abi;

View File

@ -19,8 +19,8 @@ use rustc_errors::{
struct_span_err, Applicability, DiagnosticBuilder, ErrorGuaranteed, IntoDiagnostic, PResult,
StashKey,
};
use rustc_span::edit_distance::edit_distance;
use rustc_span::edition::Edition;
use rustc_span::lev_distance::lev_distance;
use rustc_span::source_map::{self, Span};
use rustc_span::symbol::{kw, sym, Ident, Symbol};
use rustc_span::DUMMY_SP;
@ -459,7 +459,8 @@ impl<'a> Parser<'a> {
// Maybe the user misspelled `macro_rules` (issue #91227)
if self.token.is_ident()
&& path.segments.len() == 1
&& lev_distance("macro_rules", &path.segments[0].ident.to_string(), 3).is_some()
&& edit_distance("macro_rules", &path.segments[0].ident.to_string(), 2)
.is_some()
{
err.span_suggestion(
path.span,

View File

@ -21,9 +21,9 @@ use rustc_session::lint::builtin::ABSOLUTE_PATHS_NOT_STARTING_WITH_CRATE;
use rustc_session::lint::builtin::MACRO_EXPANDED_MACRO_EXPORTS_ACCESSED_BY_ABSOLUTE_PATHS;
use rustc_session::lint::BuiltinLintDiagnostics;
use rustc_session::Session;
use rustc_span::edit_distance::find_best_match_for_name;
use rustc_span::edition::Edition;
use rustc_span::hygiene::MacroKind;
use rustc_span::lev_distance::find_best_match_for_name;
use rustc_span::source_map::SourceMap;
use rustc_span::symbol::{kw, sym, Ident, Symbol};
use rustc_span::{BytePos, Span, SyntaxContext};

View File

@ -21,8 +21,8 @@ use rustc_middle::span_bug;
use rustc_middle::ty;
use rustc_session::lint::builtin::{PUB_USE_OF_PRIVATE_EXTERN_CRATE, UNUSED_IMPORTS};
use rustc_session::lint::BuiltinLintDiagnostics;
use rustc_span::edit_distance::find_best_match_for_name;
use rustc_span::hygiene::LocalExpnId;
use rustc_span::lev_distance::find_best_match_for_name;
use rustc_span::symbol::{kw, Ident, Symbol};
use rustc_span::Span;

View File

@ -25,9 +25,9 @@ use rustc_middle::ty::DefIdTree;
use rustc_session::lint;
use rustc_session::parse::feature_err;
use rustc_session::Session;
use rustc_span::edit_distance::find_best_match_for_name;
use rustc_span::edition::Edition;
use rustc_span::hygiene::MacroKind;
use rustc_span::lev_distance::find_best_match_for_name;
use rustc_span::symbol::{kw, sym, Ident, Symbol};
use rustc_span::{BytePos, Span};
@ -542,7 +542,7 @@ impl<'a: 'ast, 'ast, 'tcx> LateResolutionVisitor<'a, '_, 'ast, 'tcx> {
}
}
// Try Levenshtein algorithm.
// Try finding a suitable replacement.
let typo_sugg =
self.lookup_typo_candidate(path, source.namespace(), is_expected).to_opt_suggestion();
if path.len() == 1 && self.self_type_is_available() {
@ -770,7 +770,7 @@ impl<'a: 'ast, 'ast, 'tcx> LateResolutionVisitor<'a, '_, 'ast, 'tcx> {
_ => {}
}
// If the trait has a single item (which wasn't matched by Levenshtein), suggest it
// If the trait has a single item (which wasn't matched by the algorithm), suggest it
let suggestion = self.get_single_associated_item(&path, &source, is_expected);
if !self.r.add_typo_suggestion(err, suggestion, ident_span) {
fallback = !self.let_binding_suggestion(err, ident_span);

View File

@ -0,0 +1,229 @@
//! Edit distances.
//!
//! The [edit distance] is a metric for measuring the difference between two strings.
//!
//! [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
// The current implementation is the restricted Damerau-Levenshtein algorithm. It is restricted
// because it does not permit modifying characters that have already been transposed. The specific
// algorithm should not matter to the caller of the methods, which is why it is not noted in the
// documentation.
use crate::symbol::Symbol;
use std::{cmp, mem};
#[cfg(test)]
mod tests;
/// Finds the [edit distance] between two strings.
///
/// Returns `None` if the distance exceeds the limit.
///
/// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
pub fn edit_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
let mut a = &a.chars().collect::<Vec<_>>()[..];
let mut b = &b.chars().collect::<Vec<_>>()[..];
// Ensure that `b` is the shorter string, minimizing memory use.
if a.len() < b.len() {
mem::swap(&mut a, &mut b);
}
let min_dist = a.len() - b.len();
// If we know the limit will be exceeded, we can return early.
if min_dist > limit {
return None;
}
// Strip common prefix.
while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first())
&& a_char == b_char
{
a = a_rest;
b = b_rest;
}
// Strip common suffix.
while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last())
&& a_char == b_char
{
a = a_rest;
b = b_rest;
}
// If either string is empty, the distance is the length of the other.
// We know that `b` is the shorter string, so we don't need to check `a`.
if b.len() == 0 {
return Some(min_dist);
}
let mut prev_prev = vec![usize::MAX; b.len() + 1];
let mut prev = (0..=b.len()).collect::<Vec<_>>();
let mut current = vec![0; b.len() + 1];
// row by row
for i in 1..=a.len() {
current[0] = i;
let a_idx = i - 1;
// column by column
for j in 1..=b.len() {
let b_idx = j - 1;
// There is no cost to substitute a character with itself.
let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 };
current[j] = cmp::min(
// deletion
prev[j] + 1,
cmp::min(
// insertion
current[j - 1] + 1,
// substitution
prev[j - 1] + substitution_cost,
),
);
if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) {
// transposition
current[j] = cmp::min(current[j], prev_prev[j - 2] + 1);
}
}
// Rotate the buffers, reusing the memory.
[prev_prev, prev, current] = [prev, current, prev_prev];
}
// `prev` because we already rotated the buffers.
let distance = prev[b.len()];
(distance <= limit).then_some(distance)
}
/// Provides a word similarity score between two words that accounts for substrings being more
/// meaningful than a typical edit distance. The lower the score, the closer the match. 0 is an
/// identical match.
///
/// Uses the edit distance between the two strings and removes the cost of the length difference.
/// If this is 0 then it is either a substring match or a full word match, in the substring match
/// case we detect this and return `1`. To prevent finding meaningless substrings, eg. "in" in
/// "shrink", we only perform this subtraction of length difference if one of the words is not
/// greater than twice the length of the other. For cases where the words are close in size but not
/// an exact substring then the cost of the length difference is discounted by half.
///
/// Returns `None` if the distance exceeds the limit.
pub fn edit_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option<usize> {
let n = a.chars().count();
let m = b.chars().count();
// Check one isn't less than half the length of the other. If this is true then there is a
// big difference in length.
let big_len_diff = (n * 2) < m || (m * 2) < n;
let len_diff = if n < m { m - n } else { n - m };
let distance = edit_distance(a, b, limit + len_diff)?;
// This is the crux, subtracting length difference means exact substring matches will now be 0
let score = distance - len_diff;
// If the score is 0 but the words have different lengths then it's a substring match not a full
// word match
let score = if score == 0 && len_diff > 0 && !big_len_diff {
1 // Exact substring match, but not a total word match so return non-zero
} else if !big_len_diff {
// Not a big difference in length, discount cost of length difference
score + (len_diff + 1) / 2
} else {
// A big difference in length, add back the difference in length to the score
score + len_diff
};
(score <= limit).then_some(score)
}
/// Finds the best match for given word in the given iterator where substrings are meaningful.
///
/// A version of [`find_best_match_for_name`] that uses [`edit_distance_with_substrings`] as the
/// score for word similarity. This takes an optional distance limit which defaults to one-third of
/// the given word.
///
/// We use case insensitive comparison to improve accuracy on an edge case with a lower(upper)case
/// letters mismatch.
pub fn find_best_match_for_name_with_substrings(
candidates: &[Symbol],
lookup: Symbol,
dist: Option<usize>,
) -> Option<Symbol> {
find_best_match_for_name_impl(true, candidates, lookup, dist)
}
/// Finds the best match for a given word in the given iterator.
///
/// As a loose rule to avoid the obviously incorrect suggestions, it takes
/// an optional limit for the maximum allowable edit distance, which defaults
/// to one-third of the given word.
///
/// We use case insensitive comparison to improve accuracy on an edge case with a lower(upper)case
/// letters mismatch.
pub fn find_best_match_for_name(
candidates: &[Symbol],
lookup: Symbol,
dist: Option<usize>,
) -> Option<Symbol> {
find_best_match_for_name_impl(false, candidates, lookup, dist)
}
#[cold]
fn find_best_match_for_name_impl(
use_substring_score: bool,
candidates: &[Symbol],
lookup: Symbol,
dist: Option<usize>,
) -> Option<Symbol> {
let lookup = lookup.as_str();
let lookup_uppercase = lookup.to_uppercase();
// Priority of matches:
// 1. Exact case insensitive match
// 2. Edit distance match
// 3. Sorted word match
if let Some(c) = candidates.iter().find(|c| c.as_str().to_uppercase() == lookup_uppercase) {
return Some(*c);
}
let mut dist = dist.unwrap_or_else(|| cmp::max(lookup.len(), 3) / 3);
let mut best = None;
for c in candidates {
match if use_substring_score {
edit_distance_with_substrings(lookup, c.as_str(), dist)
} else {
edit_distance(lookup, c.as_str(), dist)
} {
Some(0) => return Some(*c),
Some(d) => {
dist = d - 1;
best = Some(*c);
}
None => {}
}
}
if best.is_some() {
return best;
}
find_match_by_sorted_words(candidates, lookup)
}
fn find_match_by_sorted_words(iter_names: &[Symbol], lookup: &str) -> Option<Symbol> {
iter_names.iter().fold(None, |result, candidate| {
if sort_by_words(candidate.as_str()) == sort_by_words(lookup) {
Some(*candidate)
} else {
result
}
})
}
fn sort_by_words(name: &str) -> String {
let mut split_words: Vec<&str> = name.split('_').collect();
// We are sorting primitive &strs and can use unstable sort here.
split_words.sort_unstable();
split_words.join("_")
}

View File

@ -0,0 +1,80 @@
use super::*;
#[test]
fn test_edit_distance() {
// Test bytelength agnosticity
for c in (0..char::MAX as u32).filter_map(char::from_u32).map(|i| i.to_string()) {
assert_eq!(edit_distance(&c[..], &c[..], usize::MAX), Some(0));
}
let a = "\nMäry häd ä little lämb\n\nLittle lämb\n";
let b = "\nMary häd ä little lämb\n\nLittle lämb\n";
let c = "Mary häd ä little lämb\n\nLittle lämb\n";
assert_eq!(edit_distance(a, b, usize::MAX), Some(1));
assert_eq!(edit_distance(b, a, usize::MAX), Some(1));
assert_eq!(edit_distance(a, c, usize::MAX), Some(2));
assert_eq!(edit_distance(c, a, usize::MAX), Some(2));
assert_eq!(edit_distance(b, c, usize::MAX), Some(1));
assert_eq!(edit_distance(c, b, usize::MAX), Some(1));
}
#[test]
fn test_edit_distance_limit() {
assert_eq!(edit_distance("abc", "abcd", 1), Some(1));
assert_eq!(edit_distance("abc", "abcd", 0), None);
assert_eq!(edit_distance("abc", "xyz", 3), Some(3));
assert_eq!(edit_distance("abc", "xyz", 2), None);
}
#[test]
fn test_method_name_similarity_score() {
assert_eq!(edit_distance_with_substrings("empty", "is_empty", 1), Some(1));
assert_eq!(edit_distance_with_substrings("shrunk", "rchunks", 2), None);
assert_eq!(edit_distance_with_substrings("abc", "abcd", 1), Some(1));
assert_eq!(edit_distance_with_substrings("a", "abcd", 1), None);
assert_eq!(edit_distance_with_substrings("edf", "eq", 1), None);
assert_eq!(edit_distance_with_substrings("abc", "xyz", 3), Some(3));
assert_eq!(edit_distance_with_substrings("abcdef", "abcdef", 2), Some(0));
}
#[test]
fn test_find_best_match_for_name() {
use crate::create_default_session_globals_then;
create_default_session_globals_then(|| {
let input = vec![Symbol::intern("aaab"), Symbol::intern("aaabc")];
assert_eq!(
find_best_match_for_name(&input, Symbol::intern("aaaa"), None),
Some(Symbol::intern("aaab"))
);
assert_eq!(find_best_match_for_name(&input, Symbol::intern("1111111111"), None), None);
let input = vec![Symbol::intern("AAAA")];
assert_eq!(
find_best_match_for_name(&input, Symbol::intern("aaaa"), None),
Some(Symbol::intern("AAAA"))
);
let input = vec![Symbol::intern("AAAA")];
assert_eq!(
find_best_match_for_name(&input, Symbol::intern("aaaa"), Some(4)),
Some(Symbol::intern("AAAA"))
);
let input = vec![Symbol::intern("a_longer_variable_name")];
assert_eq!(
find_best_match_for_name(&input, Symbol::intern("a_variable_longer_name"), None),
Some(Symbol::intern("a_longer_variable_name"))
);
})
}
#[test]
fn test_precise_algorithm() {
// Not Levenshtein distance.
assert_ne!(edit_distance("ab", "ba", usize::MAX), Some(2));
// Not unrestricted Damerau-Levenshtein distance.
assert_ne!(edit_distance("abde", "bcaed", usize::MAX), Some(3));
// The current implementation is a restricted Damerau-Levenshtein distance.
assert_eq!(edit_distance("abde", "bcaed", usize::MAX), Some(4));
}

View File

@ -1,177 +0,0 @@
//! Levenshtein distances.
//!
//! The [Levenshtein distance] is a metric for measuring the difference between two strings.
//!
//! [Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance
use crate::symbol::Symbol;
use std::cmp;
#[cfg(test)]
mod tests;
/// Finds the Levenshtein distance between two strings.
///
/// Returns None if the distance exceeds the limit.
pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
let n = a.chars().count();
let m = b.chars().count();
let min_dist = if n < m { m - n } else { n - m };
if min_dist > limit {
return None;
}
if n == 0 || m == 0 {
return (min_dist <= limit).then_some(min_dist);
}
let mut dcol: Vec<_> = (0..=m).collect();
for (i, sc) in a.chars().enumerate() {
let mut current = i;
dcol[0] = current + 1;
for (j, tc) in b.chars().enumerate() {
let next = dcol[j + 1];
if sc == tc {
dcol[j + 1] = current;
} else {
dcol[j + 1] = cmp::min(current, next);
dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1;
}
current = next;
}
}
(dcol[m] <= limit).then_some(dcol[m])
}
/// Provides a word similarity score between two words that accounts for substrings being more
/// meaningful than a typical Levenshtein distance. The lower the score, the closer the match.
/// 0 is an identical match.
///
/// Uses the Levenshtein distance between the two strings and removes the cost of the length
/// difference. If this is 0 then it is either a substring match or a full word match, in the
/// substring match case we detect this and return `1`. To prevent finding meaningless substrings,
/// eg. "in" in "shrink", we only perform this subtraction of length difference if one of the words
/// is not greater than twice the length of the other. For cases where the words are close in size
/// but not an exact substring then the cost of the length difference is discounted by half.
///
/// Returns `None` if the distance exceeds the limit.
pub fn lev_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option<usize> {
let n = a.chars().count();
let m = b.chars().count();
// Check one isn't less than half the length of the other. If this is true then there is a
// big difference in length.
let big_len_diff = (n * 2) < m || (m * 2) < n;
let len_diff = if n < m { m - n } else { n - m };
let lev = lev_distance(a, b, limit + len_diff)?;
// This is the crux, subtracting length difference means exact substring matches will now be 0
let score = lev - len_diff;
// If the score is 0 but the words have different lengths then it's a substring match not a full
// word match
let score = if score == 0 && len_diff > 0 && !big_len_diff {
1 // Exact substring match, but not a total word match so return non-zero
} else if !big_len_diff {
// Not a big difference in length, discount cost of length difference
score + (len_diff + 1) / 2
} else {
// A big difference in length, add back the difference in length to the score
score + len_diff
};
(score <= limit).then_some(score)
}
/// Finds the best match for given word in the given iterator where substrings are meaningful.
///
/// A version of [`find_best_match_for_name`] that uses [`lev_distance_with_substrings`] as the score
/// for word similarity. This takes an optional distance limit which defaults to one-third of the
/// given word.
///
/// Besides the modified Levenshtein, we use case insensitive comparison to improve accuracy
/// on an edge case with a lower(upper)case letters mismatch.
pub fn find_best_match_for_name_with_substrings(
candidates: &[Symbol],
lookup: Symbol,
dist: Option<usize>,
) -> Option<Symbol> {
find_best_match_for_name_impl(true, candidates, lookup, dist)
}
/// Finds the best match for a given word in the given iterator.
///
/// As a loose rule to avoid the obviously incorrect suggestions, it takes
/// an optional limit for the maximum allowable edit distance, which defaults
/// to one-third of the given word.
///
/// Besides Levenshtein, we use case insensitive comparison to improve accuracy
/// on an edge case with a lower(upper)case letters mismatch.
pub fn find_best_match_for_name(
candidates: &[Symbol],
lookup: Symbol,
dist: Option<usize>,
) -> Option<Symbol> {
find_best_match_for_name_impl(false, candidates, lookup, dist)
}
#[cold]
fn find_best_match_for_name_impl(
use_substring_score: bool,
candidates: &[Symbol],
lookup: Symbol,
dist: Option<usize>,
) -> Option<Symbol> {
let lookup = lookup.as_str();
let lookup_uppercase = lookup.to_uppercase();
// Priority of matches:
// 1. Exact case insensitive match
// 2. Levenshtein distance match
// 3. Sorted word match
if let Some(c) = candidates.iter().find(|c| c.as_str().to_uppercase() == lookup_uppercase) {
return Some(*c);
}
let mut dist = dist.unwrap_or_else(|| cmp::max(lookup.len(), 3) / 3);
let mut best = None;
for c in candidates {
match if use_substring_score {
lev_distance_with_substrings(lookup, c.as_str(), dist)
} else {
lev_distance(lookup, c.as_str(), dist)
} {
Some(0) => return Some(*c),
Some(d) => {
dist = d - 1;
best = Some(*c);
}
None => {}
}
}
if best.is_some() {
return best;
}
find_match_by_sorted_words(candidates, lookup)
}
fn find_match_by_sorted_words(iter_names: &[Symbol], lookup: &str) -> Option<Symbol> {
iter_names.iter().fold(None, |result, candidate| {
if sort_by_words(candidate.as_str()) == sort_by_words(lookup) {
Some(*candidate)
} else {
result
}
})
}
fn sort_by_words(name: &str) -> String {
let mut split_words: Vec<&str> = name.split('_').collect();
// We are sorting primitive &strs and can use unstable sort here.
split_words.sort_unstable();
split_words.join("_")
}

View File

@ -1,70 +0,0 @@
use super::*;
#[test]
fn test_lev_distance() {
// Test bytelength agnosticity
for c in (0..char::MAX as u32).filter_map(char::from_u32).map(|i| i.to_string()) {
assert_eq!(lev_distance(&c[..], &c[..], usize::MAX), Some(0));
}
let a = "\nMäry häd ä little lämb\n\nLittle lämb\n";
let b = "\nMary häd ä little lämb\n\nLittle lämb\n";
let c = "Mary häd ä little lämb\n\nLittle lämb\n";
assert_eq!(lev_distance(a, b, usize::MAX), Some(1));
assert_eq!(lev_distance(b, a, usize::MAX), Some(1));
assert_eq!(lev_distance(a, c, usize::MAX), Some(2));
assert_eq!(lev_distance(c, a, usize::MAX), Some(2));
assert_eq!(lev_distance(b, c, usize::MAX), Some(1));
assert_eq!(lev_distance(c, b, usize::MAX), Some(1));
}
#[test]
fn test_lev_distance_limit() {
assert_eq!(lev_distance("abc", "abcd", 1), Some(1));
assert_eq!(lev_distance("abc", "abcd", 0), None);
assert_eq!(lev_distance("abc", "xyz", 3), Some(3));
assert_eq!(lev_distance("abc", "xyz", 2), None);
}
#[test]
fn test_method_name_similarity_score() {
assert_eq!(lev_distance_with_substrings("empty", "is_empty", 1), Some(1));
assert_eq!(lev_distance_with_substrings("shrunk", "rchunks", 2), None);
assert_eq!(lev_distance_with_substrings("abc", "abcd", 1), Some(1));
assert_eq!(lev_distance_with_substrings("a", "abcd", 1), None);
assert_eq!(lev_distance_with_substrings("edf", "eq", 1), None);
assert_eq!(lev_distance_with_substrings("abc", "xyz", 3), Some(3));
assert_eq!(lev_distance_with_substrings("abcdef", "abcdef", 2), Some(0));
}
#[test]
fn test_find_best_match_for_name() {
use crate::create_default_session_globals_then;
create_default_session_globals_then(|| {
let input = vec![Symbol::intern("aaab"), Symbol::intern("aaabc")];
assert_eq!(
find_best_match_for_name(&input, Symbol::intern("aaaa"), None),
Some(Symbol::intern("aaab"))
);
assert_eq!(find_best_match_for_name(&input, Symbol::intern("1111111111"), None), None);
let input = vec![Symbol::intern("AAAA")];
assert_eq!(
find_best_match_for_name(&input, Symbol::intern("aaaa"), None),
Some(Symbol::intern("AAAA"))
);
let input = vec![Symbol::intern("AAAA")];
assert_eq!(
find_best_match_for_name(&input, Symbol::intern("aaaa"), Some(4)),
Some(Symbol::intern("AAAA"))
);
let input = vec![Symbol::intern("a_longer_variable_name")];
assert_eq!(
find_best_match_for_name(&input, Symbol::intern("a_variable_longer_name"), None),
Some(Symbol::intern("a_longer_variable_name"))
);
})
}

View File

@ -19,6 +19,7 @@
#![feature(negative_impls)]
#![feature(min_specialization)]
#![feature(rustc_attrs)]
#![feature(let_chains)]
#![deny(rustc::untranslatable_diagnostic)]
#![deny(rustc::diagnostic_outside_of_impl)]
@ -46,7 +47,7 @@ pub use hygiene::{ExpnData, ExpnHash, ExpnId, LocalExpnId, SyntaxContext};
use rustc_data_structures::stable_hasher::HashingControls;
pub mod def_id;
use def_id::{CrateNum, DefId, DefPathHash, LocalDefId, LOCAL_CRATE};
pub mod lev_distance;
pub mod edit_distance;
mod span_encoding;
pub use span_encoding::{Span, DUMMY_SP};

View File

@ -2,7 +2,9 @@ warning: unexpected `cfg` condition value
--> $DIR/invalid-cfg-value.rs:7:7
|
LL | #[cfg(feature = "sedre")]
| ^^^^^^^^^^^^^^^^^
| ^^^^^^^^^^-------
| |
| help: did you mean: `"serde"`
|
= note: expected values for `feature` are: full, serde
= note: `#[warn(unexpected_cfgs)]` on by default

View File

@ -0,0 +1,6 @@
// https://internals.rust-lang.org/t/18227
fn main() {
prinltn!(); //~ ERROR cannot find macro `prinltn` in this scope
//^ a macro with a similar name exists: `println`
}

View File

@ -0,0 +1,11 @@
error: cannot find macro `prinltn` in this scope
--> $DIR/println-typo.rs:4:5
|
LL | prinltn!();
| ^^^^^^^ help: a macro with a similar name exists: `println`
--> $SRC_DIR/std/src/macros.rs:LL:COL
|
= note: similarly named macro `println` defined here
error: aborting due to previous error