spellcheck: support transpositions aka Damerau-Levenshtein (PR other/69968)
gcc/fortran/ChangeLog: PR other/69968 * misc.c (gfc_closest_fuzzy_match): Update for renaming of levenshtein_distance to get_edit_distance. gcc/ChangeLog: PR other/69968 * spellcheck-tree.c (levenshtein_distance): Rename to... (get_edit_distance): ...this, and update for underlying renaming. * spellcheck-tree.h (levenshtein_distance): Rename to... (get_edit_distance): ...this. * spellcheck.c (levenshtein_distance): Rename to... (get_edit_distance): ...this. Convert from Levenshtein distance to Damerau-Levenshtein distance by supporting transpositions of adjacent characters. Rename "v1" to "v_next" and "v0" to "v_one_ago". (selftest::levenshtein_distance_unit_test_oneway): Rename to... (selftest::test_edit_distance_unit_test_oneway): ...this, and update for underlying renaming. (selftest::levenshtein_distance_unit_test): Rename to... (selftest::test_get_edit_distance_unit): ...this, and update for underlying renaming. (selftest::test_find_closest_string): Add example from PR 69968 where transposition helps (selftest::test_metric_conditions): Update for renaming. (selftest::test_metric_conditions): Likewise. (selftest::spellcheck_c_tests): Likewise. * spellcheck.h (levenshtein_distance): Rename both overloads to... (get_edit_distance): ...this. (best_match::consider): Update for renaming. gcc/testsuite/ChangeLog: PR other/69968 * gcc.dg/spellcheck-transposition.c: New test. From-SVN: r261521
This commit is contained in:
parent
e3329a782f
commit
b80a188bee
@ -1,3 +1,30 @@
|
||||
2018-06-12 David Malcolm <dmalcolm@redhat.com>
|
||||
|
||||
PR other/69968
|
||||
* spellcheck-tree.c (levenshtein_distance): Rename to...
|
||||
(get_edit_distance): ...this, and update for underlying renaming.
|
||||
* spellcheck-tree.h (levenshtein_distance): Rename to...
|
||||
(get_edit_distance): ...this.
|
||||
* spellcheck.c (levenshtein_distance): Rename to...
|
||||
(get_edit_distance): ...this. Convert from Levenshtein distance
|
||||
to Damerau-Levenshtein distance by supporting transpositions of
|
||||
adjacent characters. Rename "v1" to "v_next" and "v0" to
|
||||
"v_one_ago".
|
||||
(selftest::levenshtein_distance_unit_test_oneway): Rename to...
|
||||
(selftest::test_edit_distance_unit_test_oneway): ...this, and
|
||||
update for underlying renaming.
|
||||
(selftest::levenshtein_distance_unit_test): Rename to...
|
||||
(selftest::test_get_edit_distance_unit): ...this, and update for
|
||||
underlying renaming.
|
||||
(selftest::test_find_closest_string): Add example from PR 69968
|
||||
where transposition helps
|
||||
(selftest::test_metric_conditions): Update for renaming.
|
||||
(selftest::test_metric_conditions): Likewise.
|
||||
(selftest::spellcheck_c_tests): Likewise.
|
||||
* spellcheck.h (levenshtein_distance): Rename both overloads to...
|
||||
(get_edit_distance): ...this.
|
||||
(best_match::consider): Update for renaming.
|
||||
|
||||
2018-06-12 Martin Sebor <msebor@redhat.com>
|
||||
|
||||
PR tree-optimization/85259
|
||||
|
@ -1,3 +1,9 @@
|
||||
2018-06-12 David Malcolm <dmalcolm@redhat.com>
|
||||
|
||||
PR other/69968
|
||||
* misc.c (gfc_closest_fuzzy_match): Update for renaming of
|
||||
levenshtein_distance to get_edit_distance.
|
||||
|
||||
2018-06-12 Steven G. Kargl <kargl@gcc.gnu.org>
|
||||
|
||||
PR fortran/44491
|
||||
|
@ -286,7 +286,7 @@ get_c_kind(const char *c_kind_name, CInteropKind_t kinds_table[])
|
||||
|
||||
|
||||
/* For a given name TYPO, determine the best candidate from CANDIDATES
|
||||
perusing Levenshtein distance. Frees CANDIDATES before returning. */
|
||||
using get_edit_distance. Frees CANDIDATES before returning. */
|
||||
|
||||
const char *
|
||||
gfc_closest_fuzzy_match (const char *typo, char **candidates)
|
||||
@ -299,7 +299,7 @@ gfc_closest_fuzzy_match (const char *typo, char **candidates)
|
||||
|
||||
while (cand && *cand)
|
||||
{
|
||||
edit_distance_t dist = levenshtein_distance (typo, tl, *cand,
|
||||
edit_distance_t dist = get_edit_distance (typo, tl, *cand,
|
||||
strlen (*cand));
|
||||
if (dist < best_distance)
|
||||
{
|
||||
|
@ -27,18 +27,18 @@ along with GCC; see the file COPYING3. If not see
|
||||
#include "selftest.h"
|
||||
#include "stringpool.h"
|
||||
|
||||
/* Calculate Levenshtein distance between two identifiers. */
|
||||
/* Calculate edit distance between two identifiers. */
|
||||
|
||||
edit_distance_t
|
||||
levenshtein_distance (tree ident_s, tree ident_t)
|
||||
get_edit_distance (tree ident_s, tree ident_t)
|
||||
{
|
||||
gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE);
|
||||
gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE);
|
||||
|
||||
return levenshtein_distance (IDENTIFIER_POINTER (ident_s),
|
||||
IDENTIFIER_LENGTH (ident_s),
|
||||
IDENTIFIER_POINTER (ident_t),
|
||||
IDENTIFIER_LENGTH (ident_t));
|
||||
return get_edit_distance (IDENTIFIER_POINTER (ident_s),
|
||||
IDENTIFIER_LENGTH (ident_s),
|
||||
IDENTIFIER_POINTER (ident_t),
|
||||
IDENTIFIER_LENGTH (ident_t));
|
||||
}
|
||||
|
||||
/* Given TARGET, an identifier, and CANDIDATES, a vec of identifiers,
|
||||
|
@ -25,7 +25,7 @@ along with GCC; see the file COPYING3. If not see
|
||||
/* spellcheck-tree.c */
|
||||
|
||||
extern edit_distance_t
|
||||
levenshtein_distance (tree ident_s, tree ident_t);
|
||||
get_edit_distance (tree ident_s, tree ident_t);
|
||||
|
||||
extern tree
|
||||
find_closest_identifier (tree target, const auto_vec<tree> *candidates);
|
||||
|
143
gcc/spellcheck.c
143
gcc/spellcheck.c
@ -25,15 +25,18 @@ along with GCC; see the file COPYING3. If not see
|
||||
#include "spellcheck.h"
|
||||
#include "selftest.h"
|
||||
|
||||
/* The Levenshtein distance is an "edit-distance": the minimal
|
||||
number of one-character insertions, removals or substitutions
|
||||
that are needed to change one string into another.
|
||||
/* Get the edit distance between the two strings: the minimal
|
||||
number of edits that are needed to change one string into another,
|
||||
where edits can be one-character insertions, removals, or substitutions,
|
||||
or transpositions of two adjacent characters (counting as one "edit").
|
||||
|
||||
This implementation uses the Wagner-Fischer algorithm. */
|
||||
This implementation uses the Wagner-Fischer algorithm for the
|
||||
Damerau-Levenshtein distance; specifically, the "optimal string alignment
|
||||
distance" or "restricted edit distance" variant. */
|
||||
|
||||
edit_distance_t
|
||||
levenshtein_distance (const char *s, int len_s,
|
||||
const char *t, int len_t)
|
||||
get_edit_distance (const char *s, int len_s,
|
||||
const char *t, int len_t)
|
||||
{
|
||||
const bool debug = false;
|
||||
|
||||
@ -49,76 +52,86 @@ levenshtein_distance (const char *s, int len_s,
|
||||
return len_s;
|
||||
|
||||
/* We effectively build a matrix where each (i, j) contains the
|
||||
Levenshtein distance between the prefix strings s[0:j]
|
||||
and t[0:i].
|
||||
distance between the prefix strings s[0:j] and t[0:i].
|
||||
Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
|
||||
we simply keep track of the last row, v0 and a new row, v1,
|
||||
which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
|
||||
in favor of two (len_s + 1) allocations. These could potentially be
|
||||
we simply keep track of the last two rows, v_one_ago and v_two_ago,
|
||||
and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
|
||||
allocation and memory accesses in favor of three (len_s + 1)
|
||||
allocations. These could potentially be
|
||||
statically-allocated if we impose a maximum length on the
|
||||
strings of interest. */
|
||||
edit_distance_t *v0 = new edit_distance_t[len_s + 1];
|
||||
edit_distance_t *v1 = new edit_distance_t[len_s + 1];
|
||||
edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
|
||||
edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
|
||||
edit_distance_t *v_next = new edit_distance_t[len_s + 1];
|
||||
|
||||
/* The first row is for the case of an empty target string, which
|
||||
we can reach by deleting every character in the source string. */
|
||||
for (int i = 0; i < len_s + 1; i++)
|
||||
v0[i] = i;
|
||||
v_one_ago[i] = i;
|
||||
|
||||
/* Build successive rows. */
|
||||
for (int i = 0; i < len_t; i++)
|
||||
{
|
||||
if (debug)
|
||||
{
|
||||
printf ("i:%i v0 = ", i);
|
||||
printf ("i:%i v_one_ago = ", i);
|
||||
for (int j = 0; j < len_s + 1; j++)
|
||||
printf ("%i ", v0[j]);
|
||||
printf ("%i ", v_one_ago[j]);
|
||||
printf ("\n");
|
||||
}
|
||||
|
||||
/* The initial column is for the case of an empty source string; we
|
||||
can reach prefixes of the target string of length i
|
||||
by inserting i characters. */
|
||||
v1[0] = i + 1;
|
||||
v_next[0] = i + 1;
|
||||
|
||||
/* Build the rest of the row by considering neighbors to
|
||||
the north, west and northwest. */
|
||||
for (int j = 0; j < len_s; j++)
|
||||
{
|
||||
edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
|
||||
edit_distance_t deletion = v1[j] + 1;
|
||||
edit_distance_t insertion = v0[j + 1] + 1;
|
||||
edit_distance_t substitution = v0[j] + cost;
|
||||
edit_distance_t deletion = v_next[j] + 1;
|
||||
edit_distance_t insertion = v_one_ago[j + 1] + 1;
|
||||
edit_distance_t substitution = v_one_ago[j] + cost;
|
||||
edit_distance_t cheapest = MIN (deletion, insertion);
|
||||
cheapest = MIN (cheapest, substitution);
|
||||
v1[j + 1] = cheapest;
|
||||
if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
|
||||
{
|
||||
edit_distance_t transposition = v_two_ago[j - 1] + 1;
|
||||
cheapest = MIN (cheapest, transposition);
|
||||
}
|
||||
v_next[j + 1] = cheapest;
|
||||
}
|
||||
|
||||
/* Prepare to move on to next row. */
|
||||
for (int j = 0; j < len_s + 1; j++)
|
||||
v0[j] = v1[j];
|
||||
{
|
||||
v_two_ago[j] = v_one_ago[j];
|
||||
v_one_ago[j] = v_next[j];
|
||||
}
|
||||
}
|
||||
|
||||
if (debug)
|
||||
{
|
||||
printf ("final v1 = ");
|
||||
printf ("final v_next = ");
|
||||
for (int j = 0; j < len_s + 1; j++)
|
||||
printf ("%i ", v1[j]);
|
||||
printf ("%i ", v_next[j]);
|
||||
printf ("\n");
|
||||
}
|
||||
|
||||
edit_distance_t result = v1[len_s];
|
||||
delete[] v0;
|
||||
delete[] v1;
|
||||
edit_distance_t result = v_next[len_s];
|
||||
delete[] v_two_ago;
|
||||
delete[] v_one_ago;
|
||||
delete[] v_next;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Calculate Levenshtein distance between two nil-terminated strings. */
|
||||
/* Get the edit distance between two nil-terminated strings. */
|
||||
|
||||
edit_distance_t
|
||||
levenshtein_distance (const char *s, const char *t)
|
||||
get_edit_distance (const char *s, const char *t)
|
||||
{
|
||||
return levenshtein_distance (s, strlen (s), t, strlen (t));
|
||||
return get_edit_distance (s, strlen (s), t, strlen (t));
|
||||
}
|
||||
|
||||
/* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
|
||||
@ -155,29 +168,28 @@ namespace selftest {
|
||||
|
||||
/* Selftests. */
|
||||
|
||||
/* Verify that the levenshtein_distance (A, B) equals the expected
|
||||
value. */
|
||||
/* Verify that get_edit_distance (A, B) equals the expected value. */
|
||||
|
||||
static void
|
||||
levenshtein_distance_unit_test_oneway (const char *a, const char *b,
|
||||
edit_distance_t expected)
|
||||
test_edit_distance_unit_test_oneway (const char *a, const char *b,
|
||||
edit_distance_t expected)
|
||||
{
|
||||
edit_distance_t actual = levenshtein_distance (a, b);
|
||||
edit_distance_t actual = get_edit_distance (a, b);
|
||||
ASSERT_EQ (actual, expected);
|
||||
}
|
||||
|
||||
/* Verify that both
|
||||
levenshtein_distance (A, B)
|
||||
get_edit_distance (A, B)
|
||||
and
|
||||
levenshtein_distance (B, A)
|
||||
get_edit_distance (B, A)
|
||||
equal the expected value, to ensure that the function is symmetric. */
|
||||
|
||||
static void
|
||||
levenshtein_distance_unit_test (const char *a, const char *b,
|
||||
edit_distance_t expected)
|
||||
test_get_edit_distance_unit (const char *a, const char *b,
|
||||
edit_distance_t expected)
|
||||
{
|
||||
levenshtein_distance_unit_test_oneway (a, b, expected);
|
||||
levenshtein_distance_unit_test_oneway (b, a, expected);
|
||||
test_edit_distance_unit_test_oneway (a, b, expected);
|
||||
test_edit_distance_unit_test_oneway (b, a, expected);
|
||||
}
|
||||
|
||||
/* Verify that find_closest_string is sane. */
|
||||
@ -215,6 +227,16 @@ test_find_closest_string ()
|
||||
it as a suggestion will be nonsensical. Verify that we don't offer such
|
||||
suggestions. */
|
||||
ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
|
||||
|
||||
/* Example from PR 69968 where transposition helps. */
|
||||
candidates.truncate (0);
|
||||
candidates.safe_push("coordx");
|
||||
candidates.safe_push("coordy");
|
||||
candidates.safe_push("coordz");
|
||||
candidates.safe_push("coordx1");
|
||||
candidates.safe_push("coordy1");
|
||||
candidates.safe_push("coordz1");
|
||||
ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
|
||||
}
|
||||
|
||||
/* Test data for test_metric_conditions. */
|
||||
@ -227,7 +249,7 @@ static const char * const test_data[] = {
|
||||
"1234567890123456789012345678901234567890123456789012345678901234567890"
|
||||
};
|
||||
|
||||
/* Verify that levenshtein_distance appears to be a sane distance function,
|
||||
/* Verify that get_edit_distance appears to be a sane distance function,
|
||||
i.e. the conditions for being a metric. This is done directly for a
|
||||
small set of examples, using test_data above. This is O(N^3) in the size
|
||||
of the array, due to the test for the triangle inequality, so we keep the
|
||||
@ -243,7 +265,7 @@ test_metric_conditions ()
|
||||
for (int j = 0; j < num_test_cases; j++)
|
||||
{
|
||||
edit_distance_t dist_ij
|
||||
= levenshtein_distance (test_data[i], test_data[j]);
|
||||
= get_edit_distance (test_data[i], test_data[j]);
|
||||
|
||||
/* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
|
||||
if (i == j)
|
||||
@ -253,43 +275,54 @@ test_metric_conditions ()
|
||||
|
||||
/* Symmetry: d(i, j) == d(j, i). */
|
||||
edit_distance_t dist_ji
|
||||
= levenshtein_distance (test_data[j], test_data[i]);
|
||||
= get_edit_distance (test_data[j], test_data[i]);
|
||||
ASSERT_EQ (dist_ij, dist_ji);
|
||||
|
||||
/* Triangle inequality. */
|
||||
for (int k = 0; k < num_test_cases; k++)
|
||||
{
|
||||
edit_distance_t dist_ik
|
||||
= levenshtein_distance (test_data[i], test_data[k]);
|
||||
= get_edit_distance (test_data[i], test_data[k]);
|
||||
edit_distance_t dist_jk
|
||||
= levenshtein_distance (test_data[j], test_data[k]);
|
||||
= get_edit_distance (test_data[j], test_data[k]);
|
||||
ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Verify levenshtein_distance for a variety of pairs of pre-canned
|
||||
/* Verify get_edit_distance for a variety of pairs of pre-canned
|
||||
inputs, comparing against known-good values. */
|
||||
|
||||
void
|
||||
spellcheck_c_tests ()
|
||||
{
|
||||
levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
|
||||
levenshtein_distance_unit_test ("saturday", "sunday", 3);
|
||||
levenshtein_distance_unit_test ("foo", "m_foo", 2);
|
||||
levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
|
||||
levenshtein_distance_unit_test
|
||||
test_get_edit_distance_unit ("", "nonempty", strlen ("nonempty"));
|
||||
test_get_edit_distance_unit ("saturday", "sunday", 3);
|
||||
test_get_edit_distance_unit ("foo", "m_foo", 2);
|
||||
test_get_edit_distance_unit ("hello_world", "HelloWorld", 3);
|
||||
test_get_edit_distance_unit
|
||||
("the quick brown fox jumps over the lazy dog", "dog", 40);
|
||||
levenshtein_distance_unit_test
|
||||
test_get_edit_distance_unit
|
||||
("the quick brown fox jumps over the lazy dog",
|
||||
"the quick brown dog jumps over the lazy fox",
|
||||
4);
|
||||
levenshtein_distance_unit_test
|
||||
test_get_edit_distance_unit
|
||||
("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
|
||||
"All your base are belong to us",
|
||||
44);
|
||||
levenshtein_distance_unit_test ("foo", "FOO", 3);
|
||||
test_get_edit_distance_unit ("foo", "FOO", 3);
|
||||
test_get_edit_distance_unit ("fee", "deed", 2);
|
||||
test_get_edit_distance_unit ("coorzd1", "coordx1", 2);
|
||||
|
||||
/* Examples where transposition helps. */
|
||||
test_get_edit_distance_unit ("ab", "ba", 1);
|
||||
test_get_edit_distance_unit ("ba", "abc", 2);
|
||||
test_get_edit_distance_unit ("coorzd1", "coordz1", 1);
|
||||
test_get_edit_distance_unit ("abcdefghijklmnopqrstuvwxyz",
|
||||
"bacdefghijklmnopqrstuvwxzy", 2);
|
||||
test_get_edit_distance_unit ("saturday", "sundya", 4);
|
||||
test_get_edit_distance_unit ("signed", "singed", 1);
|
||||
|
||||
test_find_closest_string ();
|
||||
test_metric_conditions ();
|
||||
|
@ -25,11 +25,11 @@ const edit_distance_t MAX_EDIT_DISTANCE = UINT_MAX;
|
||||
|
||||
/* spellcheck.c */
|
||||
extern edit_distance_t
|
||||
levenshtein_distance (const char *s, int len_s,
|
||||
const char *t, int len_t);
|
||||
get_edit_distance (const char *s, int len_s,
|
||||
const char *t, int len_t);
|
||||
|
||||
extern edit_distance_t
|
||||
levenshtein_distance (const char *s, const char *t);
|
||||
get_edit_distance (const char *s, const char *t);
|
||||
|
||||
extern const char *
|
||||
find_closest_string (const char *target,
|
||||
@ -73,7 +73,7 @@ struct edit_distance_traits<const char *>
|
||||
|
||||
This type accumulates the best possible match against GOAL_TYPE for
|
||||
a sequence of elements of CANDIDATE_TYPE, whilst minimizing the
|
||||
number of calls to levenshtein_distance and to
|
||||
number of calls to get_edit_distance and to
|
||||
edit_distance_traits<T>::get_length. */
|
||||
|
||||
template <typename GOAL_TYPE, typename CANDIDATE_TYPE>
|
||||
@ -126,9 +126,9 @@ class best_match
|
||||
/* Otherwise, compute the distance and see if the candidate
|
||||
has beaten the previous best value. */
|
||||
edit_distance_t dist
|
||||
= levenshtein_distance (m_goal, m_goal_len,
|
||||
candidate_traits::get_string (candidate),
|
||||
candidate_len);
|
||||
= get_edit_distance (m_goal, m_goal_len,
|
||||
candidate_traits::get_string (candidate),
|
||||
candidate_len);
|
||||
if (dist < m_best_distance)
|
||||
{
|
||||
m_best_distance = dist;
|
||||
|
@ -1,3 +1,8 @@
|
||||
2018-06-12 David Malcolm <dmalcolm@redhat.com>
|
||||
|
||||
PR other/69968
|
||||
* gcc.dg/spellcheck-transposition.c: New test.
|
||||
|
||||
2018-06-12 Steven G. Kargl <kargl@gcc.gnu.org>
|
||||
|
||||
PR fortran/44491
|
||||
|
20
gcc/testsuite/gcc.dg/spellcheck-transposition.c
Normal file
20
gcc/testsuite/gcc.dg/spellcheck-transposition.c
Normal file
@ -0,0 +1,20 @@
|
||||
/* PR other/69968. */
|
||||
|
||||
struct {
|
||||
int coordx, coordy, coordz;
|
||||
int coordx1, coordy1, coordz1;
|
||||
} c;
|
||||
|
||||
/* Consider the misspelling "coorzd1".
|
||||
|
||||
With Levenshtein distance, the misspelling has an edit distance of 2
|
||||
to all 6 of the fields (e.g. via a deletion and a substitution for the
|
||||
first three, and via deletion and insertion for the second three).
|
||||
|
||||
With Damerau-Levenshtein, the misspelling has an edit distance of 1
|
||||
via transposition to "coordz1", and 2 to the other fields. */
|
||||
|
||||
void foo (void)
|
||||
{
|
||||
c.coorzd1 = c.coordy; /* { dg-error "has no member named 'coorzd1'; did you mean 'coordz1'" } */
|
||||
}
|
Loading…
Reference in New Issue
Block a user