Implement Levenshtein distance; use in C FE for misspelled field names
This is the combination of: [PATCH 1/2] Implement Levenshtein distance [PATCH 2/2] C FE: suggest corrections for misspelled field names plus some nit fixes in spellcheck.c. gcc/ChangeLog: * Makefile.in (OBJS): Add spellcheck.o. * spellcheck.c: New file. * spellcheck.h: New file. gcc/c/ChangeLog: * c-typeck.c: Include spellcheck.h. (lookup_field_fuzzy_find_candidates): New function. (lookup_field_fuzzy): New function. (build_component_ref): If the field was not found, try using lookup_field_fuzzy and potentially offer a suggestion. gcc/testsuite/ChangeLog: * gcc.dg/plugin/levenshtein-test-1.c: New file. * gcc.dg/plugin/levenshtein_plugin.c: New file. * gcc.dg/plugin/plugin.exp (plugin_test_list): Add levenshtein_plugin.c. * gcc.dg/spellcheck-fields.c: New file. From-SVN: r230284
This commit is contained in:
parent
6d82151f8c
commit
277fe61691
|
@ -1,3 +1,9 @@
|
|||
2015-11-13 David Malcolm <dmalcolm@redhat.com>
|
||||
|
||||
* Makefile.in (OBJS): Add spellcheck.o.
|
||||
* spellcheck.c: New file.
|
||||
* spellcheck.h: New file.
|
||||
|
||||
2015-11-13 James Bowman <james.bowman@ftdichip.com>
|
||||
|
||||
* config/ft32/ft32.md (*sne): New insn pattern.
|
||||
|
|
|
@ -1403,6 +1403,7 @@ OBJS = \
|
|||
shrink-wrap.o \
|
||||
simplify-rtx.o \
|
||||
sparseset.o \
|
||||
spellcheck.o \
|
||||
sreal.o \
|
||||
stack-ptr-mod.o \
|
||||
statistics.o \
|
||||
|
|
|
@ -1,3 +1,11 @@
|
|||
2015-11-13 David Malcolm <dmalcolm@redhat.com>
|
||||
|
||||
* c-typeck.c: Include spellcheck.h.
|
||||
(lookup_field_fuzzy_find_candidates): New function.
|
||||
(lookup_field_fuzzy): New function.
|
||||
(build_component_ref): If the field was not found, try using
|
||||
lookup_field_fuzzy and potentially offer a suggestion.
|
||||
|
||||
2015-11-12 James Norris <jnorris@codesourcery.com>
|
||||
Joseph Myers <joseph@codesourcery.com>
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@ along with GCC; see the file COPYING3. If not see
|
|||
#include "c-family/c-ubsan.h"
|
||||
#include "cilk.h"
|
||||
#include "gomp-constants.h"
|
||||
#include "spellcheck.h"
|
||||
|
||||
/* Possible cases of implicit bad conversions. Used to select
|
||||
diagnostic messages in convert_for_assignment. */
|
||||
|
@ -2242,6 +2243,72 @@ lookup_field (tree type, tree component)
|
|||
return tree_cons (NULL_TREE, field, NULL_TREE);
|
||||
}
|
||||
|
||||
/* Recursively append candidate IDENTIFIER_NODEs to CANDIDATES. */
|
||||
|
||||
static void
|
||||
lookup_field_fuzzy_find_candidates (tree type, tree component,
|
||||
vec<tree> *candidates)
|
||||
{
|
||||
tree field;
|
||||
for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
|
||||
{
|
||||
if (DECL_NAME (field) == NULL_TREE
|
||||
&& (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
|
||||
|| TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
|
||||
{
|
||||
lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
|
||||
component,
|
||||
candidates);
|
||||
}
|
||||
|
||||
if (DECL_NAME (field))
|
||||
candidates->safe_push (DECL_NAME (field));
|
||||
}
|
||||
}
|
||||
|
||||
/* Like "lookup_field", but find the closest matching IDENTIFIER_NODE,
|
||||
rather than returning a TREE_LIST for an exact match. */
|
||||
|
||||
static tree
|
||||
lookup_field_fuzzy (tree type, tree component)
|
||||
{
|
||||
gcc_assert (TREE_CODE (component) == IDENTIFIER_NODE);
|
||||
|
||||
/* First, gather a list of candidates. */
|
||||
auto_vec <tree> candidates;
|
||||
|
||||
lookup_field_fuzzy_find_candidates (type, component,
|
||||
&candidates);
|
||||
|
||||
/* Now determine which is closest. */
|
||||
int i;
|
||||
tree identifier;
|
||||
tree best_identifier = NULL;
|
||||
edit_distance_t best_distance = MAX_EDIT_DISTANCE;
|
||||
FOR_EACH_VEC_ELT (candidates, i, identifier)
|
||||
{
|
||||
gcc_assert (TREE_CODE (identifier) == IDENTIFIER_NODE);
|
||||
edit_distance_t dist = levenshtein_distance (component, identifier);
|
||||
if (dist < best_distance)
|
||||
{
|
||||
best_distance = dist;
|
||||
best_identifier = identifier;
|
||||
}
|
||||
}
|
||||
|
||||
/* If more than half of the letters were misspelled, the suggestion is
|
||||
likely to be meaningless. */
|
||||
if (best_identifier)
|
||||
{
|
||||
unsigned int cutoff = MAX (IDENTIFIER_LENGTH (component),
|
||||
IDENTIFIER_LENGTH (best_identifier)) / 2;
|
||||
if (best_distance > cutoff)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return best_identifier;
|
||||
}
|
||||
|
||||
/* Make an expression to refer to the COMPONENT field of structure or
|
||||
union value DATUM. COMPONENT is an IDENTIFIER_NODE. LOC is the
|
||||
location of the COMPONENT_REF. */
|
||||
|
@ -2277,7 +2344,12 @@ build_component_ref (location_t loc, tree datum, tree component)
|
|||
|
||||
if (!field)
|
||||
{
|
||||
error_at (loc, "%qT has no member named %qE", type, component);
|
||||
tree guessed_id = lookup_field_fuzzy (type, component);
|
||||
if (guessed_id)
|
||||
error_at (loc, "%qT has no member named %qE; did you mean %qE?",
|
||||
type, component, guessed_id);
|
||||
else
|
||||
error_at (loc, "%qT has no member named %qE", type, component);
|
||||
return error_mark_node;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
/* Find near-matches for strings and identifiers.
|
||||
Copyright (C) 2015 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GCC.
|
||||
|
||||
GCC is free software; you can redistribute it and/or modify it under
|
||||
the terms of the GNU General Public License as published by the Free
|
||||
Software Foundation; either version 3, or (at your option) any later
|
||||
version.
|
||||
|
||||
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with GCC; see the file COPYING3. If not see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "config.h"
|
||||
#include "system.h"
|
||||
#include "coretypes.h"
|
||||
#include "tm.h"
|
||||
#include "tree.h"
|
||||
#include "spellcheck.h"
|
||||
|
||||
/* The Levenshtein distance is an "edit-distance": the minimal
|
||||
number of one-character insertions, removals or substitutions
|
||||
that are needed to change one string into another.
|
||||
|
||||
This implementation uses the Wagner-Fischer algorithm. */
|
||||
|
||||
static edit_distance_t
|
||||
levenshtein_distance (const char *s, int len_s,
|
||||
const char *t, int len_t)
|
||||
{
|
||||
const bool debug = false;
|
||||
|
||||
if (debug)
|
||||
{
|
||||
printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
|
||||
printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
|
||||
}
|
||||
|
||||
if (len_s == 0)
|
||||
return len_t;
|
||||
if (len_t == 0)
|
||||
return len_s;
|
||||
|
||||
/* We effectively build a matrix where each (i, j) contains the
|
||||
Levenshtein distance between the prefix strings s[0:j]
|
||||
and t[0:i].
|
||||
Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
|
||||
we simply keep track of the last row, v0 and a new row, v1,
|
||||
which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
|
||||
in favor of two (len_s + 1) allocations. These could potentially be
|
||||
statically-allocated if we impose a maximum length on the
|
||||
strings of interest. */
|
||||
edit_distance_t *v0 = new edit_distance_t[len_s + 1];
|
||||
edit_distance_t *v1 = new edit_distance_t[len_s + 1];
|
||||
|
||||
/* The first row is for the case of an empty target string, which
|
||||
we can reach by deleting every character in the source string. */
|
||||
for (int i = 0; i < len_s + 1; i++)
|
||||
v0[i] = i;
|
||||
|
||||
/* Build successive rows. */
|
||||
for (int i = 0; i < len_t; i++)
|
||||
{
|
||||
if (debug)
|
||||
{
|
||||
printf ("i:%i v0 = ", i);
|
||||
for (int j = 0; j < len_s + 1; j++)
|
||||
printf ("%i ", v0[j]);
|
||||
printf ("\n");
|
||||
}
|
||||
|
||||
/* The initial column is for the case of an empty source string; we
|
||||
can reach prefixes of the target string of length i
|
||||
by inserting i characters. */
|
||||
v1[0] = i + 1;
|
||||
|
||||
/* Build the rest of the row by considering neighbours to
|
||||
the north, west and northwest. */
|
||||
for (int j = 0; j < len_s; j++)
|
||||
{
|
||||
edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
|
||||
edit_distance_t deletion = v1[j] + 1;
|
||||
edit_distance_t insertion = v0[j + 1] + 1;
|
||||
edit_distance_t substitution = v0[j] + cost;
|
||||
edit_distance_t cheapest = MIN (deletion, insertion);
|
||||
cheapest = MIN (cheapest, substitution);
|
||||
v1[j + 1] = cheapest;
|
||||
}
|
||||
|
||||
/* Prepare to move on to next row. */
|
||||
for (int j = 0; j < len_s + 1; j++)
|
||||
v0[j] = v1[j];
|
||||
}
|
||||
|
||||
if (debug)
|
||||
{
|
||||
printf ("final v1 = ");
|
||||
for (int j = 0; j < len_s + 1; j++)
|
||||
printf ("%i ", v1[j]);
|
||||
printf ("\n");
|
||||
}
|
||||
|
||||
edit_distance_t result = v1[len_s];
|
||||
delete[] v0;
|
||||
delete[] v1;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Calculate Levenshtein distance between two nil-terminated strings.
|
||||
This exists purely for the unit tests. */
|
||||
|
||||
edit_distance_t
|
||||
levenshtein_distance (const char *s, const char *t)
|
||||
{
|
||||
return levenshtein_distance (s, strlen (s), t, strlen (t));
|
||||
}
|
||||
|
||||
/* Calculate Levenshtein distance between two identifiers. */
|
||||
|
||||
edit_distance_t
|
||||
levenshtein_distance (tree ident_s, tree ident_t)
|
||||
{
|
||||
gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE);
|
||||
gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE);
|
||||
|
||||
return levenshtein_distance (IDENTIFIER_POINTER (ident_s),
|
||||
IDENTIFIER_LENGTH (ident_s),
|
||||
IDENTIFIER_POINTER (ident_t),
|
||||
IDENTIFIER_LENGTH (ident_t));
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
/* Find near-matches for strings and identifiers.
|
||||
Copyright (C) 2015 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GCC.
|
||||
|
||||
GCC is free software; you can redistribute it and/or modify it under
|
||||
the terms of the GNU General Public License as published by the Free
|
||||
Software Foundation; either version 3, or (at your option) any later
|
||||
version.
|
||||
|
||||
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with GCC; see the file COPYING3. If not see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef GCC_SPELLCHECK_H
|
||||
#define GCC_SPELLCHECK_H
|
||||
|
||||
typedef unsigned int edit_distance_t;
|
||||
const edit_distance_t MAX_EDIT_DISTANCE = UINT_MAX;
|
||||
|
||||
extern edit_distance_t
|
||||
levenshtein_distance (const char *s, const char *t);
|
||||
|
||||
extern edit_distance_t
|
||||
levenshtein_distance (tree ident_s, tree ident_t);
|
||||
|
||||
#endif /* GCC_SPELLCHECK_H */
|
|
@ -1,3 +1,11 @@
|
|||
2015-11-13 David Malcolm <dmalcolm@redhat.com>
|
||||
|
||||
* gcc.dg/plugin/levenshtein-test-1.c: New file.
|
||||
* gcc.dg/plugin/levenshtein_plugin.c: New file.
|
||||
* gcc.dg/plugin/plugin.exp (plugin_test_list): Add
|
||||
levenshtein_plugin.c.
|
||||
* gcc.dg/spellcheck-fields.c: New file.
|
||||
|
||||
2015-11-12 Steven G. Kargl <kargl@gcc.gnu.org>
|
||||
|
||||
PR fortran/68318
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
/* Placeholder C source file for unit-testing gcc/spellcheck.c. */
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O" } */
|
||||
|
||||
int
|
||||
main (int argc, char **argv)
|
||||
{
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
/* Plugin for unittesting gcc/spellcheck.h. */
|
||||
|
||||
#include "config.h"
|
||||
#include "gcc-plugin.h"
|
||||
#include "system.h"
|
||||
#include "coretypes.h"
|
||||
#include "spellcheck.h"
|
||||
#include "diagnostic.h"
|
||||
|
||||
int plugin_is_GPL_compatible;
|
||||
|
||||
static void
|
||||
levenshtein_distance_unit_test_oneway (const char *a, const char *b,
|
||||
edit_distance_t expected)
|
||||
{
|
||||
edit_distance_t actual = levenshtein_distance (a, b);
|
||||
if (actual != expected)
|
||||
error ("levenshtein_distance (\"%s\", \"%s\") : expected: %i got %i",
|
||||
a, b, expected, actual);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
levenshtein_distance_unit_test (const char *a, const char *b,
|
||||
edit_distance_t expected)
|
||||
{
|
||||
/* Run every test both ways to ensure it's symmetric. */
|
||||
levenshtein_distance_unit_test_oneway (a, b, expected);
|
||||
levenshtein_distance_unit_test_oneway (b, a, expected);
|
||||
}
|
||||
|
||||
/* Callback handler for the PLUGIN_FINISH event; run
|
||||
levenshtein_distance unit tests here. */
|
||||
|
||||
static void
|
||||
on_finish (void */*gcc_data*/, void */*user_data*/)
|
||||
{
|
||||
levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
|
||||
levenshtein_distance_unit_test ("saturday", "sunday", 3);
|
||||
levenshtein_distance_unit_test ("foo", "m_foo", 2);
|
||||
levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
|
||||
levenshtein_distance_unit_test
|
||||
("the quick brown fox jumps over the lazy dog", "dog", 40);
|
||||
levenshtein_distance_unit_test
|
||||
("the quick brown fox jumps over the lazy dog",
|
||||
"the quick brown dog jumps over the lazy fox",
|
||||
4);
|
||||
levenshtein_distance_unit_test
|
||||
("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
|
||||
"All your base are belong to us",
|
||||
44);
|
||||
}
|
||||
|
||||
int
|
||||
plugin_init (struct plugin_name_args *plugin_info,
|
||||
struct plugin_gcc_version */*version*/)
|
||||
{
|
||||
register_callback (plugin_info->base_name,
|
||||
PLUGIN_FINISH,
|
||||
on_finish,
|
||||
NULL); /* void *user_data */
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -66,6 +66,7 @@ set plugin_test_list [list \
|
|||
{ diagnostic_plugin_test_show_locus.c \
|
||||
diagnostic-test-show-locus-bw.c \
|
||||
diagnostic-test-show-locus-color.c } \
|
||||
{ levenshtein_plugin.c levenshtein-test-1.c } \
|
||||
]
|
||||
|
||||
foreach plugin_test $plugin_test_list {
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
/* { dg-do compile } */
|
||||
|
||||
struct foo
|
||||
{
|
||||
int foo;
|
||||
int bar;
|
||||
int baz;
|
||||
};
|
||||
|
||||
int test (struct foo *ptr)
|
||||
{
|
||||
return ptr->m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */
|
||||
}
|
||||
|
||||
int test2 (void)
|
||||
{
|
||||
struct foo instance = {0, 0, 0};
|
||||
return instance.m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */
|
||||
}
|
||||
|
||||
struct s {
|
||||
struct j { int aa; } kk;
|
||||
int ab;
|
||||
};
|
||||
|
||||
void test3 (struct s x)
|
||||
{
|
||||
x.ac; /* { dg-error "'struct s' has no member named 'ac'; did you mean 'ab'?" } */
|
||||
}
|
||||
|
||||
int test4 (struct foo *ptr)
|
||||
{
|
||||
return sizeof (ptr->foa); /* { dg-error "'struct foo' has no member named 'foa'; did you mean 'foo'?" } */
|
||||
}
|
||||
|
||||
/* Verify that we don't offer nonsensical suggestions. */
|
||||
|
||||
int test5 (struct foo *ptr)
|
||||
{
|
||||
return ptr->this_is_unlike_any_of_the_fields; /* { dg-bogus "did you mean" } */
|
||||
/* { dg-error "has no member named" "" { target *-*-* } 40 } */
|
||||
}
|
||||
|
||||
union u
|
||||
{
|
||||
int color;
|
||||
int shape;
|
||||
};
|
||||
|
||||
int test6 (union u *ptr)
|
||||
{
|
||||
return ptr->colour; /* { dg-error "'union u' has no member named 'colour'; did you mean 'color'?" } */
|
||||
}
|
||||
|
||||
struct has_anon
|
||||
{
|
||||
struct { int color; } s;
|
||||
};
|
||||
|
||||
int test7 (struct has_anon *ptr)
|
||||
{
|
||||
return ptr->s.colour; /* { dg-error "'struct <anonymous>' has no member named 'colour'; did you mean 'color'?" } */
|
||||
}
|
Loading…
Reference in New Issue