From 6a3f203c3cc8e0f0757f7ed038b3cb34063936ba Mon Sep 17 00:00:00 2001 From: David Malcolm Date: Wed, 15 Jun 2016 03:29:39 +0000 Subject: [PATCH] spellcheck.h: add best_match template; implement early-reject gcc/c/ChangeLog: * c-typeck.c: Include spellcheck-tree.h rather than spellcheck.h. gcc/cp/ChangeLog: * search.c: Include spellcheck-tree.h rather than spellcheck.h. gcc/ChangeLog: * spellcheck-tree.c: Include spellcheck-tree.h rather than spellcheck.h. (find_closest_identifier): Reimplement in terms of best_match. * spellcheck-tree.h: New file. * spellcheck.c (struct edit_distance_traits): New struct. (find_closest_string): Reimplement in terms of best_match. * spellcheck.h (levenshtein_distance): Move prototype of tree-based overload to spellcheck-tree.h. (find_closest_identifier): Likewise. (struct edit_distance_traits): New template. (class best_match): New class. From-SVN: r237471 --- gcc/ChangeLog | 17 +++++++ gcc/c/ChangeLog | 4 ++ gcc/c/c-typeck.c | 2 +- gcc/cp/ChangeLog | 4 ++ gcc/cp/search.c | 2 +- gcc/spellcheck-tree.c | 24 ++------- gcc/spellcheck-tree.h | 51 ++++++++++++++++++++ gcc/spellcheck.c | 42 ++++++++-------- gcc/spellcheck.h | 110 ++++++++++++++++++++++++++++++++++++++++-- 9 files changed, 208 insertions(+), 48 deletions(-) create mode 100644 gcc/spellcheck-tree.h diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 9bf0db86a77..907bb06bee9 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,20 @@ +2016-06-14 David Malcolm + + * spellcheck-tree.c: Include spellcheck-tree.h rather than + spellcheck.h. + (find_closest_identifier): Reimplement in terms of + best_match. + * spellcheck-tree.h: New file. + * spellcheck.c (struct edit_distance_traits): New + struct. + (find_closest_string): Reimplement in terms of + best_match. + * spellcheck.h (levenshtein_distance): Move prototype of tree-based + overload to spellcheck-tree.h. + (find_closest_identifier): Likewise. + (struct edit_distance_traits): New template. + (class best_match): New class. + 2016-06-14 David Malcolm * selftest-run-tests.c (selftest::run_tests): Call diff --git a/gcc/c/ChangeLog b/gcc/c/ChangeLog index 85fe5e3562f..e5d84bb308e 100644 --- a/gcc/c/ChangeLog +++ b/gcc/c/ChangeLog @@ -1,3 +1,7 @@ +2016-06-14 David Malcolm + + * c-typeck.c: Include spellcheck-tree.h rather than spellcheck.h. + 2016-06-14 David Malcolm * c-typeck.c (build_component_ref): Simplify fixit code by diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c index f9875087158..f03c07bb7f1 100644 --- a/gcc/c/c-typeck.c +++ b/gcc/c/c-typeck.c @@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see #include "c-family/c-ubsan.h" #include "cilk.h" #include "gomp-constants.h" -#include "spellcheck.h" +#include "spellcheck-tree.h" #include "gcc-rich-location.h" /* Possible cases of implicit bad conversions. Used to select diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog index 4e4a662faf8..775b61482d4 100644 --- a/gcc/cp/ChangeLog +++ b/gcc/cp/ChangeLog @@ -1,3 +1,7 @@ +2016-06-14 David Malcolm + + * search.c: Include spellcheck-tree.h rather than spellcheck.h. + 2016-06-14 David Malcolm * typeck.c: Include "gcc-rich-location.h". diff --git a/gcc/cp/search.c b/gcc/cp/search.c index f47833f0686..990c3fe748d 100644 --- a/gcc/cp/search.c +++ b/gcc/cp/search.c @@ -27,7 +27,7 @@ along with GCC; see the file COPYING3. If not see #include "cp-tree.h" #include "intl.h" #include "toplev.h" -#include "spellcheck.h" +#include "spellcheck-tree.h" static int is_subobject_of_p (tree, tree); static tree dfs_lookup_base (tree, void *); diff --git a/gcc/spellcheck-tree.c b/gcc/spellcheck-tree.c index 2d73b7740ba..63fb1a878ac 100644 --- a/gcc/spellcheck-tree.c +++ b/gcc/spellcheck-tree.c @@ -22,7 +22,7 @@ along with GCC; see the file COPYING3. If not see #include "coretypes.h" #include "tm.h" #include "tree.h" -#include "spellcheck.h" +#include "spellcheck-tree.h" #include "selftest.h" #include "stringpool.h" @@ -53,32 +53,16 @@ find_closest_identifier (tree target, const auto_vec *candidates) { gcc_assert (TREE_CODE (target) == IDENTIFIER_NODE); + best_match bm (target); int i; tree identifier; - tree best_identifier = NULL_TREE; - edit_distance_t best_distance = MAX_EDIT_DISTANCE; FOR_EACH_VEC_ELT (*candidates, i, identifier) { gcc_assert (TREE_CODE (identifier) == IDENTIFIER_NODE); - edit_distance_t dist = levenshtein_distance (target, identifier); - if (dist < best_distance) - { - best_distance = dist; - best_identifier = identifier; - } + bm.consider (identifier); } - /* If more than half of the letters were misspelled, the suggestion is - likely to be meaningless. */ - if (best_identifier) - { - unsigned int cutoff = MAX (IDENTIFIER_LENGTH (target), - IDENTIFIER_LENGTH (best_identifier)) / 2; - if (best_distance > cutoff) - return NULL_TREE; - } - - return best_identifier; + return bm.get_best_meaningful_candidate (); } #if CHECKING_P diff --git a/gcc/spellcheck-tree.h b/gcc/spellcheck-tree.h new file mode 100644 index 00000000000..0d5e253562f --- /dev/null +++ b/gcc/spellcheck-tree.h @@ -0,0 +1,51 @@ +/* Find near-matches for identifiers. + Copyright (C) 2015-2016 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#ifndef GCC_SPELLCHECK_TREE_H +#define GCC_SPELLCHECK_TREE_H + +#include "spellcheck.h" + +/* spellcheck-tree.c */ + +extern edit_distance_t +levenshtein_distance (tree ident_s, tree ident_t); + +extern tree +find_closest_identifier (tree target, const auto_vec *candidates); + +/* Specialization of edit_distance_traits for identifiers. */ + +template <> +struct edit_distance_traits +{ + static size_t get_length (tree id) + { + gcc_assert (TREE_CODE (id) == IDENTIFIER_NODE); + return IDENTIFIER_LENGTH (id); + } + + static const char *get_string (tree id) + { + gcc_assert (TREE_CODE (id) == IDENTIFIER_NODE); + return IDENTIFIER_POINTER (id); + } +}; + +#endif /* GCC_SPELLCHECK_TREE_H */ diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c index e03f484d621..2648f3a0bbb 100644 --- a/gcc/spellcheck.c +++ b/gcc/spellcheck.c @@ -121,6 +121,24 @@ levenshtein_distance (const char *s, const char *t) return levenshtein_distance (s, strlen (s), t, strlen (t)); } +/* Specialization of edit_distance_traits for C-style strings. */ + +template <> +struct edit_distance_traits +{ + static size_t get_length (const char *str) + { + gcc_assert (str); + return strlen (str); + } + + static const char *get_string (const char *str) + { + gcc_assert (str); + return str; + } +}; + /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to an autovec of non-NULL strings, determine which element within CANDIDATES has the lowest edit distance to TARGET. If there are @@ -139,32 +157,14 @@ find_closest_string (const char *target, int i; const char *candidate; - const char *best_candidate = NULL; - edit_distance_t best_distance = MAX_EDIT_DISTANCE; - size_t len_target = strlen (target); + best_match bm (target); FOR_EACH_VEC_ELT (*candidates, i, candidate) { gcc_assert (candidate); - edit_distance_t dist - = levenshtein_distance (target, len_target, - candidate, strlen (candidate)); - if (dist < best_distance) - { - best_distance = dist; - best_candidate = candidate; - } + bm.consider (candidate); } - /* If more than half of the letters were misspelled, the suggestion is - likely to be meaningless. */ - if (best_candidate) - { - unsigned int cutoff = MAX (len_target, strlen (best_candidate)) / 2; - if (best_distance > cutoff) - return NULL; - } - - return best_candidate; + return bm.get_best_meaningful_candidate (); } #if CHECKING_P diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h index 040c33ef571..7379399f1fd 100644 --- a/gcc/spellcheck.h +++ b/gcc/spellcheck.h @@ -35,12 +35,112 @@ extern const char * find_closest_string (const char *target, const auto_vec *candidates); -/* spellcheck-tree.c */ +/* A traits class for describing a string-like type usable by + class best_match. + Specializations should provide the implementations of the following: -extern edit_distance_t -levenshtein_distance (tree ident_s, tree ident_t); + static size_t get_length (TYPE); + static const char *get_string (TYPE); -extern tree -find_closest_identifier (tree target, const auto_vec *candidates); + get_string should return a non-NULL ptr, which does not need to be + 0-terminated. */ + +template +struct edit_distance_traits {}; + +/* A type for use when determining the best match against a string, + expressed as a template so that we can match against various + string-like types (const char *, frontend identifiers, and preprocessor + macros). + + This type accumulates the best possible match against GOAL_TYPE for + a sequence of elements of CANDIDATE_TYPE, whilst minimizing the + number of calls to levenshtein_distance and to + edit_distance_traits::get_length. */ + +template +class best_match +{ + public: + typedef GOAL_TYPE goal_t; + typedef CANDIDATE_TYPE candidate_t; + typedef edit_distance_traits goal_traits; + typedef edit_distance_traits candidate_traits; + + /* Constructor. */ + + best_match (goal_t goal) + : m_goal (goal_traits::get_string (goal)), + m_goal_len (goal_traits::get_length (goal)), + m_best_candidate (NULL), + m_best_distance (MAX_EDIT_DISTANCE) + {} + + /* Compare the edit distance between CANDIDATE and m_goal, + and if it's the best so far, record it. */ + + void consider (candidate_t candidate) + { + size_t candidate_len = candidate_traits::get_length (candidate); + + /* Calculate a lower bound on the candidate's distance to the goal, + based on the difference in lengths; it will require at least + this many insertions/deletions. */ + edit_distance_t min_candidate_distance + = abs ((ssize_t)candidate_len - (ssize_t)m_goal_len); + + /* If the candidate's length is sufficiently different to that + of the goal string, then the number of insertions/deletions + may be >= the best distance so far. If so, we can reject + the candidate immediately without needing to compute + the exact distance, since it won't be an improvement. */ + if (min_candidate_distance >= m_best_distance) + return; + + /* If the candidate will be unable to beat the criterion in + get_best_meaningful_candidate, reject it without computing + the exact distance. */ + unsigned int cutoff = MAX (m_goal_len, candidate_len) / 2; + if (min_candidate_distance > cutoff) + return; + + /* Otherwise, compute the distance and see if the candidate + has beaten the previous best value. */ + edit_distance_t dist + = levenshtein_distance (m_goal, m_goal_len, + candidate_traits::get_string (candidate), + candidate_len); + if (dist < m_best_distance) + { + m_best_distance = dist; + m_best_candidate = candidate; + m_best_candidate_len = candidate_len; + } + } + + /* Get the best candidate so far, but applying a filter to ensure + that we return NULL if none of the candidates are close to the goal, + to avoid offering nonsensical suggestions to the user. */ + + candidate_t get_best_meaningful_candidate () const + { + /* If more than half of the letters were misspelled, the suggestion is + likely to be meaningless. */ + if (m_best_candidate) + { + unsigned int cutoff = MAX (m_goal_len, m_best_candidate_len) / 2; + if (m_best_distance > cutoff) + return NULL; + } + return m_best_candidate; + } + + private: + const char *m_goal; + size_t m_goal_len; + candidate_t m_best_candidate; + edit_distance_t m_best_distance; + size_t m_best_candidate_len; +}; #endif /* GCC_SPELLCHECK_H */