2015-11-13 02:59:03 +01:00
|
|
|
/* Find near-matches for strings.
|
2016-01-04 15:30:50 +01:00
|
|
|
Copyright (C) 2015-2016 Free Software Foundation, Inc.
|
2015-11-13 02:54:41 +01:00
|
|
|
|
|
|
|
This file is part of GCC.
|
|
|
|
|
|
|
|
GCC is free software; you can redistribute it and/or modify it under
|
|
|
|
the terms of the GNU General Public License as published by the Free
|
|
|
|
Software Foundation; either version 3, or (at your option) any later
|
|
|
|
version.
|
|
|
|
|
|
|
|
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
|
|
for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with GCC; see the file COPYING3. If not see
|
|
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
|
|
|
|
#include "config.h"
|
|
|
|
#include "system.h"
|
|
|
|
#include "coretypes.h"
|
|
|
|
#include "tm.h"
|
|
|
|
#include "tree.h"
|
|
|
|
#include "spellcheck.h"
|
|
|
|
|
|
|
|
/* The Levenshtein distance is an "edit-distance": the minimal
|
|
|
|
number of one-character insertions, removals or substitutions
|
|
|
|
that are needed to change one string into another.
|
|
|
|
|
|
|
|
This implementation uses the Wagner-Fischer algorithm. */
|
|
|
|
|
2015-11-13 02:59:03 +01:00
|
|
|
edit_distance_t
|
2015-11-13 02:54:41 +01:00
|
|
|
levenshtein_distance (const char *s, int len_s,
|
|
|
|
const char *t, int len_t)
|
|
|
|
{
|
|
|
|
const bool debug = false;
|
|
|
|
|
|
|
|
if (debug)
|
|
|
|
{
|
|
|
|
printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
|
|
|
|
printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (len_s == 0)
|
|
|
|
return len_t;
|
|
|
|
if (len_t == 0)
|
|
|
|
return len_s;
|
|
|
|
|
|
|
|
/* We effectively build a matrix where each (i, j) contains the
|
|
|
|
Levenshtein distance between the prefix strings s[0:j]
|
|
|
|
and t[0:i].
|
|
|
|
Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
|
|
|
|
we simply keep track of the last row, v0 and a new row, v1,
|
|
|
|
which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
|
|
|
|
in favor of two (len_s + 1) allocations. These could potentially be
|
|
|
|
statically-allocated if we impose a maximum length on the
|
|
|
|
strings of interest. */
|
|
|
|
edit_distance_t *v0 = new edit_distance_t[len_s + 1];
|
|
|
|
edit_distance_t *v1 = new edit_distance_t[len_s + 1];
|
|
|
|
|
|
|
|
/* The first row is for the case of an empty target string, which
|
|
|
|
we can reach by deleting every character in the source string. */
|
|
|
|
for (int i = 0; i < len_s + 1; i++)
|
|
|
|
v0[i] = i;
|
|
|
|
|
|
|
|
/* Build successive rows. */
|
|
|
|
for (int i = 0; i < len_t; i++)
|
|
|
|
{
|
|
|
|
if (debug)
|
|
|
|
{
|
|
|
|
printf ("i:%i v0 = ", i);
|
|
|
|
for (int j = 0; j < len_s + 1; j++)
|
|
|
|
printf ("%i ", v0[j]);
|
|
|
|
printf ("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The initial column is for the case of an empty source string; we
|
|
|
|
can reach prefixes of the target string of length i
|
|
|
|
by inserting i characters. */
|
|
|
|
v1[0] = i + 1;
|
|
|
|
|
|
|
|
/* Build the rest of the row by considering neighbours to
|
|
|
|
the north, west and northwest. */
|
|
|
|
for (int j = 0; j < len_s; j++)
|
|
|
|
{
|
|
|
|
edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
|
|
|
|
edit_distance_t deletion = v1[j] + 1;
|
|
|
|
edit_distance_t insertion = v0[j + 1] + 1;
|
|
|
|
edit_distance_t substitution = v0[j] + cost;
|
|
|
|
edit_distance_t cheapest = MIN (deletion, insertion);
|
|
|
|
cheapest = MIN (cheapest, substitution);
|
|
|
|
v1[j + 1] = cheapest;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Prepare to move on to next row. */
|
|
|
|
for (int j = 0; j < len_s + 1; j++)
|
|
|
|
v0[j] = v1[j];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (debug)
|
|
|
|
{
|
|
|
|
printf ("final v1 = ");
|
|
|
|
for (int j = 0; j < len_s + 1; j++)
|
|
|
|
printf ("%i ", v1[j]);
|
|
|
|
printf ("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
edit_distance_t result = v1[len_s];
|
|
|
|
delete[] v0;
|
|
|
|
delete[] v1;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2015-11-13 02:59:03 +01:00
|
|
|
/* Calculate Levenshtein distance between two nil-terminated strings. */
|
2015-11-13 02:54:41 +01:00
|
|
|
|
|
|
|
edit_distance_t
|
|
|
|
levenshtein_distance (const char *s, const char *t)
|
|
|
|
{
|
|
|
|
return levenshtein_distance (s, strlen (s), t, strlen (t));
|
|
|
|
}
|