3cf2f2377e
The words.sh script in its current form extracts c comments from files, which
it then transforms into a list of words.
To use the script on the documentation (as I did for commit 6b92c0d353
"[gdb/doc] Fix typos"), I needed to disable the "extract c comments" part.
Add an option -c that enables extracting c comments, and is off by default.
gdb/ChangeLog:
2019-11-25 Tom de Vries <tdevries@suse.de>
* contrib/words.sh: Add -c option.
Change-Id: Ifa34d435b3c41b3ff845dc07ae4b0d9f02d92a2d
145 lines
3.1 KiB
Bash
Executable File
145 lines
3.1 KiB
Bash
Executable File
#!/bin/sh
|
|
|
|
# Copyright (C) 2019 Free Software Foundation, Inc.
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
# This script intends to facilitate spell checking of source/doc files.
|
|
# It:
|
|
# - transforms the files into a list of lowercase words
|
|
# - prefixes each word with the frequency
|
|
# - filters out words within a frequency range
|
|
# - sorts the words, longest first
|
|
#
|
|
# If '-c' is passed as option, it operates on the C comments only, rather than
|
|
# on the entire file.
|
|
#
|
|
# For:
|
|
# ...
|
|
# $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
|
|
# $ ./gdb/contrib/words.sh -c $files
|
|
# ...
|
|
# it generates a list of ~15000 words prefixed with frequency.
|
|
#
|
|
# This could be used to generate a dictionary that is kept as part of the
|
|
# sources, against which new code can be checked, generating a warning or
|
|
# error. The hope is that misspellings would trigger this frequently, and rare
|
|
# words rarely, otherwise the burden of updating the dictionary would be too
|
|
# much.
|
|
#
|
|
# And for:
|
|
# ...
|
|
# $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
|
|
# $ ./gdb/contrib/words.sh -c -f 1 $files
|
|
# ...
|
|
# it generates a list of ~5000 words with frequency 1.
|
|
#
|
|
# This can be used to scan for misspellings manually.
|
|
#
|
|
|
|
minfreq=
|
|
maxfreq=
|
|
c=false
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
-c)
|
|
c=true
|
|
shift
|
|
;;
|
|
--freq|-f)
|
|
minfreq=$2
|
|
maxfreq=$2
|
|
shift 2
|
|
;;
|
|
--min)
|
|
minfreq=$2
|
|
if [ "$maxfreq" = "" ]; then
|
|
maxfreq=0
|
|
fi
|
|
shift 2
|
|
;;
|
|
--max)
|
|
maxfreq=$2
|
|
if [ "$minfreq" = "" ]; then
|
|
minfreq=0
|
|
fi
|
|
shift 2
|
|
;;
|
|
*)
|
|
break;
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
|
|
minfreq=0
|
|
maxfreq=0
|
|
fi
|
|
|
|
awkfile=$(mktemp)
|
|
trap 'rm -f "$awkfile"' EXIT
|
|
|
|
cat > "$awkfile" <<EOF
|
|
BEGIN {
|
|
in_comment=0
|
|
}
|
|
|
|
// {
|
|
line=\$0
|
|
}
|
|
|
|
/\/\*/ {
|
|
in_comment=1
|
|
sub(/.*\/\*/, "", line)
|
|
}
|
|
|
|
/\*\// {
|
|
sub(/\*\/.*/, "", line)
|
|
in_comment=0
|
|
print line
|
|
next
|
|
}
|
|
|
|
// {
|
|
if (in_comment) {
|
|
print line
|
|
}
|
|
}
|
|
EOF
|
|
|
|
# Stabilize sort.
|
|
export LC_ALL=C
|
|
|
|
if $c; then
|
|
awk \
|
|
-f "$awkfile" \
|
|
-- "$@"
|
|
else
|
|
cat "$@"
|
|
fi \
|
|
| sed \
|
|
-e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \
|
|
-e 's/\[/\n/g' \
|
|
-e 's/\]/\n/g' \
|
|
-e "s/'/\n/g" \
|
|
-e 's/[0-9][0-9]*/\n/g' \
|
|
-e 's/[ \t]*//g' \
|
|
| tr '[:upper:]' '[:lower:]' \
|
|
| sort \
|
|
| uniq -c \
|
|
| awk "{ if (($minfreq == 0 || $minfreq <= \$1) \
|
|
&& ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \
|
|
| awk '{ print length($0) " " $0; }' \
|
|
| sort -n -r \
|
|
| cut -d ' ' -f 2-
|