gcc/libstdc++-v3/include/bits/regex_scanner.tcc
Jonathan Wakely 260a5334ee libstdc++: Improve std::regex_error::what() strings
This replaces the vague "regex_error" for std::regex_error::what() with
a string that corresponds to the error_type enum passed to the
constructor. This allows us to remove many of the strings passed to
__throw_regex_error, because the default string is at least as good.
When a string argument to __throw_regex_error is kept it should add some
context-specific detail absent from the default string.

Also remove full stops (periods) from the end of those strings, to make
it easier to include them in logs and other output. I've left them
starting with an upper-case letter, which is consistent with strerror
output for (at least) Glibc, Solaris and BSD. I'm ambivalent whether
that's the right choice.

This also adds the missing noreturn attribute to __throw_regex_error.

libstdc++-v3/ChangeLog:

	* include/bits/regex_compiler.tcc: Adjust all calls to
	__throw_regex_error.
	* include/bits/regex_error.h (__throw_regex_error): Add noreturn
	attribute.
	* include/bits/regex_scanner.tcc: Likewise.
	* src/c++11/regex.cc (desc): New helper function.
	(regex_error::regex_error(error_type)): Use desc to get a string
	corresponding to the error code.
2022-01-05 13:47:00 +00:00

585 lines
14 KiB
C++

// class template regex -*- C++ -*-
// Copyright (C) 2013-2022 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
/**
* @file bits/regex_scanner.tcc
* This is an internal header file, included by other library headers.
* Do not attempt to use it directly. @headername{regex}
*/
// FIXME make comments doxygen format.
// N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
// and awk
// 1) grep is basic except '\n' is treated as '|'
// 2) egrep is extended except '\n' is treated as '|'
// 3) awk is extended except special escaping rules, and there's no
// back-reference.
//
// References:
//
// ECMAScript: ECMA-262 15.10
//
// basic, extended:
// http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
//
// awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
namespace __detail
{
template<typename _CharT>
_Scanner<_CharT>::
_Scanner(const _CharT* __begin, const _CharT* __end,
_FlagT __flags, std::locale __loc)
: _ScannerBase(__flags),
_M_current(__begin), _M_end(__end),
_M_ctype(std::use_facet<_CtypeT>(__loc)),
_M_eat_escape(_M_is_ecma()
? &_Scanner::_M_eat_escape_ecma
: &_Scanner::_M_eat_escape_posix)
{ _M_advance(); }
template<typename _CharT>
void
_Scanner<_CharT>::
_M_advance()
{
if (_M_current == _M_end)
{
_M_token = _S_token_eof;
return;
}
if (_M_state == _S_state_normal)
_M_scan_normal();
else if (_M_state == _S_state_in_bracket)
_M_scan_in_bracket();
else if (_M_state == _S_state_in_brace)
_M_scan_in_brace();
else
{
__glibcxx_assert(!"unexpected state while processing regex");
}
}
// Differences between styles:
// 1) "\(", "\)", "\{" in basic. It's not escaping.
// 2) "(?:", "(?=", "(?!" in ECMAScript.
template<typename _CharT>
void
_Scanner<_CharT>::
_M_scan_normal()
{
auto __c = *_M_current++;
if (__builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
return;
}
if (__c == '\\')
{
if (_M_current == _M_end)
__throw_regex_error(
regex_constants::error_escape,
"Invalid escape at end of regular expression");
if (!_M_is_basic()
|| (*_M_current != '('
&& *_M_current != ')'
&& *_M_current != '{'))
{
(this->*_M_eat_escape)();
return;
}
__c = *_M_current++;
}
if (__c == '(')
{
if (_M_is_ecma() && *_M_current == '?')
{
if (++_M_current == _M_end)
__throw_regex_error(regex_constants::error_paren);
if (*_M_current == ':')
{
++_M_current;
_M_token = _S_token_subexpr_no_group_begin;
}
else if (*_M_current == '=')
{
++_M_current;
_M_token = _S_token_subexpr_lookahead_begin;
_M_value.assign(1, 'p');
}
else if (*_M_current == '!')
{
++_M_current;
_M_token = _S_token_subexpr_lookahead_begin;
_M_value.assign(1, 'n');
}
else
__throw_regex_error(regex_constants::error_paren,
"Invalid '(?...)' zero-width assertion "
"in regular expression");
}
else if (_M_flags & regex_constants::nosubs)
_M_token = _S_token_subexpr_no_group_begin;
else
_M_token = _S_token_subexpr_begin;
}
else if (__c == ')')
_M_token = _S_token_subexpr_end;
else if (__c == '[')
{
_M_state = _S_state_in_bracket;
_M_at_bracket_start = true;
if (_M_current != _M_end && *_M_current == '^')
{
_M_token = _S_token_bracket_neg_begin;
++_M_current;
}
else
_M_token = _S_token_bracket_begin;
}
else if (__c == '{')
{
_M_state = _S_state_in_brace;
_M_token = _S_token_interval_begin;
}
else if (__builtin_expect(__c == _CharT(0), false))
{
if (!_M_is_ecma())
__throw_regex_error(regex_constants::_S_null);
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
else if (__c != ']' && __c != '}')
{
auto __it = _M_token_tbl;
auto __narrowc = _M_ctype.narrow(__c, '\0');
for (; __it->first != '\0'; ++__it)
if (__it->first == __narrowc)
{
_M_token = __it->second;
return;
}
__glibcxx_assert(!"unexpected special character in regex");
}
else
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
}
// Differences between styles:
// 1) different semantics of "[]" and "[^]".
// 2) Escaping in bracket expr.
template<typename _CharT>
void
_Scanner<_CharT>::
_M_scan_in_bracket()
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_brack);
auto __c = *_M_current++;
if (__c == '-')
_M_token = _S_token_bracket_dash;
else if (__c == '[')
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_brack,
"Incomplete '[[' character class in "
"regular expression");
if (*_M_current == '.')
{
_M_token = _S_token_collsymbol;
_M_eat_class(*_M_current++);
}
else if (*_M_current == ':')
{
_M_token = _S_token_char_class_name;
_M_eat_class(*_M_current++);
}
else if (*_M_current == '=')
{
_M_token = _S_token_equiv_class_name;
_M_eat_class(*_M_current++);
}
else
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
}
// In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
// literally. So "[]]" and "[^]]" are valid regexes. See the testcases
// `.../empty_range.cc`.
else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
{
_M_token = _S_token_bracket_end;
_M_state = _S_state_normal;
}
// ECMAScript and awk permits escaping in bracket.
else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
(this->*_M_eat_escape)();
else
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
_M_at_bracket_start = false;
}
// Differences between styles:
// 1) "\}" in basic style.
template<typename _CharT>
void
_Scanner<_CharT>::
_M_scan_in_brace()
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_brace);
auto __c = *_M_current++;
if (_M_ctype.is(_CtypeT::digit, __c))
{
_M_token = _S_token_dup_count;
_M_value.assign(1, __c);
while (_M_current != _M_end
&& _M_ctype.is(_CtypeT::digit, *_M_current))
_M_value += *_M_current++;
}
else if (__c == ',')
_M_token = _S_token_comma;
// basic use \}.
else if (_M_is_basic())
{
if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
{
_M_state = _S_state_normal;
_M_token = _S_token_interval_end;
++_M_current;
}
else
__throw_regex_error(regex_constants::error_badbrace);
}
else if (__c == '}')
{
_M_state = _S_state_normal;
_M_token = _S_token_interval_end;
}
else
__throw_regex_error(regex_constants::error_badbrace);
}
template<typename _CharT>
void
_Scanner<_CharT>::
_M_eat_escape_ecma()
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_escape);
auto __c = *_M_current++;
auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
{
_M_token = _S_token_ord_char;
_M_value.assign(1, *__pos);
}
else if (__c == 'b')
{
_M_token = _S_token_word_bound;
_M_value.assign(1, 'p');
}
else if (__c == 'B')
{
_M_token = _S_token_word_bound;
_M_value.assign(1, 'n');
}
// N3376 28.13
else if (__c == 'd'
|| __c == 'D'
|| __c == 's'
|| __c == 'S'
|| __c == 'w'
|| __c == 'W')
{
_M_token = _S_token_quoted_class;
_M_value.assign(1, __c);
}
else if (__c == 'c')
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_escape,
"invalid '\\cX' control character in "
"regular expression");
_M_token = _S_token_ord_char;
_M_value.assign(1, *_M_current++);
}
else if (__c == 'x' || __c == 'u')
{
_M_value.clear();
const int __n = __c == 'x' ? 2 : 4;
for (int __i = 0; __i < __n; __i++)
{
if (_M_current == _M_end
|| !_M_ctype.is(_CtypeT::xdigit, *_M_current))
__throw_regex_error(regex_constants::error_escape,
__n == 2
? "Invalid '\\xNN' control character in "
"regular expression"
: "Invalid '\\uNNNN' control character in "
"regular expression");
_M_value += *_M_current++;
}
_M_token = _S_token_hex_num;
}
// ECMAScript recognizes multi-digit back-references.
else if (_M_ctype.is(_CtypeT::digit, __c))
{
_M_value.assign(1, __c);
while (_M_current != _M_end
&& _M_ctype.is(_CtypeT::digit, *_M_current))
_M_value += *_M_current++;
_M_token = _S_token_backref;
}
else
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
}
// Differences between styles:
// 1) Extended doesn't support backref, but basic does.
template<typename _CharT>
void
_Scanner<_CharT>::
_M_eat_escape_posix()
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_escape);
auto __c = *_M_current;
auto __pos = __builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
if (__pos != nullptr && *__pos != '\0')
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
// We MUST judge awk before handling backrefs. There's no backref in awk.
else if (_M_is_awk())
{
_M_eat_escape_awk();
return;
}
else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
{
_M_token = _S_token_backref;
_M_value.assign(1, __c);
}
else
{
#ifdef __STRICT_ANSI__
// POSIX says it is undefined to escape ordinary characters
__throw_regex_error(regex_constants::error_escape);
#else
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
#endif
}
++_M_current;
}
template<typename _CharT>
void
_Scanner<_CharT>::
_M_eat_escape_awk()
{
auto __c = *_M_current++;
auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
if (__pos != nullptr)
{
_M_token = _S_token_ord_char;
_M_value.assign(1, *__pos);
}
// \ddd for oct representation
else if (_M_ctype.is(_CtypeT::digit, __c)
&& __c != '8'
&& __c != '9')
{
_M_value.assign(1, __c);
for (int __i = 0;
__i < 2
&& _M_current != _M_end
&& _M_ctype.is(_CtypeT::digit, *_M_current)
&& *_M_current != '8'
&& *_M_current != '9';
__i++)
_M_value += *_M_current++;
_M_token = _S_token_oct_num;
return;
}
else
__throw_regex_error(regex_constants::error_escape);
}
// Eats a character class or throws an exception.
// __ch could be ':', '.' or '=', _M_current is the char after ']' when
// returning.
template<typename _CharT>
void
_Scanner<_CharT>::
_M_eat_class(char __ch)
{
for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
_M_value += *_M_current++;
if (_M_current == _M_end
|| *_M_current++ != __ch
|| _M_current == _M_end // skip __ch
|| *_M_current++ != ']') // skip ']'
{
__throw_regex_error(__ch == ':' ? regex_constants::error_ctype
: regex_constants::error_collate);
}
}
#ifdef _GLIBCXX_DEBUG
template<typename _CharT>
std::ostream&
_Scanner<_CharT>::
_M_print(std::ostream& ostr)
{
switch (_M_token)
{
case _S_token_anychar:
ostr << "any-character\n";
break;
case _S_token_backref:
ostr << "backref\n";
break;
case _S_token_bracket_begin:
ostr << "bracket-begin\n";
break;
case _S_token_bracket_neg_begin:
ostr << "bracket-neg-begin\n";
break;
case _S_token_bracket_end:
ostr << "bracket-end\n";
break;
case _S_token_char_class_name:
ostr << "char-class-name \"" << _M_value << "\"\n";
break;
case _S_token_closure0:
ostr << "closure0\n";
break;
case _S_token_closure1:
ostr << "closure1\n";
break;
case _S_token_collsymbol:
ostr << "collsymbol \"" << _M_value << "\"\n";
break;
case _S_token_comma:
ostr << "comma\n";
break;
case _S_token_dup_count:
ostr << "dup count: " << _M_value << "\n";
break;
case _S_token_eof:
ostr << "EOF\n";
break;
case _S_token_equiv_class_name:
ostr << "equiv-class-name \"" << _M_value << "\"\n";
break;
case _S_token_interval_begin:
ostr << "interval begin\n";
break;
case _S_token_interval_end:
ostr << "interval end\n";
break;
case _S_token_line_begin:
ostr << "line begin\n";
break;
case _S_token_line_end:
ostr << "line end\n";
break;
case _S_token_opt:
ostr << "opt\n";
break;
case _S_token_or:
ostr << "or\n";
break;
case _S_token_ord_char:
ostr << "ordinary character: \"" << _M_value << "\"\n";
break;
case _S_token_subexpr_begin:
ostr << "subexpr begin\n";
break;
case _S_token_subexpr_no_group_begin:
ostr << "no grouping subexpr begin\n";
break;
case _S_token_subexpr_lookahead_begin:
ostr << "lookahead subexpr begin\n";
break;
case _S_token_subexpr_end:
ostr << "subexpr end\n";
break;
case _S_token_unknown:
ostr << "-- unknown token --\n";
break;
case _S_token_oct_num:
ostr << "oct number " << _M_value << "\n";
break;
case _S_token_hex_num:
ostr << "hex number " << _M_value << "\n";
break;
case _S_token_quoted_class:
ostr << "quoted class " << "\\" << _M_value << "\n";
break;
default:
_GLIBCXX_DEBUG_ASSERT(false);
}
return ostr;
}
#endif
} // namespace __detail
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace