re PR libstdc++/77356 (regex error for a ECMAScript syntax string)
PR libstdc++/77356 * include/bits/regex_compiler.tcc(_M_insert_bracket_matcher, _M_expression_term): Modify to support dash literal. * include/bits/regex_scanner.h: Add dash as a token type to make a different from the mandated dash literal by escaping. * include/bits/regex_scanner.tcc(_M_scan_in_bracket): Emit dash token in bracket expression parsing. * testsuite/28_regex/regression.cc: Add new testcases. From-SVN: r239794
This commit is contained in:
parent
d8921e81e9
commit
4aebb4e4a6
@ -1,3 +1,14 @@
|
||||
2016-08-27 Tim Shen <timshen@google.com>
|
||||
|
||||
PR libstdc++/77356
|
||||
* include/bits/regex_compiler.tcc(_M_insert_bracket_matcher,
|
||||
_M_expression_term): Modify to support dash literal.
|
||||
* include/bits/regex_scanner.h: Add dash as a token type to make
|
||||
a different from the mandated dash literal by escaping.
|
||||
* include/bits/regex_scanner.tcc(_M_scan_in_bracket): Emit dash
|
||||
token in bracket expression parsing.
|
||||
* testsuite/28_regex/regression.cc: Add new testcases.
|
||||
|
||||
2016-08-26 Jonathan Wakely <jwakely@redhat.com>
|
||||
|
||||
PR libstdc++/51960
|
||||
|
@ -426,13 +426,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
pair<bool, _CharT> __last_char; // Optional<_CharT>
|
||||
__last_char.first = false;
|
||||
if (!(_M_flags & regex_constants::ECMAScript))
|
||||
if (_M_try_char())
|
||||
{
|
||||
__matcher._M_add_char(_M_value[0]);
|
||||
__last_char.first = true;
|
||||
__last_char.second = _M_value[0];
|
||||
}
|
||||
{
|
||||
if (_M_try_char())
|
||||
{
|
||||
__last_char.first = true;
|
||||
__last_char.second = _M_value[0];
|
||||
}
|
||||
else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
|
||||
{
|
||||
__last_char.first = true;
|
||||
__last_char.second = '-';
|
||||
}
|
||||
}
|
||||
while (_M_expression_term(__last_char, __matcher));
|
||||
if (__last_char.first)
|
||||
__matcher._M_add_char(__last_char.second);
|
||||
__matcher._M_ready();
|
||||
_M_stack.push(_StateSeqT(
|
||||
*_M_nfa,
|
||||
@ -449,19 +457,43 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
if (_M_match_token(_ScannerT::_S_token_bracket_end))
|
||||
return false;
|
||||
|
||||
const auto __push_char = [&](_CharT __ch)
|
||||
{
|
||||
if (__last_char.first)
|
||||
__matcher._M_add_char(__last_char.second);
|
||||
else
|
||||
__last_char.first = true;
|
||||
__last_char.second = __ch;
|
||||
};
|
||||
const auto __flush = [&]
|
||||
{
|
||||
if (__last_char.first)
|
||||
{
|
||||
__matcher._M_add_char(__last_char.second);
|
||||
__last_char.first = false;
|
||||
}
|
||||
};
|
||||
|
||||
if (_M_match_token(_ScannerT::_S_token_collsymbol))
|
||||
{
|
||||
auto __symbol = __matcher._M_add_collate_element(_M_value);
|
||||
if (__symbol.size() == 1)
|
||||
{
|
||||
__last_char.first = true;
|
||||
__last_char.second = __symbol[0];
|
||||
}
|
||||
__push_char(__symbol[0]);
|
||||
else
|
||||
__flush();
|
||||
}
|
||||
else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
|
||||
__matcher._M_add_equivalence_class(_M_value);
|
||||
{
|
||||
__flush();
|
||||
__matcher._M_add_equivalence_class(_M_value);
|
||||
}
|
||||
else if (_M_match_token(_ScannerT::_S_token_char_class_name))
|
||||
__matcher._M_add_character_class(_M_value, false);
|
||||
{
|
||||
__flush();
|
||||
__matcher._M_add_character_class(_M_value, false);
|
||||
}
|
||||
else if (_M_try_char())
|
||||
__push_char(_M_value[0]);
|
||||
// POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
|
||||
// except when the '-' is the first or last character in the bracket
|
||||
// expression ([--0]). ECMAScript treats all '-' after a range as a
|
||||
@ -472,55 +504,55 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
// Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
|
||||
//
|
||||
// It turns out that no one reads BNFs ;)
|
||||
else if (_M_try_char())
|
||||
else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
|
||||
{
|
||||
if (!__last_char.first)
|
||||
{
|
||||
__matcher._M_add_char(_M_value[0]);
|
||||
if (_M_value[0] == '-'
|
||||
&& !(_M_flags & regex_constants::ECMAScript))
|
||||
if (!(_M_flags & regex_constants::ECMAScript))
|
||||
{
|
||||
if (_M_match_token(_ScannerT::_S_token_bracket_end))
|
||||
return false;
|
||||
{
|
||||
__push_char('-');
|
||||
return false;
|
||||
}
|
||||
__throw_regex_error(
|
||||
regex_constants::error_range,
|
||||
"Unexpected dash in bracket expression. For POSIX syntax, "
|
||||
"a dash is not treated literally only when it is at "
|
||||
"beginning or end.");
|
||||
}
|
||||
__last_char.first = true;
|
||||
__last_char.second = _M_value[0];
|
||||
__push_char('-');
|
||||
}
|
||||
else
|
||||
{
|
||||
if (_M_value[0] == '-')
|
||||
if (_M_try_char())
|
||||
{
|
||||
if (_M_try_char())
|
||||
{
|
||||
__matcher._M_make_range(__last_char.second , _M_value[0]);
|
||||
__last_char.first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (_M_scanner._M_get_token()
|
||||
!= _ScannerT::_S_token_bracket_end)
|
||||
__throw_regex_error(
|
||||
regex_constants::error_range,
|
||||
"Unexpected end of bracket expression.");
|
||||
__matcher._M_add_char(_M_value[0]);
|
||||
}
|
||||
__matcher._M_make_range(__last_char.second, _M_value[0]);
|
||||
__last_char.first = false;
|
||||
}
|
||||
else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
|
||||
{
|
||||
__matcher._M_make_range(__last_char.second, '-');
|
||||
__last_char.first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
__matcher._M_add_char(_M_value[0]);
|
||||
__last_char.second = _M_value[0];
|
||||
if (_M_scanner._M_get_token()
|
||||
!= _ScannerT::_S_token_bracket_end)
|
||||
__throw_regex_error(
|
||||
regex_constants::error_range,
|
||||
"Character is expected after a dash.");
|
||||
__push_char('-');
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (_M_match_token(_ScannerT::_S_token_quoted_class))
|
||||
__matcher._M_add_character_class(_M_value,
|
||||
_M_ctype.is(_CtypeT::upper,
|
||||
_M_value[0]));
|
||||
{
|
||||
__flush();
|
||||
__matcher._M_add_character_class(_M_value,
|
||||
_M_ctype.is(_CtypeT::upper,
|
||||
_M_value[0]));
|
||||
}
|
||||
else
|
||||
__throw_regex_error(regex_constants::error_brack,
|
||||
"Unexpected character in bracket expression.");
|
||||
|
@ -43,7 +43,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
{
|
||||
public:
|
||||
/// Token types returned from the scanner.
|
||||
enum _TokenT
|
||||
enum _TokenT : unsigned
|
||||
{
|
||||
_S_token_anychar,
|
||||
_S_token_ord_char,
|
||||
@ -73,7 +73,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_S_token_comma,
|
||||
_S_token_dup_count,
|
||||
_S_token_eof,
|
||||
_S_token_unknown
|
||||
_S_token_bracket_dash,
|
||||
_S_token_unknown = -1u
|
||||
};
|
||||
|
||||
protected:
|
||||
|
@ -210,7 +210,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
|
||||
auto __c = *_M_current++;
|
||||
|
||||
if (__c == '[')
|
||||
if (__c == '-')
|
||||
_M_token = _S_token_bracket_dash;
|
||||
else if (__c == '[')
|
||||
{
|
||||
if (_M_current == _M_end)
|
||||
__throw_regex_error(regex_constants::error_brack,
|
||||
|
@ -61,12 +61,35 @@ test03()
|
||||
VERIFY(!regex_search_debug("a", regex(R"(\b$)"), regex_constants::match_not_eow));
|
||||
}
|
||||
|
||||
// PR libstdc++/77356
|
||||
void
|
||||
test04()
|
||||
{
|
||||
bool test __attribute__((unused)) = true;
|
||||
|
||||
static const char* kNumericAnchor ="(\\$|usd)(usd|\\$|to|and|up to|[0-9,\\.\\-\\sk])+";
|
||||
const std::regex re(kNumericAnchor);
|
||||
(void)re;
|
||||
}
|
||||
|
||||
void
|
||||
test05()
|
||||
{
|
||||
bool test __attribute__((unused)) = true;
|
||||
|
||||
VERIFY(regex_match_debug("!", std::regex("[![:alnum:]]")));
|
||||
VERIFY(regex_match_debug("-", std::regex("[a-]", regex_constants::basic)));
|
||||
VERIFY(regex_match_debug("-", std::regex("[a-]")));
|
||||
}
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
test01();
|
||||
test02();
|
||||
test03();
|
||||
test04();
|
||||
test05();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user