json: Nicer recovery from lexical errors

When the lexer chokes on an input character, it consumes the
character, emits a JSON error token, and enters its start state.  This
can lead to suboptimal error recovery.  For instance, input

    0123 ,

produces the tokens

    JSON_ERROR    01
    JSON_INTEGER  23
    JSON_COMMA    ,

Make the lexer skip characters after a lexical error until a
structural character ('[', ']', '{', '}', ':', ','), an ASCII control
character, or '\xFE', or '\xFF'.

Note that we must not skip ASCII control characters, '\xFE', '\xFF',
because those are documented to force the JSON parser into known-good
state, by docs/interop/qmp-spec.txt.

The lexer now produces

    JSON_ERROR    01
    JSON_COMMA    ,

Update qmp-test for the nicer error recovery: QMP now reports just one
error for input %p instead of two.  Also drop the newline after %p; it
was needed to tease out the second error.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-5-armbru@redhat.com>
[Conflict with commit ebb4d82d888 resolved]
This commit is contained in:
Markus Armbruster 2018-08-31 09:58:39 +02:00
parent c0ee3afa7f
commit 0f07a5d5f1
2 changed files with 30 additions and 18 deletions

View File

@ -101,6 +101,7 @@
enum json_lexer_state {
IN_ERROR = 0, /* must really be 0, see json_lexer[] */
IN_RECOVERY,
IN_DQ_STRING_ESCAPE,
IN_DQ_STRING,
IN_SQ_STRING_ESCAPE,
@ -130,6 +131,28 @@ QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);
static const uint8_t json_lexer[][256] = {
/* Relies on default initialization to IN_ERROR! */
/* error recovery */
[IN_RECOVERY] = {
/*
* Skip characters until a structural character, an ASCII
* control character other than '\t', or impossible UTF-8
* bytes '\xFE', '\xFF'. Structural characters and line
* endings are promising resynchronization points. Clients
* may use the others to force the JSON parser into known-good
* state; see docs/interop/qmp-spec.txt.
*/
[0 ... 0x1F] = IN_START | LOOKAHEAD,
[0x20 ... 0xFD] = IN_RECOVERY,
[0xFE ... 0xFF] = IN_START | LOOKAHEAD,
['\t'] = IN_RECOVERY,
['['] = IN_START | LOOKAHEAD,
[']'] = IN_START | LOOKAHEAD,
['{'] = IN_START | LOOKAHEAD,
['}'] = IN_START | LOOKAHEAD,
[':'] = IN_START | LOOKAHEAD,
[','] = IN_START | LOOKAHEAD,
},
/* double quote string */
[IN_DQ_STRING_ESCAPE] = {
[0x20 ... 0xFD] = IN_DQ_STRING,
@ -301,26 +324,18 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
/* fall through */
case JSON_SKIP:
g_string_truncate(lexer->token, 0);
/* fall through */
case IN_START:
new_state = lexer->start_state;
break;
case IN_ERROR:
/* XXX: To avoid having previous bad input leaving the parser in an
* unresponsive state where we consume unpredictable amounts of
* subsequent "good" input, percolate this error state up to the
* parser by emitting a JSON_ERROR token, then reset lexer state.
*
* Also note that this handling is required for reliable channel
* negotiation between QMP and the guest agent, since chr(0xFF)
* is placed at the beginning of certain events to ensure proper
* delivery when the channel is in an unknown state. chr(0xFF) is
* never a valid ASCII/UTF-8 sequence, so this should reliably
* induce an error/flush state.
*/
json_message_process_token(lexer, lexer->token, JSON_ERROR,
lexer->x, lexer->y);
new_state = IN_RECOVERY;
/* fall through */
case IN_RECOVERY:
g_string_truncate(lexer->token, 0);
lexer->state = lexer->start_state;
return;
break;
default:
break;
}

View File

@ -76,10 +76,7 @@ static void test_malformed(QTestState *qts)
assert_recovered(qts);
/* lexical error: interpolation */
qtest_qmp_send_raw(qts, "%%p\n");
/* two errors, one for "%", one for "p" */
resp = qtest_qmp_receive(qts);
qmp_assert_error_class(resp, "GenericError");
qtest_qmp_send_raw(qts, "%%p");
resp = qtest_qmp_receive(qts);
qmp_assert_error_class(resp, "GenericError");
assert_recovered(qts);