2009-11-11 10:39:14 -06:00
|
|
|
/*
|
|
|
|
* JSON lexer
|
|
|
|
*
|
|
|
|
* Copyright IBM, Corp. 2009
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Anthony Liguori <aliguori@us.ibm.com>
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
|
|
|
|
* See the COPYING.LIB file in the top-level directory.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2016-01-29 17:50:01 +00:00
|
|
|
#include "qemu/osdep.h"
|
2018-08-23 18:40:20 +02:00
|
|
|
#include "json-parser-int.h"
|
2009-11-11 10:39:14 -06:00
|
|
|
|
2011-06-01 12:14:52 -05:00
|
|
|
#define MAX_TOKEN_SIZE (64ULL << 20)
|
|
|
|
|
2009-11-11 10:39:14 -06:00
|
|
|
/*
|
2018-08-23 18:39:46 +02:00
|
|
|
* From RFC 8259 "The JavaScript Object Notation (JSON) Data
|
|
|
|
* Interchange Format", with [comments in brackets]:
|
2016-06-09 20:48:06 -06:00
|
|
|
*
|
2018-08-23 18:39:46 +02:00
|
|
|
* The set of tokens includes six structural characters, strings,
|
|
|
|
* numbers, and three literal names.
|
2016-06-09 20:48:06 -06:00
|
|
|
*
|
2018-08-23 18:39:46 +02:00
|
|
|
* These are the six structural characters:
|
2016-06-09 20:48:06 -06:00
|
|
|
*
|
2018-08-23 18:39:46 +02:00
|
|
|
* begin-array = ws %x5B ws ; [ left square bracket
|
|
|
|
* begin-object = ws %x7B ws ; { left curly bracket
|
|
|
|
* end-array = ws %x5D ws ; ] right square bracket
|
|
|
|
* end-object = ws %x7D ws ; } right curly bracket
|
|
|
|
* name-separator = ws %x3A ws ; : colon
|
|
|
|
* value-separator = ws %x2C ws ; , comma
|
2016-06-09 20:48:06 -06:00
|
|
|
*
|
2018-08-23 18:39:46 +02:00
|
|
|
* Insignificant whitespace is allowed before or after any of the six
|
|
|
|
* structural characters.
|
|
|
|
* [This lexer accepts it before or after any token, which is actually
|
|
|
|
* the same, as the grammar always has structural characters between
|
|
|
|
* other tokens.]
|
2016-06-09 20:48:06 -06:00
|
|
|
*
|
2018-08-23 18:39:46 +02:00
|
|
|
* ws = *(
|
|
|
|
* %x20 / ; Space
|
|
|
|
* %x09 / ; Horizontal tab
|
|
|
|
* %x0A / ; Line feed or New line
|
|
|
|
* %x0D ) ; Carriage return
|
2009-11-11 10:39:14 -06:00
|
|
|
*
|
2018-08-23 18:39:46 +02:00
|
|
|
* [...] three literal names:
|
|
|
|
* false null true
|
|
|
|
* [This lexer accepts [a-z]+, and leaves rejecting unknown literal
|
|
|
|
* names to the parser.]
|
|
|
|
*
|
|
|
|
* [Numbers:]
|
|
|
|
*
|
|
|
|
* number = [ minus ] int [ frac ] [ exp ]
|
|
|
|
* decimal-point = %x2E ; .
|
|
|
|
* digit1-9 = %x31-39 ; 1-9
|
|
|
|
* e = %x65 / %x45 ; e E
|
|
|
|
* exp = e [ minus / plus ] 1*DIGIT
|
|
|
|
* frac = decimal-point 1*DIGIT
|
|
|
|
* int = zero / ( digit1-9 *DIGIT )
|
|
|
|
* minus = %x2D ; -
|
|
|
|
* plus = %x2B ; +
|
|
|
|
* zero = %x30 ; 0
|
|
|
|
*
|
|
|
|
* [Strings:]
|
|
|
|
* string = quotation-mark *char quotation-mark
|
|
|
|
*
|
|
|
|
* char = unescaped /
|
|
|
|
* escape (
|
|
|
|
* %x22 / ; " quotation mark U+0022
|
|
|
|
* %x5C / ; \ reverse solidus U+005C
|
|
|
|
* %x2F / ; / solidus U+002F
|
|
|
|
* %x62 / ; b backspace U+0008
|
|
|
|
* %x66 / ; f form feed U+000C
|
|
|
|
* %x6E / ; n line feed U+000A
|
|
|
|
* %x72 / ; r carriage return U+000D
|
|
|
|
* %x74 / ; t tab U+0009
|
|
|
|
* %x75 4HEXDIG ) ; uXXXX U+XXXX
|
|
|
|
* escape = %x5C ; \
|
|
|
|
* quotation-mark = %x22 ; "
|
|
|
|
* unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
|
2018-08-23 18:39:53 +02:00
|
|
|
* [This lexer accepts any non-control character after escape, and
|
|
|
|
* leaves rejecting invalid ones to the parser.]
|
2018-08-23 18:39:46 +02:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* Extensions over RFC 8259:
|
|
|
|
* - Extra escape sequence in strings:
|
|
|
|
* 0x27 (apostrophe) is recognized after escape, too
|
|
|
|
* - Single-quoted strings:
|
|
|
|
* Like double-quoted strings, except they're delimited by %x27
|
|
|
|
* (apostrophe) instead of %x22 (quotation mark), and can't contain
|
|
|
|
* unescaped apostrophe, but can contain unescaped quotation mark.
|
2018-08-23 18:40:05 +02:00
|
|
|
* - Interpolation, if enabled:
|
2018-08-23 18:40:07 +02:00
|
|
|
* The lexer accepts %[A-Za-z0-9]*, and leaves rejecting invalid
|
|
|
|
* ones to the parser.
|
2018-08-23 18:39:46 +02:00
|
|
|
*
|
|
|
|
* Note:
|
2018-08-23 18:39:52 +02:00
|
|
|
* - Input must be encoded in modified UTF-8.
|
2018-08-23 18:39:46 +02:00
|
|
|
* - Decoding and validating is left to the parser.
|
2009-11-11 10:39:14 -06:00
|
|
|
*/
|
|
|
|
|
|
|
|
enum json_lexer_state {
|
2018-08-31 09:58:40 +02:00
|
|
|
IN_RECOVERY = 1,
|
2009-11-11 10:39:14 -06:00
|
|
|
IN_DQ_STRING_ESCAPE,
|
|
|
|
IN_DQ_STRING,
|
|
|
|
IN_SQ_STRING_ESCAPE,
|
|
|
|
IN_SQ_STRING,
|
|
|
|
IN_ZERO,
|
2018-08-23 18:40:09 +02:00
|
|
|
IN_EXP_DIGITS,
|
|
|
|
IN_EXP_SIGN,
|
2009-11-11 10:39:14 -06:00
|
|
|
IN_EXP_E,
|
|
|
|
IN_MANTISSA,
|
|
|
|
IN_MANTISSA_DIGITS,
|
2018-08-23 18:40:09 +02:00
|
|
|
IN_DIGITS,
|
|
|
|
IN_SIGN,
|
2009-11-11 10:39:14 -06:00
|
|
|
IN_KEYWORD,
|
2018-08-23 18:40:04 +02:00
|
|
|
IN_INTERP,
|
2009-11-11 10:39:14 -06:00
|
|
|
IN_START,
|
2018-08-23 18:40:05 +02:00
|
|
|
IN_START_INTERP, /* must be IN_START + 1 */
|
2009-11-11 10:39:14 -06:00
|
|
|
};
|
|
|
|
|
2018-08-31 09:58:40 +02:00
|
|
|
QEMU_BUILD_BUG_ON(JSON_ERROR != 0);
|
|
|
|
QEMU_BUILD_BUG_ON(IN_RECOVERY != JSON_ERROR + 1);
|
2018-08-23 18:40:05 +02:00
|
|
|
QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START_INTERP);
|
json: Make lexer's "character consumed" logic less confusing
The lexer uses macro TERMINAL_NEEDED_LOOKAHEAD() to decide whether a
state transition consumes the input character. It returns true when
the state transition is defined with the TERMINAL() macro. To detect
that, it checks whether input '\0' would have resulted in the same
state transition, and the new state is not IN_ERROR.
Why does that even work? For all states, the new state on input '\0'
is either IN_ERROR or defined with TERMINAL(). If the state
transition equals the one we'd get for input '\0', it goes to IN_ERROR
or to the argument of TERMINAL(). We never use TERMINAL(IN_ERROR),
because it makes no sense. Thus, if it doesn't go to IN_ERROR, it
must be defined with TERMINAL().
Since this isn't quite confusing enough, we negate the result to get
@char_consumed, and ignore it when @flush is true.
Instead of deriving the lookahead bit from the state transition, make
it explicit. This is easier to understand, and a bit more flexible,
too.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-4-armbru@redhat.com>
2018-08-31 09:58:38 +02:00
|
|
|
QEMU_BUILD_BUG_ON(JSON_MAX >= 0x80);
|
2018-08-23 18:40:05 +02:00
|
|
|
QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);
|
2015-11-25 22:23:25 +01:00
|
|
|
|
json: Make lexer's "character consumed" logic less confusing
The lexer uses macro TERMINAL_NEEDED_LOOKAHEAD() to decide whether a
state transition consumes the input character. It returns true when
the state transition is defined with the TERMINAL() macro. To detect
that, it checks whether input '\0' would have resulted in the same
state transition, and the new state is not IN_ERROR.
Why does that even work? For all states, the new state on input '\0'
is either IN_ERROR or defined with TERMINAL(). If the state
transition equals the one we'd get for input '\0', it goes to IN_ERROR
or to the argument of TERMINAL(). We never use TERMINAL(IN_ERROR),
because it makes no sense. Thus, if it doesn't go to IN_ERROR, it
must be defined with TERMINAL().
Since this isn't quite confusing enough, we negate the result to get
@char_consumed, and ignore it when @flush is true.
Instead of deriving the lookahead bit from the state transition, make
it explicit. This is easier to understand, and a bit more flexible,
too.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-4-armbru@redhat.com>
2018-08-31 09:58:38 +02:00
|
|
|
#define LOOKAHEAD 0x80
|
|
|
|
#define TERMINAL(state) [0 ... 0xFF] = ((state) | LOOKAHEAD)
|
2010-05-24 09:39:52 +02:00
|
|
|
|
2009-11-11 10:39:14 -06:00
|
|
|
static const uint8_t json_lexer[][256] = {
|
2015-11-25 22:23:25 +01:00
|
|
|
/* Relies on default initialization to IN_ERROR! */
|
|
|
|
|
json: Nicer recovery from lexical errors
When the lexer chokes on an input character, it consumes the
character, emits a JSON error token, and enters its start state. This
can lead to suboptimal error recovery. For instance, input
0123 ,
produces the tokens
JSON_ERROR 01
JSON_INTEGER 23
JSON_COMMA ,
Make the lexer skip characters after a lexical error until a
structural character ('[', ']', '{', '}', ':', ','), an ASCII control
character, or '\xFE', or '\xFF'.
Note that we must not skip ASCII control characters, '\xFE', '\xFF',
because those are documented to force the JSON parser into known-good
state, by docs/interop/qmp-spec.txt.
The lexer now produces
JSON_ERROR 01
JSON_COMMA ,
Update qmp-test for the nicer error recovery: QMP now reports just one
error for input %p instead of two. Also drop the newline after %p; it
was needed to tease out the second error.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-5-armbru@redhat.com>
[Conflict with commit ebb4d82d888 resolved]
2018-08-31 09:58:39 +02:00
|
|
|
/* error recovery */
|
|
|
|
[IN_RECOVERY] = {
|
|
|
|
/*
|
|
|
|
* Skip characters until a structural character, an ASCII
|
|
|
|
* control character other than '\t', or impossible UTF-8
|
|
|
|
* bytes '\xFE', '\xFF'. Structural characters and line
|
|
|
|
* endings are promising resynchronization points. Clients
|
|
|
|
* may use the others to force the JSON parser into known-good
|
|
|
|
* state; see docs/interop/qmp-spec.txt.
|
|
|
|
*/
|
|
|
|
[0 ... 0x1F] = IN_START | LOOKAHEAD,
|
|
|
|
[0x20 ... 0xFD] = IN_RECOVERY,
|
|
|
|
[0xFE ... 0xFF] = IN_START | LOOKAHEAD,
|
|
|
|
['\t'] = IN_RECOVERY,
|
|
|
|
['['] = IN_START | LOOKAHEAD,
|
|
|
|
[']'] = IN_START | LOOKAHEAD,
|
|
|
|
['{'] = IN_START | LOOKAHEAD,
|
|
|
|
['}'] = IN_START | LOOKAHEAD,
|
|
|
|
[':'] = IN_START | LOOKAHEAD,
|
|
|
|
[','] = IN_START | LOOKAHEAD,
|
|
|
|
},
|
|
|
|
|
2009-11-11 10:39:14 -06:00
|
|
|
/* double quote string */
|
|
|
|
[IN_DQ_STRING_ESCAPE] = {
|
2018-08-23 18:39:53 +02:00
|
|
|
[0x20 ... 0xFD] = IN_DQ_STRING,
|
2009-11-11 10:39:14 -06:00
|
|
|
},
|
|
|
|
[IN_DQ_STRING] = {
|
json: Leave rejecting invalid UTF-8 to parser
Both the lexer and the parser (attempt to) validate UTF-8 in JSON
strings.
The lexer rejects bytes that can't occur in valid UTF-8: \xC0..\xC1,
\xF5..\xFF. This rejects some, but not all invalid UTF-8. It also
rejects ASCII control characters \x00..\x1F, in accordance with RFC
8259 (see recent commit "json: Reject unescaped control characters").
When the lexer rejects, it ends the token right after the first bad
byte. Good when the bad byte is a newline. Not so good when it's
something like an overlong sequence in the middle of a string. For
instance, input
{"abc\xC0\xAFijk": 1}\n
produces the tokens
JSON_LCURLY {
JSON_ERROR "abc\xC0
JSON_ERROR \xAF
JSON_KEYWORD ijk
JSON_ERROR ": 1}\n
The parser then reports four errors
Invalid JSON syntax
Invalid JSON syntax
JSON parse error, invalid keyword 'ijk'
Invalid JSON syntax
before it recovers at the newline.
The commit before previous made the parser reject invalid UTF-8
sequences. Since then, anything the lexer rejects, the parser would
reject as well. Thus, the lexer's rejecting is unnecessary for
correctness, and harmful for error reporting.
However, we want to keep rejecting ASCII control characters in the
lexer, because that produces the behavior we want for unclosed
strings.
We also need to keep rejecting \xFF in the lexer, because we
documented that as a way to reset the JSON parser
(docs/interop/qmp-spec.txt section 2.6 QGA Synchronization), which
means we can't change how we recover from this error now. I wish we
hadn't done that.
I think we should treat \xFE the same as \xFF.
Change the lexer to accept \xC0..\xC1 and \xF5..\xFD. It now rejects
only \x00..\x1F and \xFE..\xFF. Error reporting for invalid UTF-8 in
strings is much improved, except for \xFE and \xFF. For the example
above, the lexer now produces
JSON_LCURLY {
JSON_STRING "abc\xC0\xAFijk"
JSON_COLON :
JSON_INTEGER 1
JSON_RCURLY
and the parser reports just
JSON parse error, invalid UTF-8 sequence in string
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180823164025.12553-25-armbru@redhat.com>
2018-08-23 18:39:51 +02:00
|
|
|
[0x20 ... 0xFD] = IN_DQ_STRING,
|
2009-11-11 10:39:14 -06:00
|
|
|
['\\'] = IN_DQ_STRING_ESCAPE,
|
2010-05-24 09:39:53 +02:00
|
|
|
['"'] = JSON_STRING,
|
2009-11-11 10:39:14 -06:00
|
|
|
},
|
|
|
|
|
|
|
|
/* single quote string */
|
|
|
|
[IN_SQ_STRING_ESCAPE] = {
|
2018-08-23 18:39:53 +02:00
|
|
|
[0x20 ... 0xFD] = IN_SQ_STRING,
|
2009-11-11 10:39:14 -06:00
|
|
|
},
|
|
|
|
[IN_SQ_STRING] = {
|
json: Leave rejecting invalid UTF-8 to parser
Both the lexer and the parser (attempt to) validate UTF-8 in JSON
strings.
The lexer rejects bytes that can't occur in valid UTF-8: \xC0..\xC1,
\xF5..\xFF. This rejects some, but not all invalid UTF-8. It also
rejects ASCII control characters \x00..\x1F, in accordance with RFC
8259 (see recent commit "json: Reject unescaped control characters").
When the lexer rejects, it ends the token right after the first bad
byte. Good when the bad byte is a newline. Not so good when it's
something like an overlong sequence in the middle of a string. For
instance, input
{"abc\xC0\xAFijk": 1}\n
produces the tokens
JSON_LCURLY {
JSON_ERROR "abc\xC0
JSON_ERROR \xAF
JSON_KEYWORD ijk
JSON_ERROR ": 1}\n
The parser then reports four errors
Invalid JSON syntax
Invalid JSON syntax
JSON parse error, invalid keyword 'ijk'
Invalid JSON syntax
before it recovers at the newline.
The commit before previous made the parser reject invalid UTF-8
sequences. Since then, anything the lexer rejects, the parser would
reject as well. Thus, the lexer's rejecting is unnecessary for
correctness, and harmful for error reporting.
However, we want to keep rejecting ASCII control characters in the
lexer, because that produces the behavior we want for unclosed
strings.
We also need to keep rejecting \xFF in the lexer, because we
documented that as a way to reset the JSON parser
(docs/interop/qmp-spec.txt section 2.6 QGA Synchronization), which
means we can't change how we recover from this error now. I wish we
hadn't done that.
I think we should treat \xFE the same as \xFF.
Change the lexer to accept \xC0..\xC1 and \xF5..\xFD. It now rejects
only \x00..\x1F and \xFE..\xFF. Error reporting for invalid UTF-8 in
strings is much improved, except for \xFE and \xFF. For the example
above, the lexer now produces
JSON_LCURLY {
JSON_STRING "abc\xC0\xAFijk"
JSON_COLON :
JSON_INTEGER 1
JSON_RCURLY
and the parser reports just
JSON parse error, invalid UTF-8 sequence in string
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180823164025.12553-25-armbru@redhat.com>
2018-08-23 18:39:51 +02:00
|
|
|
[0x20 ... 0xFD] = IN_SQ_STRING,
|
2009-11-11 10:39:14 -06:00
|
|
|
['\\'] = IN_SQ_STRING_ESCAPE,
|
2010-05-24 09:39:53 +02:00
|
|
|
['\''] = JSON_STRING,
|
2009-11-11 10:39:14 -06:00
|
|
|
},
|
|
|
|
|
|
|
|
/* Zero */
|
|
|
|
[IN_ZERO] = {
|
|
|
|
TERMINAL(JSON_INTEGER),
|
2018-08-31 09:58:40 +02:00
|
|
|
['0' ... '9'] = JSON_ERROR,
|
2009-11-11 10:39:14 -06:00
|
|
|
['.'] = IN_MANTISSA,
|
|
|
|
},
|
|
|
|
|
|
|
|
/* Float */
|
2018-08-23 18:40:09 +02:00
|
|
|
[IN_EXP_DIGITS] = {
|
2009-11-11 10:39:14 -06:00
|
|
|
TERMINAL(JSON_FLOAT),
|
2018-08-23 18:40:09 +02:00
|
|
|
['0' ... '9'] = IN_EXP_DIGITS,
|
2009-11-11 10:39:14 -06:00
|
|
|
},
|
|
|
|
|
2018-08-23 18:40:09 +02:00
|
|
|
[IN_EXP_SIGN] = {
|
|
|
|
['0' ... '9'] = IN_EXP_DIGITS,
|
2009-11-11 10:39:14 -06:00
|
|
|
},
|
|
|
|
|
|
|
|
[IN_EXP_E] = {
|
2018-08-23 18:40:09 +02:00
|
|
|
['-'] = IN_EXP_SIGN,
|
|
|
|
['+'] = IN_EXP_SIGN,
|
|
|
|
['0' ... '9'] = IN_EXP_DIGITS,
|
2009-11-11 10:39:14 -06:00
|
|
|
},
|
|
|
|
|
|
|
|
[IN_MANTISSA_DIGITS] = {
|
|
|
|
TERMINAL(JSON_FLOAT),
|
|
|
|
['0' ... '9'] = IN_MANTISSA_DIGITS,
|
|
|
|
['e'] = IN_EXP_E,
|
|
|
|
['E'] = IN_EXP_E,
|
|
|
|
},
|
|
|
|
|
|
|
|
[IN_MANTISSA] = {
|
|
|
|
['0' ... '9'] = IN_MANTISSA_DIGITS,
|
|
|
|
},
|
|
|
|
|
|
|
|
/* Number */
|
2018-08-23 18:40:09 +02:00
|
|
|
[IN_DIGITS] = {
|
2009-11-11 10:39:14 -06:00
|
|
|
TERMINAL(JSON_INTEGER),
|
2018-08-23 18:40:09 +02:00
|
|
|
['0' ... '9'] = IN_DIGITS,
|
2009-11-11 10:39:14 -06:00
|
|
|
['e'] = IN_EXP_E,
|
|
|
|
['E'] = IN_EXP_E,
|
|
|
|
['.'] = IN_MANTISSA,
|
|
|
|
},
|
|
|
|
|
2018-08-23 18:40:09 +02:00
|
|
|
[IN_SIGN] = {
|
2009-11-11 10:39:14 -06:00
|
|
|
['0'] = IN_ZERO,
|
2018-08-23 18:40:09 +02:00
|
|
|
['1' ... '9'] = IN_DIGITS,
|
2009-11-11 10:39:14 -06:00
|
|
|
},
|
|
|
|
|
|
|
|
/* keywords */
|
|
|
|
[IN_KEYWORD] = {
|
|
|
|
TERMINAL(JSON_KEYWORD),
|
|
|
|
['a' ... 'z'] = IN_KEYWORD,
|
|
|
|
},
|
|
|
|
|
2018-08-23 18:40:04 +02:00
|
|
|
/* interpolation */
|
|
|
|
[IN_INTERP] = {
|
2018-08-23 18:40:07 +02:00
|
|
|
TERMINAL(JSON_INTERP),
|
|
|
|
['A' ... 'Z'] = IN_INTERP,
|
|
|
|
['a' ... 'z'] = IN_INTERP,
|
|
|
|
['0' ... '9'] = IN_INTERP,
|
2009-11-11 10:39:14 -06:00
|
|
|
},
|
|
|
|
|
2018-08-23 18:40:05 +02:00
|
|
|
/*
|
|
|
|
* Two start states:
|
|
|
|
* - IN_START recognizes JSON tokens with our string extensions
|
|
|
|
* - IN_START_INTERP additionally recognizes interpolation.
|
|
|
|
*/
|
|
|
|
[IN_START ... IN_START_INTERP] = {
|
2009-11-11 10:39:14 -06:00
|
|
|
['"'] = IN_DQ_STRING,
|
|
|
|
['\''] = IN_SQ_STRING,
|
|
|
|
['0'] = IN_ZERO,
|
2018-08-23 18:40:09 +02:00
|
|
|
['1' ... '9'] = IN_DIGITS,
|
|
|
|
['-'] = IN_SIGN,
|
2015-11-25 22:23:26 +01:00
|
|
|
['{'] = JSON_LCURLY,
|
|
|
|
['}'] = JSON_RCURLY,
|
|
|
|
['['] = JSON_LSQUARE,
|
|
|
|
[']'] = JSON_RSQUARE,
|
|
|
|
[','] = JSON_COMMA,
|
|
|
|
[':'] = JSON_COLON,
|
2009-11-11 10:39:14 -06:00
|
|
|
['a' ... 'z'] = IN_KEYWORD,
|
2018-08-31 09:58:41 +02:00
|
|
|
[' '] = IN_START,
|
|
|
|
['\t'] = IN_START,
|
|
|
|
['\r'] = IN_START,
|
|
|
|
['\n'] = IN_START,
|
2009-11-11 10:39:14 -06:00
|
|
|
},
|
2018-08-23 18:40:05 +02:00
|
|
|
[IN_START_INTERP]['%'] = IN_INTERP,
|
2009-11-11 10:39:14 -06:00
|
|
|
};
|
|
|
|
|
json: Make lexer's "character consumed" logic less confusing
The lexer uses macro TERMINAL_NEEDED_LOOKAHEAD() to decide whether a
state transition consumes the input character. It returns true when
the state transition is defined with the TERMINAL() macro. To detect
that, it checks whether input '\0' would have resulted in the same
state transition, and the new state is not IN_ERROR.
Why does that even work? For all states, the new state on input '\0'
is either IN_ERROR or defined with TERMINAL(). If the state
transition equals the one we'd get for input '\0', it goes to IN_ERROR
or to the argument of TERMINAL(). We never use TERMINAL(IN_ERROR),
because it makes no sense. Thus, if it doesn't go to IN_ERROR, it
must be defined with TERMINAL().
Since this isn't quite confusing enough, we negate the result to get
@char_consumed, and ignore it when @flush is true.
Instead of deriving the lookahead bit from the state transition, make
it explicit. This is easier to understand, and a bit more flexible,
too.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-4-armbru@redhat.com>
2018-08-31 09:58:38 +02:00
|
|
|
static inline uint8_t next_state(JSONLexer *lexer, char ch, bool flush,
|
|
|
|
bool *char_consumed)
|
|
|
|
{
|
|
|
|
uint8_t next;
|
|
|
|
|
2019-03-21 11:57:52 +00:00
|
|
|
assert(lexer->state < ARRAY_SIZE(json_lexer));
|
json: Make lexer's "character consumed" logic less confusing
The lexer uses macro TERMINAL_NEEDED_LOOKAHEAD() to decide whether a
state transition consumes the input character. It returns true when
the state transition is defined with the TERMINAL() macro. To detect
that, it checks whether input '\0' would have resulted in the same
state transition, and the new state is not IN_ERROR.
Why does that even work? For all states, the new state on input '\0'
is either IN_ERROR or defined with TERMINAL(). If the state
transition equals the one we'd get for input '\0', it goes to IN_ERROR
or to the argument of TERMINAL(). We never use TERMINAL(IN_ERROR),
because it makes no sense. Thus, if it doesn't go to IN_ERROR, it
must be defined with TERMINAL().
Since this isn't quite confusing enough, we negate the result to get
@char_consumed, and ignore it when @flush is true.
Instead of deriving the lookahead bit from the state transition, make
it explicit. This is easier to understand, and a bit more flexible,
too.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-4-armbru@redhat.com>
2018-08-31 09:58:38 +02:00
|
|
|
next = json_lexer[lexer->state][(uint8_t)ch];
|
|
|
|
*char_consumed = !flush && !(next & LOOKAHEAD);
|
|
|
|
return next & ~LOOKAHEAD;
|
|
|
|
}
|
|
|
|
|
2018-08-23 18:40:05 +02:00
|
|
|
void json_lexer_init(JSONLexer *lexer, bool enable_interpolation)
|
2009-11-11 10:39:14 -06:00
|
|
|
{
|
2018-08-23 18:40:05 +02:00
|
|
|
lexer->start_state = lexer->state = enable_interpolation
|
|
|
|
? IN_START_INTERP : IN_START;
|
2015-11-25 22:23:29 +01:00
|
|
|
lexer->token = g_string_sized_new(3);
|
2010-05-17 17:50:01 -03:00
|
|
|
lexer->x = lexer->y = 0;
|
2009-11-11 10:39:14 -06:00
|
|
|
}
|
|
|
|
|
2018-08-23 18:39:58 +02:00
|
|
|
static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
|
2009-11-11 10:39:14 -06:00
|
|
|
{
|
json: Clean up how lexer consumes "end of input"
When the lexer isn't in its start state at the end of input, it's
working on a token. To flush it out, it needs to transit to its start
state on "end of input" lookahead.
There are two ways to the start state, depending on the current state:
* If the lexer is in a TERMINAL(JSON_FOO) state, it can emit a
JSON_FOO token.
* Else, it can go to IN_ERROR state, and emit a JSON_ERROR token.
There are complications, however:
* The transition to IN_ERROR state consumes the input character and
adds it to the JSON_ERROR token. The latter is inappropriate for
the "end of input" character, so we suppress that. See also recent
commit a2ec6be72b8 "json: Fix lexer to include the bad character in
JSON_ERROR token".
* The transition to a TERMINAL(JSON_FOO) state doesn't consume the
input character. In that case, the lexer normally loops until it is
consumed. We have to suppress that for the "end of input" input
character. If we didn't, the lexer would consume it by entering
IN_ERROR state, emitting a bogus JSON_ERROR token. We fixed that in
commit bd3924a33a6.
However, simply breaking the loop this way assumes that the lexer
needs exactly one state transition to reach its start state. That
assumption is correct now, but it's unclean, and I'll soon break it.
Clean up: instead of breaking the loop after one iteration, break it
after it reached the start state.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-3-armbru@redhat.com>
2018-08-31 09:58:37 +02:00
|
|
|
int new_state;
|
|
|
|
bool char_consumed = false;
|
2010-05-24 09:39:52 +02:00
|
|
|
|
2009-11-11 10:39:14 -06:00
|
|
|
lexer->x++;
|
|
|
|
if (ch == '\n') {
|
|
|
|
lexer->x = 0;
|
|
|
|
lexer->y++;
|
|
|
|
}
|
|
|
|
|
json: Clean up how lexer consumes "end of input"
When the lexer isn't in its start state at the end of input, it's
working on a token. To flush it out, it needs to transit to its start
state on "end of input" lookahead.
There are two ways to the start state, depending on the current state:
* If the lexer is in a TERMINAL(JSON_FOO) state, it can emit a
JSON_FOO token.
* Else, it can go to IN_ERROR state, and emit a JSON_ERROR token.
There are complications, however:
* The transition to IN_ERROR state consumes the input character and
adds it to the JSON_ERROR token. The latter is inappropriate for
the "end of input" character, so we suppress that. See also recent
commit a2ec6be72b8 "json: Fix lexer to include the bad character in
JSON_ERROR token".
* The transition to a TERMINAL(JSON_FOO) state doesn't consume the
input character. In that case, the lexer normally loops until it is
consumed. We have to suppress that for the "end of input" input
character. If we didn't, the lexer would consume it by entering
IN_ERROR state, emitting a bogus JSON_ERROR token. We fixed that in
commit bd3924a33a6.
However, simply breaking the loop this way assumes that the lexer
needs exactly one state transition to reach its start state. That
assumption is correct now, but it's unclean, and I'll soon break it.
Clean up: instead of breaking the loop after one iteration, break it
after it reached the start state.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-3-armbru@redhat.com>
2018-08-31 09:58:37 +02:00
|
|
|
while (flush ? lexer->state != lexer->start_state : !char_consumed) {
|
json: Make lexer's "character consumed" logic less confusing
The lexer uses macro TERMINAL_NEEDED_LOOKAHEAD() to decide whether a
state transition consumes the input character. It returns true when
the state transition is defined with the TERMINAL() macro. To detect
that, it checks whether input '\0' would have resulted in the same
state transition, and the new state is not IN_ERROR.
Why does that even work? For all states, the new state on input '\0'
is either IN_ERROR or defined with TERMINAL(). If the state
transition equals the one we'd get for input '\0', it goes to IN_ERROR
or to the argument of TERMINAL(). We never use TERMINAL(IN_ERROR),
because it makes no sense. Thus, if it doesn't go to IN_ERROR, it
must be defined with TERMINAL().
Since this isn't quite confusing enough, we negate the result to get
@char_consumed, and ignore it when @flush is true.
Instead of deriving the lookahead bit from the state transition, make
it explicit. This is easier to understand, and a bit more flexible,
too.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-4-armbru@redhat.com>
2018-08-31 09:58:38 +02:00
|
|
|
new_state = next_state(lexer, ch, flush, &char_consumed);
|
json: Clean up how lexer consumes "end of input"
When the lexer isn't in its start state at the end of input, it's
working on a token. To flush it out, it needs to transit to its start
state on "end of input" lookahead.
There are two ways to the start state, depending on the current state:
* If the lexer is in a TERMINAL(JSON_FOO) state, it can emit a
JSON_FOO token.
* Else, it can go to IN_ERROR state, and emit a JSON_ERROR token.
There are complications, however:
* The transition to IN_ERROR state consumes the input character and
adds it to the JSON_ERROR token. The latter is inappropriate for
the "end of input" character, so we suppress that. See also recent
commit a2ec6be72b8 "json: Fix lexer to include the bad character in
JSON_ERROR token".
* The transition to a TERMINAL(JSON_FOO) state doesn't consume the
input character. In that case, the lexer normally loops until it is
consumed. We have to suppress that for the "end of input" input
character. If we didn't, the lexer would consume it by entering
IN_ERROR state, emitting a bogus JSON_ERROR token. We fixed that in
commit bd3924a33a6.
However, simply breaking the loop this way assumes that the lexer
needs exactly one state transition to reach its start state. That
assumption is correct now, but it's unclean, and I'll soon break it.
Clean up: instead of breaking the loop after one iteration, break it
after it reached the start state.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-3-armbru@redhat.com>
2018-08-31 09:58:37 +02:00
|
|
|
if (char_consumed) {
|
json: Make lexer's "character consumed" logic less confusing
The lexer uses macro TERMINAL_NEEDED_LOOKAHEAD() to decide whether a
state transition consumes the input character. It returns true when
the state transition is defined with the TERMINAL() macro. To detect
that, it checks whether input '\0' would have resulted in the same
state transition, and the new state is not IN_ERROR.
Why does that even work? For all states, the new state on input '\0'
is either IN_ERROR or defined with TERMINAL(). If the state
transition equals the one we'd get for input '\0', it goes to IN_ERROR
or to the argument of TERMINAL(). We never use TERMINAL(IN_ERROR),
because it makes no sense. Thus, if it doesn't go to IN_ERROR, it
must be defined with TERMINAL().
Since this isn't quite confusing enough, we negate the result to get
@char_consumed, and ignore it when @flush is true.
Instead of deriving the lookahead bit from the state transition, make
it explicit. This is easier to understand, and a bit more flexible,
too.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-4-armbru@redhat.com>
2018-08-31 09:58:38 +02:00
|
|
|
assert(!flush);
|
2015-11-25 22:23:29 +01:00
|
|
|
g_string_append_c(lexer->token, ch);
|
2010-05-24 09:39:52 +02:00
|
|
|
}
|
2009-11-11 10:39:14 -06:00
|
|
|
|
2010-05-24 09:39:52 +02:00
|
|
|
switch (new_state) {
|
2015-11-25 22:23:26 +01:00
|
|
|
case JSON_LCURLY:
|
|
|
|
case JSON_RCURLY:
|
|
|
|
case JSON_LSQUARE:
|
|
|
|
case JSON_RSQUARE:
|
|
|
|
case JSON_COLON:
|
|
|
|
case JSON_COMMA:
|
2018-08-23 18:40:04 +02:00
|
|
|
case JSON_INTERP:
|
2010-05-24 09:39:52 +02:00
|
|
|
case JSON_INTEGER:
|
|
|
|
case JSON_FLOAT:
|
|
|
|
case JSON_KEYWORD:
|
|
|
|
case JSON_STRING:
|
2018-08-23 18:40:00 +02:00
|
|
|
json_message_process_token(lexer, lexer->token, new_state,
|
|
|
|
lexer->x, lexer->y);
|
2012-01-09 18:29:51 +01:00
|
|
|
/* fall through */
|
json: Nicer recovery from lexical errors
When the lexer chokes on an input character, it consumes the
character, emits a JSON error token, and enters its start state. This
can lead to suboptimal error recovery. For instance, input
0123 ,
produces the tokens
JSON_ERROR 01
JSON_INTEGER 23
JSON_COMMA ,
Make the lexer skip characters after a lexical error until a
structural character ('[', ']', '{', '}', ':', ','), an ASCII control
character, or '\xFE', or '\xFF'.
Note that we must not skip ASCII control characters, '\xFE', '\xFF',
because those are documented to force the JSON parser into known-good
state, by docs/interop/qmp-spec.txt.
The lexer now produces
JSON_ERROR 01
JSON_COMMA ,
Update qmp-test for the nicer error recovery: QMP now reports just one
error for input %p instead of two. Also drop the newline after %p; it
was needed to tease out the second error.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-5-armbru@redhat.com>
[Conflict with commit ebb4d82d888 resolved]
2018-08-31 09:58:39 +02:00
|
|
|
case IN_START:
|
2018-08-31 09:58:41 +02:00
|
|
|
g_string_truncate(lexer->token, 0);
|
2018-08-23 18:40:05 +02:00
|
|
|
new_state = lexer->start_state;
|
2010-05-24 09:39:52 +02:00
|
|
|
break;
|
2018-08-31 09:58:40 +02:00
|
|
|
case JSON_ERROR:
|
2018-08-23 18:40:00 +02:00
|
|
|
json_message_process_token(lexer, lexer->token, JSON_ERROR,
|
|
|
|
lexer->x, lexer->y);
|
json: Nicer recovery from lexical errors
When the lexer chokes on an input character, it consumes the
character, emits a JSON error token, and enters its start state. This
can lead to suboptimal error recovery. For instance, input
0123 ,
produces the tokens
JSON_ERROR 01
JSON_INTEGER 23
JSON_COMMA ,
Make the lexer skip characters after a lexical error until a
structural character ('[', ']', '{', '}', ':', ','), an ASCII control
character, or '\xFE', or '\xFF'.
Note that we must not skip ASCII control characters, '\xFE', '\xFF',
because those are documented to force the JSON parser into known-good
state, by docs/interop/qmp-spec.txt.
The lexer now produces
JSON_ERROR 01
JSON_COMMA ,
Update qmp-test for the nicer error recovery: QMP now reports just one
error for input %p instead of two. Also drop the newline after %p; it
was needed to tease out the second error.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-5-armbru@redhat.com>
[Conflict with commit ebb4d82d888 resolved]
2018-08-31 09:58:39 +02:00
|
|
|
new_state = IN_RECOVERY;
|
|
|
|
/* fall through */
|
|
|
|
case IN_RECOVERY:
|
2015-11-25 22:23:29 +01:00
|
|
|
g_string_truncate(lexer->token, 0);
|
json: Nicer recovery from lexical errors
When the lexer chokes on an input character, it consumes the
character, emits a JSON error token, and enters its start state. This
can lead to suboptimal error recovery. For instance, input
0123 ,
produces the tokens
JSON_ERROR 01
JSON_INTEGER 23
JSON_COMMA ,
Make the lexer skip characters after a lexical error until a
structural character ('[', ']', '{', '}', ':', ','), an ASCII control
character, or '\xFE', or '\xFF'.
Note that we must not skip ASCII control characters, '\xFE', '\xFF',
because those are documented to force the JSON parser into known-good
state, by docs/interop/qmp-spec.txt.
The lexer now produces
JSON_ERROR 01
JSON_COMMA ,
Update qmp-test for the nicer error recovery: QMP now reports just one
error for input %p instead of two. Also drop the newline after %p; it
was needed to tease out the second error.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-5-armbru@redhat.com>
[Conflict with commit ebb4d82d888 resolved]
2018-08-31 09:58:39 +02:00
|
|
|
break;
|
2010-05-24 09:39:52 +02:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
lexer->state = new_state;
|
json: Clean up how lexer consumes "end of input"
When the lexer isn't in its start state at the end of input, it's
working on a token. To flush it out, it needs to transit to its start
state on "end of input" lookahead.
There are two ways to the start state, depending on the current state:
* If the lexer is in a TERMINAL(JSON_FOO) state, it can emit a
JSON_FOO token.
* Else, it can go to IN_ERROR state, and emit a JSON_ERROR token.
There are complications, however:
* The transition to IN_ERROR state consumes the input character and
adds it to the JSON_ERROR token. The latter is inappropriate for
the "end of input" character, so we suppress that. See also recent
commit a2ec6be72b8 "json: Fix lexer to include the bad character in
JSON_ERROR token".
* The transition to a TERMINAL(JSON_FOO) state doesn't consume the
input character. In that case, the lexer normally loops until it is
consumed. We have to suppress that for the "end of input" input
character. If we didn't, the lexer would consume it by entering
IN_ERROR state, emitting a bogus JSON_ERROR token. We fixed that in
commit bd3924a33a6.
However, simply breaking the loop this way assumes that the lexer
needs exactly one state transition to reach its start state. That
assumption is correct now, but it's unclean, and I'll soon break it.
Clean up: instead of breaking the loop after one iteration, break it
after it reached the start state.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-3-armbru@redhat.com>
2018-08-31 09:58:37 +02:00
|
|
|
}
|
2011-06-01 12:14:52 -05:00
|
|
|
|
|
|
|
/* Do not let a single token grow to an arbitrarily large size,
|
|
|
|
* this is a security consideration.
|
|
|
|
*/
|
2015-11-25 22:23:29 +01:00
|
|
|
if (lexer->token->len > MAX_TOKEN_SIZE) {
|
2018-08-23 18:40:00 +02:00
|
|
|
json_message_process_token(lexer, lexer->token, lexer->state,
|
|
|
|
lexer->x, lexer->y);
|
2015-11-25 22:23:29 +01:00
|
|
|
g_string_truncate(lexer->token, 0);
|
2018-08-23 18:40:05 +02:00
|
|
|
lexer->state = lexer->start_state;
|
2011-06-01 12:14:52 -05:00
|
|
|
}
|
2009-11-11 10:39:14 -06:00
|
|
|
}
|
|
|
|
|
2018-08-23 18:39:58 +02:00
|
|
|
void json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
|
2009-11-11 10:39:14 -06:00
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for (i = 0; i < size; i++) {
|
2018-08-23 18:39:58 +02:00
|
|
|
json_lexer_feed_char(lexer, buffer[i], false);
|
2009-11-11 10:39:14 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-23 18:39:58 +02:00
|
|
|
void json_lexer_flush(JSONLexer *lexer)
|
2009-11-11 10:39:14 -06:00
|
|
|
{
|
json: Clean up how lexer consumes "end of input"
When the lexer isn't in its start state at the end of input, it's
working on a token. To flush it out, it needs to transit to its start
state on "end of input" lookahead.
There are two ways to the start state, depending on the current state:
* If the lexer is in a TERMINAL(JSON_FOO) state, it can emit a
JSON_FOO token.
* Else, it can go to IN_ERROR state, and emit a JSON_ERROR token.
There are complications, however:
* The transition to IN_ERROR state consumes the input character and
adds it to the JSON_ERROR token. The latter is inappropriate for
the "end of input" character, so we suppress that. See also recent
commit a2ec6be72b8 "json: Fix lexer to include the bad character in
JSON_ERROR token".
* The transition to a TERMINAL(JSON_FOO) state doesn't consume the
input character. In that case, the lexer normally loops until it is
consumed. We have to suppress that for the "end of input" input
character. If we didn't, the lexer would consume it by entering
IN_ERROR state, emitting a bogus JSON_ERROR token. We fixed that in
commit bd3924a33a6.
However, simply breaking the loop this way assumes that the lexer
needs exactly one state transition to reach its start state. That
assumption is correct now, but it's unclean, and I'll soon break it.
Clean up: instead of breaking the loop after one iteration, break it
after it reached the start state.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-3-armbru@redhat.com>
2018-08-31 09:58:37 +02:00
|
|
|
json_lexer_feed_char(lexer, 0, true);
|
|
|
|
assert(lexer->state == lexer->start_state);
|
2018-08-23 18:40:12 +02:00
|
|
|
json_message_process_token(lexer, lexer->token, JSON_END_OF_INPUT,
|
|
|
|
lexer->x, lexer->y);
|
2009-11-11 10:39:14 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
void json_lexer_destroy(JSONLexer *lexer)
|
|
|
|
{
|
2015-11-25 22:23:29 +01:00
|
|
|
g_string_free(lexer->token, true);
|
2009-11-11 10:39:14 -06:00
|
|
|
}
|