1a372823aSPaolo Bonzini /* 2a372823aSPaolo Bonzini * JSON lexer 3a372823aSPaolo Bonzini * 4a372823aSPaolo Bonzini * Copyright IBM, Corp. 2009 5a372823aSPaolo Bonzini * 6a372823aSPaolo Bonzini * Authors: 7a372823aSPaolo Bonzini * Anthony Liguori <aliguori@us.ibm.com> 8a372823aSPaolo Bonzini * 9a372823aSPaolo Bonzini * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10a372823aSPaolo Bonzini * See the COPYING.LIB file in the top-level directory. 11a372823aSPaolo Bonzini * 12a372823aSPaolo Bonzini */ 13a372823aSPaolo Bonzini 14f2ad72b3SPeter Maydell #include "qemu/osdep.h" 15a372823aSPaolo Bonzini #include "qemu-common.h" 16a372823aSPaolo Bonzini #include "qapi/qmp/json-lexer.h" 17a372823aSPaolo Bonzini 18a372823aSPaolo Bonzini #define MAX_TOKEN_SIZE (64ULL << 20) 19a372823aSPaolo Bonzini 20a372823aSPaolo Bonzini /* 21eddc0a7fSMarkus Armbruster * From RFC 8259 "The JavaScript Object Notation (JSON) Data 22eddc0a7fSMarkus Armbruster * Interchange Format", with [comments in brackets]: 23ff5394adSEric Blake * 24eddc0a7fSMarkus Armbruster * The set of tokens includes six structural characters, strings, 25eddc0a7fSMarkus Armbruster * numbers, and three literal names. 26ff5394adSEric Blake * 27eddc0a7fSMarkus Armbruster * These are the six structural characters: 28ff5394adSEric Blake * 29eddc0a7fSMarkus Armbruster * begin-array = ws %x5B ws ; [ left square bracket 30eddc0a7fSMarkus Armbruster * begin-object = ws %x7B ws ; { left curly bracket 31eddc0a7fSMarkus Armbruster * end-array = ws %x5D ws ; ] right square bracket 32eddc0a7fSMarkus Armbruster * end-object = ws %x7D ws ; } right curly bracket 33eddc0a7fSMarkus Armbruster * name-separator = ws %x3A ws ; : colon 34eddc0a7fSMarkus Armbruster * value-separator = ws %x2C ws ; , comma 35ff5394adSEric Blake * 36eddc0a7fSMarkus Armbruster * Insignificant whitespace is allowed before or after any of the six 37eddc0a7fSMarkus Armbruster * structural characters. 38eddc0a7fSMarkus Armbruster * [This lexer accepts it before or after any token, which is actually 39eddc0a7fSMarkus Armbruster * the same, as the grammar always has structural characters between 40eddc0a7fSMarkus Armbruster * other tokens.] 41ff5394adSEric Blake * 42eddc0a7fSMarkus Armbruster * ws = *( 43eddc0a7fSMarkus Armbruster * %x20 / ; Space 44eddc0a7fSMarkus Armbruster * %x09 / ; Horizontal tab 45eddc0a7fSMarkus Armbruster * %x0A / ; Line feed or New line 46eddc0a7fSMarkus Armbruster * %x0D ) ; Carriage return 47a372823aSPaolo Bonzini * 48eddc0a7fSMarkus Armbruster * [...] three literal names: 49eddc0a7fSMarkus Armbruster * false null true 50eddc0a7fSMarkus Armbruster * [This lexer accepts [a-z]+, and leaves rejecting unknown literal 51eddc0a7fSMarkus Armbruster * names to the parser.] 52eddc0a7fSMarkus Armbruster * 53eddc0a7fSMarkus Armbruster * [Numbers:] 54eddc0a7fSMarkus Armbruster * 55eddc0a7fSMarkus Armbruster * number = [ minus ] int [ frac ] [ exp ] 56eddc0a7fSMarkus Armbruster * decimal-point = %x2E ; . 57eddc0a7fSMarkus Armbruster * digit1-9 = %x31-39 ; 1-9 58eddc0a7fSMarkus Armbruster * e = %x65 / %x45 ; e E 59eddc0a7fSMarkus Armbruster * exp = e [ minus / plus ] 1*DIGIT 60eddc0a7fSMarkus Armbruster * frac = decimal-point 1*DIGIT 61eddc0a7fSMarkus Armbruster * int = zero / ( digit1-9 *DIGIT ) 62eddc0a7fSMarkus Armbruster * minus = %x2D ; - 63eddc0a7fSMarkus Armbruster * plus = %x2B ; + 64eddc0a7fSMarkus Armbruster * zero = %x30 ; 0 65eddc0a7fSMarkus Armbruster * 66eddc0a7fSMarkus Armbruster * [Strings:] 67eddc0a7fSMarkus Armbruster * string = quotation-mark *char quotation-mark 68eddc0a7fSMarkus Armbruster * 69eddc0a7fSMarkus Armbruster * char = unescaped / 70eddc0a7fSMarkus Armbruster * escape ( 71eddc0a7fSMarkus Armbruster * %x22 / ; " quotation mark U+0022 72eddc0a7fSMarkus Armbruster * %x5C / ; \ reverse solidus U+005C 73eddc0a7fSMarkus Armbruster * %x2F / ; / solidus U+002F 74eddc0a7fSMarkus Armbruster * %x62 / ; b backspace U+0008 75eddc0a7fSMarkus Armbruster * %x66 / ; f form feed U+000C 76eddc0a7fSMarkus Armbruster * %x6E / ; n line feed U+000A 77eddc0a7fSMarkus Armbruster * %x72 / ; r carriage return U+000D 78eddc0a7fSMarkus Armbruster * %x74 / ; t tab U+0009 79eddc0a7fSMarkus Armbruster * %x75 4HEXDIG ) ; uXXXX U+XXXX 80eddc0a7fSMarkus Armbruster * escape = %x5C ; \ 81eddc0a7fSMarkus Armbruster * quotation-mark = %x22 ; " 82eddc0a7fSMarkus Armbruster * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF 83eddc0a7fSMarkus Armbruster * 84eddc0a7fSMarkus Armbruster * 85eddc0a7fSMarkus Armbruster * Extensions over RFC 8259: 86eddc0a7fSMarkus Armbruster * - Extra escape sequence in strings: 87eddc0a7fSMarkus Armbruster * 0x27 (apostrophe) is recognized after escape, too 88eddc0a7fSMarkus Armbruster * - Single-quoted strings: 89eddc0a7fSMarkus Armbruster * Like double-quoted strings, except they're delimited by %x27 90eddc0a7fSMarkus Armbruster * (apostrophe) instead of %x22 (quotation mark), and can't contain 91eddc0a7fSMarkus Armbruster * unescaped apostrophe, but can contain unescaped quotation mark. 92eddc0a7fSMarkus Armbruster * - Interpolation: 93eddc0a7fSMarkus Armbruster * interpolation = %((l|ll|I64)[du]|[ipsf]) 94eddc0a7fSMarkus Armbruster * 95eddc0a7fSMarkus Armbruster * Note: 96eddc0a7fSMarkus Armbruster * - Input must be encoded in UTF-8. 97eddc0a7fSMarkus Armbruster * - Decoding and validating is left to the parser. 98a372823aSPaolo Bonzini */ 99a372823aSPaolo Bonzini 100a372823aSPaolo Bonzini enum json_lexer_state { 101b8d3b1daSMarkus Armbruster IN_ERROR = 0, /* must really be 0, see json_lexer[] */ 102a372823aSPaolo Bonzini IN_DQ_UCODE3, 103a372823aSPaolo Bonzini IN_DQ_UCODE2, 104a372823aSPaolo Bonzini IN_DQ_UCODE1, 105a372823aSPaolo Bonzini IN_DQ_UCODE0, 106a372823aSPaolo Bonzini IN_DQ_STRING_ESCAPE, 107a372823aSPaolo Bonzini IN_DQ_STRING, 108a372823aSPaolo Bonzini IN_SQ_UCODE3, 109a372823aSPaolo Bonzini IN_SQ_UCODE2, 110a372823aSPaolo Bonzini IN_SQ_UCODE1, 111a372823aSPaolo Bonzini IN_SQ_UCODE0, 112a372823aSPaolo Bonzini IN_SQ_STRING_ESCAPE, 113a372823aSPaolo Bonzini IN_SQ_STRING, 114a372823aSPaolo Bonzini IN_ZERO, 115a372823aSPaolo Bonzini IN_DIGITS, 116a372823aSPaolo Bonzini IN_DIGIT, 117a372823aSPaolo Bonzini IN_EXP_E, 118a372823aSPaolo Bonzini IN_MANTISSA, 119a372823aSPaolo Bonzini IN_MANTISSA_DIGITS, 120a372823aSPaolo Bonzini IN_NONZERO_NUMBER, 121a372823aSPaolo Bonzini IN_NEG_NONZERO_NUMBER, 122a372823aSPaolo Bonzini IN_KEYWORD, 123a372823aSPaolo Bonzini IN_ESCAPE, 124a372823aSPaolo Bonzini IN_ESCAPE_L, 125a372823aSPaolo Bonzini IN_ESCAPE_LL, 126a372823aSPaolo Bonzini IN_ESCAPE_I, 127a372823aSPaolo Bonzini IN_ESCAPE_I6, 128a372823aSPaolo Bonzini IN_ESCAPE_I64, 129a372823aSPaolo Bonzini IN_WHITESPACE, 130a372823aSPaolo Bonzini IN_START, 131a372823aSPaolo Bonzini }; 132a372823aSPaolo Bonzini 133b8d3b1daSMarkus Armbruster QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START); 134b8d3b1daSMarkus Armbruster 135a372823aSPaolo Bonzini #define TERMINAL(state) [0 ... 0x7F] = (state) 136a372823aSPaolo Bonzini 137a372823aSPaolo Bonzini /* Return whether TERMINAL is a terminal state and the transition to it 138a372823aSPaolo Bonzini from OLD_STATE required lookahead. This happens whenever the table 139a372823aSPaolo Bonzini below uses the TERMINAL macro. */ 140a372823aSPaolo Bonzini #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \ 141a2ec6be7SMarkus Armbruster (terminal != IN_ERROR && json_lexer[(old_state)][0] == (terminal)) 142a372823aSPaolo Bonzini 143a372823aSPaolo Bonzini static const uint8_t json_lexer[][256] = { 144b8d3b1daSMarkus Armbruster /* Relies on default initialization to IN_ERROR! */ 145b8d3b1daSMarkus Armbruster 146a372823aSPaolo Bonzini /* double quote string */ 147a372823aSPaolo Bonzini [IN_DQ_UCODE3] = { 148a372823aSPaolo Bonzini ['0' ... '9'] = IN_DQ_STRING, 149a372823aSPaolo Bonzini ['a' ... 'f'] = IN_DQ_STRING, 150a372823aSPaolo Bonzini ['A' ... 'F'] = IN_DQ_STRING, 151a372823aSPaolo Bonzini }, 152a372823aSPaolo Bonzini [IN_DQ_UCODE2] = { 153a372823aSPaolo Bonzini ['0' ... '9'] = IN_DQ_UCODE3, 154a372823aSPaolo Bonzini ['a' ... 'f'] = IN_DQ_UCODE3, 155a372823aSPaolo Bonzini ['A' ... 'F'] = IN_DQ_UCODE3, 156a372823aSPaolo Bonzini }, 157a372823aSPaolo Bonzini [IN_DQ_UCODE1] = { 158a372823aSPaolo Bonzini ['0' ... '9'] = IN_DQ_UCODE2, 159a372823aSPaolo Bonzini ['a' ... 'f'] = IN_DQ_UCODE2, 160a372823aSPaolo Bonzini ['A' ... 'F'] = IN_DQ_UCODE2, 161a372823aSPaolo Bonzini }, 162a372823aSPaolo Bonzini [IN_DQ_UCODE0] = { 163a372823aSPaolo Bonzini ['0' ... '9'] = IN_DQ_UCODE1, 164a372823aSPaolo Bonzini ['a' ... 'f'] = IN_DQ_UCODE1, 165a372823aSPaolo Bonzini ['A' ... 'F'] = IN_DQ_UCODE1, 166a372823aSPaolo Bonzini }, 167a372823aSPaolo Bonzini [IN_DQ_STRING_ESCAPE] = { 168a372823aSPaolo Bonzini ['b'] = IN_DQ_STRING, 169a372823aSPaolo Bonzini ['f'] = IN_DQ_STRING, 170a372823aSPaolo Bonzini ['n'] = IN_DQ_STRING, 171a372823aSPaolo Bonzini ['r'] = IN_DQ_STRING, 172a372823aSPaolo Bonzini ['t'] = IN_DQ_STRING, 173a372823aSPaolo Bonzini ['/'] = IN_DQ_STRING, 174a372823aSPaolo Bonzini ['\\'] = IN_DQ_STRING, 175a372823aSPaolo Bonzini ['\''] = IN_DQ_STRING, 176a372823aSPaolo Bonzini ['\"'] = IN_DQ_STRING, 177a372823aSPaolo Bonzini ['u'] = IN_DQ_UCODE0, 178a372823aSPaolo Bonzini }, 179a372823aSPaolo Bonzini [IN_DQ_STRING] = { 180*de930f45SMarkus Armbruster [0x20 ... 0xFD] = IN_DQ_STRING, 181a372823aSPaolo Bonzini ['\\'] = IN_DQ_STRING_ESCAPE, 182a372823aSPaolo Bonzini ['"'] = JSON_STRING, 183a372823aSPaolo Bonzini }, 184a372823aSPaolo Bonzini 185a372823aSPaolo Bonzini /* single quote string */ 186a372823aSPaolo Bonzini [IN_SQ_UCODE3] = { 187a372823aSPaolo Bonzini ['0' ... '9'] = IN_SQ_STRING, 188a372823aSPaolo Bonzini ['a' ... 'f'] = IN_SQ_STRING, 189a372823aSPaolo Bonzini ['A' ... 'F'] = IN_SQ_STRING, 190a372823aSPaolo Bonzini }, 191a372823aSPaolo Bonzini [IN_SQ_UCODE2] = { 192a372823aSPaolo Bonzini ['0' ... '9'] = IN_SQ_UCODE3, 193a372823aSPaolo Bonzini ['a' ... 'f'] = IN_SQ_UCODE3, 194a372823aSPaolo Bonzini ['A' ... 'F'] = IN_SQ_UCODE3, 195a372823aSPaolo Bonzini }, 196a372823aSPaolo Bonzini [IN_SQ_UCODE1] = { 197a372823aSPaolo Bonzini ['0' ... '9'] = IN_SQ_UCODE2, 198a372823aSPaolo Bonzini ['a' ... 'f'] = IN_SQ_UCODE2, 199a372823aSPaolo Bonzini ['A' ... 'F'] = IN_SQ_UCODE2, 200a372823aSPaolo Bonzini }, 201a372823aSPaolo Bonzini [IN_SQ_UCODE0] = { 202a372823aSPaolo Bonzini ['0' ... '9'] = IN_SQ_UCODE1, 203a372823aSPaolo Bonzini ['a' ... 'f'] = IN_SQ_UCODE1, 204a372823aSPaolo Bonzini ['A' ... 'F'] = IN_SQ_UCODE1, 205a372823aSPaolo Bonzini }, 206a372823aSPaolo Bonzini [IN_SQ_STRING_ESCAPE] = { 207a372823aSPaolo Bonzini ['b'] = IN_SQ_STRING, 208a372823aSPaolo Bonzini ['f'] = IN_SQ_STRING, 209a372823aSPaolo Bonzini ['n'] = IN_SQ_STRING, 210a372823aSPaolo Bonzini ['r'] = IN_SQ_STRING, 211a372823aSPaolo Bonzini ['t'] = IN_SQ_STRING, 212d5932334SPaolo Bonzini ['/'] = IN_SQ_STRING, 213d5932334SPaolo Bonzini ['\\'] = IN_SQ_STRING, 214a372823aSPaolo Bonzini ['\''] = IN_SQ_STRING, 215a372823aSPaolo Bonzini ['\"'] = IN_SQ_STRING, 216a372823aSPaolo Bonzini ['u'] = IN_SQ_UCODE0, 217a372823aSPaolo Bonzini }, 218a372823aSPaolo Bonzini [IN_SQ_STRING] = { 219*de930f45SMarkus Armbruster [0x20 ... 0xFD] = IN_SQ_STRING, 220a372823aSPaolo Bonzini ['\\'] = IN_SQ_STRING_ESCAPE, 221a372823aSPaolo Bonzini ['\''] = JSON_STRING, 222a372823aSPaolo Bonzini }, 223a372823aSPaolo Bonzini 224a372823aSPaolo Bonzini /* Zero */ 225a372823aSPaolo Bonzini [IN_ZERO] = { 226a372823aSPaolo Bonzini TERMINAL(JSON_INTEGER), 227a372823aSPaolo Bonzini ['0' ... '9'] = IN_ERROR, 228a372823aSPaolo Bonzini ['.'] = IN_MANTISSA, 229a372823aSPaolo Bonzini }, 230a372823aSPaolo Bonzini 231a372823aSPaolo Bonzini /* Float */ 232a372823aSPaolo Bonzini [IN_DIGITS] = { 233a372823aSPaolo Bonzini TERMINAL(JSON_FLOAT), 234a372823aSPaolo Bonzini ['0' ... '9'] = IN_DIGITS, 235a372823aSPaolo Bonzini }, 236a372823aSPaolo Bonzini 237a372823aSPaolo Bonzini [IN_DIGIT] = { 238a372823aSPaolo Bonzini ['0' ... '9'] = IN_DIGITS, 239a372823aSPaolo Bonzini }, 240a372823aSPaolo Bonzini 241a372823aSPaolo Bonzini [IN_EXP_E] = { 242a372823aSPaolo Bonzini ['-'] = IN_DIGIT, 243a372823aSPaolo Bonzini ['+'] = IN_DIGIT, 244a372823aSPaolo Bonzini ['0' ... '9'] = IN_DIGITS, 245a372823aSPaolo Bonzini }, 246a372823aSPaolo Bonzini 247a372823aSPaolo Bonzini [IN_MANTISSA_DIGITS] = { 248a372823aSPaolo Bonzini TERMINAL(JSON_FLOAT), 249a372823aSPaolo Bonzini ['0' ... '9'] = IN_MANTISSA_DIGITS, 250a372823aSPaolo Bonzini ['e'] = IN_EXP_E, 251a372823aSPaolo Bonzini ['E'] = IN_EXP_E, 252a372823aSPaolo Bonzini }, 253a372823aSPaolo Bonzini 254a372823aSPaolo Bonzini [IN_MANTISSA] = { 255a372823aSPaolo Bonzini ['0' ... '9'] = IN_MANTISSA_DIGITS, 256a372823aSPaolo Bonzini }, 257a372823aSPaolo Bonzini 258a372823aSPaolo Bonzini /* Number */ 259a372823aSPaolo Bonzini [IN_NONZERO_NUMBER] = { 260a372823aSPaolo Bonzini TERMINAL(JSON_INTEGER), 261a372823aSPaolo Bonzini ['0' ... '9'] = IN_NONZERO_NUMBER, 262a372823aSPaolo Bonzini ['e'] = IN_EXP_E, 263a372823aSPaolo Bonzini ['E'] = IN_EXP_E, 264a372823aSPaolo Bonzini ['.'] = IN_MANTISSA, 265a372823aSPaolo Bonzini }, 266a372823aSPaolo Bonzini 267a372823aSPaolo Bonzini [IN_NEG_NONZERO_NUMBER] = { 268a372823aSPaolo Bonzini ['0'] = IN_ZERO, 269a372823aSPaolo Bonzini ['1' ... '9'] = IN_NONZERO_NUMBER, 270a372823aSPaolo Bonzini }, 271a372823aSPaolo Bonzini 272a372823aSPaolo Bonzini /* keywords */ 273a372823aSPaolo Bonzini [IN_KEYWORD] = { 274a372823aSPaolo Bonzini TERMINAL(JSON_KEYWORD), 275a372823aSPaolo Bonzini ['a' ... 'z'] = IN_KEYWORD, 276a372823aSPaolo Bonzini }, 277a372823aSPaolo Bonzini 278a372823aSPaolo Bonzini /* whitespace */ 279a372823aSPaolo Bonzini [IN_WHITESPACE] = { 280a372823aSPaolo Bonzini TERMINAL(JSON_SKIP), 281a372823aSPaolo Bonzini [' '] = IN_WHITESPACE, 282a372823aSPaolo Bonzini ['\t'] = IN_WHITESPACE, 283a372823aSPaolo Bonzini ['\r'] = IN_WHITESPACE, 284a372823aSPaolo Bonzini ['\n'] = IN_WHITESPACE, 285a372823aSPaolo Bonzini }, 286a372823aSPaolo Bonzini 287a372823aSPaolo Bonzini /* escape */ 288a372823aSPaolo Bonzini [IN_ESCAPE_LL] = { 289a372823aSPaolo Bonzini ['d'] = JSON_ESCAPE, 2902bc7cfeaSMarc-André Lureau ['u'] = JSON_ESCAPE, 291a372823aSPaolo Bonzini }, 292a372823aSPaolo Bonzini 293a372823aSPaolo Bonzini [IN_ESCAPE_L] = { 294a372823aSPaolo Bonzini ['d'] = JSON_ESCAPE, 295a372823aSPaolo Bonzini ['l'] = IN_ESCAPE_LL, 2962bc7cfeaSMarc-André Lureau ['u'] = JSON_ESCAPE, 297a372823aSPaolo Bonzini }, 298a372823aSPaolo Bonzini 299a372823aSPaolo Bonzini [IN_ESCAPE_I64] = { 300a372823aSPaolo Bonzini ['d'] = JSON_ESCAPE, 3012bc7cfeaSMarc-André Lureau ['u'] = JSON_ESCAPE, 302a372823aSPaolo Bonzini }, 303a372823aSPaolo Bonzini 304a372823aSPaolo Bonzini [IN_ESCAPE_I6] = { 305a372823aSPaolo Bonzini ['4'] = IN_ESCAPE_I64, 306a372823aSPaolo Bonzini }, 307a372823aSPaolo Bonzini 308a372823aSPaolo Bonzini [IN_ESCAPE_I] = { 309a372823aSPaolo Bonzini ['6'] = IN_ESCAPE_I6, 310a372823aSPaolo Bonzini }, 311a372823aSPaolo Bonzini 312a372823aSPaolo Bonzini [IN_ESCAPE] = { 313a372823aSPaolo Bonzini ['d'] = JSON_ESCAPE, 314a372823aSPaolo Bonzini ['i'] = JSON_ESCAPE, 315a372823aSPaolo Bonzini ['p'] = JSON_ESCAPE, 316a372823aSPaolo Bonzini ['s'] = JSON_ESCAPE, 3172bc7cfeaSMarc-André Lureau ['u'] = JSON_ESCAPE, 318a372823aSPaolo Bonzini ['f'] = JSON_ESCAPE, 319a372823aSPaolo Bonzini ['l'] = IN_ESCAPE_L, 320a372823aSPaolo Bonzini ['I'] = IN_ESCAPE_I, 321a372823aSPaolo Bonzini }, 322a372823aSPaolo Bonzini 323a372823aSPaolo Bonzini /* top level rule */ 324a372823aSPaolo Bonzini [IN_START] = { 325a372823aSPaolo Bonzini ['"'] = IN_DQ_STRING, 326a372823aSPaolo Bonzini ['\''] = IN_SQ_STRING, 327a372823aSPaolo Bonzini ['0'] = IN_ZERO, 328a372823aSPaolo Bonzini ['1' ... '9'] = IN_NONZERO_NUMBER, 329a372823aSPaolo Bonzini ['-'] = IN_NEG_NONZERO_NUMBER, 330c5461660SMarkus Armbruster ['{'] = JSON_LCURLY, 331c5461660SMarkus Armbruster ['}'] = JSON_RCURLY, 332c5461660SMarkus Armbruster ['['] = JSON_LSQUARE, 333c5461660SMarkus Armbruster [']'] = JSON_RSQUARE, 334c5461660SMarkus Armbruster [','] = JSON_COMMA, 335c5461660SMarkus Armbruster [':'] = JSON_COLON, 336a372823aSPaolo Bonzini ['a' ... 'z'] = IN_KEYWORD, 337a372823aSPaolo Bonzini ['%'] = IN_ESCAPE, 338a372823aSPaolo Bonzini [' '] = IN_WHITESPACE, 339a372823aSPaolo Bonzini ['\t'] = IN_WHITESPACE, 340a372823aSPaolo Bonzini ['\r'] = IN_WHITESPACE, 341a372823aSPaolo Bonzini ['\n'] = IN_WHITESPACE, 342a372823aSPaolo Bonzini }, 343a372823aSPaolo Bonzini }; 344a372823aSPaolo Bonzini 345a372823aSPaolo Bonzini void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func) 346a372823aSPaolo Bonzini { 347a372823aSPaolo Bonzini lexer->emit = func; 348a372823aSPaolo Bonzini lexer->state = IN_START; 349d2ca7c0bSPaolo Bonzini lexer->token = g_string_sized_new(3); 350a372823aSPaolo Bonzini lexer->x = lexer->y = 0; 351a372823aSPaolo Bonzini } 352a372823aSPaolo Bonzini 353a372823aSPaolo Bonzini static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) 354a372823aSPaolo Bonzini { 355a372823aSPaolo Bonzini int char_consumed, new_state; 356a372823aSPaolo Bonzini 357a372823aSPaolo Bonzini lexer->x++; 358a372823aSPaolo Bonzini if (ch == '\n') { 359a372823aSPaolo Bonzini lexer->x = 0; 360a372823aSPaolo Bonzini lexer->y++; 361a372823aSPaolo Bonzini } 362a372823aSPaolo Bonzini 363a372823aSPaolo Bonzini do { 364b8d3b1daSMarkus Armbruster assert(lexer->state <= ARRAY_SIZE(json_lexer)); 365a372823aSPaolo Bonzini new_state = json_lexer[lexer->state][(uint8_t)ch]; 366a372823aSPaolo Bonzini char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); 367a2ec6be7SMarkus Armbruster if (char_consumed && !flush) { 368d2ca7c0bSPaolo Bonzini g_string_append_c(lexer->token, ch); 369a372823aSPaolo Bonzini } 370a372823aSPaolo Bonzini 371a372823aSPaolo Bonzini switch (new_state) { 372c5461660SMarkus Armbruster case JSON_LCURLY: 373c5461660SMarkus Armbruster case JSON_RCURLY: 374c5461660SMarkus Armbruster case JSON_LSQUARE: 375c5461660SMarkus Armbruster case JSON_RSQUARE: 376c5461660SMarkus Armbruster case JSON_COLON: 377c5461660SMarkus Armbruster case JSON_COMMA: 378a372823aSPaolo Bonzini case JSON_ESCAPE: 379a372823aSPaolo Bonzini case JSON_INTEGER: 380a372823aSPaolo Bonzini case JSON_FLOAT: 381a372823aSPaolo Bonzini case JSON_KEYWORD: 382a372823aSPaolo Bonzini case JSON_STRING: 383a372823aSPaolo Bonzini lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y); 384a372823aSPaolo Bonzini /* fall through */ 385a372823aSPaolo Bonzini case JSON_SKIP: 386d2ca7c0bSPaolo Bonzini g_string_truncate(lexer->token, 0); 387a372823aSPaolo Bonzini new_state = IN_START; 388a372823aSPaolo Bonzini break; 389a372823aSPaolo Bonzini case IN_ERROR: 390a372823aSPaolo Bonzini /* XXX: To avoid having previous bad input leaving the parser in an 391a372823aSPaolo Bonzini * unresponsive state where we consume unpredictable amounts of 392a372823aSPaolo Bonzini * subsequent "good" input, percolate this error state up to the 393a372823aSPaolo Bonzini * tokenizer/parser by forcing a NULL object to be emitted, then 394a372823aSPaolo Bonzini * reset state. 395a372823aSPaolo Bonzini * 396a372823aSPaolo Bonzini * Also note that this handling is required for reliable channel 397a372823aSPaolo Bonzini * negotiation between QMP and the guest agent, since chr(0xFF) 398a372823aSPaolo Bonzini * is placed at the beginning of certain events to ensure proper 399a372823aSPaolo Bonzini * delivery when the channel is in an unknown state. chr(0xFF) is 400a372823aSPaolo Bonzini * never a valid ASCII/UTF-8 sequence, so this should reliably 401a372823aSPaolo Bonzini * induce an error/flush state. 402a372823aSPaolo Bonzini */ 403a372823aSPaolo Bonzini lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y); 404d2ca7c0bSPaolo Bonzini g_string_truncate(lexer->token, 0); 405a372823aSPaolo Bonzini new_state = IN_START; 406a372823aSPaolo Bonzini lexer->state = new_state; 407a372823aSPaolo Bonzini return 0; 408a372823aSPaolo Bonzini default: 409a372823aSPaolo Bonzini break; 410a372823aSPaolo Bonzini } 411a372823aSPaolo Bonzini lexer->state = new_state; 412a372823aSPaolo Bonzini } while (!char_consumed && !flush); 413a372823aSPaolo Bonzini 414a372823aSPaolo Bonzini /* Do not let a single token grow to an arbitrarily large size, 415a372823aSPaolo Bonzini * this is a security consideration. 416a372823aSPaolo Bonzini */ 417d2ca7c0bSPaolo Bonzini if (lexer->token->len > MAX_TOKEN_SIZE) { 418a372823aSPaolo Bonzini lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); 419d2ca7c0bSPaolo Bonzini g_string_truncate(lexer->token, 0); 420a372823aSPaolo Bonzini lexer->state = IN_START; 421a372823aSPaolo Bonzini } 422a372823aSPaolo Bonzini 423a372823aSPaolo Bonzini return 0; 424a372823aSPaolo Bonzini } 425a372823aSPaolo Bonzini 426a372823aSPaolo Bonzini int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 427a372823aSPaolo Bonzini { 428a372823aSPaolo Bonzini size_t i; 429a372823aSPaolo Bonzini 430a372823aSPaolo Bonzini for (i = 0; i < size; i++) { 431a372823aSPaolo Bonzini int err; 432a372823aSPaolo Bonzini 433a372823aSPaolo Bonzini err = json_lexer_feed_char(lexer, buffer[i], false); 434a372823aSPaolo Bonzini if (err < 0) { 435a372823aSPaolo Bonzini return err; 436a372823aSPaolo Bonzini } 437a372823aSPaolo Bonzini } 438a372823aSPaolo Bonzini 439a372823aSPaolo Bonzini return 0; 440a372823aSPaolo Bonzini } 441a372823aSPaolo Bonzini 442a372823aSPaolo Bonzini int json_lexer_flush(JSONLexer *lexer) 443a372823aSPaolo Bonzini { 444a372823aSPaolo Bonzini return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true); 445a372823aSPaolo Bonzini } 446a372823aSPaolo Bonzini 447a372823aSPaolo Bonzini void json_lexer_destroy(JSONLexer *lexer) 448a372823aSPaolo Bonzini { 449d2ca7c0bSPaolo Bonzini g_string_free(lexer->token, true); 450a372823aSPaolo Bonzini } 451