1 /* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14 #include "qemu/osdep.h" 15 #include "json-parser-int.h" 16 17 #define MAX_TOKEN_SIZE (64ULL << 20) 18 19 /* 20 * From RFC 8259 "The JavaScript Object Notation (JSON) Data 21 * Interchange Format", with [comments in brackets]: 22 * 23 * The set of tokens includes six structural characters, strings, 24 * numbers, and three literal names. 25 * 26 * These are the six structural characters: 27 * 28 * begin-array = ws %x5B ws ; [ left square bracket 29 * begin-object = ws %x7B ws ; { left curly bracket 30 * end-array = ws %x5D ws ; ] right square bracket 31 * end-object = ws %x7D ws ; } right curly bracket 32 * name-separator = ws %x3A ws ; : colon 33 * value-separator = ws %x2C ws ; , comma 34 * 35 * Insignificant whitespace is allowed before or after any of the six 36 * structural characters. 37 * [This lexer accepts it before or after any token, which is actually 38 * the same, as the grammar always has structural characters between 39 * other tokens.] 40 * 41 * ws = *( 42 * %x20 / ; Space 43 * %x09 / ; Horizontal tab 44 * %x0A / ; Line feed or New line 45 * %x0D ) ; Carriage return 46 * 47 * [...] three literal names: 48 * false null true 49 * [This lexer accepts [a-z]+, and leaves rejecting unknown literal 50 * names to the parser.] 51 * 52 * [Numbers:] 53 * 54 * number = [ minus ] int [ frac ] [ exp ] 55 * decimal-point = %x2E ; . 56 * digit1-9 = %x31-39 ; 1-9 57 * e = %x65 / %x45 ; e E 58 * exp = e [ minus / plus ] 1*DIGIT 59 * frac = decimal-point 1*DIGIT 60 * int = zero / ( digit1-9 *DIGIT ) 61 * minus = %x2D ; - 62 * plus = %x2B ; + 63 * zero = %x30 ; 0 64 * 65 * [Strings:] 66 * string = quotation-mark *char quotation-mark 67 * 68 * char = unescaped / 69 * escape ( 70 * %x22 / ; " quotation mark U+0022 71 * %x5C / ; \ reverse solidus U+005C 72 * %x2F / ; / solidus U+002F 73 * %x62 / ; b backspace U+0008 74 * %x66 / ; f form feed U+000C 75 * %x6E / ; n line feed U+000A 76 * %x72 / ; r carriage return U+000D 77 * %x74 / ; t tab U+0009 78 * %x75 4HEXDIG ) ; uXXXX U+XXXX 79 * escape = %x5C ; \ 80 * quotation-mark = %x22 ; " 81 * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF 82 * [This lexer accepts any non-control character after escape, and 83 * leaves rejecting invalid ones to the parser.] 84 * 85 * 86 * Extensions over RFC 8259: 87 * - Extra escape sequence in strings: 88 * 0x27 (apostrophe) is recognized after escape, too 89 * - Single-quoted strings: 90 * Like double-quoted strings, except they're delimited by %x27 91 * (apostrophe) instead of %x22 (quotation mark), and can't contain 92 * unescaped apostrophe, but can contain unescaped quotation mark. 93 * - Interpolation, if enabled: 94 * The lexer accepts %[A-Za-z0-9]*, and leaves rejecting invalid 95 * ones to the parser. 96 * 97 * Note: 98 * - Input must be encoded in modified UTF-8. 99 * - Decoding and validating is left to the parser. 100 */ 101 102 enum json_lexer_state { 103 IN_ERROR = 0, /* must really be 0, see json_lexer[] */ 104 IN_DQ_STRING_ESCAPE, 105 IN_DQ_STRING, 106 IN_SQ_STRING_ESCAPE, 107 IN_SQ_STRING, 108 IN_ZERO, 109 IN_EXP_DIGITS, 110 IN_EXP_SIGN, 111 IN_EXP_E, 112 IN_MANTISSA, 113 IN_MANTISSA_DIGITS, 114 IN_DIGITS, 115 IN_SIGN, 116 IN_KEYWORD, 117 IN_INTERP, 118 IN_WHITESPACE, 119 IN_START, 120 IN_START_INTERP, /* must be IN_START + 1 */ 121 }; 122 123 QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START_INTERP); 124 QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1); 125 126 #define TERMINAL(state) [0 ... 0x7F] = (state) 127 128 /* Return whether TERMINAL is a terminal state and the transition to it 129 from OLD_STATE required lookahead. This happens whenever the table 130 below uses the TERMINAL macro. */ 131 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \ 132 (terminal != IN_ERROR && json_lexer[(old_state)][0] == (terminal)) 133 134 static const uint8_t json_lexer[][256] = { 135 /* Relies on default initialization to IN_ERROR! */ 136 137 /* double quote string */ 138 [IN_DQ_STRING_ESCAPE] = { 139 [0x20 ... 0xFD] = IN_DQ_STRING, 140 }, 141 [IN_DQ_STRING] = { 142 [0x20 ... 0xFD] = IN_DQ_STRING, 143 ['\\'] = IN_DQ_STRING_ESCAPE, 144 ['"'] = JSON_STRING, 145 }, 146 147 /* single quote string */ 148 [IN_SQ_STRING_ESCAPE] = { 149 [0x20 ... 0xFD] = IN_SQ_STRING, 150 }, 151 [IN_SQ_STRING] = { 152 [0x20 ... 0xFD] = IN_SQ_STRING, 153 ['\\'] = IN_SQ_STRING_ESCAPE, 154 ['\''] = JSON_STRING, 155 }, 156 157 /* Zero */ 158 [IN_ZERO] = { 159 TERMINAL(JSON_INTEGER), 160 ['0' ... '9'] = IN_ERROR, 161 ['.'] = IN_MANTISSA, 162 }, 163 164 /* Float */ 165 [IN_EXP_DIGITS] = { 166 TERMINAL(JSON_FLOAT), 167 ['0' ... '9'] = IN_EXP_DIGITS, 168 }, 169 170 [IN_EXP_SIGN] = { 171 ['0' ... '9'] = IN_EXP_DIGITS, 172 }, 173 174 [IN_EXP_E] = { 175 ['-'] = IN_EXP_SIGN, 176 ['+'] = IN_EXP_SIGN, 177 ['0' ... '9'] = IN_EXP_DIGITS, 178 }, 179 180 [IN_MANTISSA_DIGITS] = { 181 TERMINAL(JSON_FLOAT), 182 ['0' ... '9'] = IN_MANTISSA_DIGITS, 183 ['e'] = IN_EXP_E, 184 ['E'] = IN_EXP_E, 185 }, 186 187 [IN_MANTISSA] = { 188 ['0' ... '9'] = IN_MANTISSA_DIGITS, 189 }, 190 191 /* Number */ 192 [IN_DIGITS] = { 193 TERMINAL(JSON_INTEGER), 194 ['0' ... '9'] = IN_DIGITS, 195 ['e'] = IN_EXP_E, 196 ['E'] = IN_EXP_E, 197 ['.'] = IN_MANTISSA, 198 }, 199 200 [IN_SIGN] = { 201 ['0'] = IN_ZERO, 202 ['1' ... '9'] = IN_DIGITS, 203 }, 204 205 /* keywords */ 206 [IN_KEYWORD] = { 207 TERMINAL(JSON_KEYWORD), 208 ['a' ... 'z'] = IN_KEYWORD, 209 }, 210 211 /* whitespace */ 212 [IN_WHITESPACE] = { 213 TERMINAL(JSON_SKIP), 214 [' '] = IN_WHITESPACE, 215 ['\t'] = IN_WHITESPACE, 216 ['\r'] = IN_WHITESPACE, 217 ['\n'] = IN_WHITESPACE, 218 }, 219 220 /* interpolation */ 221 [IN_INTERP] = { 222 TERMINAL(JSON_INTERP), 223 ['A' ... 'Z'] = IN_INTERP, 224 ['a' ... 'z'] = IN_INTERP, 225 ['0' ... '9'] = IN_INTERP, 226 }, 227 228 /* 229 * Two start states: 230 * - IN_START recognizes JSON tokens with our string extensions 231 * - IN_START_INTERP additionally recognizes interpolation. 232 */ 233 [IN_START ... IN_START_INTERP] = { 234 ['"'] = IN_DQ_STRING, 235 ['\''] = IN_SQ_STRING, 236 ['0'] = IN_ZERO, 237 ['1' ... '9'] = IN_DIGITS, 238 ['-'] = IN_SIGN, 239 ['{'] = JSON_LCURLY, 240 ['}'] = JSON_RCURLY, 241 ['['] = JSON_LSQUARE, 242 [']'] = JSON_RSQUARE, 243 [','] = JSON_COMMA, 244 [':'] = JSON_COLON, 245 ['a' ... 'z'] = IN_KEYWORD, 246 [' '] = IN_WHITESPACE, 247 ['\t'] = IN_WHITESPACE, 248 ['\r'] = IN_WHITESPACE, 249 ['\n'] = IN_WHITESPACE, 250 }, 251 [IN_START_INTERP]['%'] = IN_INTERP, 252 }; 253 254 void json_lexer_init(JSONLexer *lexer, bool enable_interpolation) 255 { 256 lexer->start_state = lexer->state = enable_interpolation 257 ? IN_START_INTERP : IN_START; 258 lexer->token = g_string_sized_new(3); 259 lexer->x = lexer->y = 0; 260 } 261 262 static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) 263 { 264 int char_consumed, new_state; 265 266 lexer->x++; 267 if (ch == '\n') { 268 lexer->x = 0; 269 lexer->y++; 270 } 271 272 do { 273 assert(lexer->state <= ARRAY_SIZE(json_lexer)); 274 new_state = json_lexer[lexer->state][(uint8_t)ch]; 275 char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); 276 if (char_consumed && !flush) { 277 g_string_append_c(lexer->token, ch); 278 } 279 280 switch (new_state) { 281 case JSON_LCURLY: 282 case JSON_RCURLY: 283 case JSON_LSQUARE: 284 case JSON_RSQUARE: 285 case JSON_COLON: 286 case JSON_COMMA: 287 case JSON_INTERP: 288 case JSON_INTEGER: 289 case JSON_FLOAT: 290 case JSON_KEYWORD: 291 case JSON_STRING: 292 json_message_process_token(lexer, lexer->token, new_state, 293 lexer->x, lexer->y); 294 /* fall through */ 295 case JSON_SKIP: 296 g_string_truncate(lexer->token, 0); 297 new_state = lexer->start_state; 298 break; 299 case IN_ERROR: 300 /* XXX: To avoid having previous bad input leaving the parser in an 301 * unresponsive state where we consume unpredictable amounts of 302 * subsequent "good" input, percolate this error state up to the 303 * parser by emitting a JSON_ERROR token, then reset lexer state. 304 * 305 * Also note that this handling is required for reliable channel 306 * negotiation between QMP and the guest agent, since chr(0xFF) 307 * is placed at the beginning of certain events to ensure proper 308 * delivery when the channel is in an unknown state. chr(0xFF) is 309 * never a valid ASCII/UTF-8 sequence, so this should reliably 310 * induce an error/flush state. 311 */ 312 json_message_process_token(lexer, lexer->token, JSON_ERROR, 313 lexer->x, lexer->y); 314 g_string_truncate(lexer->token, 0); 315 lexer->state = lexer->start_state; 316 return; 317 default: 318 break; 319 } 320 lexer->state = new_state; 321 } while (!char_consumed && !flush); 322 323 /* Do not let a single token grow to an arbitrarily large size, 324 * this is a security consideration. 325 */ 326 if (lexer->token->len > MAX_TOKEN_SIZE) { 327 json_message_process_token(lexer, lexer->token, lexer->state, 328 lexer->x, lexer->y); 329 g_string_truncate(lexer->token, 0); 330 lexer->state = lexer->start_state; 331 } 332 } 333 334 void json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 335 { 336 size_t i; 337 338 for (i = 0; i < size; i++) { 339 json_lexer_feed_char(lexer, buffer[i], false); 340 } 341 } 342 343 void json_lexer_flush(JSONLexer *lexer) 344 { 345 if (lexer->state != lexer->start_state) { 346 json_lexer_feed_char(lexer, 0, true); 347 } 348 json_message_process_token(lexer, lexer->token, JSON_END_OF_INPUT, 349 lexer->x, lexer->y); 350 } 351 352 void json_lexer_destroy(JSONLexer *lexer) 353 { 354 g_string_free(lexer->token, true); 355 } 356