1 /* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14 #include "qemu/osdep.h" 15 #include "qemu-common.h" 16 #include "qapi/qmp/json-lexer.h" 17 18 #define MAX_TOKEN_SIZE (64ULL << 20) 19 20 /* 21 * Required by JSON (RFC 7159): 22 * 23 * \"([^\\\"]|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*\" 24 * -?(0|[1-9][0-9]*)(.[0-9]+)?([eE][-+]?[0-9]+)? 25 * [{}\[\],:] 26 * [a-z]+ # covers null, true, false 27 * 28 * Extension of '' strings: 29 * 30 * '([^\\']|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*' 31 * 32 * Extension for vararg handling in JSON construction: 33 * 34 * %((l|ll|I64)?d|[ipsf]) 35 * 36 */ 37 38 enum json_lexer_state { 39 IN_ERROR = 0, /* must really be 0, see json_lexer[] */ 40 IN_DQ_UCODE3, 41 IN_DQ_UCODE2, 42 IN_DQ_UCODE1, 43 IN_DQ_UCODE0, 44 IN_DQ_STRING_ESCAPE, 45 IN_DQ_STRING, 46 IN_SQ_UCODE3, 47 IN_SQ_UCODE2, 48 IN_SQ_UCODE1, 49 IN_SQ_UCODE0, 50 IN_SQ_STRING_ESCAPE, 51 IN_SQ_STRING, 52 IN_ZERO, 53 IN_DIGITS, 54 IN_DIGIT, 55 IN_EXP_E, 56 IN_MANTISSA, 57 IN_MANTISSA_DIGITS, 58 IN_NONZERO_NUMBER, 59 IN_NEG_NONZERO_NUMBER, 60 IN_KEYWORD, 61 IN_ESCAPE, 62 IN_ESCAPE_L, 63 IN_ESCAPE_LL, 64 IN_ESCAPE_I, 65 IN_ESCAPE_I6, 66 IN_ESCAPE_I64, 67 IN_WHITESPACE, 68 IN_START, 69 }; 70 71 QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START); 72 73 #define TERMINAL(state) [0 ... 0x7F] = (state) 74 75 /* Return whether TERMINAL is a terminal state and the transition to it 76 from OLD_STATE required lookahead. This happens whenever the table 77 below uses the TERMINAL macro. */ 78 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \ 79 (json_lexer[(old_state)][0] == (terminal)) 80 81 static const uint8_t json_lexer[][256] = { 82 /* Relies on default initialization to IN_ERROR! */ 83 84 /* double quote string */ 85 [IN_DQ_UCODE3] = { 86 ['0' ... '9'] = IN_DQ_STRING, 87 ['a' ... 'f'] = IN_DQ_STRING, 88 ['A' ... 'F'] = IN_DQ_STRING, 89 }, 90 [IN_DQ_UCODE2] = { 91 ['0' ... '9'] = IN_DQ_UCODE3, 92 ['a' ... 'f'] = IN_DQ_UCODE3, 93 ['A' ... 'F'] = IN_DQ_UCODE3, 94 }, 95 [IN_DQ_UCODE1] = { 96 ['0' ... '9'] = IN_DQ_UCODE2, 97 ['a' ... 'f'] = IN_DQ_UCODE2, 98 ['A' ... 'F'] = IN_DQ_UCODE2, 99 }, 100 [IN_DQ_UCODE0] = { 101 ['0' ... '9'] = IN_DQ_UCODE1, 102 ['a' ... 'f'] = IN_DQ_UCODE1, 103 ['A' ... 'F'] = IN_DQ_UCODE1, 104 }, 105 [IN_DQ_STRING_ESCAPE] = { 106 ['b'] = IN_DQ_STRING, 107 ['f'] = IN_DQ_STRING, 108 ['n'] = IN_DQ_STRING, 109 ['r'] = IN_DQ_STRING, 110 ['t'] = IN_DQ_STRING, 111 ['/'] = IN_DQ_STRING, 112 ['\\'] = IN_DQ_STRING, 113 ['\''] = IN_DQ_STRING, 114 ['\"'] = IN_DQ_STRING, 115 ['u'] = IN_DQ_UCODE0, 116 }, 117 [IN_DQ_STRING] = { 118 [1 ... 0xBF] = IN_DQ_STRING, 119 [0xC2 ... 0xF4] = IN_DQ_STRING, 120 ['\\'] = IN_DQ_STRING_ESCAPE, 121 ['"'] = JSON_STRING, 122 }, 123 124 /* single quote string */ 125 [IN_SQ_UCODE3] = { 126 ['0' ... '9'] = IN_SQ_STRING, 127 ['a' ... 'f'] = IN_SQ_STRING, 128 ['A' ... 'F'] = IN_SQ_STRING, 129 }, 130 [IN_SQ_UCODE2] = { 131 ['0' ... '9'] = IN_SQ_UCODE3, 132 ['a' ... 'f'] = IN_SQ_UCODE3, 133 ['A' ... 'F'] = IN_SQ_UCODE3, 134 }, 135 [IN_SQ_UCODE1] = { 136 ['0' ... '9'] = IN_SQ_UCODE2, 137 ['a' ... 'f'] = IN_SQ_UCODE2, 138 ['A' ... 'F'] = IN_SQ_UCODE2, 139 }, 140 [IN_SQ_UCODE0] = { 141 ['0' ... '9'] = IN_SQ_UCODE1, 142 ['a' ... 'f'] = IN_SQ_UCODE1, 143 ['A' ... 'F'] = IN_SQ_UCODE1, 144 }, 145 [IN_SQ_STRING_ESCAPE] = { 146 ['b'] = IN_SQ_STRING, 147 ['f'] = IN_SQ_STRING, 148 ['n'] = IN_SQ_STRING, 149 ['r'] = IN_SQ_STRING, 150 ['t'] = IN_SQ_STRING, 151 ['/'] = IN_SQ_STRING, 152 ['\\'] = IN_SQ_STRING, 153 ['\''] = IN_SQ_STRING, 154 ['\"'] = IN_SQ_STRING, 155 ['u'] = IN_SQ_UCODE0, 156 }, 157 [IN_SQ_STRING] = { 158 [1 ... 0xBF] = IN_SQ_STRING, 159 [0xC2 ... 0xF4] = IN_SQ_STRING, 160 ['\\'] = IN_SQ_STRING_ESCAPE, 161 ['\''] = JSON_STRING, 162 }, 163 164 /* Zero */ 165 [IN_ZERO] = { 166 TERMINAL(JSON_INTEGER), 167 ['0' ... '9'] = IN_ERROR, 168 ['.'] = IN_MANTISSA, 169 }, 170 171 /* Float */ 172 [IN_DIGITS] = { 173 TERMINAL(JSON_FLOAT), 174 ['0' ... '9'] = IN_DIGITS, 175 }, 176 177 [IN_DIGIT] = { 178 ['0' ... '9'] = IN_DIGITS, 179 }, 180 181 [IN_EXP_E] = { 182 ['-'] = IN_DIGIT, 183 ['+'] = IN_DIGIT, 184 ['0' ... '9'] = IN_DIGITS, 185 }, 186 187 [IN_MANTISSA_DIGITS] = { 188 TERMINAL(JSON_FLOAT), 189 ['0' ... '9'] = IN_MANTISSA_DIGITS, 190 ['e'] = IN_EXP_E, 191 ['E'] = IN_EXP_E, 192 }, 193 194 [IN_MANTISSA] = { 195 ['0' ... '9'] = IN_MANTISSA_DIGITS, 196 }, 197 198 /* Number */ 199 [IN_NONZERO_NUMBER] = { 200 TERMINAL(JSON_INTEGER), 201 ['0' ... '9'] = IN_NONZERO_NUMBER, 202 ['e'] = IN_EXP_E, 203 ['E'] = IN_EXP_E, 204 ['.'] = IN_MANTISSA, 205 }, 206 207 [IN_NEG_NONZERO_NUMBER] = { 208 ['0'] = IN_ZERO, 209 ['1' ... '9'] = IN_NONZERO_NUMBER, 210 }, 211 212 /* keywords */ 213 [IN_KEYWORD] = { 214 TERMINAL(JSON_KEYWORD), 215 ['a' ... 'z'] = IN_KEYWORD, 216 }, 217 218 /* whitespace */ 219 [IN_WHITESPACE] = { 220 TERMINAL(JSON_SKIP), 221 [' '] = IN_WHITESPACE, 222 ['\t'] = IN_WHITESPACE, 223 ['\r'] = IN_WHITESPACE, 224 ['\n'] = IN_WHITESPACE, 225 }, 226 227 /* escape */ 228 [IN_ESCAPE_LL] = { 229 ['d'] = JSON_ESCAPE, 230 ['u'] = JSON_ESCAPE, 231 }, 232 233 [IN_ESCAPE_L] = { 234 ['d'] = JSON_ESCAPE, 235 ['l'] = IN_ESCAPE_LL, 236 ['u'] = JSON_ESCAPE, 237 }, 238 239 [IN_ESCAPE_I64] = { 240 ['d'] = JSON_ESCAPE, 241 ['u'] = JSON_ESCAPE, 242 }, 243 244 [IN_ESCAPE_I6] = { 245 ['4'] = IN_ESCAPE_I64, 246 }, 247 248 [IN_ESCAPE_I] = { 249 ['6'] = IN_ESCAPE_I6, 250 }, 251 252 [IN_ESCAPE] = { 253 ['d'] = JSON_ESCAPE, 254 ['i'] = JSON_ESCAPE, 255 ['p'] = JSON_ESCAPE, 256 ['s'] = JSON_ESCAPE, 257 ['u'] = JSON_ESCAPE, 258 ['f'] = JSON_ESCAPE, 259 ['l'] = IN_ESCAPE_L, 260 ['I'] = IN_ESCAPE_I, 261 }, 262 263 /* top level rule */ 264 [IN_START] = { 265 ['"'] = IN_DQ_STRING, 266 ['\''] = IN_SQ_STRING, 267 ['0'] = IN_ZERO, 268 ['1' ... '9'] = IN_NONZERO_NUMBER, 269 ['-'] = IN_NEG_NONZERO_NUMBER, 270 ['{'] = JSON_LCURLY, 271 ['}'] = JSON_RCURLY, 272 ['['] = JSON_LSQUARE, 273 [']'] = JSON_RSQUARE, 274 [','] = JSON_COMMA, 275 [':'] = JSON_COLON, 276 ['a' ... 'z'] = IN_KEYWORD, 277 ['%'] = IN_ESCAPE, 278 [' '] = IN_WHITESPACE, 279 ['\t'] = IN_WHITESPACE, 280 ['\r'] = IN_WHITESPACE, 281 ['\n'] = IN_WHITESPACE, 282 }, 283 }; 284 285 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func) 286 { 287 lexer->emit = func; 288 lexer->state = IN_START; 289 lexer->token = g_string_sized_new(3); 290 lexer->x = lexer->y = 0; 291 } 292 293 static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) 294 { 295 int char_consumed, new_state; 296 297 lexer->x++; 298 if (ch == '\n') { 299 lexer->x = 0; 300 lexer->y++; 301 } 302 303 do { 304 assert(lexer->state <= ARRAY_SIZE(json_lexer)); 305 new_state = json_lexer[lexer->state][(uint8_t)ch]; 306 char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); 307 if (char_consumed) { 308 g_string_append_c(lexer->token, ch); 309 } 310 311 switch (new_state) { 312 case JSON_LCURLY: 313 case JSON_RCURLY: 314 case JSON_LSQUARE: 315 case JSON_RSQUARE: 316 case JSON_COLON: 317 case JSON_COMMA: 318 case JSON_ESCAPE: 319 case JSON_INTEGER: 320 case JSON_FLOAT: 321 case JSON_KEYWORD: 322 case JSON_STRING: 323 lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y); 324 /* fall through */ 325 case JSON_SKIP: 326 g_string_truncate(lexer->token, 0); 327 new_state = IN_START; 328 break; 329 case IN_ERROR: 330 /* XXX: To avoid having previous bad input leaving the parser in an 331 * unresponsive state where we consume unpredictable amounts of 332 * subsequent "good" input, percolate this error state up to the 333 * tokenizer/parser by forcing a NULL object to be emitted, then 334 * reset state. 335 * 336 * Also note that this handling is required for reliable channel 337 * negotiation between QMP and the guest agent, since chr(0xFF) 338 * is placed at the beginning of certain events to ensure proper 339 * delivery when the channel is in an unknown state. chr(0xFF) is 340 * never a valid ASCII/UTF-8 sequence, so this should reliably 341 * induce an error/flush state. 342 */ 343 lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y); 344 g_string_truncate(lexer->token, 0); 345 new_state = IN_START; 346 lexer->state = new_state; 347 return 0; 348 default: 349 break; 350 } 351 lexer->state = new_state; 352 } while (!char_consumed && !flush); 353 354 /* Do not let a single token grow to an arbitrarily large size, 355 * this is a security consideration. 356 */ 357 if (lexer->token->len > MAX_TOKEN_SIZE) { 358 lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); 359 g_string_truncate(lexer->token, 0); 360 lexer->state = IN_START; 361 } 362 363 return 0; 364 } 365 366 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 367 { 368 size_t i; 369 370 for (i = 0; i < size; i++) { 371 int err; 372 373 err = json_lexer_feed_char(lexer, buffer[i], false); 374 if (err < 0) { 375 return err; 376 } 377 } 378 379 return 0; 380 } 381 382 int json_lexer_flush(JSONLexer *lexer) 383 { 384 return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true); 385 } 386 387 void json_lexer_destroy(JSONLexer *lexer) 388 { 389 g_string_free(lexer->token, true); 390 } 391