1 /* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14 #include "qemu/osdep.h" 15 #include "qemu-common.h" 16 #include "qapi/qmp/json-lexer.h" 17 18 #define MAX_TOKEN_SIZE (64ULL << 20) 19 20 /* 21 * Required by JSON (RFC 7159): 22 * 23 * \"([^\\\"]|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*\" 24 * -?(0|[1-9][0-9]*)(.[0-9]+)?([eE][-+]?[0-9]+)? 25 * [{}\[\],:] 26 * [a-z]+ # covers null, true, false 27 * 28 * Extension of '' strings: 29 * 30 * '([^\\']|\\[\"'\\/bfnrt]|\\u[0-9a-fA-F]{4})*' 31 * 32 * Extension for vararg handling in JSON construction: 33 * 34 * %((l|ll|I64)?d|[ipsf]) 35 * 36 */ 37 38 enum json_lexer_state { 39 IN_ERROR = 0, /* must really be 0, see json_lexer[] */ 40 IN_DQ_UCODE3, 41 IN_DQ_UCODE2, 42 IN_DQ_UCODE1, 43 IN_DQ_UCODE0, 44 IN_DQ_STRING_ESCAPE, 45 IN_DQ_STRING, 46 IN_SQ_UCODE3, 47 IN_SQ_UCODE2, 48 IN_SQ_UCODE1, 49 IN_SQ_UCODE0, 50 IN_SQ_STRING_ESCAPE, 51 IN_SQ_STRING, 52 IN_ZERO, 53 IN_DIGITS, 54 IN_DIGIT, 55 IN_EXP_E, 56 IN_MANTISSA, 57 IN_MANTISSA_DIGITS, 58 IN_NONZERO_NUMBER, 59 IN_NEG_NONZERO_NUMBER, 60 IN_KEYWORD, 61 IN_ESCAPE, 62 IN_ESCAPE_L, 63 IN_ESCAPE_LL, 64 IN_ESCAPE_I, 65 IN_ESCAPE_I6, 66 IN_ESCAPE_I64, 67 IN_WHITESPACE, 68 IN_START, 69 }; 70 71 QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START); 72 73 #define TERMINAL(state) [0 ... 0x7F] = (state) 74 75 /* Return whether TERMINAL is a terminal state and the transition to it 76 from OLD_STATE required lookahead. This happens whenever the table 77 below uses the TERMINAL macro. */ 78 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \ 79 (json_lexer[(old_state)][0] == (terminal)) 80 81 static const uint8_t json_lexer[][256] = { 82 /* Relies on default initialization to IN_ERROR! */ 83 84 /* double quote string */ 85 [IN_DQ_UCODE3] = { 86 ['0' ... '9'] = IN_DQ_STRING, 87 ['a' ... 'f'] = IN_DQ_STRING, 88 ['A' ... 'F'] = IN_DQ_STRING, 89 }, 90 [IN_DQ_UCODE2] = { 91 ['0' ... '9'] = IN_DQ_UCODE3, 92 ['a' ... 'f'] = IN_DQ_UCODE3, 93 ['A' ... 'F'] = IN_DQ_UCODE3, 94 }, 95 [IN_DQ_UCODE1] = { 96 ['0' ... '9'] = IN_DQ_UCODE2, 97 ['a' ... 'f'] = IN_DQ_UCODE2, 98 ['A' ... 'F'] = IN_DQ_UCODE2, 99 }, 100 [IN_DQ_UCODE0] = { 101 ['0' ... '9'] = IN_DQ_UCODE1, 102 ['a' ... 'f'] = IN_DQ_UCODE1, 103 ['A' ... 'F'] = IN_DQ_UCODE1, 104 }, 105 [IN_DQ_STRING_ESCAPE] = { 106 ['b'] = IN_DQ_STRING, 107 ['f'] = IN_DQ_STRING, 108 ['n'] = IN_DQ_STRING, 109 ['r'] = IN_DQ_STRING, 110 ['t'] = IN_DQ_STRING, 111 ['/'] = IN_DQ_STRING, 112 ['\\'] = IN_DQ_STRING, 113 ['\''] = IN_DQ_STRING, 114 ['\"'] = IN_DQ_STRING, 115 ['u'] = IN_DQ_UCODE0, 116 }, 117 [IN_DQ_STRING] = { 118 [1 ... 0xBF] = IN_DQ_STRING, 119 [0xC2 ... 0xF4] = IN_DQ_STRING, 120 ['\\'] = IN_DQ_STRING_ESCAPE, 121 ['"'] = JSON_STRING, 122 }, 123 124 /* single quote string */ 125 [IN_SQ_UCODE3] = { 126 ['0' ... '9'] = IN_SQ_STRING, 127 ['a' ... 'f'] = IN_SQ_STRING, 128 ['A' ... 'F'] = IN_SQ_STRING, 129 }, 130 [IN_SQ_UCODE2] = { 131 ['0' ... '9'] = IN_SQ_UCODE3, 132 ['a' ... 'f'] = IN_SQ_UCODE3, 133 ['A' ... 'F'] = IN_SQ_UCODE3, 134 }, 135 [IN_SQ_UCODE1] = { 136 ['0' ... '9'] = IN_SQ_UCODE2, 137 ['a' ... 'f'] = IN_SQ_UCODE2, 138 ['A' ... 'F'] = IN_SQ_UCODE2, 139 }, 140 [IN_SQ_UCODE0] = { 141 ['0' ... '9'] = IN_SQ_UCODE1, 142 ['a' ... 'f'] = IN_SQ_UCODE1, 143 ['A' ... 'F'] = IN_SQ_UCODE1, 144 }, 145 [IN_SQ_STRING_ESCAPE] = { 146 ['b'] = IN_SQ_STRING, 147 ['f'] = IN_SQ_STRING, 148 ['n'] = IN_SQ_STRING, 149 ['r'] = IN_SQ_STRING, 150 ['t'] = IN_SQ_STRING, 151 ['/'] = IN_SQ_STRING, 152 ['\\'] = IN_SQ_STRING, 153 ['\''] = IN_SQ_STRING, 154 ['\"'] = IN_SQ_STRING, 155 ['u'] = IN_SQ_UCODE0, 156 }, 157 [IN_SQ_STRING] = { 158 [1 ... 0xBF] = IN_SQ_STRING, 159 [0xC2 ... 0xF4] = IN_SQ_STRING, 160 ['\\'] = IN_SQ_STRING_ESCAPE, 161 ['\''] = JSON_STRING, 162 }, 163 164 /* Zero */ 165 [IN_ZERO] = { 166 TERMINAL(JSON_INTEGER), 167 ['0' ... '9'] = IN_ERROR, 168 ['.'] = IN_MANTISSA, 169 }, 170 171 /* Float */ 172 [IN_DIGITS] = { 173 TERMINAL(JSON_FLOAT), 174 ['0' ... '9'] = IN_DIGITS, 175 }, 176 177 [IN_DIGIT] = { 178 ['0' ... '9'] = IN_DIGITS, 179 }, 180 181 [IN_EXP_E] = { 182 ['-'] = IN_DIGIT, 183 ['+'] = IN_DIGIT, 184 ['0' ... '9'] = IN_DIGITS, 185 }, 186 187 [IN_MANTISSA_DIGITS] = { 188 TERMINAL(JSON_FLOAT), 189 ['0' ... '9'] = IN_MANTISSA_DIGITS, 190 ['e'] = IN_EXP_E, 191 ['E'] = IN_EXP_E, 192 }, 193 194 [IN_MANTISSA] = { 195 ['0' ... '9'] = IN_MANTISSA_DIGITS, 196 }, 197 198 /* Number */ 199 [IN_NONZERO_NUMBER] = { 200 TERMINAL(JSON_INTEGER), 201 ['0' ... '9'] = IN_NONZERO_NUMBER, 202 ['e'] = IN_EXP_E, 203 ['E'] = IN_EXP_E, 204 ['.'] = IN_MANTISSA, 205 }, 206 207 [IN_NEG_NONZERO_NUMBER] = { 208 ['0'] = IN_ZERO, 209 ['1' ... '9'] = IN_NONZERO_NUMBER, 210 }, 211 212 /* keywords */ 213 [IN_KEYWORD] = { 214 TERMINAL(JSON_KEYWORD), 215 ['a' ... 'z'] = IN_KEYWORD, 216 }, 217 218 /* whitespace */ 219 [IN_WHITESPACE] = { 220 TERMINAL(JSON_SKIP), 221 [' '] = IN_WHITESPACE, 222 ['\t'] = IN_WHITESPACE, 223 ['\r'] = IN_WHITESPACE, 224 ['\n'] = IN_WHITESPACE, 225 }, 226 227 /* escape */ 228 [IN_ESCAPE_LL] = { 229 ['d'] = JSON_ESCAPE, 230 }, 231 232 [IN_ESCAPE_L] = { 233 ['d'] = JSON_ESCAPE, 234 ['l'] = IN_ESCAPE_LL, 235 }, 236 237 [IN_ESCAPE_I64] = { 238 ['d'] = JSON_ESCAPE, 239 }, 240 241 [IN_ESCAPE_I6] = { 242 ['4'] = IN_ESCAPE_I64, 243 }, 244 245 [IN_ESCAPE_I] = { 246 ['6'] = IN_ESCAPE_I6, 247 }, 248 249 [IN_ESCAPE] = { 250 ['d'] = JSON_ESCAPE, 251 ['i'] = JSON_ESCAPE, 252 ['p'] = JSON_ESCAPE, 253 ['s'] = JSON_ESCAPE, 254 ['f'] = JSON_ESCAPE, 255 ['l'] = IN_ESCAPE_L, 256 ['I'] = IN_ESCAPE_I, 257 }, 258 259 /* top level rule */ 260 [IN_START] = { 261 ['"'] = IN_DQ_STRING, 262 ['\''] = IN_SQ_STRING, 263 ['0'] = IN_ZERO, 264 ['1' ... '9'] = IN_NONZERO_NUMBER, 265 ['-'] = IN_NEG_NONZERO_NUMBER, 266 ['{'] = JSON_LCURLY, 267 ['}'] = JSON_RCURLY, 268 ['['] = JSON_LSQUARE, 269 [']'] = JSON_RSQUARE, 270 [','] = JSON_COMMA, 271 [':'] = JSON_COLON, 272 ['a' ... 'z'] = IN_KEYWORD, 273 ['%'] = IN_ESCAPE, 274 [' '] = IN_WHITESPACE, 275 ['\t'] = IN_WHITESPACE, 276 ['\r'] = IN_WHITESPACE, 277 ['\n'] = IN_WHITESPACE, 278 }, 279 }; 280 281 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func) 282 { 283 lexer->emit = func; 284 lexer->state = IN_START; 285 lexer->token = g_string_sized_new(3); 286 lexer->x = lexer->y = 0; 287 } 288 289 static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) 290 { 291 int char_consumed, new_state; 292 293 lexer->x++; 294 if (ch == '\n') { 295 lexer->x = 0; 296 lexer->y++; 297 } 298 299 do { 300 assert(lexer->state <= ARRAY_SIZE(json_lexer)); 301 new_state = json_lexer[lexer->state][(uint8_t)ch]; 302 char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); 303 if (char_consumed) { 304 g_string_append_c(lexer->token, ch); 305 } 306 307 switch (new_state) { 308 case JSON_LCURLY: 309 case JSON_RCURLY: 310 case JSON_LSQUARE: 311 case JSON_RSQUARE: 312 case JSON_COLON: 313 case JSON_COMMA: 314 case JSON_ESCAPE: 315 case JSON_INTEGER: 316 case JSON_FLOAT: 317 case JSON_KEYWORD: 318 case JSON_STRING: 319 lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y); 320 /* fall through */ 321 case JSON_SKIP: 322 g_string_truncate(lexer->token, 0); 323 new_state = IN_START; 324 break; 325 case IN_ERROR: 326 /* XXX: To avoid having previous bad input leaving the parser in an 327 * unresponsive state where we consume unpredictable amounts of 328 * subsequent "good" input, percolate this error state up to the 329 * tokenizer/parser by forcing a NULL object to be emitted, then 330 * reset state. 331 * 332 * Also note that this handling is required for reliable channel 333 * negotiation between QMP and the guest agent, since chr(0xFF) 334 * is placed at the beginning of certain events to ensure proper 335 * delivery when the channel is in an unknown state. chr(0xFF) is 336 * never a valid ASCII/UTF-8 sequence, so this should reliably 337 * induce an error/flush state. 338 */ 339 lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y); 340 g_string_truncate(lexer->token, 0); 341 new_state = IN_START; 342 lexer->state = new_state; 343 return 0; 344 default: 345 break; 346 } 347 lexer->state = new_state; 348 } while (!char_consumed && !flush); 349 350 /* Do not let a single token grow to an arbitrarily large size, 351 * this is a security consideration. 352 */ 353 if (lexer->token->len > MAX_TOKEN_SIZE) { 354 lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); 355 g_string_truncate(lexer->token, 0); 356 lexer->state = IN_START; 357 } 358 359 return 0; 360 } 361 362 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 363 { 364 size_t i; 365 366 for (i = 0; i < size; i++) { 367 int err; 368 369 err = json_lexer_feed_char(lexer, buffer[i], false); 370 if (err < 0) { 371 return err; 372 } 373 } 374 375 return 0; 376 } 377 378 int json_lexer_flush(JSONLexer *lexer) 379 { 380 return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true); 381 } 382 383 void json_lexer_destroy(JSONLexer *lexer) 384 { 385 g_string_free(lexer->token, true); 386 } 387