1 /* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14 #include "qapi/qmp/qstring.h" 15 #include "qapi/qmp/qlist.h" 16 #include "qapi/qmp/qdict.h" 17 #include "qapi/qmp/qint.h" 18 #include "qemu-common.h" 19 #include "qapi/qmp/json-lexer.h" 20 21 #define MAX_TOKEN_SIZE (64ULL << 20) 22 23 /* 24 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\" 25 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*' 26 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+)) 27 * [{}\[\],:] 28 * [a-z]+ 29 * 30 */ 31 32 enum json_lexer_state { 33 IN_ERROR = 0, /* must really be 0, see json_lexer[] */ 34 IN_DQ_UCODE3, 35 IN_DQ_UCODE2, 36 IN_DQ_UCODE1, 37 IN_DQ_UCODE0, 38 IN_DQ_STRING_ESCAPE, 39 IN_DQ_STRING, 40 IN_SQ_UCODE3, 41 IN_SQ_UCODE2, 42 IN_SQ_UCODE1, 43 IN_SQ_UCODE0, 44 IN_SQ_STRING_ESCAPE, 45 IN_SQ_STRING, 46 IN_ZERO, 47 IN_DIGITS, 48 IN_DIGIT, 49 IN_EXP_E, 50 IN_MANTISSA, 51 IN_MANTISSA_DIGITS, 52 IN_NONZERO_NUMBER, 53 IN_NEG_NONZERO_NUMBER, 54 IN_KEYWORD, 55 IN_ESCAPE, 56 IN_ESCAPE_L, 57 IN_ESCAPE_LL, 58 IN_ESCAPE_I, 59 IN_ESCAPE_I6, 60 IN_ESCAPE_I64, 61 IN_WHITESPACE, 62 IN_START, 63 }; 64 65 QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START); 66 67 #define TERMINAL(state) [0 ... 0x7F] = (state) 68 69 /* Return whether TERMINAL is a terminal state and the transition to it 70 from OLD_STATE required lookahead. This happens whenever the table 71 below uses the TERMINAL macro. */ 72 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \ 73 (json_lexer[(old_state)][0] == (terminal)) 74 75 static const uint8_t json_lexer[][256] = { 76 /* Relies on default initialization to IN_ERROR! */ 77 78 /* double quote string */ 79 [IN_DQ_UCODE3] = { 80 ['0' ... '9'] = IN_DQ_STRING, 81 ['a' ... 'f'] = IN_DQ_STRING, 82 ['A' ... 'F'] = IN_DQ_STRING, 83 }, 84 [IN_DQ_UCODE2] = { 85 ['0' ... '9'] = IN_DQ_UCODE3, 86 ['a' ... 'f'] = IN_DQ_UCODE3, 87 ['A' ... 'F'] = IN_DQ_UCODE3, 88 }, 89 [IN_DQ_UCODE1] = { 90 ['0' ... '9'] = IN_DQ_UCODE2, 91 ['a' ... 'f'] = IN_DQ_UCODE2, 92 ['A' ... 'F'] = IN_DQ_UCODE2, 93 }, 94 [IN_DQ_UCODE0] = { 95 ['0' ... '9'] = IN_DQ_UCODE1, 96 ['a' ... 'f'] = IN_DQ_UCODE1, 97 ['A' ... 'F'] = IN_DQ_UCODE1, 98 }, 99 [IN_DQ_STRING_ESCAPE] = { 100 ['b'] = IN_DQ_STRING, 101 ['f'] = IN_DQ_STRING, 102 ['n'] = IN_DQ_STRING, 103 ['r'] = IN_DQ_STRING, 104 ['t'] = IN_DQ_STRING, 105 ['/'] = IN_DQ_STRING, 106 ['\\'] = IN_DQ_STRING, 107 ['\''] = IN_DQ_STRING, 108 ['\"'] = IN_DQ_STRING, 109 ['u'] = IN_DQ_UCODE0, 110 }, 111 [IN_DQ_STRING] = { 112 [1 ... 0xBF] = IN_DQ_STRING, 113 [0xC2 ... 0xF4] = IN_DQ_STRING, 114 ['\\'] = IN_DQ_STRING_ESCAPE, 115 ['"'] = JSON_STRING, 116 }, 117 118 /* single quote string */ 119 [IN_SQ_UCODE3] = { 120 ['0' ... '9'] = IN_SQ_STRING, 121 ['a' ... 'f'] = IN_SQ_STRING, 122 ['A' ... 'F'] = IN_SQ_STRING, 123 }, 124 [IN_SQ_UCODE2] = { 125 ['0' ... '9'] = IN_SQ_UCODE3, 126 ['a' ... 'f'] = IN_SQ_UCODE3, 127 ['A' ... 'F'] = IN_SQ_UCODE3, 128 }, 129 [IN_SQ_UCODE1] = { 130 ['0' ... '9'] = IN_SQ_UCODE2, 131 ['a' ... 'f'] = IN_SQ_UCODE2, 132 ['A' ... 'F'] = IN_SQ_UCODE2, 133 }, 134 [IN_SQ_UCODE0] = { 135 ['0' ... '9'] = IN_SQ_UCODE1, 136 ['a' ... 'f'] = IN_SQ_UCODE1, 137 ['A' ... 'F'] = IN_SQ_UCODE1, 138 }, 139 [IN_SQ_STRING_ESCAPE] = { 140 ['b'] = IN_SQ_STRING, 141 ['f'] = IN_SQ_STRING, 142 ['n'] = IN_SQ_STRING, 143 ['r'] = IN_SQ_STRING, 144 ['t'] = IN_SQ_STRING, 145 ['/'] = IN_SQ_STRING, 146 ['\\'] = IN_SQ_STRING, 147 ['\''] = IN_SQ_STRING, 148 ['\"'] = IN_SQ_STRING, 149 ['u'] = IN_SQ_UCODE0, 150 }, 151 [IN_SQ_STRING] = { 152 [1 ... 0xBF] = IN_SQ_STRING, 153 [0xC2 ... 0xF4] = IN_SQ_STRING, 154 ['\\'] = IN_SQ_STRING_ESCAPE, 155 ['\''] = JSON_STRING, 156 }, 157 158 /* Zero */ 159 [IN_ZERO] = { 160 TERMINAL(JSON_INTEGER), 161 ['0' ... '9'] = IN_ERROR, 162 ['.'] = IN_MANTISSA, 163 }, 164 165 /* Float */ 166 [IN_DIGITS] = { 167 TERMINAL(JSON_FLOAT), 168 ['0' ... '9'] = IN_DIGITS, 169 }, 170 171 [IN_DIGIT] = { 172 ['0' ... '9'] = IN_DIGITS, 173 }, 174 175 [IN_EXP_E] = { 176 ['-'] = IN_DIGIT, 177 ['+'] = IN_DIGIT, 178 ['0' ... '9'] = IN_DIGITS, 179 }, 180 181 [IN_MANTISSA_DIGITS] = { 182 TERMINAL(JSON_FLOAT), 183 ['0' ... '9'] = IN_MANTISSA_DIGITS, 184 ['e'] = IN_EXP_E, 185 ['E'] = IN_EXP_E, 186 }, 187 188 [IN_MANTISSA] = { 189 ['0' ... '9'] = IN_MANTISSA_DIGITS, 190 }, 191 192 /* Number */ 193 [IN_NONZERO_NUMBER] = { 194 TERMINAL(JSON_INTEGER), 195 ['0' ... '9'] = IN_NONZERO_NUMBER, 196 ['e'] = IN_EXP_E, 197 ['E'] = IN_EXP_E, 198 ['.'] = IN_MANTISSA, 199 }, 200 201 [IN_NEG_NONZERO_NUMBER] = { 202 ['0'] = IN_ZERO, 203 ['1' ... '9'] = IN_NONZERO_NUMBER, 204 }, 205 206 /* keywords */ 207 [IN_KEYWORD] = { 208 TERMINAL(JSON_KEYWORD), 209 ['a' ... 'z'] = IN_KEYWORD, 210 }, 211 212 /* whitespace */ 213 [IN_WHITESPACE] = { 214 TERMINAL(JSON_SKIP), 215 [' '] = IN_WHITESPACE, 216 ['\t'] = IN_WHITESPACE, 217 ['\r'] = IN_WHITESPACE, 218 ['\n'] = IN_WHITESPACE, 219 }, 220 221 /* escape */ 222 [IN_ESCAPE_LL] = { 223 ['d'] = JSON_ESCAPE, 224 }, 225 226 [IN_ESCAPE_L] = { 227 ['d'] = JSON_ESCAPE, 228 ['l'] = IN_ESCAPE_LL, 229 }, 230 231 [IN_ESCAPE_I64] = { 232 ['d'] = JSON_ESCAPE, 233 }, 234 235 [IN_ESCAPE_I6] = { 236 ['4'] = IN_ESCAPE_I64, 237 }, 238 239 [IN_ESCAPE_I] = { 240 ['6'] = IN_ESCAPE_I6, 241 }, 242 243 [IN_ESCAPE] = { 244 ['d'] = JSON_ESCAPE, 245 ['i'] = JSON_ESCAPE, 246 ['p'] = JSON_ESCAPE, 247 ['s'] = JSON_ESCAPE, 248 ['f'] = JSON_ESCAPE, 249 ['l'] = IN_ESCAPE_L, 250 ['I'] = IN_ESCAPE_I, 251 }, 252 253 /* top level rule */ 254 [IN_START] = { 255 ['"'] = IN_DQ_STRING, 256 ['\''] = IN_SQ_STRING, 257 ['0'] = IN_ZERO, 258 ['1' ... '9'] = IN_NONZERO_NUMBER, 259 ['-'] = IN_NEG_NONZERO_NUMBER, 260 ['{'] = JSON_LCURLY, 261 ['}'] = JSON_RCURLY, 262 ['['] = JSON_LSQUARE, 263 [']'] = JSON_RSQUARE, 264 [','] = JSON_COMMA, 265 [':'] = JSON_COLON, 266 ['a' ... 'z'] = IN_KEYWORD, 267 ['%'] = IN_ESCAPE, 268 [' '] = IN_WHITESPACE, 269 ['\t'] = IN_WHITESPACE, 270 ['\r'] = IN_WHITESPACE, 271 ['\n'] = IN_WHITESPACE, 272 }, 273 }; 274 275 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func) 276 { 277 lexer->emit = func; 278 lexer->state = IN_START; 279 lexer->token = qstring_new(); 280 lexer->x = lexer->y = 0; 281 } 282 283 static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) 284 { 285 int char_consumed, new_state; 286 287 lexer->x++; 288 if (ch == '\n') { 289 lexer->x = 0; 290 lexer->y++; 291 } 292 293 do { 294 assert(lexer->state <= ARRAY_SIZE(json_lexer)); 295 new_state = json_lexer[lexer->state][(uint8_t)ch]; 296 char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); 297 if (char_consumed) { 298 qstring_append_chr(lexer->token, ch); 299 } 300 301 switch (new_state) { 302 case JSON_LCURLY: 303 case JSON_RCURLY: 304 case JSON_LSQUARE: 305 case JSON_RSQUARE: 306 case JSON_COLON: 307 case JSON_COMMA: 308 case JSON_ESCAPE: 309 case JSON_INTEGER: 310 case JSON_FLOAT: 311 case JSON_KEYWORD: 312 case JSON_STRING: 313 lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y); 314 /* fall through */ 315 case JSON_SKIP: 316 QDECREF(lexer->token); 317 lexer->token = qstring_new(); 318 new_state = IN_START; 319 break; 320 case IN_ERROR: 321 /* XXX: To avoid having previous bad input leaving the parser in an 322 * unresponsive state where we consume unpredictable amounts of 323 * subsequent "good" input, percolate this error state up to the 324 * tokenizer/parser by forcing a NULL object to be emitted, then 325 * reset state. 326 * 327 * Also note that this handling is required for reliable channel 328 * negotiation between QMP and the guest agent, since chr(0xFF) 329 * is placed at the beginning of certain events to ensure proper 330 * delivery when the channel is in an unknown state. chr(0xFF) is 331 * never a valid ASCII/UTF-8 sequence, so this should reliably 332 * induce an error/flush state. 333 */ 334 lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y); 335 QDECREF(lexer->token); 336 lexer->token = qstring_new(); 337 new_state = IN_START; 338 lexer->state = new_state; 339 return 0; 340 default: 341 break; 342 } 343 lexer->state = new_state; 344 } while (!char_consumed && !flush); 345 346 /* Do not let a single token grow to an arbitrarily large size, 347 * this is a security consideration. 348 */ 349 if (lexer->token->length > MAX_TOKEN_SIZE) { 350 lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); 351 QDECREF(lexer->token); 352 lexer->token = qstring_new(); 353 lexer->state = IN_START; 354 } 355 356 return 0; 357 } 358 359 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 360 { 361 size_t i; 362 363 for (i = 0; i < size; i++) { 364 int err; 365 366 err = json_lexer_feed_char(lexer, buffer[i], false); 367 if (err < 0) { 368 return err; 369 } 370 } 371 372 return 0; 373 } 374 375 int json_lexer_flush(JSONLexer *lexer) 376 { 377 return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true); 378 } 379 380 void json_lexer_destroy(JSONLexer *lexer) 381 { 382 QDECREF(lexer->token); 383 } 384