1 /* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14 #include "qemu-common.h" 15 #include "qapi/qmp/json-lexer.h" 16 #include <stdint.h> 17 18 #define MAX_TOKEN_SIZE (64ULL << 20) 19 20 /* 21 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\" 22 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*' 23 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+)) 24 * [{}\[\],:] 25 * [a-z]+ 26 * 27 */ 28 29 enum json_lexer_state { 30 IN_ERROR = 0, /* must really be 0, see json_lexer[] */ 31 IN_DQ_UCODE3, 32 IN_DQ_UCODE2, 33 IN_DQ_UCODE1, 34 IN_DQ_UCODE0, 35 IN_DQ_STRING_ESCAPE, 36 IN_DQ_STRING, 37 IN_SQ_UCODE3, 38 IN_SQ_UCODE2, 39 IN_SQ_UCODE1, 40 IN_SQ_UCODE0, 41 IN_SQ_STRING_ESCAPE, 42 IN_SQ_STRING, 43 IN_ZERO, 44 IN_DIGITS, 45 IN_DIGIT, 46 IN_EXP_E, 47 IN_MANTISSA, 48 IN_MANTISSA_DIGITS, 49 IN_NONZERO_NUMBER, 50 IN_NEG_NONZERO_NUMBER, 51 IN_KEYWORD, 52 IN_ESCAPE, 53 IN_ESCAPE_L, 54 IN_ESCAPE_LL, 55 IN_ESCAPE_I, 56 IN_ESCAPE_I6, 57 IN_ESCAPE_I64, 58 IN_WHITESPACE, 59 IN_START, 60 }; 61 62 QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START); 63 64 #define TERMINAL(state) [0 ... 0x7F] = (state) 65 66 /* Return whether TERMINAL is a terminal state and the transition to it 67 from OLD_STATE required lookahead. This happens whenever the table 68 below uses the TERMINAL macro. */ 69 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \ 70 (json_lexer[(old_state)][0] == (terminal)) 71 72 static const uint8_t json_lexer[][256] = { 73 /* Relies on default initialization to IN_ERROR! */ 74 75 /* double quote string */ 76 [IN_DQ_UCODE3] = { 77 ['0' ... '9'] = IN_DQ_STRING, 78 ['a' ... 'f'] = IN_DQ_STRING, 79 ['A' ... 'F'] = IN_DQ_STRING, 80 }, 81 [IN_DQ_UCODE2] = { 82 ['0' ... '9'] = IN_DQ_UCODE3, 83 ['a' ... 'f'] = IN_DQ_UCODE3, 84 ['A' ... 'F'] = IN_DQ_UCODE3, 85 }, 86 [IN_DQ_UCODE1] = { 87 ['0' ... '9'] = IN_DQ_UCODE2, 88 ['a' ... 'f'] = IN_DQ_UCODE2, 89 ['A' ... 'F'] = IN_DQ_UCODE2, 90 }, 91 [IN_DQ_UCODE0] = { 92 ['0' ... '9'] = IN_DQ_UCODE1, 93 ['a' ... 'f'] = IN_DQ_UCODE1, 94 ['A' ... 'F'] = IN_DQ_UCODE1, 95 }, 96 [IN_DQ_STRING_ESCAPE] = { 97 ['b'] = IN_DQ_STRING, 98 ['f'] = IN_DQ_STRING, 99 ['n'] = IN_DQ_STRING, 100 ['r'] = IN_DQ_STRING, 101 ['t'] = IN_DQ_STRING, 102 ['/'] = IN_DQ_STRING, 103 ['\\'] = IN_DQ_STRING, 104 ['\''] = IN_DQ_STRING, 105 ['\"'] = IN_DQ_STRING, 106 ['u'] = IN_DQ_UCODE0, 107 }, 108 [IN_DQ_STRING] = { 109 [1 ... 0xBF] = IN_DQ_STRING, 110 [0xC2 ... 0xF4] = IN_DQ_STRING, 111 ['\\'] = IN_DQ_STRING_ESCAPE, 112 ['"'] = JSON_STRING, 113 }, 114 115 /* single quote string */ 116 [IN_SQ_UCODE3] = { 117 ['0' ... '9'] = IN_SQ_STRING, 118 ['a' ... 'f'] = IN_SQ_STRING, 119 ['A' ... 'F'] = IN_SQ_STRING, 120 }, 121 [IN_SQ_UCODE2] = { 122 ['0' ... '9'] = IN_SQ_UCODE3, 123 ['a' ... 'f'] = IN_SQ_UCODE3, 124 ['A' ... 'F'] = IN_SQ_UCODE3, 125 }, 126 [IN_SQ_UCODE1] = { 127 ['0' ... '9'] = IN_SQ_UCODE2, 128 ['a' ... 'f'] = IN_SQ_UCODE2, 129 ['A' ... 'F'] = IN_SQ_UCODE2, 130 }, 131 [IN_SQ_UCODE0] = { 132 ['0' ... '9'] = IN_SQ_UCODE1, 133 ['a' ... 'f'] = IN_SQ_UCODE1, 134 ['A' ... 'F'] = IN_SQ_UCODE1, 135 }, 136 [IN_SQ_STRING_ESCAPE] = { 137 ['b'] = IN_SQ_STRING, 138 ['f'] = IN_SQ_STRING, 139 ['n'] = IN_SQ_STRING, 140 ['r'] = IN_SQ_STRING, 141 ['t'] = IN_SQ_STRING, 142 ['/'] = IN_SQ_STRING, 143 ['\\'] = IN_SQ_STRING, 144 ['\''] = IN_SQ_STRING, 145 ['\"'] = IN_SQ_STRING, 146 ['u'] = IN_SQ_UCODE0, 147 }, 148 [IN_SQ_STRING] = { 149 [1 ... 0xBF] = IN_SQ_STRING, 150 [0xC2 ... 0xF4] = IN_SQ_STRING, 151 ['\\'] = IN_SQ_STRING_ESCAPE, 152 ['\''] = JSON_STRING, 153 }, 154 155 /* Zero */ 156 [IN_ZERO] = { 157 TERMINAL(JSON_INTEGER), 158 ['0' ... '9'] = IN_ERROR, 159 ['.'] = IN_MANTISSA, 160 }, 161 162 /* Float */ 163 [IN_DIGITS] = { 164 TERMINAL(JSON_FLOAT), 165 ['0' ... '9'] = IN_DIGITS, 166 }, 167 168 [IN_DIGIT] = { 169 ['0' ... '9'] = IN_DIGITS, 170 }, 171 172 [IN_EXP_E] = { 173 ['-'] = IN_DIGIT, 174 ['+'] = IN_DIGIT, 175 ['0' ... '9'] = IN_DIGITS, 176 }, 177 178 [IN_MANTISSA_DIGITS] = { 179 TERMINAL(JSON_FLOAT), 180 ['0' ... '9'] = IN_MANTISSA_DIGITS, 181 ['e'] = IN_EXP_E, 182 ['E'] = IN_EXP_E, 183 }, 184 185 [IN_MANTISSA] = { 186 ['0' ... '9'] = IN_MANTISSA_DIGITS, 187 }, 188 189 /* Number */ 190 [IN_NONZERO_NUMBER] = { 191 TERMINAL(JSON_INTEGER), 192 ['0' ... '9'] = IN_NONZERO_NUMBER, 193 ['e'] = IN_EXP_E, 194 ['E'] = IN_EXP_E, 195 ['.'] = IN_MANTISSA, 196 }, 197 198 [IN_NEG_NONZERO_NUMBER] = { 199 ['0'] = IN_ZERO, 200 ['1' ... '9'] = IN_NONZERO_NUMBER, 201 }, 202 203 /* keywords */ 204 [IN_KEYWORD] = { 205 TERMINAL(JSON_KEYWORD), 206 ['a' ... 'z'] = IN_KEYWORD, 207 }, 208 209 /* whitespace */ 210 [IN_WHITESPACE] = { 211 TERMINAL(JSON_SKIP), 212 [' '] = IN_WHITESPACE, 213 ['\t'] = IN_WHITESPACE, 214 ['\r'] = IN_WHITESPACE, 215 ['\n'] = IN_WHITESPACE, 216 }, 217 218 /* escape */ 219 [IN_ESCAPE_LL] = { 220 ['d'] = JSON_ESCAPE, 221 }, 222 223 [IN_ESCAPE_L] = { 224 ['d'] = JSON_ESCAPE, 225 ['l'] = IN_ESCAPE_LL, 226 }, 227 228 [IN_ESCAPE_I64] = { 229 ['d'] = JSON_ESCAPE, 230 }, 231 232 [IN_ESCAPE_I6] = { 233 ['4'] = IN_ESCAPE_I64, 234 }, 235 236 [IN_ESCAPE_I] = { 237 ['6'] = IN_ESCAPE_I6, 238 }, 239 240 [IN_ESCAPE] = { 241 ['d'] = JSON_ESCAPE, 242 ['i'] = JSON_ESCAPE, 243 ['p'] = JSON_ESCAPE, 244 ['s'] = JSON_ESCAPE, 245 ['f'] = JSON_ESCAPE, 246 ['l'] = IN_ESCAPE_L, 247 ['I'] = IN_ESCAPE_I, 248 }, 249 250 /* top level rule */ 251 [IN_START] = { 252 ['"'] = IN_DQ_STRING, 253 ['\''] = IN_SQ_STRING, 254 ['0'] = IN_ZERO, 255 ['1' ... '9'] = IN_NONZERO_NUMBER, 256 ['-'] = IN_NEG_NONZERO_NUMBER, 257 ['{'] = JSON_LCURLY, 258 ['}'] = JSON_RCURLY, 259 ['['] = JSON_LSQUARE, 260 [']'] = JSON_RSQUARE, 261 [','] = JSON_COMMA, 262 [':'] = JSON_COLON, 263 ['a' ... 'z'] = IN_KEYWORD, 264 ['%'] = IN_ESCAPE, 265 [' '] = IN_WHITESPACE, 266 ['\t'] = IN_WHITESPACE, 267 ['\r'] = IN_WHITESPACE, 268 ['\n'] = IN_WHITESPACE, 269 }, 270 }; 271 272 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func) 273 { 274 lexer->emit = func; 275 lexer->state = IN_START; 276 lexer->token = g_string_sized_new(3); 277 lexer->x = lexer->y = 0; 278 } 279 280 static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) 281 { 282 int char_consumed, new_state; 283 284 lexer->x++; 285 if (ch == '\n') { 286 lexer->x = 0; 287 lexer->y++; 288 } 289 290 do { 291 assert(lexer->state <= ARRAY_SIZE(json_lexer)); 292 new_state = json_lexer[lexer->state][(uint8_t)ch]; 293 char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); 294 if (char_consumed) { 295 g_string_append_c(lexer->token, ch); 296 } 297 298 switch (new_state) { 299 case JSON_LCURLY: 300 case JSON_RCURLY: 301 case JSON_LSQUARE: 302 case JSON_RSQUARE: 303 case JSON_COLON: 304 case JSON_COMMA: 305 case JSON_ESCAPE: 306 case JSON_INTEGER: 307 case JSON_FLOAT: 308 case JSON_KEYWORD: 309 case JSON_STRING: 310 lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y); 311 /* fall through */ 312 case JSON_SKIP: 313 g_string_truncate(lexer->token, 0); 314 new_state = IN_START; 315 break; 316 case IN_ERROR: 317 /* XXX: To avoid having previous bad input leaving the parser in an 318 * unresponsive state where we consume unpredictable amounts of 319 * subsequent "good" input, percolate this error state up to the 320 * tokenizer/parser by forcing a NULL object to be emitted, then 321 * reset state. 322 * 323 * Also note that this handling is required for reliable channel 324 * negotiation between QMP and the guest agent, since chr(0xFF) 325 * is placed at the beginning of certain events to ensure proper 326 * delivery when the channel is in an unknown state. chr(0xFF) is 327 * never a valid ASCII/UTF-8 sequence, so this should reliably 328 * induce an error/flush state. 329 */ 330 lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y); 331 g_string_truncate(lexer->token, 0); 332 new_state = IN_START; 333 lexer->state = new_state; 334 return 0; 335 default: 336 break; 337 } 338 lexer->state = new_state; 339 } while (!char_consumed && !flush); 340 341 /* Do not let a single token grow to an arbitrarily large size, 342 * this is a security consideration. 343 */ 344 if (lexer->token->len > MAX_TOKEN_SIZE) { 345 lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); 346 g_string_truncate(lexer->token, 0); 347 lexer->state = IN_START; 348 } 349 350 return 0; 351 } 352 353 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 354 { 355 size_t i; 356 357 for (i = 0; i < size; i++) { 358 int err; 359 360 err = json_lexer_feed_char(lexer, buffer[i], false); 361 if (err < 0) { 362 return err; 363 } 364 } 365 366 return 0; 367 } 368 369 int json_lexer_flush(JSONLexer *lexer) 370 { 371 return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true); 372 } 373 374 void json_lexer_destroy(JSONLexer *lexer) 375 { 376 g_string_free(lexer->token, true); 377 } 378