1 /* 2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com> 3 * All rights reserved 4 * 5 * "THE BEER-WARE LICENSE" (Revision 42): 6 * Sergey Lyubka wrote this file. As long as you retain this notice you 7 * can do whatever you want with this stuff. If we meet some day, and you think 8 * this stuff is worth it, you can buy me a beer in return. 9 */ 10 11 /* 12 * Downloaded Sat Nov 5 17:43:06 CET 2011 at 13 * http://slre.sourceforge.net/1.0/slre.c 14 */ 15 16 #ifdef SLRE_TEST 17 #include <stdio.h> 18 #include <assert.h> 19 #include <ctype.h> 20 #include <stdlib.h> 21 #include <string.h> 22 #else 23 #include <common.h> 24 #include <linux/ctype.h> 25 #endif /* SLRE_TEST */ 26 27 #include <errno.h> 28 29 #include <slre.h> 30 31 enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL, 32 STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT}; 33 34 #ifdef SLRE_TEST 35 static struct { 36 const char *name; 37 int narg; 38 const char *flags; 39 } opcodes[] = { 40 {"END", 0, ""}, /* End of code block or program */ 41 {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */ 42 {"ANY", 0, ""}, /* Match any character, "." */ 43 {"EXACT", 2, "d"}, /* Match exact string */ 44 {"ANYOF", 2, "D"}, /* Match any from set, "[]" */ 45 {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/ 46 {"OPEN ", 1, "i"}, /* Capture start, "(" */ 47 {"CLOSE", 1, "i"}, /* Capture end, ")" */ 48 {"BOL", 0, ""}, /* Beginning of string, "^" */ 49 {"EOL", 0, ""}, /* End of string, "$" */ 50 {"STAR", 1, "o"}, /* Match zero or more times "*" */ 51 {"PLUS", 1, "o"}, /* Match one or more times, "+" */ 52 {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */ 53 {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */ 54 {"QUEST", 1, "o"}, /* Match zero or one time, "?" */ 55 {"SPACE", 0, ""}, /* Match whitespace, "\s" */ 56 {"NONSPACE", 0, ""}, /* Match non-space, "\S" */ 57 {"DIGIT", 0, ""} /* Match digit, "\d" */ 58 }; 59 #endif /* SLRE_TEST */ 60 61 /* 62 * Commands and operands are all unsigned char (1 byte long). All code offsets 63 * are relative to current address, and positive (always point forward). Data 64 * offsets are absolute. Commands with operands: 65 * 66 * BRANCH offset1 offset2 67 * Try to match the code block that follows the BRANCH instruction 68 * (code block ends with END). If no match, try to match code block that 69 * starts at offset1. If either of these match, jump to offset2. 70 * 71 * EXACT data_offset data_length 72 * Try to match exact string. String is recorded in data section from 73 * data_offset, and has length data_length. 74 * 75 * OPEN capture_number 76 * CLOSE capture_number 77 * If the user have passed 'struct cap' array for captures, OPEN 78 * records the beginning of the matched substring (cap->ptr), CLOSE 79 * sets the length (cap->len) for respective capture_number. 80 * 81 * STAR code_offset 82 * PLUS code_offset 83 * QUEST code_offset 84 * *, +, ?, respectively. Try to gobble as much as possible from the 85 * matched buffer, until code block that follows these instructions 86 * matches. When the longest possible string is matched, 87 * jump to code_offset 88 * 89 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS. 90 */ 91 92 static const char *meta_chars = "|.^$*+?()[\\"; 93 94 #ifdef SLRE_TEST 95 96 static void 97 print_character_set(FILE *fp, const unsigned char *p, int len) 98 { 99 int i; 100 101 for (i = 0; i < len; i++) { 102 if (i > 0) 103 (void) fputc(',', fp); 104 if (p[i] == 0) { 105 i++; 106 if (p[i] == 0) 107 (void) fprintf(fp, "\\x%02x", p[i]); 108 else 109 (void) fprintf(fp, "%s", opcodes[p[i]].name); 110 } else if (isprint(p[i])) { 111 (void) fputc(p[i], fp); 112 } else { 113 (void) fprintf(fp, "\\x%02x", p[i]); 114 } 115 } 116 } 117 118 void 119 slre_dump(const struct slre *r, FILE *fp) 120 { 121 int i, j, ch, op, pc; 122 123 for (pc = 0; pc < r->code_size; pc++) { 124 125 op = r->code[pc]; 126 (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name); 127 128 for (i = 0; opcodes[op].flags[i] != '\0'; i++) 129 switch (opcodes[op].flags[i]) { 130 case 'i': 131 (void) fprintf(fp, "%d ", r->code[pc + 1]); 132 pc++; 133 break; 134 case 'o': 135 (void) fprintf(fp, "%d ", 136 pc + r->code[pc + 1] - i); 137 pc++; 138 break; 139 case 'D': 140 print_character_set(fp, r->data + 141 r->code[pc + 1], r->code[pc + 2]); 142 pc += 2; 143 break; 144 case 'd': 145 (void) fputc('"', fp); 146 for (j = 0; j < r->code[pc + 2]; j++) { 147 ch = r->data[r->code[pc + 1] + j]; 148 if (isprint(ch)) { 149 (void) fputc(ch, fp); 150 } else { 151 (void) fprintf(fp, 152 "\\x%02x", ch); 153 } 154 } 155 (void) fputc('"', fp); 156 pc += 2; 157 break; 158 } 159 160 (void) fputc('\n', fp); 161 } 162 } 163 #endif /* SLRE_TEST */ 164 165 static void 166 set_jump_offset(struct slre *r, int pc, int offset) 167 { 168 assert(offset < r->code_size); 169 170 if (r->code_size - offset > 0xff) 171 r->err_str = "Jump offset is too big"; 172 else 173 r->code[pc] = (unsigned char) (r->code_size - offset); 174 } 175 176 static void 177 emit(struct slre *r, int code) 178 { 179 if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0]))) 180 r->err_str = "RE is too long (code overflow)"; 181 else 182 r->code[r->code_size++] = (unsigned char) code; 183 } 184 185 static void 186 store_char_in_data(struct slre *r, int ch) 187 { 188 if (r->data_size >= (int) sizeof(r->data)) 189 r->err_str = "RE is too long (data overflow)"; 190 else 191 r->data[r->data_size++] = ch; 192 } 193 194 static void 195 exact(struct slre *r, const char **re) 196 { 197 int old_data_size = r->data_size; 198 199 while (**re != '\0' && (strchr(meta_chars, **re)) == NULL) 200 store_char_in_data(r, *(*re)++); 201 202 emit(r, EXACT); 203 emit(r, old_data_size); 204 emit(r, r->data_size - old_data_size); 205 } 206 207 static int 208 get_escape_char(const char **re) 209 { 210 int res; 211 212 switch (*(*re)++) { 213 case 'n': 214 res = '\n'; 215 break; 216 case 'r': 217 res = '\r'; 218 break; 219 case 't': 220 res = '\t'; 221 break; 222 case '0': 223 res = 0; 224 break; 225 case 'S': 226 res = NONSPACE << 8; 227 break; 228 case 's': 229 res = SPACE << 8; 230 break; 231 case 'd': 232 res = DIGIT << 8; 233 break; 234 default: 235 res = (*re)[-1]; 236 break; 237 } 238 239 return res; 240 } 241 242 static void 243 anyof(struct slre *r, const char **re) 244 { 245 int esc, old_data_size = r->data_size, op = ANYOF; 246 247 if (**re == '^') { 248 op = ANYBUT; 249 (*re)++; 250 } 251 252 while (**re != '\0') 253 254 switch (*(*re)++) { 255 case ']': 256 emit(r, op); 257 emit(r, old_data_size); 258 emit(r, r->data_size - old_data_size); 259 return; 260 /* NOTREACHED */ 261 break; 262 case '\\': 263 esc = get_escape_char(re); 264 if ((esc & 0xff) == 0) { 265 store_char_in_data(r, 0); 266 store_char_in_data(r, esc >> 8); 267 } else { 268 store_char_in_data(r, esc); 269 } 270 break; 271 default: 272 store_char_in_data(r, (*re)[-1]); 273 break; 274 } 275 276 r->err_str = "No closing ']' bracket"; 277 } 278 279 static void 280 relocate(struct slre *r, int begin, int shift) 281 { 282 emit(r, END); 283 memmove(r->code + begin + shift, r->code + begin, r->code_size - begin); 284 r->code_size += shift; 285 } 286 287 static void 288 quantifier(struct slre *r, int prev, int op) 289 { 290 if (r->code[prev] == EXACT && r->code[prev + 2] > 1) { 291 r->code[prev + 2]--; 292 emit(r, EXACT); 293 emit(r, r->code[prev + 1] + r->code[prev + 2]); 294 emit(r, 1); 295 prev = r->code_size - 3; 296 } 297 relocate(r, prev, 2); 298 r->code[prev] = op; 299 set_jump_offset(r, prev + 1, prev); 300 } 301 302 static void 303 exact_one_char(struct slre *r, int ch) 304 { 305 emit(r, EXACT); 306 emit(r, r->data_size); 307 emit(r, 1); 308 store_char_in_data(r, ch); 309 } 310 311 static void 312 fixup_branch(struct slre *r, int fixup) 313 { 314 if (fixup > 0) { 315 emit(r, END); 316 set_jump_offset(r, fixup, fixup - 2); 317 } 318 } 319 320 static void 321 compile(struct slre *r, const char **re) 322 { 323 int op, esc, branch_start, last_op, fixup, cap_no, level; 324 325 fixup = 0; 326 level = r->num_caps; 327 branch_start = last_op = r->code_size; 328 329 for (;;) 330 switch (*(*re)++) { 331 case '\0': 332 (*re)--; 333 return; 334 /* NOTREACHED */ 335 break; 336 case '^': 337 emit(r, BOL); 338 break; 339 case '$': 340 emit(r, EOL); 341 break; 342 case '.': 343 last_op = r->code_size; 344 emit(r, ANY); 345 break; 346 case '[': 347 last_op = r->code_size; 348 anyof(r, re); 349 break; 350 case '\\': 351 last_op = r->code_size; 352 esc = get_escape_char(re); 353 if (esc & 0xff00) 354 emit(r, esc >> 8); 355 else 356 exact_one_char(r, esc); 357 break; 358 case '(': 359 last_op = r->code_size; 360 cap_no = ++r->num_caps; 361 emit(r, OPEN); 362 emit(r, cap_no); 363 364 compile(r, re); 365 if (*(*re)++ != ')') { 366 r->err_str = "No closing bracket"; 367 return; 368 } 369 370 emit(r, CLOSE); 371 emit(r, cap_no); 372 break; 373 case ')': 374 (*re)--; 375 fixup_branch(r, fixup); 376 if (level == 0) { 377 r->err_str = "Unbalanced brackets"; 378 return; 379 } 380 return; 381 /* NOTREACHED */ 382 break; 383 case '+': 384 case '*': 385 op = (*re)[-1] == '*' ? STAR : PLUS; 386 if (**re == '?') { 387 (*re)++; 388 op = op == STAR ? STARQ : PLUSQ; 389 } 390 quantifier(r, last_op, op); 391 break; 392 case '?': 393 quantifier(r, last_op, QUEST); 394 break; 395 case '|': 396 fixup_branch(r, fixup); 397 relocate(r, branch_start, 3); 398 r->code[branch_start] = BRANCH; 399 set_jump_offset(r, branch_start + 1, branch_start); 400 fixup = branch_start + 2; 401 r->code[fixup] = 0xff; 402 break; 403 default: 404 (*re)--; 405 last_op = r->code_size; 406 exact(r, re); 407 break; 408 } 409 } 410 411 int 412 slre_compile(struct slre *r, const char *re) 413 { 414 r->err_str = NULL; 415 r->code_size = r->data_size = r->num_caps = r->anchored = 0; 416 417 if (*re == '^') 418 r->anchored++; 419 420 emit(r, OPEN); /* This will capture what matches full RE */ 421 emit(r, 0); 422 423 while (*re != '\0') 424 compile(r, &re); 425 426 if (r->code[2] == BRANCH) 427 fixup_branch(r, 4); 428 429 emit(r, CLOSE); 430 emit(r, 0); 431 emit(r, END); 432 433 return (r->err_str == NULL ? 1 : 0); 434 } 435 436 static int match(const struct slre *, int, 437 const char *, int, int *, struct cap *); 438 439 static void 440 loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs) 441 { 442 int saved_offset, matched_offset; 443 444 matched_offset = *ofs; 445 446 while (match(r, pc + 2, s, len, ofs, NULL)) { 447 saved_offset = *ofs; 448 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL)) 449 matched_offset = saved_offset; 450 *ofs = saved_offset; 451 } 452 453 *ofs = matched_offset; 454 } 455 456 static void 457 loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs) 458 { 459 int saved_offset = *ofs; 460 461 while (match(r, pc + 2, s, len, ofs, NULL)) { 462 saved_offset = *ofs; 463 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL)) 464 break; 465 } 466 467 *ofs = saved_offset; 468 } 469 470 static int 471 is_any_of(const unsigned char *p, int len, const char *s, int *ofs) 472 { 473 int i, ch; 474 475 ch = s[*ofs]; 476 477 for (i = 0; i < len; i++) 478 if (p[i] == ch) { 479 (*ofs)++; 480 return 1; 481 } 482 483 return 0; 484 } 485 486 static int 487 is_any_but(const unsigned char *p, int len, const char *s, int *ofs) 488 { 489 int i, ch; 490 491 ch = s[*ofs]; 492 493 for (i = 0; i < len; i++) { 494 if (p[i] == ch) 495 return 0; 496 } 497 498 (*ofs)++; 499 return 1; 500 } 501 502 static int 503 match(const struct slre *r, int pc, const char *s, int len, 504 int *ofs, struct cap *caps) 505 { 506 int n, saved_offset, res = 1; 507 508 while (res && r->code[pc] != END) { 509 510 assert(pc < r->code_size); 511 assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0]))); 512 513 switch (r->code[pc]) { 514 case BRANCH: 515 saved_offset = *ofs; 516 res = match(r, pc + 3, s, len, ofs, caps); 517 if (res == 0) { 518 *ofs = saved_offset; 519 res = match(r, pc + r->code[pc + 1], 520 s, len, ofs, caps); 521 } 522 pc += r->code[pc + 2]; 523 break; 524 case EXACT: 525 res = 0; 526 n = r->code[pc + 2]; /* String length */ 527 if (n <= len - *ofs && !memcmp(s + *ofs, r->data + 528 r->code[pc + 1], n)) { 529 (*ofs) += n; 530 res = 1; 531 } 532 pc += 3; 533 break; 534 case QUEST: 535 res = 1; 536 saved_offset = *ofs; 537 if (!match(r, pc + 2, s, len, ofs, caps)) 538 *ofs = saved_offset; 539 pc += r->code[pc + 1]; 540 break; 541 case STAR: 542 res = 1; 543 loop_greedy(r, pc, s, len, ofs); 544 pc += r->code[pc + 1]; 545 break; 546 case STARQ: 547 res = 1; 548 loop_non_greedy(r, pc, s, len, ofs); 549 pc += r->code[pc + 1]; 550 break; 551 case PLUS: 552 res = match(r, pc + 2, s, len, ofs, caps); 553 if (res == 0) 554 break; 555 556 loop_greedy(r, pc, s, len, ofs); 557 pc += r->code[pc + 1]; 558 break; 559 case PLUSQ: 560 res = match(r, pc + 2, s, len, ofs, caps); 561 if (res == 0) 562 break; 563 564 loop_non_greedy(r, pc, s, len, ofs); 565 pc += r->code[pc + 1]; 566 break; 567 case SPACE: 568 res = 0; 569 if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) { 570 (*ofs)++; 571 res = 1; 572 } 573 pc++; 574 break; 575 case NONSPACE: 576 res = 0; 577 if (*ofs < len && 578 !isspace(((unsigned char *)s)[*ofs])) { 579 (*ofs)++; 580 res = 1; 581 } 582 pc++; 583 break; 584 case DIGIT: 585 res = 0; 586 if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) { 587 (*ofs)++; 588 res = 1; 589 } 590 pc++; 591 break; 592 case ANY: 593 res = 0; 594 if (*ofs < len) { 595 (*ofs)++; 596 res = 1; 597 } 598 pc++; 599 break; 600 case ANYOF: 601 res = 0; 602 if (*ofs < len) 603 res = is_any_of(r->data + r->code[pc + 1], 604 r->code[pc + 2], s, ofs); 605 pc += 3; 606 break; 607 case ANYBUT: 608 res = 0; 609 if (*ofs < len) 610 res = is_any_but(r->data + r->code[pc + 1], 611 r->code[pc + 2], s, ofs); 612 pc += 3; 613 break; 614 case BOL: 615 res = *ofs == 0 ? 1 : 0; 616 pc++; 617 break; 618 case EOL: 619 res = *ofs == len ? 1 : 0; 620 pc++; 621 break; 622 case OPEN: 623 if (caps != NULL) 624 caps[r->code[pc + 1]].ptr = s + *ofs; 625 pc += 2; 626 break; 627 case CLOSE: 628 if (caps != NULL) 629 caps[r->code[pc + 1]].len = (s + *ofs) - 630 caps[r->code[pc + 1]].ptr; 631 pc += 2; 632 break; 633 case END: 634 pc++; 635 break; 636 default: 637 printf("unknown cmd (%d) at %d\n", r->code[pc], pc); 638 assert(0); 639 break; 640 } 641 } 642 643 return res; 644 } 645 646 int 647 slre_match(const struct slre *r, const char *buf, int len, 648 struct cap *caps) 649 { 650 int i, ofs = 0, res = 0; 651 652 if (r->anchored) { 653 res = match(r, 0, buf, len, &ofs, caps); 654 } else { 655 for (i = 0; i < len && res == 0; i++) { 656 ofs = i; 657 res = match(r, 0, buf, len, &ofs, caps); 658 } 659 } 660 661 return res; 662 } 663 664 #ifdef SLRE_TEST 665 #define N_CAPS 5 666 667 int main(int argc, char *argv[]) 668 { 669 struct slre slre; 670 struct cap caps[N_CAPS]; 671 unsigned char data[1 * 1024 * 1024]; 672 FILE *fp; 673 int i, res, len; 674 675 if (argc < 2) { 676 fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]); 677 return 1; 678 } 679 680 fp = fopen(argv[2], "rb"); 681 if (fp == NULL) { 682 fprintf(stderr, "Error: cannot open %s:%s\n", 683 argv[2], strerror(errno)); 684 return 1; 685 } 686 687 if (!slre_compile(&slre, argv[1])) { 688 fprintf(stderr, "Error compiling slre: %s\n", slre.err_str); 689 return 1; 690 } 691 692 slre_dump(&slre, stderr); 693 694 while (fgets(data, sizeof(data), fp) != NULL) { 695 len = strlen(data); 696 697 if ((len > 0) && (data[len-1] == '\n')) { 698 data[len-1] = '\0'; 699 --len; 700 } 701 702 printf("Data = \"%s\"\n", data); 703 704 (void) memset(caps, 0, sizeof(caps)); 705 706 res = slre_match(&slre, data, len, caps); 707 printf("Result [%d]: %d\n", i, res); 708 709 for (i = 0; i < N_CAPS; i++) { 710 if (caps[i].len > 0) { 711 printf("Substring %d: len=%d [%.*s]\n", i, 712 caps[i].len, 713 caps[i].len, caps[i].ptr); 714 } 715 } 716 printf("----------------------------------------------------\n"); 717 } 718 (void) fclose(fp); 719 720 return 0; 721 } 722 #endif /* SLRE_TEST */ 723