xref: /openbmc/u-boot/lib/slre.c (revision b94a8271cc42086f3f75941c15bba265f409601d)
1  /*
2   * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
3   * All rights reserved
4   *
5   * "THE BEER-WARE LICENSE" (Revision 42):
6   * Sergey Lyubka wrote this file.  As long as you retain this notice you
7   * can do whatever you want with this stuff. If we meet some day, and you think
8   * this stuff is worth it, you can buy me a beer in return.
9   */
10  
11  /*
12   * Downloaded Sat Nov  5 17:43:06 CET 2011 at
13   * http://slre.sourceforge.net/1.0/slre.c
14   */
15  
16  #ifdef SLRE_TEST
17  #include <stdio.h>
18  #include <assert.h>
19  #include <ctype.h>
20  #include <stdlib.h>
21  #include <string.h>
22  #else
23  #include <common.h>
24  #include <linux/ctype.h>
25  #endif /* SLRE_TEST */
26  
27  #include <errno.h>
28  
29  #include <slre.h>
30  
31  enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
32  	STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
33  
34  #ifdef SLRE_TEST
35  static struct {
36  	const char	*name;
37  	int		narg;
38  	const char	*flags;
39  } opcodes[] = {
40  	{"END",		0, ""},		/* End of code block or program	*/
41  	{"BRANCH",	2, "oo"},	/* Alternative operator, "|"	*/
42  	{"ANY",		0, ""},		/* Match any character, "."	*/
43  	{"EXACT",	2, "d"},	/* Match exact string		*/
44  	{"ANYOF",	2, "D"},	/* Match any from set, "[]"	*/
45  	{"ANYBUT",	2, "D"},	/* Match any but from set, "[^]"*/
46  	{"OPEN ",	1, "i"},	/* Capture start, "("		*/
47  	{"CLOSE",	1, "i"},	/* Capture end, ")"		*/
48  	{"BOL",		0, ""},		/* Beginning of string, "^"	*/
49  	{"EOL",		0, ""},		/* End of string, "$"		*/
50  	{"STAR",	1, "o"},	/* Match zero or more times "*"	*/
51  	{"PLUS",	1, "o"},	/* Match one or more times, "+"	*/
52  	{"STARQ",	1, "o"},	/* Non-greedy STAR,  "*?"	*/
53  	{"PLUSQ",	1, "o"},	/* Non-greedy PLUS, "+?"	*/
54  	{"QUEST",	1, "o"},	/* Match zero or one time, "?"	*/
55  	{"SPACE",	0, ""},		/* Match whitespace, "\s"	*/
56  	{"NONSPACE",	0, ""},		/* Match non-space, "\S"	*/
57  	{"DIGIT",	0, ""}		/* Match digit, "\d"		*/
58  };
59  #endif /* SLRE_TEST */
60  
61  /*
62   * Commands and operands are all unsigned char (1 byte long). All code offsets
63   * are relative to current address, and positive (always point forward). Data
64   * offsets are absolute. Commands with operands:
65   *
66   * BRANCH offset1 offset2
67   *	Try to match the code block that follows the BRANCH instruction
68   *	(code block ends with END). If no match, try to match code block that
69   *	starts at offset1. If either of these match, jump to offset2.
70   *
71   * EXACT data_offset data_length
72   *	Try to match exact string. String is recorded in data section from
73   *	data_offset, and has length data_length.
74   *
75   * OPEN capture_number
76   * CLOSE capture_number
77   *	If the user have passed 'struct cap' array for captures, OPEN
78   *	records the beginning of the matched substring (cap->ptr), CLOSE
79   *	sets the length (cap->len) for respective capture_number.
80   *
81   * STAR code_offset
82   * PLUS code_offset
83   * QUEST code_offset
84   *	*, +, ?, respectively. Try to gobble as much as possible from the
85   *	matched buffer, until code block that follows these instructions
86   *	matches. When the longest possible string is matched,
87   *	jump to code_offset
88   *
89   * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
90   */
91  
92  static const char *meta_chars = "|.^$*+?()[\\";
93  
94  #ifdef SLRE_TEST
95  
96  static void
97  print_character_set(FILE *fp, const unsigned char *p, int len)
98  {
99  	int	i;
100  
101  	for (i = 0; i < len; i++) {
102  		if (i > 0)
103  			(void) fputc(',', fp);
104  		if (p[i] == 0) {
105  			i++;
106  			if (p[i] == 0)
107  				(void) fprintf(fp, "\\x%02x", p[i]);
108  			else
109  				(void) fprintf(fp, "%s", opcodes[p[i]].name);
110  		} else if (isprint(p[i])) {
111  			(void) fputc(p[i], fp);
112  		} else {
113  			(void) fprintf(fp, "\\x%02x", p[i]);
114  		}
115  	}
116  }
117  
118  void
119  slre_dump(const struct slre *r, FILE *fp)
120  {
121  	int	i, j, ch, op, pc;
122  
123  	for (pc = 0; pc < r->code_size; pc++) {
124  
125  		op = r->code[pc];
126  		(void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
127  
128  		for (i = 0; opcodes[op].flags[i] != '\0'; i++)
129  			switch (opcodes[op].flags[i]) {
130  			case 'i':
131  				(void) fprintf(fp, "%d ", r->code[pc + 1]);
132  				pc++;
133  				break;
134  			case 'o':
135  				(void) fprintf(fp, "%d ",
136  				    pc + r->code[pc + 1] - i);
137  				pc++;
138  				break;
139  			case 'D':
140  				print_character_set(fp, r->data +
141  				    r->code[pc + 1], r->code[pc + 2]);
142  				pc += 2;
143  				break;
144  			case 'd':
145  				(void) fputc('"', fp);
146  				for (j = 0; j < r->code[pc + 2]; j++) {
147  					ch = r->data[r->code[pc + 1] + j];
148  					if (isprint(ch)) {
149  						(void) fputc(ch, fp);
150  					} else {
151  						(void) fprintf(fp,
152  							"\\x%02x", ch);
153  					}
154  				}
155  				(void) fputc('"', fp);
156  				pc += 2;
157  				break;
158  			}
159  
160  		(void) fputc('\n', fp);
161  	}
162  }
163  #endif /* SLRE_TEST */
164  
165  static void
166  set_jump_offset(struct slre *r, int pc, int offset)
167  {
168  	assert(offset < r->code_size);
169  
170  	if (r->code_size - offset > 0xff)
171  		r->err_str = "Jump offset is too big";
172  	else
173  		r->code[pc] = (unsigned char) (r->code_size - offset);
174  }
175  
176  static void
177  emit(struct slre *r, int code)
178  {
179  	if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
180  		r->err_str = "RE is too long (code overflow)";
181  	else
182  		r->code[r->code_size++] = (unsigned char) code;
183  }
184  
185  static void
186  store_char_in_data(struct slre *r, int ch)
187  {
188  	if (r->data_size >= (int) sizeof(r->data))
189  		r->err_str = "RE is too long (data overflow)";
190  	else
191  		r->data[r->data_size++] = ch;
192  }
193  
194  static void
195  exact(struct slre *r, const char **re)
196  {
197  	int	old_data_size = r->data_size;
198  
199  	while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
200  		store_char_in_data(r, *(*re)++);
201  
202  	emit(r, EXACT);
203  	emit(r, old_data_size);
204  	emit(r, r->data_size - old_data_size);
205  }
206  
207  static int
208  get_escape_char(const char **re)
209  {
210  	int	res;
211  
212  	switch (*(*re)++) {
213  	case 'n':
214  		res = '\n';
215  		break;
216  	case 'r':
217  		res = '\r';
218  		break;
219  	case 't':
220  		res = '\t';
221  		break;
222  	case '0':
223  		res = 0;
224  		break;
225  	case 'S':
226  		res = NONSPACE << 8;
227  		break;
228  	case 's':
229  		res = SPACE << 8;
230  		break;
231  	case 'd':
232  		res = DIGIT << 8;
233  		break;
234  	default:
235  		res = (*re)[-1];
236  		break;
237  	}
238  
239  	return res;
240  }
241  
242  static void
243  anyof(struct slre *r, const char **re)
244  {
245  	int	esc, old_data_size = r->data_size, op = ANYOF;
246  
247  	if (**re == '^') {
248  		op = ANYBUT;
249  		(*re)++;
250  	}
251  
252  	while (**re != '\0')
253  
254  		switch (*(*re)++) {
255  		case ']':
256  			emit(r, op);
257  			emit(r, old_data_size);
258  			emit(r, r->data_size - old_data_size);
259  			return;
260  			/* NOTREACHED */
261  			break;
262  		case '\\':
263  			esc = get_escape_char(re);
264  			if ((esc & 0xff) == 0) {
265  				store_char_in_data(r, 0);
266  				store_char_in_data(r, esc >> 8);
267  			} else {
268  				store_char_in_data(r, esc);
269  			}
270  			break;
271  		default:
272  			store_char_in_data(r, (*re)[-1]);
273  			break;
274  		}
275  
276  	r->err_str = "No closing ']' bracket";
277  }
278  
279  static void
280  relocate(struct slre *r, int begin, int shift)
281  {
282  	emit(r, END);
283  	memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
284  	r->code_size += shift;
285  }
286  
287  static void
288  quantifier(struct slre *r, int prev, int op)
289  {
290  	if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
291  		r->code[prev + 2]--;
292  		emit(r, EXACT);
293  		emit(r, r->code[prev + 1] + r->code[prev + 2]);
294  		emit(r, 1);
295  		prev = r->code_size - 3;
296  	}
297  	relocate(r, prev, 2);
298  	r->code[prev] = op;
299  	set_jump_offset(r, prev + 1, prev);
300  }
301  
302  static void
303  exact_one_char(struct slre *r, int ch)
304  {
305  	emit(r, EXACT);
306  	emit(r, r->data_size);
307  	emit(r, 1);
308  	store_char_in_data(r, ch);
309  }
310  
311  static void
312  fixup_branch(struct slre *r, int fixup)
313  {
314  	if (fixup > 0) {
315  		emit(r, END);
316  		set_jump_offset(r, fixup, fixup - 2);
317  	}
318  }
319  
320  static void
321  compile(struct slre *r, const char **re)
322  {
323  	int	op, esc, branch_start, last_op, fixup, cap_no, level;
324  
325  	fixup = 0;
326  	level = r->num_caps;
327  	branch_start = last_op = r->code_size;
328  
329  	for (;;)
330  		switch (*(*re)++) {
331  		case '\0':
332  			(*re)--;
333  			return;
334  			/* NOTREACHED */
335  			break;
336  		case '^':
337  			emit(r, BOL);
338  			break;
339  		case '$':
340  			emit(r, EOL);
341  			break;
342  		case '.':
343  			last_op = r->code_size;
344  			emit(r, ANY);
345  			break;
346  		case '[':
347  			last_op = r->code_size;
348  			anyof(r, re);
349  			break;
350  		case '\\':
351  			last_op = r->code_size;
352  			esc = get_escape_char(re);
353  			if (esc & 0xff00)
354  				emit(r, esc >> 8);
355  			else
356  				exact_one_char(r, esc);
357  			break;
358  		case '(':
359  			last_op = r->code_size;
360  			cap_no = ++r->num_caps;
361  			emit(r, OPEN);
362  			emit(r, cap_no);
363  
364  			compile(r, re);
365  			if (*(*re)++ != ')') {
366  				r->err_str = "No closing bracket";
367  				return;
368  			}
369  
370  			emit(r, CLOSE);
371  			emit(r, cap_no);
372  			break;
373  		case ')':
374  			(*re)--;
375  			fixup_branch(r, fixup);
376  			if (level == 0) {
377  				r->err_str = "Unbalanced brackets";
378  				return;
379  			}
380  			return;
381  			/* NOTREACHED */
382  			break;
383  		case '+':
384  		case '*':
385  			op = (*re)[-1] == '*' ? STAR : PLUS;
386  			if (**re == '?') {
387  				(*re)++;
388  				op = op == STAR ? STARQ : PLUSQ;
389  			}
390  			quantifier(r, last_op, op);
391  			break;
392  		case '?':
393  			quantifier(r, last_op, QUEST);
394  			break;
395  		case '|':
396  			fixup_branch(r, fixup);
397  			relocate(r, branch_start, 3);
398  			r->code[branch_start] = BRANCH;
399  			set_jump_offset(r, branch_start + 1, branch_start);
400  			fixup = branch_start + 2;
401  			r->code[fixup] = 0xff;
402  			break;
403  		default:
404  			(*re)--;
405  			last_op = r->code_size;
406  			exact(r, re);
407  			break;
408  		}
409  }
410  
411  int
412  slre_compile(struct slre *r, const char *re)
413  {
414  	r->err_str = NULL;
415  	r->code_size = r->data_size = r->num_caps = r->anchored = 0;
416  
417  	if (*re == '^')
418  		r->anchored++;
419  
420  	emit(r, OPEN);	/* This will capture what matches full RE */
421  	emit(r, 0);
422  
423  	while (*re != '\0')
424  		compile(r, &re);
425  
426  	if (r->code[2] == BRANCH)
427  		fixup_branch(r, 4);
428  
429  	emit(r, CLOSE);
430  	emit(r, 0);
431  	emit(r, END);
432  
433  	return (r->err_str == NULL ? 1 : 0);
434  }
435  
436  static int match(const struct slre *, int,
437  		const char *, int, int *, struct cap *);
438  
439  static void
440  loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
441  {
442  	int	saved_offset, matched_offset;
443  
444  	matched_offset = *ofs;
445  
446  	while (match(r, pc + 2, s, len, ofs, NULL)) {
447  		saved_offset = *ofs;
448  		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
449  			matched_offset = saved_offset;
450  		*ofs = saved_offset;
451  	}
452  
453  	*ofs = matched_offset;
454  }
455  
456  static void
457  loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
458  {
459  	int	saved_offset = *ofs;
460  
461  	while (match(r, pc + 2, s, len, ofs, NULL)) {
462  		saved_offset = *ofs;
463  		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
464  			break;
465  	}
466  
467  	*ofs = saved_offset;
468  }
469  
470  static int
471  is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
472  {
473  	int	i, ch;
474  
475  	ch = s[*ofs];
476  
477  	for (i = 0; i < len; i++)
478  		if (p[i] == ch) {
479  			(*ofs)++;
480  			return 1;
481  		}
482  
483  	return 0;
484  }
485  
486  static int
487  is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
488  {
489  	int	i, ch;
490  
491  	ch = s[*ofs];
492  
493  	for (i = 0; i < len; i++) {
494  		if (p[i] == ch)
495  			return 0;
496  	}
497  
498  	(*ofs)++;
499  	return 1;
500  }
501  
502  static int
503  match(const struct slre *r, int pc, const char *s, int len,
504  		int *ofs, struct cap *caps)
505  {
506  	int	n, saved_offset, res = 1;
507  
508  	while (res && r->code[pc] != END) {
509  
510  		assert(pc < r->code_size);
511  		assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
512  
513  		switch (r->code[pc]) {
514  		case BRANCH:
515  			saved_offset = *ofs;
516  			res = match(r, pc + 3, s, len, ofs, caps);
517  			if (res == 0) {
518  				*ofs = saved_offset;
519  				res = match(r, pc + r->code[pc + 1],
520  				    s, len, ofs, caps);
521  			}
522  			pc += r->code[pc + 2];
523  			break;
524  		case EXACT:
525  			res = 0;
526  			n = r->code[pc + 2];	/* String length */
527  			if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
528  			    r->code[pc + 1], n)) {
529  				(*ofs) += n;
530  				res = 1;
531  			}
532  			pc += 3;
533  			break;
534  		case QUEST:
535  			res = 1;
536  			saved_offset = *ofs;
537  			if (!match(r, pc + 2, s, len, ofs, caps))
538  				*ofs = saved_offset;
539  			pc += r->code[pc + 1];
540  			break;
541  		case STAR:
542  			res = 1;
543  			loop_greedy(r, pc, s, len, ofs);
544  			pc += r->code[pc + 1];
545  			break;
546  		case STARQ:
547  			res = 1;
548  			loop_non_greedy(r, pc, s, len, ofs);
549  			pc += r->code[pc + 1];
550  			break;
551  		case PLUS:
552  			res = match(r, pc + 2, s, len, ofs, caps);
553  			if (res == 0)
554  				break;
555  
556  			loop_greedy(r, pc, s, len, ofs);
557  			pc += r->code[pc + 1];
558  			break;
559  		case PLUSQ:
560  			res = match(r, pc + 2, s, len, ofs, caps);
561  			if (res == 0)
562  				break;
563  
564  			loop_non_greedy(r, pc, s, len, ofs);
565  			pc += r->code[pc + 1];
566  			break;
567  		case SPACE:
568  			res = 0;
569  			if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
570  				(*ofs)++;
571  				res = 1;
572  			}
573  			pc++;
574  			break;
575  		case NONSPACE:
576  			res = 0;
577  			if (*ofs < len &&
578  					!isspace(((unsigned char *)s)[*ofs])) {
579  				(*ofs)++;
580  				res = 1;
581  			}
582  			pc++;
583  			break;
584  		case DIGIT:
585  			res = 0;
586  			if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
587  				(*ofs)++;
588  				res = 1;
589  			}
590  			pc++;
591  			break;
592  		case ANY:
593  			res = 0;
594  			if (*ofs < len) {
595  				(*ofs)++;
596  				res = 1;
597  			}
598  			pc++;
599  			break;
600  		case ANYOF:
601  			res = 0;
602  			if (*ofs < len)
603  				res = is_any_of(r->data + r->code[pc + 1],
604  					r->code[pc + 2], s, ofs);
605  			pc += 3;
606  			break;
607  		case ANYBUT:
608  			res = 0;
609  			if (*ofs < len)
610  				res = is_any_but(r->data + r->code[pc + 1],
611  					r->code[pc + 2], s, ofs);
612  			pc += 3;
613  			break;
614  		case BOL:
615  			res = *ofs == 0 ? 1 : 0;
616  			pc++;
617  			break;
618  		case EOL:
619  			res = *ofs == len ? 1 : 0;
620  			pc++;
621  			break;
622  		case OPEN:
623  			if (caps != NULL)
624  				caps[r->code[pc + 1]].ptr = s + *ofs;
625  			pc += 2;
626  			break;
627  		case CLOSE:
628  			if (caps != NULL)
629  				caps[r->code[pc + 1]].len = (s + *ofs) -
630  				    caps[r->code[pc + 1]].ptr;
631  			pc += 2;
632  			break;
633  		case END:
634  			pc++;
635  			break;
636  		default:
637  			printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
638  			assert(0);
639  			break;
640  		}
641  	}
642  
643  	return res;
644  }
645  
646  int
647  slre_match(const struct slre *r, const char *buf, int len,
648  		struct cap *caps)
649  {
650  	int	i, ofs = 0, res = 0;
651  
652  	if (r->anchored) {
653  		res = match(r, 0, buf, len, &ofs, caps);
654  	} else {
655  		for (i = 0; i < len && res == 0; i++) {
656  			ofs = i;
657  			res = match(r, 0, buf, len, &ofs, caps);
658  		}
659  	}
660  
661  	return res;
662  }
663  
664  #ifdef SLRE_TEST
665  #define N_CAPS	5
666  
667  int main(int argc, char *argv[])
668  {
669  	struct slre	slre;
670  	struct cap	caps[N_CAPS];
671  	unsigned char	data[1 * 1024 * 1024];
672  	FILE		*fp;
673  	int		i, res, len;
674  
675  	if (argc < 2) {
676  		fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
677  		return 1;
678  	}
679  
680  	fp = fopen(argv[2], "rb");
681  	if (fp == NULL) {
682  		fprintf(stderr, "Error: cannot open %s:%s\n",
683  			argv[2], strerror(errno));
684  		return 1;
685  	}
686  
687  	if (!slre_compile(&slre, argv[1])) {
688  		fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
689  		return 1;
690  	}
691  
692  	slre_dump(&slre, stderr);
693  
694  	while (fgets(data, sizeof(data), fp) != NULL) {
695  		len = strlen(data);
696  
697  		if ((len > 0) && (data[len-1] == '\n')) {
698  			data[len-1] = '\0';
699  			--len;
700  		}
701  
702  		printf("Data = \"%s\"\n", data);
703  
704  		(void) memset(caps, 0, sizeof(caps));
705  
706  		res = slre_match(&slre, data, len, caps);
707  		printf("Result [%d]: %d\n", i, res);
708  
709  		for (i = 0; i < N_CAPS; i++) {
710  			if (caps[i].len > 0) {
711  				printf("Substring %d: len=%d  [%.*s]\n", i,
712  					caps[i].len,
713  					caps[i].len, caps[i].ptr);
714  			}
715  		}
716  		printf("----------------------------------------------------\n");
717  	}
718  	(void) fclose(fp);
719  
720  	return 0;
721  }
722  #endif /* SLRE_TEST */
723