xref: /openbmc/linux/lib/ts_bm.c (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
28082e4edSPablo Neira Ayuso /*
38082e4edSPablo Neira Ayuso  * lib/ts_bm.c		Boyer-Moore text search implementation
48082e4edSPablo Neira Ayuso  *
58082e4edSPablo Neira Ayuso  * Authors:	Pablo Neira Ayuso <pablo@eurodev.net>
68082e4edSPablo Neira Ayuso  *
78082e4edSPablo Neira Ayuso  * ==========================================================================
88082e4edSPablo Neira Ayuso  *
98082e4edSPablo Neira Ayuso  *   Implements Boyer-Moore string matching algorithm:
108082e4edSPablo Neira Ayuso  *
118082e4edSPablo Neira Ayuso  *   [1] A Fast String Searching Algorithm, R.S. Boyer and Moore.
128082e4edSPablo Neira Ayuso  *       Communications of the Association for Computing Machinery,
138082e4edSPablo Neira Ayuso  *       20(10), 1977, pp. 762-772.
14d89775fcSAlexander A. Klimov  *       https://www.cs.utexas.edu/users/moore/publications/fstrpos.pdf
158082e4edSPablo Neira Ayuso  *
168082e4edSPablo Neira Ayuso  *   [2] Handbook of Exact String Matching Algorithms, Thierry Lecroq, 2004
178082e4edSPablo Neira Ayuso  *       http://www-igm.univ-mlv.fr/~lecroq/string/string.pdf
188082e4edSPablo Neira Ayuso  *
198082e4edSPablo Neira Ayuso  *   Note: Since Boyer-Moore (BM) performs searches for matchings from right
208082e4edSPablo Neira Ayuso  *   to left, it's still possible that a matching could be spread over
218082e4edSPablo Neira Ayuso  *   multiple blocks, in that case this algorithm won't find any coincidence.
228082e4edSPablo Neira Ayuso  *
238082e4edSPablo Neira Ayuso  *   If you're willing to ensure that such thing won't ever happen, use the
248082e4edSPablo Neira Ayuso  *   Knuth-Pratt-Morris (KMP) implementation instead. In conclusion, choose
258082e4edSPablo Neira Ayuso  *   the proper string search algorithm depending on your setting.
268082e4edSPablo Neira Ayuso  *
278082e4edSPablo Neira Ayuso  *   Say you're using the textsearch infrastructure for filtering, NIDS or
288082e4edSPablo Neira Ayuso  *   any similar security focused purpose, then go KMP. Otherwise, if you
298082e4edSPablo Neira Ayuso  *   really care about performance, say you're classifying packets to apply
308082e4edSPablo Neira Ayuso  *   Quality of Service (QoS) policies, and you don't mind about possible
318082e4edSPablo Neira Ayuso  *   matchings spread over multiple fragments, then go BM.
328082e4edSPablo Neira Ayuso  */
338082e4edSPablo Neira Ayuso 
348082e4edSPablo Neira Ayuso #include <linux/kernel.h>
358082e4edSPablo Neira Ayuso #include <linux/module.h>
368082e4edSPablo Neira Ayuso #include <linux/types.h>
378082e4edSPablo Neira Ayuso #include <linux/string.h>
383b76d081SJoonwoo Park #include <linux/ctype.h>
398082e4edSPablo Neira Ayuso #include <linux/textsearch.h>
408082e4edSPablo Neira Ayuso 
418082e4edSPablo Neira Ayuso /* Alphabet size, use ASCII */
428082e4edSPablo Neira Ayuso #define ASIZE 256
438082e4edSPablo Neira Ayuso 
448082e4edSPablo Neira Ayuso #if 0
458082e4edSPablo Neira Ayuso #define DEBUGP printk
468082e4edSPablo Neira Ayuso #else
478082e4edSPablo Neira Ayuso #define DEBUGP(args, format...)
488082e4edSPablo Neira Ayuso #endif
498082e4edSPablo Neira Ayuso 
508082e4edSPablo Neira Ayuso struct ts_bm
518082e4edSPablo Neira Ayuso {
528082e4edSPablo Neira Ayuso 	u8 *		pattern;
538082e4edSPablo Neira Ayuso 	unsigned int	patlen;
548082e4edSPablo Neira Ayuso 	unsigned int 	bad_shift[ASIZE];
55c6e2ac3bSGustavo A. R. Silva 	unsigned int	good_shift[];
568082e4edSPablo Neira Ayuso };
578082e4edSPablo Neira Ayuso 
matchpat(const u8 * pattern,unsigned int patlen,const u8 * text,bool icase)58*86e9c9aaSJeremy Sowden static unsigned int matchpat(const u8 *pattern, unsigned int patlen,
59*86e9c9aaSJeremy Sowden 			     const u8 *text, bool icase)
60*86e9c9aaSJeremy Sowden {
61*86e9c9aaSJeremy Sowden 	unsigned int i;
62*86e9c9aaSJeremy Sowden 
63*86e9c9aaSJeremy Sowden 	for (i = 0; i < patlen; i++) {
64*86e9c9aaSJeremy Sowden 		u8 t = *(text-i);
65*86e9c9aaSJeremy Sowden 
66*86e9c9aaSJeremy Sowden 		if (icase)
67*86e9c9aaSJeremy Sowden 			t = toupper(t);
68*86e9c9aaSJeremy Sowden 
69*86e9c9aaSJeremy Sowden 		if (t != *(pattern-i))
70*86e9c9aaSJeremy Sowden 			break;
71*86e9c9aaSJeremy Sowden 	}
72*86e9c9aaSJeremy Sowden 
73*86e9c9aaSJeremy Sowden 	return i;
74*86e9c9aaSJeremy Sowden }
75*86e9c9aaSJeremy Sowden 
bm_find(struct ts_config * conf,struct ts_state * state)768082e4edSPablo Neira Ayuso static unsigned int bm_find(struct ts_config *conf, struct ts_state *state)
778082e4edSPablo Neira Ayuso {
788082e4edSPablo Neira Ayuso 	struct ts_bm *bm = ts_config_priv(conf);
798082e4edSPablo Neira Ayuso 	unsigned int i, text_len, consumed = state->offset;
808082e4edSPablo Neira Ayuso 	const u8 *text;
816f67fbf8SJeremy Sowden 	int bs;
823b76d081SJoonwoo Park 	const u8 icase = conf->flags & TS_IGNORECASE;
838082e4edSPablo Neira Ayuso 
848082e4edSPablo Neira Ayuso 	for (;;) {
856f67fbf8SJeremy Sowden 		int shift = bm->patlen - 1;
866f67fbf8SJeremy Sowden 
878082e4edSPablo Neira Ayuso 		text_len = conf->get_next_block(consumed, &text, conf, state);
888082e4edSPablo Neira Ayuso 
898082e4edSPablo Neira Ayuso 		if (unlikely(text_len == 0))
908082e4edSPablo Neira Ayuso 			break;
918082e4edSPablo Neira Ayuso 
928082e4edSPablo Neira Ayuso 		while (shift < text_len) {
938082e4edSPablo Neira Ayuso 			DEBUGP("Searching in position %d (%c)\n",
948082e4edSPablo Neira Ayuso 			       shift, text[shift]);
958082e4edSPablo Neira Ayuso 
96*86e9c9aaSJeremy Sowden 			i = matchpat(&bm->pattern[bm->patlen-1], bm->patlen,
97*86e9c9aaSJeremy Sowden 				     &text[shift], icase);
98*86e9c9aaSJeremy Sowden 			if (i == bm->patlen) {
998082e4edSPablo Neira Ayuso 				/* London calling... */
1008082e4edSPablo Neira Ayuso 				DEBUGP("found!\n");
1014a70ce5fSColin Ian King 				return consumed + (shift-(bm->patlen-1));
102*86e9c9aaSJeremy Sowden 			}
1038082e4edSPablo Neira Ayuso 
104*86e9c9aaSJeremy Sowden 			bs = bm->bad_shift[text[shift-i]];
1058082e4edSPablo Neira Ayuso 
1068082e4edSPablo Neira Ayuso 			/* Now jumping to... */
1078082e4edSPablo Neira Ayuso 			shift = max_t(int, shift-i+bs, shift+bm->good_shift[i]);
1088082e4edSPablo Neira Ayuso 		}
1098082e4edSPablo Neira Ayuso 		consumed += text_len;
1108082e4edSPablo Neira Ayuso 	}
1118082e4edSPablo Neira Ayuso 
1128082e4edSPablo Neira Ayuso 	return UINT_MAX;
1138082e4edSPablo Neira Ayuso }
1148082e4edSPablo Neira Ayuso 
subpattern(u8 * pattern,int i,int j,int g)1153f330317SPablo Neira Ayuso static int subpattern(u8 *pattern, int i, int j, int g)
1163f330317SPablo Neira Ayuso {
1173f330317SPablo Neira Ayuso 	int x = i+g-1, y = j+g-1, ret = 0;
1183f330317SPablo Neira Ayuso 
1193f330317SPablo Neira Ayuso 	while(pattern[x--] == pattern[y--]) {
1203f330317SPablo Neira Ayuso 		if (y < 0) {
1213f330317SPablo Neira Ayuso 			ret = 1;
1223f330317SPablo Neira Ayuso 			break;
1233f330317SPablo Neira Ayuso 		}
1243f330317SPablo Neira Ayuso 		if (--g == 0) {
1253f330317SPablo Neira Ayuso 			ret = pattern[i-1] != pattern[j-1];
1263f330317SPablo Neira Ayuso 			break;
1273f330317SPablo Neira Ayuso 		}
1283f330317SPablo Neira Ayuso 	}
1293f330317SPablo Neira Ayuso 
1303f330317SPablo Neira Ayuso 	return ret;
1313f330317SPablo Neira Ayuso }
1323f330317SPablo Neira Ayuso 
compute_prefix_tbl(struct ts_bm * bm,int flags)1333b76d081SJoonwoo Park static void compute_prefix_tbl(struct ts_bm *bm, int flags)
1348082e4edSPablo Neira Ayuso {
1353f330317SPablo Neira Ayuso 	int i, j, g;
1368082e4edSPablo Neira Ayuso 
1378082e4edSPablo Neira Ayuso 	for (i = 0; i < ASIZE; i++)
1383ffaa8c7SMichael Rash 		bm->bad_shift[i] = bm->patlen;
1393b76d081SJoonwoo Park 	for (i = 0; i < bm->patlen - 1; i++) {
1403ffaa8c7SMichael Rash 		bm->bad_shift[bm->pattern[i]] = bm->patlen - 1 - i;
1413b76d081SJoonwoo Park 		if (flags & TS_IGNORECASE)
1423b76d081SJoonwoo Park 			bm->bad_shift[tolower(bm->pattern[i])]
1433b76d081SJoonwoo Park 			    = bm->patlen - 1 - i;
1443b76d081SJoonwoo Park 	}
1458082e4edSPablo Neira Ayuso 
1468082e4edSPablo Neira Ayuso 	/* Compute the good shift array, used to match reocurrences
1478082e4edSPablo Neira Ayuso 	 * of a subpattern */
1488082e4edSPablo Neira Ayuso 	bm->good_shift[0] = 1;
1498082e4edSPablo Neira Ayuso 	for (i = 1; i < bm->patlen; i++)
1508082e4edSPablo Neira Ayuso 		bm->good_shift[i] = bm->patlen;
1513f330317SPablo Neira Ayuso         for (i = bm->patlen-1, g = 1; i > 0; g++, i--) {
1523f330317SPablo Neira Ayuso 		for (j = i-1; j >= 1-g ; j--)
1533f330317SPablo Neira Ayuso 			if (subpattern(bm->pattern, i, j, g)) {
1543f330317SPablo Neira Ayuso 				bm->good_shift[g] = bm->patlen-j-g;
1553f330317SPablo Neira Ayuso 				break;
1563f330317SPablo Neira Ayuso 			}
1578082e4edSPablo Neira Ayuso 	}
1588082e4edSPablo Neira Ayuso }
1598082e4edSPablo Neira Ayuso 
bm_init(const void * pattern,unsigned int len,gfp_t gfp_mask,int flags)1608082e4edSPablo Neira Ayuso static struct ts_config *bm_init(const void *pattern, unsigned int len,
1613b76d081SJoonwoo Park 				 gfp_t gfp_mask, int flags)
1628082e4edSPablo Neira Ayuso {
1638082e4edSPablo Neira Ayuso 	struct ts_config *conf;
1648082e4edSPablo Neira Ayuso 	struct ts_bm *bm;
1653b76d081SJoonwoo Park 	int i;
1668082e4edSPablo Neira Ayuso 	unsigned int prefix_tbl_len = len * sizeof(unsigned int);
1678082e4edSPablo Neira Ayuso 	size_t priv_size = sizeof(*bm) + len + prefix_tbl_len;
1688082e4edSPablo Neira Ayuso 
1698082e4edSPablo Neira Ayuso 	conf = alloc_ts_config(priv_size, gfp_mask);
1708082e4edSPablo Neira Ayuso 	if (IS_ERR(conf))
1718082e4edSPablo Neira Ayuso 		return conf;
1728082e4edSPablo Neira Ayuso 
1733b76d081SJoonwoo Park 	conf->flags = flags;
1748082e4edSPablo Neira Ayuso 	bm = ts_config_priv(conf);
1758082e4edSPablo Neira Ayuso 	bm->patlen = len;
1768082e4edSPablo Neira Ayuso 	bm->pattern = (u8 *) bm->good_shift + prefix_tbl_len;
1773b76d081SJoonwoo Park 	if (flags & TS_IGNORECASE)
1783b76d081SJoonwoo Park 		for (i = 0; i < len; i++)
1793b76d081SJoonwoo Park 			bm->pattern[i] = toupper(((u8 *)pattern)[i]);
1803b76d081SJoonwoo Park 	else
1818082e4edSPablo Neira Ayuso 		memcpy(bm->pattern, pattern, len);
1823b76d081SJoonwoo Park 	compute_prefix_tbl(bm, flags);
1838082e4edSPablo Neira Ayuso 
1848082e4edSPablo Neira Ayuso 	return conf;
1858082e4edSPablo Neira Ayuso }
1868082e4edSPablo Neira Ayuso 
bm_get_pattern(struct ts_config * conf)1878082e4edSPablo Neira Ayuso static void *bm_get_pattern(struct ts_config *conf)
1888082e4edSPablo Neira Ayuso {
1898082e4edSPablo Neira Ayuso 	struct ts_bm *bm = ts_config_priv(conf);
1908082e4edSPablo Neira Ayuso 	return bm->pattern;
1918082e4edSPablo Neira Ayuso }
1928082e4edSPablo Neira Ayuso 
bm_get_pattern_len(struct ts_config * conf)1938082e4edSPablo Neira Ayuso static unsigned int bm_get_pattern_len(struct ts_config *conf)
1948082e4edSPablo Neira Ayuso {
1958082e4edSPablo Neira Ayuso 	struct ts_bm *bm = ts_config_priv(conf);
1968082e4edSPablo Neira Ayuso 	return bm->patlen;
1978082e4edSPablo Neira Ayuso }
1988082e4edSPablo Neira Ayuso 
1998082e4edSPablo Neira Ayuso static struct ts_ops bm_ops = {
2008082e4edSPablo Neira Ayuso 	.name		  = "bm",
2018082e4edSPablo Neira Ayuso 	.find		  = bm_find,
2028082e4edSPablo Neira Ayuso 	.init		  = bm_init,
2038082e4edSPablo Neira Ayuso 	.get_pattern	  = bm_get_pattern,
2048082e4edSPablo Neira Ayuso 	.get_pattern_len  = bm_get_pattern_len,
2058082e4edSPablo Neira Ayuso 	.owner		  = THIS_MODULE,
2068082e4edSPablo Neira Ayuso 	.list		  = LIST_HEAD_INIT(bm_ops.list)
2078082e4edSPablo Neira Ayuso };
2088082e4edSPablo Neira Ayuso 
init_bm(void)2098082e4edSPablo Neira Ayuso static int __init init_bm(void)
2108082e4edSPablo Neira Ayuso {
2118082e4edSPablo Neira Ayuso 	return textsearch_register(&bm_ops);
2128082e4edSPablo Neira Ayuso }
2138082e4edSPablo Neira Ayuso 
exit_bm(void)2148082e4edSPablo Neira Ayuso static void __exit exit_bm(void)
2158082e4edSPablo Neira Ayuso {
2168082e4edSPablo Neira Ayuso 	textsearch_unregister(&bm_ops);
2178082e4edSPablo Neira Ayuso }
2188082e4edSPablo Neira Ayuso 
2198082e4edSPablo Neira Ayuso MODULE_LICENSE("GPL");
2208082e4edSPablo Neira Ayuso 
2218082e4edSPablo Neira Ayuso module_init(init_bm);
2228082e4edSPablo Neira Ayuso module_exit(exit_bm);
223