12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
26408f79cSThomas Graf /*
36408f79cSThomas Graf * lib/ts_fsm.c A naive finite state machine text search approach
46408f79cSThomas Graf *
56408f79cSThomas Graf * Authors: Thomas Graf <tgraf@suug.ch>
66408f79cSThomas Graf *
76408f79cSThomas Graf * ==========================================================================
86408f79cSThomas Graf *
96408f79cSThomas Graf * A finite state machine consists of n states (struct ts_fsm_token)
107433a8d6SRandy Dunlap * representing the pattern as a finite automaton. The data is read
111497b274SAndreas Mohr * sequentially on an octet basis. Every state token specifies the number
126408f79cSThomas Graf * of recurrences and the type of value accepted which can be either a
136408f79cSThomas Graf * specific character or ctype based set of characters. The available
146408f79cSThomas Graf * type of recurrences include 1, (0|1), [0 n], and [1 n].
156408f79cSThomas Graf *
161497b274SAndreas Mohr * The algorithm differs between strict/non-strict mode specifying
171497b274SAndreas Mohr * whether the pattern has to start at the first octet. Strict mode
186408f79cSThomas Graf * is enabled by default and can be disabled by inserting
196408f79cSThomas Graf * TS_FSM_HEAD_IGNORE as the first token in the chain.
206408f79cSThomas Graf *
216408f79cSThomas Graf * The runtime performance of the algorithm should be around O(n),
226408f79cSThomas Graf * however while in strict mode the average runtime can be better.
236408f79cSThomas Graf */
246408f79cSThomas Graf
256408f79cSThomas Graf #include <linux/module.h>
266408f79cSThomas Graf #include <linux/types.h>
276408f79cSThomas Graf #include <linux/string.h>
286408f79cSThomas Graf #include <linux/ctype.h>
296408f79cSThomas Graf #include <linux/textsearch.h>
306408f79cSThomas Graf #include <linux/textsearch_fsm.h>
316408f79cSThomas Graf
326408f79cSThomas Graf struct ts_fsm
336408f79cSThomas Graf {
346408f79cSThomas Graf unsigned int ntokens;
35842ae1f5SGustavo A. R. Silva struct ts_fsm_token tokens[];
366408f79cSThomas Graf };
376408f79cSThomas Graf
386408f79cSThomas Graf /* other values derived from ctype.h */
396408f79cSThomas Graf #define _A 0x100 /* ascii */
406408f79cSThomas Graf #define _W 0x200 /* wildcard */
416408f79cSThomas Graf
426408f79cSThomas Graf /* Map to _ctype flags and some magic numbers */
431497b274SAndreas Mohr static const u16 token_map[TS_FSM_TYPE_MAX+1] = {
446408f79cSThomas Graf [TS_FSM_SPECIFIC] = 0,
456408f79cSThomas Graf [TS_FSM_WILDCARD] = _W,
466408f79cSThomas Graf [TS_FSM_CNTRL] = _C,
476408f79cSThomas Graf [TS_FSM_LOWER] = _L,
486408f79cSThomas Graf [TS_FSM_UPPER] = _U,
496408f79cSThomas Graf [TS_FSM_PUNCT] = _P,
506408f79cSThomas Graf [TS_FSM_SPACE] = _S,
516408f79cSThomas Graf [TS_FSM_DIGIT] = _D,
526408f79cSThomas Graf [TS_FSM_XDIGIT] = _D | _X,
536408f79cSThomas Graf [TS_FSM_ALPHA] = _U | _L,
546408f79cSThomas Graf [TS_FSM_ALNUM] = _U | _L | _D,
556408f79cSThomas Graf [TS_FSM_PRINT] = _P | _U | _L | _D | _SP,
566408f79cSThomas Graf [TS_FSM_GRAPH] = _P | _U | _L | _D,
576408f79cSThomas Graf [TS_FSM_ASCII] = _A,
586408f79cSThomas Graf };
596408f79cSThomas Graf
601497b274SAndreas Mohr static const u16 token_lookup_tbl[256] = {
616408f79cSThomas Graf _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 0- 3 */
626408f79cSThomas Graf _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 4- 7 */
636408f79cSThomas Graf _W|_A|_C, _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C|_S, /* 8- 11 */
646408f79cSThomas Graf _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C, _W|_A|_C, /* 12- 15 */
656408f79cSThomas Graf _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 16- 19 */
666408f79cSThomas Graf _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 20- 23 */
676408f79cSThomas Graf _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 24- 27 */
686408f79cSThomas Graf _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 28- 31 */
696408f79cSThomas Graf _W|_A|_S|_SP, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 32- 35 */
706408f79cSThomas Graf _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 36- 39 */
716408f79cSThomas Graf _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 40- 43 */
726408f79cSThomas Graf _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 44- 47 */
736408f79cSThomas Graf _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 48- 51 */
746408f79cSThomas Graf _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 52- 55 */
756408f79cSThomas Graf _W|_A|_D, _W|_A|_D, _W|_A|_P, _W|_A|_P, /* 56- 59 */
766408f79cSThomas Graf _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 60- 63 */
776408f79cSThomas Graf _W|_A|_P, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, /* 64- 67 */
786408f79cSThomas Graf _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U, /* 68- 71 */
796408f79cSThomas Graf _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 72- 75 */
806408f79cSThomas Graf _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 76- 79 */
816408f79cSThomas Graf _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 80- 83 */
826408f79cSThomas Graf _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 84- 87 */
836408f79cSThomas Graf _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_P, /* 88- 91 */
846408f79cSThomas Graf _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 92- 95 */
856408f79cSThomas Graf _W|_A|_P, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, /* 96- 99 */
866408f79cSThomas Graf _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L, /* 100-103 */
876408f79cSThomas Graf _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 104-107 */
886408f79cSThomas Graf _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 108-111 */
896408f79cSThomas Graf _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 112-115 */
906408f79cSThomas Graf _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 116-119 */
916408f79cSThomas Graf _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_P, /* 120-123 */
926408f79cSThomas Graf _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_C, /* 124-127 */
936408f79cSThomas Graf _W, _W, _W, _W, /* 128-131 */
946408f79cSThomas Graf _W, _W, _W, _W, /* 132-135 */
956408f79cSThomas Graf _W, _W, _W, _W, /* 136-139 */
966408f79cSThomas Graf _W, _W, _W, _W, /* 140-143 */
976408f79cSThomas Graf _W, _W, _W, _W, /* 144-147 */
986408f79cSThomas Graf _W, _W, _W, _W, /* 148-151 */
996408f79cSThomas Graf _W, _W, _W, _W, /* 152-155 */
1006408f79cSThomas Graf _W, _W, _W, _W, /* 156-159 */
1016408f79cSThomas Graf _W|_S|_SP, _W|_P, _W|_P, _W|_P, /* 160-163 */
1026408f79cSThomas Graf _W|_P, _W|_P, _W|_P, _W|_P, /* 164-167 */
1036408f79cSThomas Graf _W|_P, _W|_P, _W|_P, _W|_P, /* 168-171 */
1046408f79cSThomas Graf _W|_P, _W|_P, _W|_P, _W|_P, /* 172-175 */
1056408f79cSThomas Graf _W|_P, _W|_P, _W|_P, _W|_P, /* 176-179 */
1066408f79cSThomas Graf _W|_P, _W|_P, _W|_P, _W|_P, /* 180-183 */
1076408f79cSThomas Graf _W|_P, _W|_P, _W|_P, _W|_P, /* 184-187 */
1086408f79cSThomas Graf _W|_P, _W|_P, _W|_P, _W|_P, /* 188-191 */
1096408f79cSThomas Graf _W|_U, _W|_U, _W|_U, _W|_U, /* 192-195 */
1106408f79cSThomas Graf _W|_U, _W|_U, _W|_U, _W|_U, /* 196-199 */
1116408f79cSThomas Graf _W|_U, _W|_U, _W|_U, _W|_U, /* 200-203 */
1126408f79cSThomas Graf _W|_U, _W|_U, _W|_U, _W|_U, /* 204-207 */
1136408f79cSThomas Graf _W|_U, _W|_U, _W|_U, _W|_U, /* 208-211 */
1146408f79cSThomas Graf _W|_U, _W|_U, _W|_U, _W|_P, /* 212-215 */
1156408f79cSThomas Graf _W|_U, _W|_U, _W|_U, _W|_U, /* 216-219 */
1166408f79cSThomas Graf _W|_U, _W|_U, _W|_U, _W|_L, /* 220-223 */
1176408f79cSThomas Graf _W|_L, _W|_L, _W|_L, _W|_L, /* 224-227 */
1186408f79cSThomas Graf _W|_L, _W|_L, _W|_L, _W|_L, /* 228-231 */
1196408f79cSThomas Graf _W|_L, _W|_L, _W|_L, _W|_L, /* 232-235 */
1206408f79cSThomas Graf _W|_L, _W|_L, _W|_L, _W|_L, /* 236-239 */
1216408f79cSThomas Graf _W|_L, _W|_L, _W|_L, _W|_L, /* 240-243 */
1226408f79cSThomas Graf _W|_L, _W|_L, _W|_L, _W|_P, /* 244-247 */
1236408f79cSThomas Graf _W|_L, _W|_L, _W|_L, _W|_L, /* 248-251 */
1246408f79cSThomas Graf _W|_L, _W|_L, _W|_L, _W|_L}; /* 252-255 */
1256408f79cSThomas Graf
match_token(struct ts_fsm_token * t,u8 d)1266408f79cSThomas Graf static inline int match_token(struct ts_fsm_token *t, u8 d)
1276408f79cSThomas Graf {
1286408f79cSThomas Graf if (t->type)
1296408f79cSThomas Graf return (token_lookup_tbl[d] & t->type) != 0;
1306408f79cSThomas Graf else
1316408f79cSThomas Graf return t->value == d;
1326408f79cSThomas Graf }
1336408f79cSThomas Graf
fsm_find(struct ts_config * conf,struct ts_state * state)1346408f79cSThomas Graf static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state)
1356408f79cSThomas Graf {
1366408f79cSThomas Graf struct ts_fsm *fsm = ts_config_priv(conf);
1376408f79cSThomas Graf struct ts_fsm_token *cur = NULL, *next;
1386408f79cSThomas Graf unsigned int match_start, block_idx = 0, tok_idx;
1396408f79cSThomas Graf unsigned block_len = 0, strict, consumed = state->offset;
1406408f79cSThomas Graf const u8 *data;
1416408f79cSThomas Graf
1426408f79cSThomas Graf #define GET_NEXT_BLOCK() \
1436408f79cSThomas Graf ({ consumed += block_idx; \
1446408f79cSThomas Graf block_idx = 0; \
1456408f79cSThomas Graf block_len = conf->get_next_block(consumed, &data, conf, state); })
1466408f79cSThomas Graf
1476408f79cSThomas Graf #define TOKEN_MISMATCH() \
1486408f79cSThomas Graf do { \
1496408f79cSThomas Graf if (strict) \
1506408f79cSThomas Graf goto no_match; \
1516408f79cSThomas Graf block_idx++; \
1526408f79cSThomas Graf goto startover; \
1536408f79cSThomas Graf } while(0)
1546408f79cSThomas Graf
1556408f79cSThomas Graf #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK())
1566408f79cSThomas Graf
1576408f79cSThomas Graf if (end_of_data())
1586408f79cSThomas Graf goto no_match;
1596408f79cSThomas Graf
1606408f79cSThomas Graf strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE;
1616408f79cSThomas Graf
1626408f79cSThomas Graf startover:
1636408f79cSThomas Graf match_start = consumed + block_idx;
1646408f79cSThomas Graf
1656408f79cSThomas Graf for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) {
1666408f79cSThomas Graf cur = &fsm->tokens[tok_idx];
1676408f79cSThomas Graf
1686408f79cSThomas Graf if (likely(tok_idx < (fsm->ntokens - 1)))
1696408f79cSThomas Graf next = &fsm->tokens[tok_idx + 1];
1706408f79cSThomas Graf else
1716408f79cSThomas Graf next = NULL;
1726408f79cSThomas Graf
1736408f79cSThomas Graf switch (cur->recur) {
1746408f79cSThomas Graf case TS_FSM_SINGLE:
1756408f79cSThomas Graf if (end_of_data())
1766408f79cSThomas Graf goto no_match;
1776408f79cSThomas Graf
1786408f79cSThomas Graf if (!match_token(cur, data[block_idx]))
1796408f79cSThomas Graf TOKEN_MISMATCH();
1806408f79cSThomas Graf break;
1816408f79cSThomas Graf
1826408f79cSThomas Graf case TS_FSM_PERHAPS:
1836408f79cSThomas Graf if (end_of_data() ||
1846408f79cSThomas Graf !match_token(cur, data[block_idx]))
1856408f79cSThomas Graf continue;
1866408f79cSThomas Graf break;
1876408f79cSThomas Graf
1886408f79cSThomas Graf case TS_FSM_MULTI:
1896408f79cSThomas Graf if (end_of_data())
1906408f79cSThomas Graf goto no_match;
1916408f79cSThomas Graf
1926408f79cSThomas Graf if (!match_token(cur, data[block_idx]))
1936408f79cSThomas Graf TOKEN_MISMATCH();
1946408f79cSThomas Graf
1956408f79cSThomas Graf block_idx++;
196*4c1ca831SNick Desaulniers fallthrough;
1976408f79cSThomas Graf
1986408f79cSThomas Graf case TS_FSM_ANY:
1996408f79cSThomas Graf if (next == NULL)
2006408f79cSThomas Graf goto found_match;
2016408f79cSThomas Graf
2026408f79cSThomas Graf if (end_of_data())
2036408f79cSThomas Graf continue;
2046408f79cSThomas Graf
2056408f79cSThomas Graf while (!match_token(next, data[block_idx])) {
2066408f79cSThomas Graf if (!match_token(cur, data[block_idx]))
2076408f79cSThomas Graf TOKEN_MISMATCH();
2086408f79cSThomas Graf block_idx++;
2096408f79cSThomas Graf if (end_of_data())
2106408f79cSThomas Graf goto no_match;
2116408f79cSThomas Graf }
2126408f79cSThomas Graf continue;
2136408f79cSThomas Graf
2146408f79cSThomas Graf /*
2156408f79cSThomas Graf * Optimization: Prefer small local loop over jumping
2166408f79cSThomas Graf * back and forth until garbage at head is munched.
2176408f79cSThomas Graf */
2186408f79cSThomas Graf case TS_FSM_HEAD_IGNORE:
2196408f79cSThomas Graf if (end_of_data())
2206408f79cSThomas Graf continue;
2216408f79cSThomas Graf
2226408f79cSThomas Graf while (!match_token(next, data[block_idx])) {
2236408f79cSThomas Graf /*
2246408f79cSThomas Graf * Special case, don't start over upon
2256408f79cSThomas Graf * a mismatch, give the user the
2266408f79cSThomas Graf * chance to specify the type of data
2276408f79cSThomas Graf * allowed to be ignored.
2286408f79cSThomas Graf */
2296408f79cSThomas Graf if (!match_token(cur, data[block_idx]))
2306408f79cSThomas Graf goto no_match;
2316408f79cSThomas Graf
2326408f79cSThomas Graf block_idx++;
2336408f79cSThomas Graf if (end_of_data())
2346408f79cSThomas Graf goto no_match;
2356408f79cSThomas Graf }
2366408f79cSThomas Graf
2376408f79cSThomas Graf match_start = consumed + block_idx;
2386408f79cSThomas Graf continue;
2396408f79cSThomas Graf }
2406408f79cSThomas Graf
2416408f79cSThomas Graf block_idx++;
2426408f79cSThomas Graf }
2436408f79cSThomas Graf
2446408f79cSThomas Graf if (end_of_data())
2456408f79cSThomas Graf goto found_match;
2466408f79cSThomas Graf
2476408f79cSThomas Graf no_match:
2486408f79cSThomas Graf return UINT_MAX;
2496408f79cSThomas Graf
2506408f79cSThomas Graf found_match:
2516408f79cSThomas Graf state->offset = consumed + block_idx;
2526408f79cSThomas Graf return match_start;
2536408f79cSThomas Graf }
2546408f79cSThomas Graf
fsm_init(const void * pattern,unsigned int len,gfp_t gfp_mask,int flags)2556408f79cSThomas Graf static struct ts_config *fsm_init(const void *pattern, unsigned int len,
25643138833SJoonwoo Park gfp_t gfp_mask, int flags)
2576408f79cSThomas Graf {
2586408f79cSThomas Graf int i, err = -EINVAL;
2596408f79cSThomas Graf struct ts_config *conf;
2606408f79cSThomas Graf struct ts_fsm *fsm;
2616408f79cSThomas Graf struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern;
2626408f79cSThomas Graf unsigned int ntokens = len / sizeof(*tokens);
2636408f79cSThomas Graf size_t priv_size = sizeof(*fsm) + len;
2646408f79cSThomas Graf
2656408f79cSThomas Graf if (len % sizeof(struct ts_fsm_token) || ntokens < 1)
2666408f79cSThomas Graf goto errout;
2676408f79cSThomas Graf
26843138833SJoonwoo Park if (flags & TS_IGNORECASE)
26943138833SJoonwoo Park goto errout;
27043138833SJoonwoo Park
2716408f79cSThomas Graf for (i = 0; i < ntokens; i++) {
2726408f79cSThomas Graf struct ts_fsm_token *t = &tokens[i];
2736408f79cSThomas Graf
2746408f79cSThomas Graf if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX)
2756408f79cSThomas Graf goto errout;
2766408f79cSThomas Graf
2776408f79cSThomas Graf if (t->recur == TS_FSM_HEAD_IGNORE &&
2786408f79cSThomas Graf (i != 0 || i == (ntokens - 1)))
2796408f79cSThomas Graf goto errout;
2806408f79cSThomas Graf }
2816408f79cSThomas Graf
2826408f79cSThomas Graf conf = alloc_ts_config(priv_size, gfp_mask);
2836408f79cSThomas Graf if (IS_ERR(conf))
2846408f79cSThomas Graf return conf;
2856408f79cSThomas Graf
28643138833SJoonwoo Park conf->flags = flags;
2876408f79cSThomas Graf fsm = ts_config_priv(conf);
2886408f79cSThomas Graf fsm->ntokens = ntokens;
2896408f79cSThomas Graf memcpy(fsm->tokens, pattern, len);
2906408f79cSThomas Graf
2916408f79cSThomas Graf for (i = 0; i < fsm->ntokens; i++) {
2926408f79cSThomas Graf struct ts_fsm_token *t = &fsm->tokens[i];
2936408f79cSThomas Graf t->type = token_map[t->type];
2946408f79cSThomas Graf }
2956408f79cSThomas Graf
2966408f79cSThomas Graf return conf;
2976408f79cSThomas Graf
2986408f79cSThomas Graf errout:
2996408f79cSThomas Graf return ERR_PTR(err);
3006408f79cSThomas Graf }
3016408f79cSThomas Graf
fsm_get_pattern(struct ts_config * conf)3026408f79cSThomas Graf static void *fsm_get_pattern(struct ts_config *conf)
3036408f79cSThomas Graf {
3046408f79cSThomas Graf struct ts_fsm *fsm = ts_config_priv(conf);
3056408f79cSThomas Graf return fsm->tokens;
3066408f79cSThomas Graf }
3076408f79cSThomas Graf
fsm_get_pattern_len(struct ts_config * conf)3086408f79cSThomas Graf static unsigned int fsm_get_pattern_len(struct ts_config *conf)
3096408f79cSThomas Graf {
3106408f79cSThomas Graf struct ts_fsm *fsm = ts_config_priv(conf);
3116408f79cSThomas Graf return fsm->ntokens * sizeof(struct ts_fsm_token);
3126408f79cSThomas Graf }
3136408f79cSThomas Graf
3146408f79cSThomas Graf static struct ts_ops fsm_ops = {
3156408f79cSThomas Graf .name = "fsm",
3166408f79cSThomas Graf .find = fsm_find,
3176408f79cSThomas Graf .init = fsm_init,
3186408f79cSThomas Graf .get_pattern = fsm_get_pattern,
3196408f79cSThomas Graf .get_pattern_len = fsm_get_pattern_len,
3206408f79cSThomas Graf .owner = THIS_MODULE,
3216408f79cSThomas Graf .list = LIST_HEAD_INIT(fsm_ops.list)
3226408f79cSThomas Graf };
3236408f79cSThomas Graf
init_fsm(void)3246408f79cSThomas Graf static int __init init_fsm(void)
3256408f79cSThomas Graf {
3266408f79cSThomas Graf return textsearch_register(&fsm_ops);
3276408f79cSThomas Graf }
3286408f79cSThomas Graf
exit_fsm(void)3296408f79cSThomas Graf static void __exit exit_fsm(void)
3306408f79cSThomas Graf {
3316408f79cSThomas Graf textsearch_unregister(&fsm_ops);
3326408f79cSThomas Graf }
3336408f79cSThomas Graf
3346408f79cSThomas Graf MODULE_LICENSE("GPL");
3356408f79cSThomas Graf
3366408f79cSThomas Graf module_init(init_fsm);
3376408f79cSThomas Graf module_exit(exit_fsm);
338