1 // SPDX-License-Identifier: GPL-2.0-only
2 
3 /* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
4  *
5  * Copyright (c) 2019-2020 Red Hat GmbH
6  *
7  * Author: Stefano Brivio <sbrivio@redhat.com>
8  */
9 
10 #include <linux/kernel.h>
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/netlink.h>
14 #include <linux/netfilter.h>
15 #include <linux/netfilter/nf_tables.h>
16 #include <net/netfilter/nf_tables_core.h>
17 #include <uapi/linux/netfilter/nf_tables.h>
18 #include <linux/bitmap.h>
19 #include <linux/bitops.h>
20 
21 #include <linux/compiler.h>
22 #include <asm/fpu/api.h>
23 
24 #include "nft_set_pipapo_avx2.h"
25 #include "nft_set_pipapo.h"
26 
27 #define NFT_PIPAPO_LONGS_PER_M256	(XSAVE_YMM_SIZE / BITS_PER_LONG)
28 
29 /* Load from memory into YMM register with non-temporal hint ("stream load"),
30  * that is, don't fetch lines from memory into the cache. This avoids pushing
31  * precious packet data out of the cache hierarchy, and is appropriate when:
32  *
33  * - loading buckets from lookup tables, as they are not going to be used
34  *   again before packets are entirely classified
35  *
36  * - loading the result bitmap from the previous field, as it's never used
37  *   again
38  */
39 #define NFT_PIPAPO_AVX2_LOAD(reg, loc)					\
40 	asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
41 
42 /* Stream a single lookup table bucket into YMM register given lookup table,
43  * group index, value of packet bits, bucket size.
44  */
45 #define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize)		\
46 	NFT_PIPAPO_AVX2_LOAD(reg,					\
47 			     lt[((group) * NFT_PIPAPO_BUCKETS(4) +	\
48 				 (v)) * (bsize)])
49 #define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize)		\
50 	NFT_PIPAPO_AVX2_LOAD(reg,					\
51 			     lt[((group) * NFT_PIPAPO_BUCKETS(8) +	\
52 				 (v)) * (bsize)])
53 
54 /* Bitwise AND: the staple operation of this algorithm */
55 #define NFT_PIPAPO_AVX2_AND(dst, a, b)					\
56 	asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
57 
58 /* Jump to label if @reg is zero */
59 #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label)			\
60 	asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";"	\
61 			  "je %l[" #label "]" : : : : label)
62 
63 /* Store 256 bits from YMM register into memory. Contrary to bucket load
64  * operation, we don't bypass the cache here, as stored matching results
65  * are always used shortly after.
66  */
67 #define NFT_PIPAPO_AVX2_STORE(loc, reg)					\
68 	asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
69 
70 /* Zero out a complete YMM register, @reg */
71 #define NFT_PIPAPO_AVX2_ZERO(reg)					\
72 	asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
73 
74 /* Current working bitmap index, toggled between field matches */
75 static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
76 
77 /**
78  * nft_pipapo_avx2_prepare() - Prepare before main algorithm body
79  *
80  * This zeroes out ymm15, which is later used whenever we need to clear a
81  * memory location, by storing its content into memory.
82  */
83 static void nft_pipapo_avx2_prepare(void)
84 {
85 	NFT_PIPAPO_AVX2_ZERO(15);
86 }
87 
88 /**
89  * nft_pipapo_avx2_fill() - Fill a bitmap region with ones
90  * @data:	Base memory area
91  * @start:	First bit to set
92  * @len:	Count of bits to fill
93  *
94  * This is nothing else than a version of bitmap_set(), as used e.g. by
95  * pipapo_refill(), tailored for the microarchitectures using it and better
96  * suited for the specific usage: it's very likely that we'll set a small number
97  * of bits, not crossing a word boundary, and correct branch prediction is
98  * critical here.
99  *
100  * This function doesn't actually use any AVX2 instruction.
101  */
102 static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
103 {
104 	int offset = start % BITS_PER_LONG;
105 	unsigned long mask;
106 
107 	data += start / BITS_PER_LONG;
108 
109 	if (likely(len == 1)) {
110 		*data |= BIT(offset);
111 		return;
112 	}
113 
114 	if (likely(len < BITS_PER_LONG || offset)) {
115 		if (likely(len + offset <= BITS_PER_LONG)) {
116 			*data |= GENMASK(len - 1 + offset, offset);
117 			return;
118 		}
119 
120 		*data |= ~0UL << offset;
121 		len -= BITS_PER_LONG - offset;
122 		data++;
123 
124 		if (len <= BITS_PER_LONG) {
125 			mask = ~0UL >> (BITS_PER_LONG - len);
126 			*data |= mask;
127 			return;
128 		}
129 	}
130 
131 	memset(data, 0xff, len / BITS_PER_BYTE);
132 	data += len / BITS_PER_LONG;
133 
134 	len %= BITS_PER_LONG;
135 	if (len)
136 		*data |= ~0UL >> (BITS_PER_LONG - len);
137 }
138 
139 /**
140  * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
141  * @offset:	Start from given bitmap (equivalent to bucket) offset, in longs
142  * @map:	Bitmap to be scanned for set bits
143  * @dst:	Destination bitmap
144  * @mt:		Mapping table containing bit set specifiers
145  * @last:	Return index of first set bit, if this is the last field
146  *
147  * This is an alternative implementation of pipapo_refill() suitable for usage
148  * with AVX2 lookup routines: we know there are four words to be scanned, at
149  * a given offset inside the map, for each matching iteration.
150  *
151  * This function doesn't actually use any AVX2 instruction.
152  *
153  * Return: first set bit index if @last, index of first filled word otherwise.
154  */
155 static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
156 				  unsigned long *dst,
157 				  union nft_pipapo_map_bucket *mt, bool last)
158 {
159 	int ret = -1;
160 
161 #define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x)				\
162 	do {								\
163 		while (map[(x)]) {					\
164 			int r = __builtin_ctzl(map[(x)]);		\
165 			int i = (offset + (x)) * BITS_PER_LONG + r;	\
166 									\
167 			if (last)					\
168 				return i;				\
169 									\
170 			nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n);	\
171 									\
172 			if (ret == -1)					\
173 				ret = mt[i].to;				\
174 									\
175 			map[(x)] &= ~(1UL << r);			\
176 		}							\
177 	} while (0)
178 
179 	NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
180 	NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
181 	NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
182 	NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
183 #undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
184 
185 	return ret;
186 }
187 
188 /**
189  * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
190  * @map:	Previous match result, used as initial bitmap
191  * @fill:	Destination bitmap to be filled with current match result
192  * @f:		Field, containing lookup and mapping tables
193  * @offset:	Ignore buckets before the given index, no bits are filled there
194  * @pkt:	Packet data, pointer to input nftables register
195  * @first:	If this is the first field, don't source previous result
196  * @last:	Last field: stop at the first match and return bit index
197  *
198  * Load buckets from lookup table corresponding to the values of each 4-bit
199  * group of packet bytes, and perform a bitwise intersection between them. If
200  * this is the first field in the set, simply AND the buckets together
201  * (equivalent to using an all-ones starting bitmap), use the provided starting
202  * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
203  * working bitmap, @fill.
204  *
205  * This is used for 8-bit fields (i.e. protocol numbers).
206  *
207  * Out-of-order (and superscalar) execution is vital here, so it's critical to
208  * avoid false data dependencies. CPU and compiler could (mostly) take care of
209  * this on their own, but the operation ordering is explicitly given here with
210  * a likely execution order in mind, to highlight possible stalls. That's why
211  * a number of logically distinct operations (i.e. loading buckets, intersecting
212  * buckets) are interleaved.
213  *
214  * Return: -1 on no match, rule index of match if @last, otherwise first long
215  * word index to be checked next (i.e. first filled word).
216  */
217 static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
218 				       struct nft_pipapo_field *f, int offset,
219 				       const u8 *pkt, bool first, bool last)
220 {
221 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
222 	u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
223 	unsigned long *lt = f->lt, bsize = f->bsize;
224 
225 	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
226 	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
227 		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
228 
229 		if (first) {
230 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
231 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
232 			NFT_PIPAPO_AVX2_AND(4, 0, 1);
233 		} else {
234 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
235 			NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
236 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
237 			NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
238 			NFT_PIPAPO_AVX2_AND(3, 0, 1);
239 			NFT_PIPAPO_AVX2_AND(4, 2, 3);
240 		}
241 
242 		NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
243 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
244 
245 		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
246 		if (last)
247 			return b;
248 
249 		if (unlikely(ret == -1))
250 			ret = b / XSAVE_YMM_SIZE;
251 
252 		continue;
253 nomatch:
254 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
255 nothing:
256 		;
257 	}
258 
259 	return ret;
260 }
261 
262 /**
263  * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
264  * @map:	Previous match result, used as initial bitmap
265  * @fill:	Destination bitmap to be filled with current match result
266  * @f:		Field, containing lookup and mapping tables
267  * @offset:	Ignore buckets before the given index, no bits are filled there
268  * @pkt:	Packet data, pointer to input nftables register
269  * @first:	If this is the first field, don't source previous result
270  * @last:	Last field: stop at the first match and return bit index
271  *
272  * See nft_pipapo_avx2_lookup_4b_2().
273  *
274  * This is used for 16-bit fields (i.e. ports).
275  *
276  * Return: -1 on no match, rule index of match if @last, otherwise first long
277  * word index to be checked next (i.e. first filled word).
278  */
279 static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
280 				       struct nft_pipapo_field *f, int offset,
281 				       const u8 *pkt, bool first, bool last)
282 {
283 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
284 	u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
285 	unsigned long *lt = f->lt, bsize = f->bsize;
286 
287 	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
288 	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
289 		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
290 
291 		if (first) {
292 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
293 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
294 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
295 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
296 			NFT_PIPAPO_AVX2_AND(4, 0, 1);
297 			NFT_PIPAPO_AVX2_AND(5, 2, 3);
298 			NFT_PIPAPO_AVX2_AND(7, 4, 5);
299 		} else {
300 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
301 
302 			NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
303 
304 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
305 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
306 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
307 			NFT_PIPAPO_AVX2_AND(5, 0, 1);
308 
309 			NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
310 
311 			NFT_PIPAPO_AVX2_AND(6, 2, 3);
312 			NFT_PIPAPO_AVX2_AND(7, 4, 5);
313 			/* Stall */
314 			NFT_PIPAPO_AVX2_AND(7, 6, 7);
315 		}
316 
317 		/* Stall */
318 		NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
319 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
320 
321 		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
322 		if (last)
323 			return b;
324 
325 		if (unlikely(ret == -1))
326 			ret = b / XSAVE_YMM_SIZE;
327 
328 		continue;
329 nomatch:
330 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
331 nothing:
332 		;
333 	}
334 
335 	return ret;
336 }
337 
338 /**
339  * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
340  * @map:	Previous match result, used as initial bitmap
341  * @fill:	Destination bitmap to be filled with current match result
342  * @f:		Field, containing lookup and mapping tables
343  * @offset:	Ignore buckets before the given index, no bits are filled there
344  * @pkt:	Packet data, pointer to input nftables register
345  * @first:	If this is the first field, don't source previous result
346  * @last:	Last field: stop at the first match and return bit index
347  *
348  * See nft_pipapo_avx2_lookup_4b_2().
349  *
350  * This is used for 32-bit fields (i.e. IPv4 addresses).
351  *
352  * Return: -1 on no match, rule index of match if @last, otherwise first long
353  * word index to be checked next (i.e. first filled word).
354  */
355 static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
356 				       struct nft_pipapo_field *f, int offset,
357 				       const u8 *pkt, bool first, bool last)
358 {
359 	u8 pg[8] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
360 		      pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
361 		   };
362 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
363 	unsigned long *lt = f->lt, bsize = f->bsize;
364 
365 	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
366 	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
367 		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
368 
369 		if (first) {
370 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 0, pg[0], bsize);
371 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 1, pg[1], bsize);
372 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 2, pg[2], bsize);
373 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 3, pg[3], bsize);
374 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt, 4, pg[4], bsize);
375 			NFT_PIPAPO_AVX2_AND(5,   0,  1);
376 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 5, pg[5], bsize);
377 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 6, pg[6], bsize);
378 			NFT_PIPAPO_AVX2_AND(8,   2,  3);
379 			NFT_PIPAPO_AVX2_AND(9,   4,  5);
380 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
381 			NFT_PIPAPO_AVX2_AND(11,  6,  7);
382 			NFT_PIPAPO_AVX2_AND(12,  8,  9);
383 			NFT_PIPAPO_AVX2_AND(13, 10, 11);
384 
385 			/* Stall */
386 			NFT_PIPAPO_AVX2_AND(1,  12, 13);
387 		} else {
388 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 0, pg[0], bsize);
389 			NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
390 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 1, pg[1], bsize);
391 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 2, pg[2], bsize);
392 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt, 3, pg[3], bsize);
393 
394 			NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
395 
396 			NFT_PIPAPO_AVX2_AND(5,   0,  1);
397 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 4, pg[4], bsize);
398 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 5, pg[5], bsize);
399 			NFT_PIPAPO_AVX2_AND(8,   2,  3);
400 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(9,  lt, 6, pg[6], bsize);
401 			NFT_PIPAPO_AVX2_AND(10,  4,  5);
402 			NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
403 			NFT_PIPAPO_AVX2_AND(12,  6,  7);
404 			NFT_PIPAPO_AVX2_AND(13,  8,  9);
405 			NFT_PIPAPO_AVX2_AND(14, 10, 11);
406 
407 			/* Stall */
408 			NFT_PIPAPO_AVX2_AND(1,  12, 13);
409 			NFT_PIPAPO_AVX2_AND(1,   1, 14);
410 		}
411 
412 		NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
413 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
414 
415 		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
416 		if (last)
417 			return b;
418 
419 		if (unlikely(ret == -1))
420 			ret = b / XSAVE_YMM_SIZE;
421 
422 		continue;
423 
424 nomatch:
425 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
426 nothing:
427 		;
428 	}
429 
430 	return ret;
431 }
432 
433 /**
434  * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
435  * @map:	Previous match result, used as initial bitmap
436  * @fill:	Destination bitmap to be filled with current match result
437  * @f:		Field, containing lookup and mapping tables
438  * @offset:	Ignore buckets before the given index, no bits are filled there
439  * @pkt:	Packet data, pointer to input nftables register
440  * @first:	If this is the first field, don't source previous result
441  * @last:	Last field: stop at the first match and return bit index
442  *
443  * See nft_pipapo_avx2_lookup_4b_2().
444  *
445  * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
446  *
447  * Return: -1 on no match, rule index of match if @last, otherwise first long
448  * word index to be checked next (i.e. first filled word).
449  */
450 static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
451 				        struct nft_pipapo_field *f, int offset,
452 				        const u8 *pkt, bool first, bool last)
453 {
454 	u8 pg[12] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
455 		       pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
456 		       pkt[4] >> 4,  pkt[4] & 0xf,  pkt[5] >> 4,  pkt[5] & 0xf,
457 		    };
458 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
459 	unsigned long *lt = f->lt, bsize = f->bsize;
460 
461 	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
462 	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
463 		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
464 
465 		if (!first)
466 			NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
467 
468 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt,  0,  pg[0], bsize);
469 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt,  1,  pg[1], bsize);
470 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt,  2,  pg[2], bsize);
471 
472 		if (!first) {
473 			NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
474 			NFT_PIPAPO_AVX2_AND(1, 1, 0);
475 		}
476 
477 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt,  3,  pg[3], bsize);
478 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt,  4,  pg[4], bsize);
479 		NFT_PIPAPO_AVX2_AND(6,   2,  3);
480 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt,  5,  pg[5], bsize);
481 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt,  6,  pg[6], bsize);
482 		NFT_PIPAPO_AVX2_AND(9,   1,  4);
483 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt,  7,  pg[7], bsize);
484 		NFT_PIPAPO_AVX2_AND(11,  5,  6);
485 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt,  8,  pg[8], bsize);
486 		NFT_PIPAPO_AVX2_AND(13,  7,  8);
487 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt,  9,  pg[9], bsize);
488 
489 		NFT_PIPAPO_AVX2_AND(0,   9, 10);
490 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 10,  pg[10], bsize);
491 		NFT_PIPAPO_AVX2_AND(2,  11, 12);
492 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 11,  pg[11], bsize);
493 		NFT_PIPAPO_AVX2_AND(4,  13, 14);
494 		NFT_PIPAPO_AVX2_AND(5,   0,  1);
495 
496 		NFT_PIPAPO_AVX2_AND(6,   2,  3);
497 
498 		/* Stalls */
499 		NFT_PIPAPO_AVX2_AND(7,   4,  5);
500 		NFT_PIPAPO_AVX2_AND(8,   6,  7);
501 
502 		NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
503 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
504 
505 		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
506 		if (last)
507 			return b;
508 
509 		if (unlikely(ret == -1))
510 			ret = b / XSAVE_YMM_SIZE;
511 
512 		continue;
513 nomatch:
514 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
515 nothing:
516 		;
517 	}
518 
519 	return ret;
520 }
521 
522 /**
523  * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
524  * @map:	Previous match result, used as initial bitmap
525  * @fill:	Destination bitmap to be filled with current match result
526  * @f:		Field, containing lookup and mapping tables
527  * @offset:	Ignore buckets before the given index, no bits are filled there
528  * @pkt:	Packet data, pointer to input nftables register
529  * @first:	If this is the first field, don't source previous result
530  * @last:	Last field: stop at the first match and return bit index
531  *
532  * See nft_pipapo_avx2_lookup_4b_2().
533  *
534  * This is used for 128-bit fields (i.e. IPv6 addresses).
535  *
536  * Return: -1 on no match, rule index of match if @last, otherwise first long
537  * word index to be checked next (i.e. first filled word).
538  */
539 static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
540 					struct nft_pipapo_field *f, int offset,
541 					const u8 *pkt, bool first, bool last)
542 {
543 	u8 pg[32] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
544 		       pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
545 		       pkt[4] >> 4,  pkt[4] & 0xf,  pkt[5] >> 4,  pkt[5] & 0xf,
546 		       pkt[6] >> 4,  pkt[6] & 0xf,  pkt[7] >> 4,  pkt[7] & 0xf,
547 		       pkt[8] >> 4,  pkt[8] & 0xf,  pkt[9] >> 4,  pkt[9] & 0xf,
548 		      pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
549 		      pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
550 		      pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
551 		    };
552 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
553 	unsigned long *lt = f->lt, bsize = f->bsize;
554 
555 	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
556 	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
557 		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
558 
559 		if (!first)
560 			NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
561 
562 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt,  0,  pg[0], bsize);
563 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt,  1,  pg[1], bsize);
564 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt,  2,  pg[2], bsize);
565 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt,  3,  pg[3], bsize);
566 		if (!first) {
567 			NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
568 			NFT_PIPAPO_AVX2_AND(1, 1, 0);
569 		}
570 
571 		NFT_PIPAPO_AVX2_AND(5,   2,  3);
572 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt,  4,  pg[4], bsize);
573 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt,  5,  pg[5], bsize);
574 		NFT_PIPAPO_AVX2_AND(8,   1,  4);
575 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(9,  lt,  6,  pg[6], bsize);
576 		NFT_PIPAPO_AVX2_AND(10,  5,  6);
577 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt,  7,  pg[7], bsize);
578 		NFT_PIPAPO_AVX2_AND(12,  7,  8);
579 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt,  8,  pg[8], bsize);
580 		NFT_PIPAPO_AVX2_AND(14,  9, 10);
581 
582 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt,  9,  pg[9], bsize);
583 		NFT_PIPAPO_AVX2_AND(1,  11, 12);
584 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 10, pg[10], bsize);
585 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 11, pg[11], bsize);
586 		NFT_PIPAPO_AVX2_AND(4,  13, 14);
587 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 12, pg[12], bsize);
588 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 13, pg[13], bsize);
589 		NFT_PIPAPO_AVX2_AND(7,   0,  1);
590 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 14, pg[14], bsize);
591 		NFT_PIPAPO_AVX2_AND(9,   2,  3);
592 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
593 		NFT_PIPAPO_AVX2_AND(11,  4,  5);
594 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
595 		NFT_PIPAPO_AVX2_AND(13,  6,  7);
596 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
597 
598 		NFT_PIPAPO_AVX2_AND(0,   8,  9);
599 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 18, pg[18], bsize);
600 		NFT_PIPAPO_AVX2_AND(2,  10, 11);
601 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 19, pg[19], bsize);
602 		NFT_PIPAPO_AVX2_AND(4,  12, 13);
603 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 20, pg[20], bsize);
604 		NFT_PIPAPO_AVX2_AND(6,  14,  0);
605 		NFT_PIPAPO_AVX2_AND(7,   1,  2);
606 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 21, pg[21], bsize);
607 		NFT_PIPAPO_AVX2_AND(9,   3,  4);
608 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
609 		NFT_PIPAPO_AVX2_AND(11,  5,  6);
610 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
611 		NFT_PIPAPO_AVX2_AND(13,  7,  8);
612 
613 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
614 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 25, pg[25], bsize);
615 		NFT_PIPAPO_AVX2_AND(1,   9, 10);
616 		NFT_PIPAPO_AVX2_AND(2,  11, 12);
617 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 26, pg[26], bsize);
618 		NFT_PIPAPO_AVX2_AND(4,  13, 14);
619 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 27, pg[27], bsize);
620 		NFT_PIPAPO_AVX2_AND(6,   0,  1);
621 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 28, pg[28], bsize);
622 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 29, pg[29], bsize);
623 		NFT_PIPAPO_AVX2_AND(9,   2,  3);
624 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
625 		NFT_PIPAPO_AVX2_AND(11,  4,  5);
626 		NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
627 
628 		NFT_PIPAPO_AVX2_AND(0,   6,  7);
629 		NFT_PIPAPO_AVX2_AND(1,   8,  9);
630 		NFT_PIPAPO_AVX2_AND(2,  10, 11);
631 		NFT_PIPAPO_AVX2_AND(3,  12,  0);
632 
633 		/* Stalls */
634 		NFT_PIPAPO_AVX2_AND(4,   1,  2);
635 		NFT_PIPAPO_AVX2_AND(5,   3,  4);
636 
637 		NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
638 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
639 
640 		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
641 		if (last)
642 			return b;
643 
644 		if (unlikely(ret == -1))
645 			ret = b / XSAVE_YMM_SIZE;
646 
647 		continue;
648 nomatch:
649 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
650 nothing:
651 		;
652 	}
653 
654 	return ret;
655 }
656 
657 /**
658  * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
659  * @map:	Previous match result, used as initial bitmap
660  * @fill:	Destination bitmap to be filled with current match result
661  * @f:		Field, containing lookup and mapping tables
662  * @offset:	Ignore buckets before the given index, no bits are filled there
663  * @pkt:	Packet data, pointer to input nftables register
664  * @first:	If this is the first field, don't source previous result
665  * @last:	Last field: stop at the first match and return bit index
666  *
667  * See nft_pipapo_avx2_lookup_4b_2().
668  *
669  * This is used for 8-bit fields (i.e. protocol numbers).
670  *
671  * Return: -1 on no match, rule index of match if @last, otherwise first long
672  * word index to be checked next (i.e. first filled word).
673  */
674 static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
675 				       struct nft_pipapo_field *f, int offset,
676 				       const u8 *pkt, bool first, bool last)
677 {
678 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
679 	unsigned long *lt = f->lt, bsize = f->bsize;
680 
681 	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
682 	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
683 		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
684 
685 		if (first) {
686 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
687 		} else {
688 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
689 			NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
690 			NFT_PIPAPO_AVX2_AND(2, 0, 1);
691 			NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
692 		}
693 
694 		NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
695 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
696 
697 		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
698 		if (last)
699 			return b;
700 
701 		if (unlikely(ret == -1))
702 			ret = b / XSAVE_YMM_SIZE;
703 
704 		continue;
705 nomatch:
706 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
707 nothing:
708 		;
709 	}
710 
711 	return ret;
712 }
713 
714 /**
715  * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
716  * @map:	Previous match result, used as initial bitmap
717  * @fill:	Destination bitmap to be filled with current match result
718  * @f:		Field, containing lookup and mapping tables
719  * @offset:	Ignore buckets before the given index, no bits are filled there
720  * @pkt:	Packet data, pointer to input nftables register
721  * @first:	If this is the first field, don't source previous result
722  * @last:	Last field: stop at the first match and return bit index
723  *
724  * See nft_pipapo_avx2_lookup_4b_2().
725  *
726  * This is used for 16-bit fields (i.e. ports).
727  *
728  * Return: -1 on no match, rule index of match if @last, otherwise first long
729  * word index to be checked next (i.e. first filled word).
730  */
731 static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
732 				       struct nft_pipapo_field *f, int offset,
733 				       const u8 *pkt, bool first, bool last)
734 {
735 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
736 	unsigned long *lt = f->lt, bsize = f->bsize;
737 
738 	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
739 	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
740 		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
741 
742 		if (first) {
743 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
744 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
745 			NFT_PIPAPO_AVX2_AND(4, 0, 1);
746 		} else {
747 			NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
748 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
749 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
750 
751 			/* Stall */
752 			NFT_PIPAPO_AVX2_AND(3, 0, 1);
753 			NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
754 			NFT_PIPAPO_AVX2_AND(4, 3, 2);
755 		}
756 
757 		/* Stall */
758 		NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
759 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
760 
761 		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
762 		if (last)
763 			return b;
764 
765 		if (unlikely(ret == -1))
766 			ret = b / XSAVE_YMM_SIZE;
767 
768 		continue;
769 nomatch:
770 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
771 nothing:
772 		;
773 	}
774 
775 	return ret;
776 }
777 
778 /**
779  * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
780  * @map:	Previous match result, used as initial bitmap
781  * @fill:	Destination bitmap to be filled with current match result
782  * @f:		Field, containing lookup and mapping tables
783  * @offset:	Ignore buckets before the given index, no bits are filled there
784  * @pkt:	Packet data, pointer to input nftables register
785  * @first:	If this is the first field, don't source previous result
786  * @last:	Last field: stop at the first match and return bit index
787  *
788  * See nft_pipapo_avx2_lookup_4b_2().
789  *
790  * This is used for 32-bit fields (i.e. IPv4 addresses).
791  *
792  * Return: -1 on no match, rule index of match if @last, otherwise first long
793  * word index to be checked next (i.e. first filled word).
794  */
795 static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
796 				       struct nft_pipapo_field *f, int offset,
797 				       const u8 *pkt, bool first, bool last)
798 {
799 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
800 	unsigned long *lt = f->lt, bsize = f->bsize;
801 
802 	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
803 	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
804 		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
805 
806 		if (first) {
807 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
808 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 1, pkt[1], bsize);
809 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 2, pkt[2], bsize);
810 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 3, pkt[3], bsize);
811 
812 			/* Stall */
813 			NFT_PIPAPO_AVX2_AND(4, 0, 1);
814 			NFT_PIPAPO_AVX2_AND(5, 2, 3);
815 			NFT_PIPAPO_AVX2_AND(0, 4, 5);
816 		} else {
817 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
818 			NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
819 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 1, pkt[1], bsize);
820 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 2, pkt[2], bsize);
821 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 3, pkt[3], bsize);
822 
823 			NFT_PIPAPO_AVX2_AND(5, 0, 1);
824 			NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
825 			NFT_PIPAPO_AVX2_AND(6, 2, 3);
826 
827 			/* Stall */
828 			NFT_PIPAPO_AVX2_AND(7, 4, 5);
829 			NFT_PIPAPO_AVX2_AND(0, 6, 7);
830 		}
831 
832 		NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
833 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
834 
835 		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
836 		if (last)
837 			return b;
838 
839 		if (unlikely(ret == -1))
840 			ret = b / XSAVE_YMM_SIZE;
841 
842 		continue;
843 
844 nomatch:
845 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
846 nothing:
847 		;
848 	}
849 
850 	return ret;
851 }
852 
853 /**
854  * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
855  * @map:	Previous match result, used as initial bitmap
856  * @fill:	Destination bitmap to be filled with current match result
857  * @f:		Field, containing lookup and mapping tables
858  * @offset:	Ignore buckets before the given index, no bits are filled there
859  * @pkt:	Packet data, pointer to input nftables register
860  * @first:	If this is the first field, don't source previous result
861  * @last:	Last field: stop at the first match and return bit index
862  *
863  * See nft_pipapo_avx2_lookup_4b_2().
864  *
865  * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
866  *
867  * Return: -1 on no match, rule index of match if @last, otherwise first long
868  * word index to be checked next (i.e. first filled word).
869  */
870 static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
871 				       struct nft_pipapo_field *f, int offset,
872 				       const u8 *pkt, bool first, bool last)
873 {
874 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
875 	unsigned long *lt = f->lt, bsize = f->bsize;
876 
877 	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
878 	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
879 		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
880 
881 		if (first) {
882 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
883 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 1, pkt[1], bsize);
884 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 2, pkt[2], bsize);
885 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 3, pkt[3], bsize);
886 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 4, pkt[4], bsize);
887 
888 			NFT_PIPAPO_AVX2_AND(5, 0, 1);
889 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(6,  lt, 5, pkt[5], bsize);
890 			NFT_PIPAPO_AVX2_AND(7, 2, 3);
891 
892 			/* Stall */
893 			NFT_PIPAPO_AVX2_AND(0, 4, 5);
894 			NFT_PIPAPO_AVX2_AND(1, 6, 7);
895 			NFT_PIPAPO_AVX2_AND(4, 0, 1);
896 		} else {
897 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
898 			NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
899 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 1, pkt[1], bsize);
900 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 2, pkt[2], bsize);
901 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 3, pkt[3], bsize);
902 
903 			NFT_PIPAPO_AVX2_AND(5, 0, 1);
904 			NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
905 
906 			NFT_PIPAPO_AVX2_AND(6, 2, 3);
907 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(7,  lt, 4, pkt[4], bsize);
908 			NFT_PIPAPO_AVX2_AND(0, 4, 5);
909 			NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 5, pkt[5], bsize);
910 			NFT_PIPAPO_AVX2_AND(2, 6, 7);
911 
912 			/* Stall */
913 			NFT_PIPAPO_AVX2_AND(3, 0, 1);
914 			NFT_PIPAPO_AVX2_AND(4, 2, 3);
915 		}
916 
917 		NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
918 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
919 
920 		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
921 		if (last)
922 			return b;
923 
924 		if (unlikely(ret == -1))
925 			ret = b / XSAVE_YMM_SIZE;
926 
927 		continue;
928 
929 nomatch:
930 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
931 nothing:
932 		;
933 	}
934 
935 	return ret;
936 }
937 
938 /**
939  * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
940  * @map:	Previous match result, used as initial bitmap
941  * @fill:	Destination bitmap to be filled with current match result
942  * @f:		Field, containing lookup and mapping tables
943  * @offset:	Ignore buckets before the given index, no bits are filled there
944  * @pkt:	Packet data, pointer to input nftables register
945  * @first:	If this is the first field, don't source previous result
946  * @last:	Last field: stop at the first match and return bit index
947  *
948  * See nft_pipapo_avx2_lookup_4b_2().
949  *
950  * This is used for 128-bit fields (i.e. IPv6 addresses).
951  *
952  * Return: -1 on no match, rule index of match if @last, otherwise first long
953  * word index to be checked next (i.e. first filled word).
954  */
955 static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
956 					struct nft_pipapo_field *f, int offset,
957 					const u8 *pkt, bool first, bool last)
958 {
959 	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
960 	unsigned long *lt = f->lt, bsize = f->bsize;
961 
962 	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
963 	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
964 		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
965 
966 		if (!first)
967 			NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
968 
969 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt,  0,  pkt[0], bsize);
970 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt,  1,  pkt[1], bsize);
971 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt,  2,  pkt[2], bsize);
972 		if (!first) {
973 			NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
974 			NFT_PIPAPO_AVX2_AND(1, 1, 0);
975 		}
976 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt,  3,  pkt[3], bsize);
977 
978 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt,  4,  pkt[4], bsize);
979 		NFT_PIPAPO_AVX2_AND(6, 1, 2);
980 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt,  5,  pkt[5], bsize);
981 		NFT_PIPAPO_AVX2_AND(0, 3, 4);
982 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt,  6,  pkt[6], bsize);
983 
984 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt,  7,  pkt[7], bsize);
985 		NFT_PIPAPO_AVX2_AND(3, 5, 6);
986 		NFT_PIPAPO_AVX2_AND(4, 0, 1);
987 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt,  8,  pkt[8], bsize);
988 
989 		NFT_PIPAPO_AVX2_AND(6, 2, 3);
990 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt,  9,  pkt[9], bsize);
991 		NFT_PIPAPO_AVX2_AND(0, 4, 5);
992 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
993 		NFT_PIPAPO_AVX2_AND(2, 6, 7);
994 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
995 		NFT_PIPAPO_AVX2_AND(4, 0, 1);
996 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
997 		NFT_PIPAPO_AVX2_AND(6, 2, 3);
998 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
999 		NFT_PIPAPO_AVX2_AND(0, 4, 5);
1000 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
1001 		NFT_PIPAPO_AVX2_AND(2, 6, 7);
1002 		NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
1003 		NFT_PIPAPO_AVX2_AND(4, 0, 1);
1004 
1005 		/* Stall */
1006 		NFT_PIPAPO_AVX2_AND(5, 2, 3);
1007 		NFT_PIPAPO_AVX2_AND(6, 4, 5);
1008 
1009 		NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
1010 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
1011 
1012 		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
1013 		if (last)
1014 			return b;
1015 
1016 		if (unlikely(ret == -1))
1017 			ret = b / XSAVE_YMM_SIZE;
1018 
1019 		continue;
1020 
1021 nomatch:
1022 		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
1023 nothing:
1024 		;
1025 	}
1026 
1027 	return ret;
1028 }
1029 
1030 /**
1031  * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
1032  * @map:	Previous match result, used as initial bitmap
1033  * @fill:	Destination bitmap to be filled with current match result
1034  * @f:		Field, containing lookup and mapping tables
1035  * @offset:	Ignore buckets before the given index, no bits are filled there
1036  * @pkt:	Packet data, pointer to input nftables register
1037  * @first:	If this is the first field, don't source previous result
1038  * @last:	Last field: stop at the first match and return bit index
1039  *
1040  * This function should never be called, but is provided for the case the field
1041  * size doesn't match any of the known data types. Matching rate is
1042  * substantially lower than AVX2 routines.
1043  *
1044  * Return: -1 on no match, rule index of match if @last, otherwise first long
1045  * word index to be checked next (i.e. first filled word).
1046  */
1047 static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
1048 					struct nft_pipapo_field *f, int offset,
1049 					const u8 *pkt, bool first, bool last)
1050 {
1051 	unsigned long bsize = f->bsize;
1052 	int i, ret = -1, b;
1053 
1054 	if (first)
1055 		memset(map, 0xff, bsize * sizeof(*map));
1056 
1057 	for (i = offset; i < bsize; i++) {
1058 		if (f->bb == 8)
1059 			pipapo_and_field_buckets_8bit(f, map, pkt);
1060 		else
1061 			pipapo_and_field_buckets_4bit(f, map, pkt);
1062 		NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
1063 
1064 		b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
1065 
1066 		if (last)
1067 			return b;
1068 
1069 		if (ret == -1)
1070 			ret = b / XSAVE_YMM_SIZE;
1071 	}
1072 
1073 	return ret;
1074 }
1075 
1076 /**
1077  * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
1078  * @desc:	Set description, element count and field description used
1079  * @features:	Flags: NFT_SET_INTERVAL needs to be there
1080  * @est:	Storage for estimation data
1081  *
1082  * Return: true if set is compatible and AVX2 available, false otherwise.
1083  */
1084 bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
1085 			      struct nft_set_estimate *est)
1086 {
1087 	if (!(features & NFT_SET_INTERVAL) ||
1088 	    desc->field_count < NFT_PIPAPO_MIN_FIELDS)
1089 		return false;
1090 
1091 	if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
1092 		return false;
1093 
1094 	est->size = pipapo_estimate_size(desc);
1095 	if (!est->size)
1096 		return false;
1097 
1098 	est->lookup = NFT_SET_CLASS_O_LOG_N;
1099 
1100 	est->space = NFT_SET_CLASS_O_N;
1101 
1102 	return true;
1103 }
1104 
1105 /**
1106  * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
1107  * @net:	Network namespace
1108  * @set:	nftables API set representation
1109  * @key:	nftables API element representation containing key data
1110  * @ext:	nftables API extension pointer, filled with matching reference
1111  *
1112  * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
1113  *
1114  * This implementation exploits the repetitive characteristic of the algorithm
1115  * to provide a fast, vectorised version using the AVX2 SIMD instruction set.
1116  *
1117  * Return: true on match, false otherwise.
1118  */
1119 bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
1120 			    const u32 *key, const struct nft_set_ext **ext)
1121 {
1122 	struct nft_pipapo *priv = nft_set_priv(set);
1123 	unsigned long *res, *fill, *scratch;
1124 	u8 genmask = nft_genmask_cur(net);
1125 	const u8 *rp = (const u8 *)key;
1126 	struct nft_pipapo_match *m;
1127 	struct nft_pipapo_field *f;
1128 	bool map_index;
1129 	int i, ret = 0;
1130 
1131 	if (unlikely(!irq_fpu_usable()))
1132 		return nft_pipapo_lookup(net, set, key, ext);
1133 
1134 	m = rcu_dereference(priv->match);
1135 
1136 	/* This also protects access to all data related to scratch maps.
1137 	 *
1138 	 * Note that we don't need a valid MXCSR state for any of the
1139 	 * operations we use here, so pass 0 as mask and spare a LDMXCSR
1140 	 * instruction.
1141 	 */
1142 	kernel_fpu_begin_mask(0);
1143 
1144 	scratch = *raw_cpu_ptr(m->scratch_aligned);
1145 	if (unlikely(!scratch)) {
1146 		kernel_fpu_end();
1147 		return false;
1148 	}
1149 	map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
1150 
1151 	res  = scratch + (map_index ? m->bsize_max : 0);
1152 	fill = scratch + (map_index ? 0 : m->bsize_max);
1153 
1154 	/* Starting map doesn't need to be set for this implementation */
1155 
1156 	nft_pipapo_avx2_prepare();
1157 
1158 next_match:
1159 	nft_pipapo_for_each_field(f, i, m) {
1160 		bool last = i == m->field_count - 1, first = !i;
1161 
1162 #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n)				\
1163 		(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f,	\
1164 							 ret, rp,	\
1165 							 first, last))
1166 
1167 		if (likely(f->bb == 8)) {
1168 			if (f->groups == 1) {
1169 				NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
1170 			} else if (f->groups == 2) {
1171 				NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
1172 			} else if (f->groups == 4) {
1173 				NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
1174 			} else if (f->groups == 6) {
1175 				NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
1176 			} else if (f->groups == 16) {
1177 				NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
1178 			} else {
1179 				ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
1180 								  ret, rp,
1181 								  first, last);
1182 			}
1183 		} else {
1184 			if (f->groups == 2) {
1185 				NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
1186 			} else if (f->groups == 4) {
1187 				NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
1188 			} else if (f->groups == 8) {
1189 				NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
1190 			} else if (f->groups == 12) {
1191 				NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
1192 			} else if (f->groups == 32) {
1193 				NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
1194 			} else {
1195 				ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
1196 								  ret, rp,
1197 								  first, last);
1198 			}
1199 		}
1200 		NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
1201 
1202 #undef NFT_SET_PIPAPO_AVX2_LOOKUP
1203 
1204 		if (ret < 0)
1205 			goto out;
1206 
1207 		if (last) {
1208 			*ext = &f->mt[ret].e->ext;
1209 			if (unlikely(nft_set_elem_expired(*ext) ||
1210 				     !nft_set_elem_active(*ext, genmask))) {
1211 				ret = 0;
1212 				goto next_match;
1213 			}
1214 
1215 			goto out;
1216 		}
1217 
1218 		swap(res, fill);
1219 		rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
1220 	}
1221 
1222 out:
1223 	if (i % 2)
1224 		raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index);
1225 	kernel_fpu_end();
1226 
1227 	return ret >= 0;
1228 }
1229