1 /*
2  * Copyright (C) 2016-2018 Netronome Systems, Inc.
3  *
4  * This software is dual licensed under the GNU General License Version 2,
5  * June 1991 as shown in the file COPYING in the top-level directory of this
6  * source tree or the BSD 2-Clause License provided below.  You have the
7  * option to license this software under the complete terms of either license.
8  *
9  * The BSD 2-Clause License:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      1. Redistributions of source code must retain the above
16  *         copyright notice, this list of conditions and the following
17  *         disclaimer.
18  *
19  *      2. Redistributions in binary form must reproduce the above
20  *         copyright notice, this list of conditions and the following
21  *         disclaimer in the documentation and/or other materials
22  *         provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #define pr_fmt(fmt)	"NFP net bpf: " fmt
35 
36 #include <linux/bug.h>
37 #include <linux/bpf.h>
38 #include <linux/filter.h>
39 #include <linux/kernel.h>
40 #include <linux/pkt_cls.h>
41 #include <linux/reciprocal_div.h>
42 #include <linux/unistd.h>
43 
44 #include "main.h"
45 #include "../nfp_asm.h"
46 #include "../nfp_net_ctrl.h"
47 
48 /* --- NFP prog --- */
49 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
50  * It's safe to modify the next pointers (but not pos).
51  */
52 #define nfp_for_each_insn_walk2(nfp_prog, pos, next)			\
53 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
54 	     next = list_next_entry(pos, l);			\
55 	     &(nfp_prog)->insns != &pos->l &&			\
56 	     &(nfp_prog)->insns != &next->l;			\
57 	     pos = nfp_meta_next(pos),				\
58 	     next = nfp_meta_next(pos))
59 
60 #define nfp_for_each_insn_walk3(nfp_prog, pos, next, next2)		\
61 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
62 	     next = list_next_entry(pos, l),			\
63 	     next2 = list_next_entry(next, l);			\
64 	     &(nfp_prog)->insns != &pos->l &&			\
65 	     &(nfp_prog)->insns != &next->l &&			\
66 	     &(nfp_prog)->insns != &next2->l;			\
67 	     pos = nfp_meta_next(pos),				\
68 	     next = nfp_meta_next(pos),				\
69 	     next2 = nfp_meta_next(next))
70 
71 static bool
72 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
73 {
74 	return meta->l.prev != &nfp_prog->insns;
75 }
76 
77 static void nfp_prog_push(struct nfp_prog *nfp_prog, u64 insn)
78 {
79 	if (nfp_prog->__prog_alloc_len / sizeof(u64) == nfp_prog->prog_len) {
80 		pr_warn("instruction limit reached (%u NFP instructions)\n",
81 			nfp_prog->prog_len);
82 		nfp_prog->error = -ENOSPC;
83 		return;
84 	}
85 
86 	nfp_prog->prog[nfp_prog->prog_len] = insn;
87 	nfp_prog->prog_len++;
88 }
89 
90 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog)
91 {
92 	return nfp_prog->prog_len;
93 }
94 
95 static bool
96 nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off)
97 {
98 	/* If there is a recorded error we may have dropped instructions;
99 	 * that doesn't have to be due to translator bug, and the translation
100 	 * will fail anyway, so just return OK.
101 	 */
102 	if (nfp_prog->error)
103 		return true;
104 	return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off);
105 }
106 
107 /* --- Emitters --- */
108 static void
109 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
110 	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx,
111 	   bool indir)
112 {
113 	u64 insn;
114 
115 	insn =	FIELD_PREP(OP_CMD_A_SRC, areg) |
116 		FIELD_PREP(OP_CMD_CTX, ctx) |
117 		FIELD_PREP(OP_CMD_B_SRC, breg) |
118 		FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) |
119 		FIELD_PREP(OP_CMD_XFER, xfer) |
120 		FIELD_PREP(OP_CMD_CNT, size) |
121 		FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) |
122 		FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
123 		FIELD_PREP(OP_CMD_INDIR, indir) |
124 		FIELD_PREP(OP_CMD_MODE, mode);
125 
126 	nfp_prog_push(nfp_prog, insn);
127 }
128 
129 static void
130 emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
131 	     swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir)
132 {
133 	struct nfp_insn_re_regs reg;
134 	int err;
135 
136 	err = swreg_to_restricted(reg_none(), lreg, rreg, &reg, false);
137 	if (err) {
138 		nfp_prog->error = err;
139 		return;
140 	}
141 	if (reg.swap) {
142 		pr_err("cmd can't swap arguments\n");
143 		nfp_prog->error = -EFAULT;
144 		return;
145 	}
146 	if (reg.dst_lmextn || reg.src_lmextn) {
147 		pr_err("cmd can't use LMextn\n");
148 		nfp_prog->error = -EFAULT;
149 		return;
150 	}
151 
152 	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx,
153 		   indir);
154 }
155 
156 static void
157 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
158 	 swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
159 {
160 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false);
161 }
162 
163 static void
164 emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
165 	       swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
166 {
167 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true);
168 }
169 
170 static void
171 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
172 	  enum br_ctx_signal_state css, u16 addr, u8 defer)
173 {
174 	u16 addr_lo, addr_hi;
175 	u64 insn;
176 
177 	addr_lo = addr & (OP_BR_ADDR_LO >> __bf_shf(OP_BR_ADDR_LO));
178 	addr_hi = addr != addr_lo;
179 
180 	insn = OP_BR_BASE |
181 		FIELD_PREP(OP_BR_MASK, mask) |
182 		FIELD_PREP(OP_BR_EV_PIP, ev_pip) |
183 		FIELD_PREP(OP_BR_CSS, css) |
184 		FIELD_PREP(OP_BR_DEFBR, defer) |
185 		FIELD_PREP(OP_BR_ADDR_LO, addr_lo) |
186 		FIELD_PREP(OP_BR_ADDR_HI, addr_hi);
187 
188 	nfp_prog_push(nfp_prog, insn);
189 }
190 
191 static void
192 emit_br_relo(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer,
193 	     enum nfp_relo_type relo)
194 {
195 	if (mask == BR_UNC && defer > 2) {
196 		pr_err("BUG: branch defer out of bounds %d\n", defer);
197 		nfp_prog->error = -EFAULT;
198 		return;
199 	}
200 
201 	__emit_br(nfp_prog, mask,
202 		  mask != BR_UNC ? BR_EV_PIP_COND : BR_EV_PIP_UNCOND,
203 		  BR_CSS_NONE, addr, defer);
204 
205 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
206 		FIELD_PREP(OP_RELO_TYPE, relo);
207 }
208 
209 static void
210 emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
211 {
212 	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
213 }
214 
215 static void
216 __emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
217 	      bool set, bool src_lmextn)
218 {
219 	u16 addr_lo, addr_hi;
220 	u64 insn;
221 
222 	addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
223 	addr_hi = addr != addr_lo;
224 
225 	insn = OP_BR_BIT_BASE |
226 		FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
227 		FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
228 		FIELD_PREP(OP_BR_BIT_BV, set) |
229 		FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
230 		FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
231 		FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
232 		FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);
233 
234 	nfp_prog_push(nfp_prog, insn);
235 }
236 
237 static void
238 emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
239 		 u8 defer, bool set, enum nfp_relo_type relo)
240 {
241 	struct nfp_insn_re_regs reg;
242 	int err;
243 
244 	/* NOTE: The bit to test is specified as an rotation amount, such that
245 	 *	 the bit to test will be placed on the MSB of the result when
246 	 *	 doing a rotate right. For bit X, we need right rotate X + 1.
247 	 */
248 	bit += 1;
249 
250 	err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
251 	if (err) {
252 		nfp_prog->error = err;
253 		return;
254 	}
255 
256 	__emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
257 		      reg.src_lmextn);
258 
259 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
260 		FIELD_PREP(OP_RELO_TYPE, relo);
261 }
262 
263 static void
264 emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
265 {
266 	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
267 }
268 
269 static void
270 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
271 	     enum immed_width width, bool invert,
272 	     enum immed_shift shift, bool wr_both,
273 	     bool dst_lmextn, bool src_lmextn)
274 {
275 	u64 insn;
276 
277 	insn = OP_IMMED_BASE |
278 		FIELD_PREP(OP_IMMED_A_SRC, areg) |
279 		FIELD_PREP(OP_IMMED_B_SRC, breg) |
280 		FIELD_PREP(OP_IMMED_IMM, imm_hi) |
281 		FIELD_PREP(OP_IMMED_WIDTH, width) |
282 		FIELD_PREP(OP_IMMED_INV, invert) |
283 		FIELD_PREP(OP_IMMED_SHIFT, shift) |
284 		FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
285 		FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
286 		FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
287 
288 	nfp_prog_push(nfp_prog, insn);
289 }
290 
291 static void
292 emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
293 	   enum immed_width width, bool invert, enum immed_shift shift)
294 {
295 	struct nfp_insn_ur_regs reg;
296 	int err;
297 
298 	if (swreg_type(dst) == NN_REG_IMM) {
299 		nfp_prog->error = -EFAULT;
300 		return;
301 	}
302 
303 	err = swreg_to_unrestricted(dst, dst, reg_imm(imm & 0xff), &reg);
304 	if (err) {
305 		nfp_prog->error = err;
306 		return;
307 	}
308 
309 	/* Use reg.dst when destination is No-Dest. */
310 	__emit_immed(nfp_prog,
311 		     swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
312 		     reg.breg, imm >> 8, width, invert, shift,
313 		     reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
314 }
315 
316 static void
317 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
318 	   enum shf_sc sc, u8 shift,
319 	   u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
320 	   bool dst_lmextn, bool src_lmextn)
321 {
322 	u64 insn;
323 
324 	if (!FIELD_FIT(OP_SHF_SHIFT, shift)) {
325 		nfp_prog->error = -EFAULT;
326 		return;
327 	}
328 
329 	if (sc == SHF_SC_L_SHF)
330 		shift = 32 - shift;
331 
332 	insn = OP_SHF_BASE |
333 		FIELD_PREP(OP_SHF_A_SRC, areg) |
334 		FIELD_PREP(OP_SHF_SC, sc) |
335 		FIELD_PREP(OP_SHF_B_SRC, breg) |
336 		FIELD_PREP(OP_SHF_I8, i8) |
337 		FIELD_PREP(OP_SHF_SW, sw) |
338 		FIELD_PREP(OP_SHF_DST, dst) |
339 		FIELD_PREP(OP_SHF_SHIFT, shift) |
340 		FIELD_PREP(OP_SHF_OP, op) |
341 		FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
342 		FIELD_PREP(OP_SHF_WR_AB, wr_both) |
343 		FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
344 		FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
345 
346 	nfp_prog_push(nfp_prog, insn);
347 }
348 
349 static void
350 emit_shf(struct nfp_prog *nfp_prog, swreg dst,
351 	 swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
352 {
353 	struct nfp_insn_re_regs reg;
354 	int err;
355 
356 	err = swreg_to_restricted(dst, lreg, rreg, &reg, true);
357 	if (err) {
358 		nfp_prog->error = err;
359 		return;
360 	}
361 
362 	__emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
363 		   reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
364 		   reg.dst_lmextn, reg.src_lmextn);
365 }
366 
367 static void
368 emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
369 	       swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
370 {
371 	if (sc == SHF_SC_R_ROT) {
372 		pr_err("indirect shift is not allowed on rotation\n");
373 		nfp_prog->error = -EFAULT;
374 		return;
375 	}
376 
377 	emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
378 }
379 
380 static void
381 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
382 	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
383 	   bool dst_lmextn, bool src_lmextn)
384 {
385 	u64 insn;
386 
387 	insn = OP_ALU_BASE |
388 		FIELD_PREP(OP_ALU_A_SRC, areg) |
389 		FIELD_PREP(OP_ALU_B_SRC, breg) |
390 		FIELD_PREP(OP_ALU_DST, dst) |
391 		FIELD_PREP(OP_ALU_SW, swap) |
392 		FIELD_PREP(OP_ALU_OP, op) |
393 		FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
394 		FIELD_PREP(OP_ALU_WR_AB, wr_both) |
395 		FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
396 		FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
397 
398 	nfp_prog_push(nfp_prog, insn);
399 }
400 
401 static void
402 emit_alu(struct nfp_prog *nfp_prog, swreg dst,
403 	 swreg lreg, enum alu_op op, swreg rreg)
404 {
405 	struct nfp_insn_ur_regs reg;
406 	int err;
407 
408 	err = swreg_to_unrestricted(dst, lreg, rreg, &reg);
409 	if (err) {
410 		nfp_prog->error = err;
411 		return;
412 	}
413 
414 	__emit_alu(nfp_prog, reg.dst, reg.dst_ab,
415 		   reg.areg, op, reg.breg, reg.swap, reg.wr_both,
416 		   reg.dst_lmextn, reg.src_lmextn);
417 }
418 
419 static void
420 __emit_mul(struct nfp_prog *nfp_prog, enum alu_dst_ab dst_ab, u16 areg,
421 	   enum mul_type type, enum mul_step step, u16 breg, bool swap,
422 	   bool wr_both, bool dst_lmextn, bool src_lmextn)
423 {
424 	u64 insn;
425 
426 	insn = OP_MUL_BASE |
427 		FIELD_PREP(OP_MUL_A_SRC, areg) |
428 		FIELD_PREP(OP_MUL_B_SRC, breg) |
429 		FIELD_PREP(OP_MUL_STEP, step) |
430 		FIELD_PREP(OP_MUL_DST_AB, dst_ab) |
431 		FIELD_PREP(OP_MUL_SW, swap) |
432 		FIELD_PREP(OP_MUL_TYPE, type) |
433 		FIELD_PREP(OP_MUL_WR_AB, wr_both) |
434 		FIELD_PREP(OP_MUL_SRC_LMEXTN, src_lmextn) |
435 		FIELD_PREP(OP_MUL_DST_LMEXTN, dst_lmextn);
436 
437 	nfp_prog_push(nfp_prog, insn);
438 }
439 
440 static void
441 emit_mul(struct nfp_prog *nfp_prog, swreg lreg, enum mul_type type,
442 	 enum mul_step step, swreg rreg)
443 {
444 	struct nfp_insn_ur_regs reg;
445 	u16 areg;
446 	int err;
447 
448 	if (type == MUL_TYPE_START && step != MUL_STEP_NONE) {
449 		nfp_prog->error = -EINVAL;
450 		return;
451 	}
452 
453 	if (step == MUL_LAST || step == MUL_LAST_2) {
454 		/* When type is step and step Number is LAST or LAST2, left
455 		 * source is used as destination.
456 		 */
457 		err = swreg_to_unrestricted(lreg, reg_none(), rreg, &reg);
458 		areg = reg.dst;
459 	} else {
460 		err = swreg_to_unrestricted(reg_none(), lreg, rreg, &reg);
461 		areg = reg.areg;
462 	}
463 
464 	if (err) {
465 		nfp_prog->error = err;
466 		return;
467 	}
468 
469 	__emit_mul(nfp_prog, reg.dst_ab, areg, type, step, reg.breg, reg.swap,
470 		   reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
471 }
472 
473 static void
474 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
475 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
476 		bool zero, bool swap, bool wr_both,
477 		bool dst_lmextn, bool src_lmextn)
478 {
479 	u64 insn;
480 
481 	insn = OP_LDF_BASE |
482 		FIELD_PREP(OP_LDF_A_SRC, areg) |
483 		FIELD_PREP(OP_LDF_SC, sc) |
484 		FIELD_PREP(OP_LDF_B_SRC, breg) |
485 		FIELD_PREP(OP_LDF_I8, imm8) |
486 		FIELD_PREP(OP_LDF_SW, swap) |
487 		FIELD_PREP(OP_LDF_ZF, zero) |
488 		FIELD_PREP(OP_LDF_BMASK, bmask) |
489 		FIELD_PREP(OP_LDF_SHF, shift) |
490 		FIELD_PREP(OP_LDF_WR_AB, wr_both) |
491 		FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
492 		FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
493 
494 	nfp_prog_push(nfp_prog, insn);
495 }
496 
497 static void
498 emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
499 		  enum shf_sc sc, u8 shift, bool zero)
500 {
501 	struct nfp_insn_re_regs reg;
502 	int err;
503 
504 	/* Note: ld_field is special as it uses one of the src regs as dst */
505 	err = swreg_to_restricted(dst, dst, src, &reg, true);
506 	if (err) {
507 		nfp_prog->error = err;
508 		return;
509 	}
510 
511 	__emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
512 			reg.i8, zero, reg.swap, reg.wr_both,
513 			reg.dst_lmextn, reg.src_lmextn);
514 }
515 
516 static void
517 emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
518 	      enum shf_sc sc, u8 shift)
519 {
520 	emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
521 }
522 
523 static void
524 __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
525 	    bool dst_lmextn, bool src_lmextn)
526 {
527 	u64 insn;
528 
529 	insn = OP_LCSR_BASE |
530 		FIELD_PREP(OP_LCSR_A_SRC, areg) |
531 		FIELD_PREP(OP_LCSR_B_SRC, breg) |
532 		FIELD_PREP(OP_LCSR_WRITE, wr) |
533 		FIELD_PREP(OP_LCSR_ADDR, addr / 4) |
534 		FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
535 		FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
536 
537 	nfp_prog_push(nfp_prog, insn);
538 }
539 
540 static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
541 {
542 	struct nfp_insn_ur_regs reg;
543 	int err;
544 
545 	/* This instruction takes immeds instead of reg_none() for the ignored
546 	 * operand, but we can't encode 2 immeds in one instr with our normal
547 	 * swreg infra so if param is an immed, we encode as reg_none() and
548 	 * copy the immed to both operands.
549 	 */
550 	if (swreg_type(src) == NN_REG_IMM) {
551 		err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
552 		reg.breg = reg.areg;
553 	} else {
554 		err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
555 	}
556 	if (err) {
557 		nfp_prog->error = err;
558 		return;
559 	}
560 
561 	__emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr,
562 		    false, reg.src_lmextn);
563 }
564 
565 /* CSR value is read in following immed[gpr, 0] */
566 static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr)
567 {
568 	__emit_lcsr(nfp_prog, 0, 0, false, addr, false, false);
569 }
570 
571 static void emit_nop(struct nfp_prog *nfp_prog)
572 {
573 	__emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
574 }
575 
576 /* --- Wrappers --- */
577 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
578 {
579 	if (!(imm & 0xffff0000)) {
580 		*val = imm;
581 		*shift = IMMED_SHIFT_0B;
582 	} else if (!(imm & 0xff0000ff)) {
583 		*val = imm >> 8;
584 		*shift = IMMED_SHIFT_1B;
585 	} else if (!(imm & 0x0000ffff)) {
586 		*val = imm >> 16;
587 		*shift = IMMED_SHIFT_2B;
588 	} else {
589 		return false;
590 	}
591 
592 	return true;
593 }
594 
595 static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
596 {
597 	enum immed_shift shift;
598 	u16 val;
599 
600 	if (pack_immed(imm, &val, &shift)) {
601 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, false, shift);
602 	} else if (pack_immed(~imm, &val, &shift)) {
603 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, true, shift);
604 	} else {
605 		emit_immed(nfp_prog, dst, imm & 0xffff, IMMED_WIDTH_ALL,
606 			   false, IMMED_SHIFT_0B);
607 		emit_immed(nfp_prog, dst, imm >> 16, IMMED_WIDTH_WORD,
608 			   false, IMMED_SHIFT_2B);
609 	}
610 }
611 
612 static void
613 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm,
614 	       enum nfp_relo_type relo)
615 {
616 	if (imm > 0xffff) {
617 		pr_err("relocation of a large immediate!\n");
618 		nfp_prog->error = -EFAULT;
619 		return;
620 	}
621 	emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
622 
623 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
624 		FIELD_PREP(OP_RELO_TYPE, relo);
625 }
626 
627 /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted)
628  * If the @imm is small enough encode it directly in operand and return
629  * otherwise load @imm to a spare register and return its encoding.
630  */
631 static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
632 {
633 	if (FIELD_FIT(UR_REG_IMM_MAX, imm))
634 		return reg_imm(imm);
635 
636 	wrp_immed(nfp_prog, tmp_reg, imm);
637 	return tmp_reg;
638 }
639 
640 /* re_load_imm_any() - encode immediate or use tmp register (restricted)
641  * If the @imm is small enough encode it directly in operand and return
642  * otherwise load @imm to a spare register and return its encoding.
643  */
644 static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
645 {
646 	if (FIELD_FIT(RE_REG_IMM_MAX, imm))
647 		return reg_imm(imm);
648 
649 	wrp_immed(nfp_prog, tmp_reg, imm);
650 	return tmp_reg;
651 }
652 
653 static void wrp_nops(struct nfp_prog *nfp_prog, unsigned int count)
654 {
655 	while (count--)
656 		emit_nop(nfp_prog);
657 }
658 
659 static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src)
660 {
661 	emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src);
662 }
663 
664 static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
665 {
666 	wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
667 }
668 
669 /* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
670  * result to @dst from low end.
671  */
672 static void
673 wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
674 		u8 offset)
675 {
676 	enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
677 	u8 mask = (1 << field_len) - 1;
678 
679 	emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
680 }
681 
682 /* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the
683  * result to @dst from offset, there is no change on the other bits of @dst.
684  */
685 static void
686 wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src,
687 		   u8 field_len, u8 offset)
688 {
689 	enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE;
690 	u8 mask = ((1 << field_len) - 1) << offset;
691 
692 	emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8);
693 }
694 
695 static void
696 addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
697 	      swreg *rega, swreg *regb)
698 {
699 	if (offset == reg_imm(0)) {
700 		*rega = reg_a(src_gpr);
701 		*regb = reg_b(src_gpr + 1);
702 		return;
703 	}
704 
705 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset);
706 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C,
707 		 reg_imm(0));
708 	*rega = imm_a(nfp_prog);
709 	*regb = imm_b(nfp_prog);
710 }
711 
712 /* NFP has Command Push Pull bus which supports bluk memory operations. */
713 static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
714 {
715 	bool descending_seq = meta->ldst_gather_len < 0;
716 	s16 len = abs(meta->ldst_gather_len);
717 	swreg src_base, off;
718 	bool src_40bit_addr;
719 	unsigned int i;
720 	u8 xfer_num;
721 
722 	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
723 	src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE;
724 	src_base = reg_a(meta->insn.src_reg * 2);
725 	xfer_num = round_up(len, 4) / 4;
726 
727 	if (src_40bit_addr)
728 		addr40_offset(nfp_prog, meta->insn.src_reg * 2, off, &src_base,
729 			      &off);
730 
731 	/* Setup PREV_ALU fields to override memory read length. */
732 	if (len > 32)
733 		wrp_immed(nfp_prog, reg_none(),
734 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
735 
736 	/* Memory read from source addr into transfer-in registers. */
737 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP,
738 		     src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0,
739 		     src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32);
740 
741 	/* Move from transfer-in to transfer-out. */
742 	for (i = 0; i < xfer_num; i++)
743 		wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
744 
745 	off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
746 
747 	if (len <= 8) {
748 		/* Use single direct_ref write8. */
749 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
750 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
751 			 CMD_CTX_SWAP);
752 	} else if (len <= 32 && IS_ALIGNED(len, 4)) {
753 		/* Use single direct_ref write32. */
754 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
755 			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
756 			 CMD_CTX_SWAP);
757 	} else if (len <= 32) {
758 		/* Use single indirect_ref write8. */
759 		wrp_immed(nfp_prog, reg_none(),
760 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
761 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
762 			       reg_a(meta->paired_st->dst_reg * 2), off,
763 			       len - 1, CMD_CTX_SWAP);
764 	} else if (IS_ALIGNED(len, 4)) {
765 		/* Use single indirect_ref write32. */
766 		wrp_immed(nfp_prog, reg_none(),
767 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
768 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
769 			       reg_a(meta->paired_st->dst_reg * 2), off,
770 			       xfer_num - 1, CMD_CTX_SWAP);
771 	} else if (len <= 40) {
772 		/* Use one direct_ref write32 to write the first 32-bytes, then
773 		 * another direct_ref write8 to write the remaining bytes.
774 		 */
775 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
776 			 reg_a(meta->paired_st->dst_reg * 2), off, 7,
777 			 CMD_CTX_SWAP);
778 
779 		off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
780 				      imm_b(nfp_prog));
781 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
782 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
783 			 CMD_CTX_SWAP);
784 	} else {
785 		/* Use one indirect_ref write32 to write 4-bytes aligned length,
786 		 * then another direct_ref write8 to write the remaining bytes.
787 		 */
788 		u8 new_off;
789 
790 		wrp_immed(nfp_prog, reg_none(),
791 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
792 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
793 			       reg_a(meta->paired_st->dst_reg * 2), off,
794 			       xfer_num - 2, CMD_CTX_SWAP);
795 		new_off = meta->paired_st->off + (xfer_num - 1) * 4;
796 		off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
797 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
798 			 xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
799 			 (len & 0x3) - 1, CMD_CTX_SWAP);
800 	}
801 
802 	/* TODO: The following extra load is to make sure data flow be identical
803 	 *  before and after we do memory copy optimization.
804 	 *
805 	 *  The load destination register is not guaranteed to be dead, so we
806 	 *  need to make sure it is loaded with the value the same as before
807 	 *  this transformation.
808 	 *
809 	 *  These extra loads could be removed once we have accurate register
810 	 *  usage information.
811 	 */
812 	if (descending_seq)
813 		xfer_num = 0;
814 	else if (BPF_SIZE(meta->insn.code) != BPF_DW)
815 		xfer_num = xfer_num - 1;
816 	else
817 		xfer_num = xfer_num - 2;
818 
819 	switch (BPF_SIZE(meta->insn.code)) {
820 	case BPF_B:
821 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
822 				reg_xfer(xfer_num), 1,
823 				IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
824 		break;
825 	case BPF_H:
826 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
827 				reg_xfer(xfer_num), 2, (len & 3) ^ 2);
828 		break;
829 	case BPF_W:
830 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
831 			reg_xfer(0));
832 		break;
833 	case BPF_DW:
834 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
835 			reg_xfer(xfer_num));
836 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
837 			reg_xfer(xfer_num + 1));
838 		break;
839 	}
840 
841 	if (BPF_SIZE(meta->insn.code) != BPF_DW)
842 		wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
843 
844 	return 0;
845 }
846 
847 static int
848 data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size)
849 {
850 	unsigned int i;
851 	u16 shift, sz;
852 
853 	/* We load the value from the address indicated in @offset and then
854 	 * shift out the data we don't need.  Note: this is big endian!
855 	 */
856 	sz = max(size, 4);
857 	shift = size < 4 ? 4 - size : 0;
858 
859 	emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
860 		 pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP);
861 
862 	i = 0;
863 	if (shift)
864 		emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE,
865 			 reg_xfer(0), SHF_SC_R_SHF, shift * 8);
866 	else
867 		for (; i * 4 < size; i++)
868 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
869 
870 	if (i < 2)
871 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
872 
873 	return 0;
874 }
875 
876 static int
877 data_ld_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr,
878 		   swreg lreg, swreg rreg, int size, enum cmd_mode mode)
879 {
880 	unsigned int i;
881 	u8 mask, sz;
882 
883 	/* We load the value from the address indicated in rreg + lreg and then
884 	 * mask out the data we don't need.  Note: this is little endian!
885 	 */
886 	sz = max(size, 4);
887 	mask = size < 4 ? GENMASK(size - 1, 0) : 0;
888 
889 	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0,
890 		 lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP);
891 
892 	i = 0;
893 	if (mask)
894 		emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask,
895 				  reg_xfer(0), SHF_SC_NONE, 0, true);
896 	else
897 		for (; i * 4 < size; i++)
898 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
899 
900 	if (i < 2)
901 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
902 
903 	return 0;
904 }
905 
906 static int
907 data_ld_host_order_addr32(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
908 			  u8 dst_gpr, u8 size)
909 {
910 	return data_ld_host_order(nfp_prog, dst_gpr, reg_a(src_gpr), offset,
911 				  size, CMD_MODE_32b);
912 }
913 
914 static int
915 data_ld_host_order_addr40(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
916 			  u8 dst_gpr, u8 size)
917 {
918 	swreg rega, regb;
919 
920 	addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb);
921 
922 	return data_ld_host_order(nfp_prog, dst_gpr, rega, regb,
923 				  size, CMD_MODE_40b_BA);
924 }
925 
926 static int
927 construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset, u16 src, u8 size)
928 {
929 	swreg tmp_reg;
930 
931 	/* Calculate the true offset (src_reg + imm) */
932 	tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
933 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg);
934 
935 	/* Check packet length (size guaranteed to fit b/c it's u8) */
936 	emit_alu(nfp_prog, imm_a(nfp_prog),
937 		 imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
938 	emit_alu(nfp_prog, reg_none(),
939 		 plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
940 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
941 
942 	/* Load data */
943 	return data_ld(nfp_prog, imm_b(nfp_prog), 0, size);
944 }
945 
946 static int construct_data_ld(struct nfp_prog *nfp_prog, u16 offset, u8 size)
947 {
948 	swreg tmp_reg;
949 
950 	/* Check packet length */
951 	tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog));
952 	emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
953 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
954 
955 	/* Load data */
956 	tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
957 	return data_ld(nfp_prog, tmp_reg, 0, size);
958 }
959 
960 static int
961 data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
962 		    u8 src_gpr, u8 size)
963 {
964 	unsigned int i;
965 
966 	for (i = 0; i * 4 < size; i++)
967 		wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i));
968 
969 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
970 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
971 
972 	return 0;
973 }
974 
975 static int
976 data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
977 		   u64 imm, u8 size)
978 {
979 	wrp_immed(nfp_prog, reg_xfer(0), imm);
980 	if (size == 8)
981 		wrp_immed(nfp_prog, reg_xfer(1), imm >> 32);
982 
983 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
984 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
985 
986 	return 0;
987 }
988 
989 typedef int
990 (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
991 	     unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
992 	     bool needs_inc);
993 
994 static int
995 wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
996 	      unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
997 	      bool needs_inc)
998 {
999 	bool should_inc = needs_inc && new_gpr && !last;
1000 	u32 idx, src_byte;
1001 	enum shf_sc sc;
1002 	swreg reg;
1003 	int shf;
1004 	u8 mask;
1005 
1006 	if (WARN_ON_ONCE(dst_byte + size > 4 || off % 4 + size > 4))
1007 		return -EOPNOTSUPP;
1008 
1009 	idx = off / 4;
1010 
1011 	/* Move the entire word */
1012 	if (size == 4) {
1013 		wrp_mov(nfp_prog, reg_both(dst),
1014 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
1015 		return 0;
1016 	}
1017 
1018 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1019 		return -EOPNOTSUPP;
1020 
1021 	src_byte = off % 4;
1022 
1023 	mask = (1 << size) - 1;
1024 	mask <<= dst_byte;
1025 
1026 	if (WARN_ON_ONCE(mask > 0xf))
1027 		return -EOPNOTSUPP;
1028 
1029 	shf = abs(src_byte - dst_byte) * 8;
1030 	if (src_byte == dst_byte) {
1031 		sc = SHF_SC_NONE;
1032 	} else if (src_byte < dst_byte) {
1033 		shf = 32 - shf;
1034 		sc = SHF_SC_L_SHF;
1035 	} else {
1036 		sc = SHF_SC_R_SHF;
1037 	}
1038 
1039 	/* ld_field can address fewer indexes, if offset too large do RMW.
1040 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1041 	 */
1042 	if (idx <= RE_REG_LM_IDX_MAX) {
1043 		reg = reg_lm(lm3 ? 3 : 0, idx);
1044 	} else {
1045 		reg = imm_a(nfp_prog);
1046 		/* If it's not the first part of the load and we start a new GPR
1047 		 * that means we are loading a second part of the LMEM word into
1048 		 * a new GPR.  IOW we've already looked that LMEM word and
1049 		 * therefore it has been loaded into imm_a().
1050 		 */
1051 		if (first || !new_gpr)
1052 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1053 	}
1054 
1055 	emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
1056 
1057 	if (should_inc)
1058 		wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1059 
1060 	return 0;
1061 }
1062 
1063 static int
1064 wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
1065 	       unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1066 	       bool needs_inc)
1067 {
1068 	bool should_inc = needs_inc && new_gpr && !last;
1069 	u32 idx, dst_byte;
1070 	enum shf_sc sc;
1071 	swreg reg;
1072 	int shf;
1073 	u8 mask;
1074 
1075 	if (WARN_ON_ONCE(src_byte + size > 4 || off % 4 + size > 4))
1076 		return -EOPNOTSUPP;
1077 
1078 	idx = off / 4;
1079 
1080 	/* Move the entire word */
1081 	if (size == 4) {
1082 		wrp_mov(nfp_prog,
1083 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
1084 			reg_b(src));
1085 		return 0;
1086 	}
1087 
1088 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1089 		return -EOPNOTSUPP;
1090 
1091 	dst_byte = off % 4;
1092 
1093 	mask = (1 << size) - 1;
1094 	mask <<= dst_byte;
1095 
1096 	if (WARN_ON_ONCE(mask > 0xf))
1097 		return -EOPNOTSUPP;
1098 
1099 	shf = abs(src_byte - dst_byte) * 8;
1100 	if (src_byte == dst_byte) {
1101 		sc = SHF_SC_NONE;
1102 	} else if (src_byte < dst_byte) {
1103 		shf = 32 - shf;
1104 		sc = SHF_SC_L_SHF;
1105 	} else {
1106 		sc = SHF_SC_R_SHF;
1107 	}
1108 
1109 	/* ld_field can address fewer indexes, if offset too large do RMW.
1110 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1111 	 */
1112 	if (idx <= RE_REG_LM_IDX_MAX) {
1113 		reg = reg_lm(lm3 ? 3 : 0, idx);
1114 	} else {
1115 		reg = imm_a(nfp_prog);
1116 		/* Only first and last LMEM locations are going to need RMW,
1117 		 * the middle location will be overwritten fully.
1118 		 */
1119 		if (first || last)
1120 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1121 	}
1122 
1123 	emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
1124 
1125 	if (new_gpr || last) {
1126 		if (idx > RE_REG_LM_IDX_MAX)
1127 			wrp_mov(nfp_prog, reg_lm(0, idx), reg);
1128 		if (should_inc)
1129 			wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1130 	}
1131 
1132 	return 0;
1133 }
1134 
1135 static int
1136 mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1137 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
1138 	     bool clr_gpr, lmem_step step)
1139 {
1140 	s32 off = nfp_prog->stack_depth + meta->insn.off + ptr_off;
1141 	bool first = true, last;
1142 	bool needs_inc = false;
1143 	swreg stack_off_reg;
1144 	u8 prev_gpr = 255;
1145 	u32 gpr_byte = 0;
1146 	bool lm3 = true;
1147 	int ret;
1148 
1149 	if (meta->ptr_not_const) {
1150 		/* Use of the last encountered ptr_off is OK, they all have
1151 		 * the same alignment.  Depend on low bits of value being
1152 		 * discarded when written to LMaddr register.
1153 		 */
1154 		stack_off_reg = ur_load_imm_any(nfp_prog, meta->insn.off,
1155 						stack_imm(nfp_prog));
1156 
1157 		emit_alu(nfp_prog, imm_b(nfp_prog),
1158 			 reg_a(ptr_gpr), ALU_OP_ADD, stack_off_reg);
1159 
1160 		needs_inc = true;
1161 	} else if (off + size <= 64) {
1162 		/* We can reach bottom 64B with LMaddr0 */
1163 		lm3 = false;
1164 	} else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
1165 		/* We have to set up a new pointer.  If we know the offset
1166 		 * and the entire access falls into a single 32 byte aligned
1167 		 * window we won't have to increment the LM pointer.
1168 		 * The 32 byte alignment is imporant because offset is ORed in
1169 		 * not added when doing *l$indexN[off].
1170 		 */
1171 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
1172 						stack_imm(nfp_prog));
1173 		emit_alu(nfp_prog, imm_b(nfp_prog),
1174 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1175 
1176 		off %= 32;
1177 	} else {
1178 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
1179 						stack_imm(nfp_prog));
1180 
1181 		emit_alu(nfp_prog, imm_b(nfp_prog),
1182 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1183 
1184 		needs_inc = true;
1185 	}
1186 	if (lm3) {
1187 		emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
1188 		/* For size < 4 one slot will be filled by zeroing of upper. */
1189 		wrp_nops(nfp_prog, clr_gpr && size < 8 ? 2 : 3);
1190 	}
1191 
1192 	if (clr_gpr && size < 8)
1193 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
1194 
1195 	while (size) {
1196 		u32 slice_end;
1197 		u8 slice_size;
1198 
1199 		slice_size = min(size, 4 - gpr_byte);
1200 		slice_end = min(off + slice_size, round_up(off + 1, 4));
1201 		slice_size = slice_end - off;
1202 
1203 		last = slice_size == size;
1204 
1205 		if (needs_inc)
1206 			off %= 4;
1207 
1208 		ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
1209 			   first, gpr != prev_gpr, last, lm3, needs_inc);
1210 		if (ret)
1211 			return ret;
1212 
1213 		prev_gpr = gpr;
1214 		first = false;
1215 
1216 		gpr_byte += slice_size;
1217 		if (gpr_byte >= 4) {
1218 			gpr_byte -= 4;
1219 			gpr++;
1220 		}
1221 
1222 		size -= slice_size;
1223 		off += slice_size;
1224 	}
1225 
1226 	return 0;
1227 }
1228 
1229 static void
1230 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
1231 {
1232 	swreg tmp_reg;
1233 
1234 	if (alu_op == ALU_OP_AND) {
1235 		if (!imm)
1236 			wrp_immed(nfp_prog, reg_both(dst), 0);
1237 		if (!imm || !~imm)
1238 			return;
1239 	}
1240 	if (alu_op == ALU_OP_OR) {
1241 		if (!~imm)
1242 			wrp_immed(nfp_prog, reg_both(dst), ~0U);
1243 		if (!imm || !~imm)
1244 			return;
1245 	}
1246 	if (alu_op == ALU_OP_XOR) {
1247 		if (!~imm)
1248 			emit_alu(nfp_prog, reg_both(dst), reg_none(),
1249 				 ALU_OP_NOT, reg_b(dst));
1250 		if (!imm || !~imm)
1251 			return;
1252 	}
1253 
1254 	tmp_reg = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1255 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, tmp_reg);
1256 }
1257 
1258 static int
1259 wrp_alu64_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1260 	      enum alu_op alu_op, bool skip)
1261 {
1262 	const struct bpf_insn *insn = &meta->insn;
1263 	u64 imm = insn->imm; /* sign extend */
1264 
1265 	if (skip) {
1266 		meta->skip = true;
1267 		return 0;
1268 	}
1269 
1270 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, imm & ~0U);
1271 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, alu_op, imm >> 32);
1272 
1273 	return 0;
1274 }
1275 
1276 static int
1277 wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1278 	      enum alu_op alu_op)
1279 {
1280 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1281 
1282 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1283 	emit_alu(nfp_prog, reg_both(dst + 1),
1284 		 reg_a(dst + 1), alu_op, reg_b(src + 1));
1285 
1286 	return 0;
1287 }
1288 
1289 static int
1290 wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1291 	      enum alu_op alu_op, bool skip)
1292 {
1293 	const struct bpf_insn *insn = &meta->insn;
1294 
1295 	if (skip) {
1296 		meta->skip = true;
1297 		return 0;
1298 	}
1299 
1300 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, insn->imm);
1301 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1302 
1303 	return 0;
1304 }
1305 
1306 static int
1307 wrp_alu32_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1308 	      enum alu_op alu_op)
1309 {
1310 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1311 
1312 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1313 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1314 
1315 	return 0;
1316 }
1317 
1318 static void
1319 wrp_test_reg_one(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u8 src,
1320 		 enum br_mask br_mask, u16 off)
1321 {
1322 	emit_alu(nfp_prog, reg_none(), reg_a(dst), alu_op, reg_b(src));
1323 	emit_br(nfp_prog, br_mask, off, 0);
1324 }
1325 
1326 static int
1327 wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1328 	     enum alu_op alu_op, enum br_mask br_mask)
1329 {
1330 	const struct bpf_insn *insn = &meta->insn;
1331 
1332 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
1333 			 insn->src_reg * 2, br_mask, insn->off);
1334 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
1335 			 insn->src_reg * 2 + 1, br_mask, insn->off);
1336 
1337 	return 0;
1338 }
1339 
1340 static const struct jmp_code_map {
1341 	enum br_mask br_mask;
1342 	bool swap;
1343 } jmp_code_map[] = {
1344 	[BPF_JGT >> 4]	= { BR_BLO, true },
1345 	[BPF_JGE >> 4]	= { BR_BHS, false },
1346 	[BPF_JLT >> 4]	= { BR_BLO, false },
1347 	[BPF_JLE >> 4]	= { BR_BHS, true },
1348 	[BPF_JSGT >> 4]	= { BR_BLT, true },
1349 	[BPF_JSGE >> 4]	= { BR_BGE, false },
1350 	[BPF_JSLT >> 4]	= { BR_BLT, false },
1351 	[BPF_JSLE >> 4]	= { BR_BGE, true },
1352 };
1353 
1354 static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
1355 {
1356 	unsigned int op;
1357 
1358 	op = BPF_OP(meta->insn.code) >> 4;
1359 	/* br_mask of 0 is BR_BEQ which we don't use in jump code table */
1360 	if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
1361 		      !jmp_code_map[op].br_mask,
1362 		      "no code found for jump instruction"))
1363 		return NULL;
1364 
1365 	return &jmp_code_map[op];
1366 }
1367 
1368 static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1369 {
1370 	const struct bpf_insn *insn = &meta->insn;
1371 	u64 imm = insn->imm; /* sign extend */
1372 	const struct jmp_code_map *code;
1373 	enum alu_op alu_op, carry_op;
1374 	u8 reg = insn->dst_reg * 2;
1375 	swreg tmp_reg;
1376 
1377 	code = nfp_jmp_code_get(meta);
1378 	if (!code)
1379 		return -EINVAL;
1380 
1381 	alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
1382 	carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
1383 
1384 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
1385 	if (!code->swap)
1386 		emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
1387 	else
1388 		emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
1389 
1390 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
1391 	if (!code->swap)
1392 		emit_alu(nfp_prog, reg_none(),
1393 			 reg_a(reg + 1), carry_op, tmp_reg);
1394 	else
1395 		emit_alu(nfp_prog, reg_none(),
1396 			 tmp_reg, carry_op, reg_a(reg + 1));
1397 
1398 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1399 
1400 	return 0;
1401 }
1402 
1403 static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1404 {
1405 	const struct bpf_insn *insn = &meta->insn;
1406 	const struct jmp_code_map *code;
1407 	u8 areg, breg;
1408 
1409 	code = nfp_jmp_code_get(meta);
1410 	if (!code)
1411 		return -EINVAL;
1412 
1413 	areg = insn->dst_reg * 2;
1414 	breg = insn->src_reg * 2;
1415 
1416 	if (code->swap) {
1417 		areg ^= breg;
1418 		breg ^= areg;
1419 		areg ^= breg;
1420 	}
1421 
1422 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
1423 	emit_alu(nfp_prog, reg_none(),
1424 		 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
1425 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1426 
1427 	return 0;
1428 }
1429 
1430 static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
1431 {
1432 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in,
1433 		      SHF_SC_R_ROT, 8);
1434 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out),
1435 		      SHF_SC_R_ROT, 16);
1436 }
1437 
1438 static void
1439 wrp_mul_u32(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1440 	    swreg rreg, bool gen_high_half)
1441 {
1442 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1443 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_1, rreg);
1444 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_2, rreg);
1445 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_3, rreg);
1446 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_4, rreg);
1447 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_32x32, MUL_LAST, reg_none());
1448 	if (gen_high_half)
1449 		emit_mul(nfp_prog, dst_hi, MUL_TYPE_STEP_32x32, MUL_LAST_2,
1450 			 reg_none());
1451 	else
1452 		wrp_immed(nfp_prog, dst_hi, 0);
1453 }
1454 
1455 static void
1456 wrp_mul_u16(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1457 	    swreg rreg)
1458 {
1459 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1460 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_1, rreg);
1461 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_2, rreg);
1462 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_16x16, MUL_LAST, reg_none());
1463 }
1464 
1465 static int
1466 wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1467 	bool gen_high_half, bool ropnd_from_reg)
1468 {
1469 	swreg multiplier, multiplicand, dst_hi, dst_lo;
1470 	const struct bpf_insn *insn = &meta->insn;
1471 	u32 lopnd_max, ropnd_max;
1472 	u8 dst_reg;
1473 
1474 	dst_reg = insn->dst_reg;
1475 	multiplicand = reg_a(dst_reg * 2);
1476 	dst_hi = reg_both(dst_reg * 2 + 1);
1477 	dst_lo = reg_both(dst_reg * 2);
1478 	lopnd_max = meta->umax_dst;
1479 	if (ropnd_from_reg) {
1480 		multiplier = reg_b(insn->src_reg * 2);
1481 		ropnd_max = meta->umax_src;
1482 	} else {
1483 		u32 imm = insn->imm;
1484 
1485 		multiplier = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1486 		ropnd_max = imm;
1487 	}
1488 	if (lopnd_max > U16_MAX || ropnd_max > U16_MAX)
1489 		wrp_mul_u32(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier,
1490 			    gen_high_half);
1491 	else
1492 		wrp_mul_u16(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier);
1493 
1494 	return 0;
1495 }
1496 
1497 static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
1498 {
1499 	swreg dst_both = reg_both(dst), dst_a = reg_a(dst), dst_b = reg_a(dst);
1500 	struct reciprocal_value_adv rvalue;
1501 	u8 pre_shift, exp;
1502 	swreg magic;
1503 
1504 	if (imm > U32_MAX) {
1505 		wrp_immed(nfp_prog, dst_both, 0);
1506 		return 0;
1507 	}
1508 
1509 	/* NOTE: because we are using "reciprocal_value_adv" which doesn't
1510 	 * support "divisor > (1u << 31)", we need to JIT separate NFP sequence
1511 	 * to handle such case which actually equals to the result of unsigned
1512 	 * comparison "dst >= imm" which could be calculated using the following
1513 	 * NFP sequence:
1514 	 *
1515 	 *  alu[--, dst, -, imm]
1516 	 *  immed[imm, 0]
1517 	 *  alu[dst, imm, +carry, 0]
1518 	 *
1519 	 */
1520 	if (imm > 1U << 31) {
1521 		swreg tmp_b = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1522 
1523 		emit_alu(nfp_prog, reg_none(), dst_a, ALU_OP_SUB, tmp_b);
1524 		wrp_immed(nfp_prog, imm_a(nfp_prog), 0);
1525 		emit_alu(nfp_prog, dst_both, imm_a(nfp_prog), ALU_OP_ADD_C,
1526 			 reg_imm(0));
1527 		return 0;
1528 	}
1529 
1530 	rvalue = reciprocal_value_adv(imm, 32);
1531 	exp = rvalue.exp;
1532 	if (rvalue.is_wide_m && !(imm & 1)) {
1533 		pre_shift = fls(imm & -imm) - 1;
1534 		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
1535 	} else {
1536 		pre_shift = 0;
1537 	}
1538 	magic = ur_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
1539 	if (imm == 1U << exp) {
1540 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1541 			 SHF_SC_R_SHF, exp);
1542 	} else if (rvalue.is_wide_m) {
1543 		wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a,
1544 			    magic, true);
1545 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB,
1546 			 imm_b(nfp_prog));
1547 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1548 			 SHF_SC_R_SHF, 1);
1549 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD,
1550 			 imm_b(nfp_prog));
1551 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1552 			 SHF_SC_R_SHF, rvalue.sh - 1);
1553 	} else {
1554 		if (pre_shift)
1555 			emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1556 				 dst_b, SHF_SC_R_SHF, pre_shift);
1557 		wrp_mul_u32(nfp_prog, dst_both, reg_none(), dst_a, magic, true);
1558 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1559 			 dst_b, SHF_SC_R_SHF, rvalue.sh);
1560 	}
1561 
1562 	return 0;
1563 }
1564 
1565 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1566 {
1567 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
1568 	struct nfp_bpf_cap_adjust_head *adjust_head;
1569 	u32 ret_einval, end;
1570 
1571 	adjust_head = &nfp_prog->bpf->adjust_head;
1572 
1573 	/* Optimized version - 5 vs 14 cycles */
1574 	if (nfp_prog->adjust_head_location != UINT_MAX) {
1575 		if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n))
1576 			return -EINVAL;
1577 
1578 		emit_alu(nfp_prog, pptr_reg(nfp_prog),
1579 			 reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog));
1580 		emit_alu(nfp_prog, plen_reg(nfp_prog),
1581 			 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1582 		emit_alu(nfp_prog, pv_len(nfp_prog),
1583 			 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1584 
1585 		wrp_immed(nfp_prog, reg_both(0), 0);
1586 		wrp_immed(nfp_prog, reg_both(1), 0);
1587 
1588 		/* TODO: when adjust head is guaranteed to succeed we can
1589 		 * also eliminate the following if (r0 == 0) branch.
1590 		 */
1591 
1592 		return 0;
1593 	}
1594 
1595 	ret_einval = nfp_prog_current_offset(nfp_prog) + 14;
1596 	end = ret_einval + 2;
1597 
1598 	/* We need to use a temp because offset is just a part of the pkt ptr */
1599 	emit_alu(nfp_prog, tmp,
1600 		 reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog));
1601 
1602 	/* Validate result will fit within FW datapath constraints */
1603 	emit_alu(nfp_prog, reg_none(),
1604 		 tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min));
1605 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1606 	emit_alu(nfp_prog, reg_none(),
1607 		 reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp);
1608 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1609 
1610 	/* Validate the length is at least ETH_HLEN */
1611 	emit_alu(nfp_prog, tmp_len,
1612 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1613 	emit_alu(nfp_prog, reg_none(),
1614 		 tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN));
1615 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1616 
1617 	/* Load the ret code */
1618 	wrp_immed(nfp_prog, reg_both(0), 0);
1619 	wrp_immed(nfp_prog, reg_both(1), 0);
1620 
1621 	/* Modify the packet metadata */
1622 	emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0);
1623 
1624 	/* Skip over the -EINVAL ret code (defer 2) */
1625 	emit_br(nfp_prog, BR_UNC, end, 2);
1626 
1627 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1628 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1629 	emit_alu(nfp_prog, pv_len(nfp_prog),
1630 		 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1631 
1632 	/* return -EINVAL target */
1633 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1634 		return -EINVAL;
1635 
1636 	wrp_immed(nfp_prog, reg_both(0), -22);
1637 	wrp_immed(nfp_prog, reg_both(1), ~0);
1638 
1639 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1640 		return -EINVAL;
1641 
1642 	return 0;
1643 }
1644 
1645 static int
1646 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1647 {
1648 	bool load_lm_ptr;
1649 	u32 ret_tgt;
1650 	s64 lm_off;
1651 
1652 	/* We only have to reload LM0 if the key is not at start of stack */
1653 	lm_off = nfp_prog->stack_depth;
1654 	lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
1655 	load_lm_ptr = meta->arg2.var_off || lm_off;
1656 
1657 	/* Set LM0 to start of key */
1658 	if (load_lm_ptr)
1659 		emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0);
1660 	if (meta->func_id == BPF_FUNC_map_update_elem)
1661 		emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1662 
1663 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1664 		     2, RELO_BR_HELPER);
1665 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1666 
1667 	/* Load map ID into A0 */
1668 	wrp_mov(nfp_prog, reg_a(0), reg_a(2));
1669 
1670 	/* Load the return address into B0 */
1671 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1672 
1673 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1674 		return -EINVAL;
1675 
1676 	/* Reset the LM0 pointer */
1677 	if (!load_lm_ptr)
1678 		return 0;
1679 
1680 	emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
1681 	wrp_nops(nfp_prog, 3);
1682 
1683 	return 0;
1684 }
1685 
1686 static int
1687 nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1688 {
1689 	__emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM);
1690 	/* CSR value is read in following immed[gpr, 0] */
1691 	emit_immed(nfp_prog, reg_both(0), 0,
1692 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1693 	emit_immed(nfp_prog, reg_both(1), 0,
1694 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1695 	return 0;
1696 }
1697 
1698 static int
1699 nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1700 {
1701 	swreg ptr_type;
1702 	u32 ret_tgt;
1703 
1704 	ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
1705 
1706 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
1707 
1708 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1709 		     2, RELO_BR_HELPER);
1710 
1711 	/* Load ptr type into A1 */
1712 	wrp_mov(nfp_prog, reg_a(1), ptr_type);
1713 
1714 	/* Load the return address into B0 */
1715 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1716 
1717 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1718 		return -EINVAL;
1719 
1720 	return 0;
1721 }
1722 
1723 static int
1724 nfp_queue_select(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1725 {
1726 	u32 jmp_tgt;
1727 
1728 	jmp_tgt = nfp_prog_current_offset(nfp_prog) + 5;
1729 
1730 	/* Make sure the queue id fits into FW field */
1731 	emit_alu(nfp_prog, reg_none(), reg_a(meta->insn.src_reg * 2),
1732 		 ALU_OP_AND_NOT_B, reg_imm(0xff));
1733 	emit_br(nfp_prog, BR_BEQ, jmp_tgt, 2);
1734 
1735 	/* Set the 'queue selected' bit and the queue value */
1736 	emit_shf(nfp_prog, pv_qsel_set(nfp_prog),
1737 		 pv_qsel_set(nfp_prog), SHF_OP_OR, reg_imm(1),
1738 		 SHF_SC_L_SHF, PKT_VEL_QSEL_SET_BIT);
1739 	emit_ld_field(nfp_prog,
1740 		      pv_qsel_val(nfp_prog), 0x1, reg_b(meta->insn.src_reg * 2),
1741 		      SHF_SC_NONE, 0);
1742 	/* Delay slots end here, we will jump over next instruction if queue
1743 	 * value fits into the field.
1744 	 */
1745 	emit_ld_field(nfp_prog,
1746 		      pv_qsel_val(nfp_prog), 0x1, reg_imm(NFP_NET_RXR_MAX),
1747 		      SHF_SC_NONE, 0);
1748 
1749 	if (!nfp_prog_confirm_current_offset(nfp_prog, jmp_tgt))
1750 		return -EINVAL;
1751 
1752 	return 0;
1753 }
1754 
1755 /* --- Callbacks --- */
1756 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1757 {
1758 	const struct bpf_insn *insn = &meta->insn;
1759 	u8 dst = insn->dst_reg * 2;
1760 	u8 src = insn->src_reg * 2;
1761 
1762 	if (insn->src_reg == BPF_REG_10) {
1763 		swreg stack_depth_reg;
1764 
1765 		stack_depth_reg = ur_load_imm_any(nfp_prog,
1766 						  nfp_prog->stack_depth,
1767 						  stack_imm(nfp_prog));
1768 		emit_alu(nfp_prog, reg_both(dst),
1769 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_depth_reg);
1770 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1771 	} else {
1772 		wrp_reg_mov(nfp_prog, dst, src);
1773 		wrp_reg_mov(nfp_prog, dst + 1, src + 1);
1774 	}
1775 
1776 	return 0;
1777 }
1778 
1779 static int mov_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1780 {
1781 	u64 imm = meta->insn.imm; /* sign extend */
1782 
1783 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2), imm & ~0U);
1784 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), imm >> 32);
1785 
1786 	return 0;
1787 }
1788 
1789 static int xor_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1790 {
1791 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_XOR);
1792 }
1793 
1794 static int xor_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1795 {
1796 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm);
1797 }
1798 
1799 static int and_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1800 {
1801 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_AND);
1802 }
1803 
1804 static int and_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1805 {
1806 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1807 }
1808 
1809 static int or_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1810 {
1811 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_OR);
1812 }
1813 
1814 static int or_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1815 {
1816 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1817 }
1818 
1819 static int add_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1820 {
1821 	const struct bpf_insn *insn = &meta->insn;
1822 
1823 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1824 		 reg_a(insn->dst_reg * 2), ALU_OP_ADD,
1825 		 reg_b(insn->src_reg * 2));
1826 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1827 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_ADD_C,
1828 		 reg_b(insn->src_reg * 2 + 1));
1829 
1830 	return 0;
1831 }
1832 
1833 static int add_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1834 {
1835 	const struct bpf_insn *insn = &meta->insn;
1836 	u64 imm = insn->imm; /* sign extend */
1837 
1838 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_ADD, imm & ~0U);
1839 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_ADD_C, imm >> 32);
1840 
1841 	return 0;
1842 }
1843 
1844 static int sub_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1845 {
1846 	const struct bpf_insn *insn = &meta->insn;
1847 
1848 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1849 		 reg_a(insn->dst_reg * 2), ALU_OP_SUB,
1850 		 reg_b(insn->src_reg * 2));
1851 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1852 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_SUB_C,
1853 		 reg_b(insn->src_reg * 2 + 1));
1854 
1855 	return 0;
1856 }
1857 
1858 static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1859 {
1860 	const struct bpf_insn *insn = &meta->insn;
1861 	u64 imm = insn->imm; /* sign extend */
1862 
1863 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_SUB, imm & ~0U);
1864 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_SUB_C, imm >> 32);
1865 
1866 	return 0;
1867 }
1868 
1869 static int mul_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1870 {
1871 	return wrp_mul(nfp_prog, meta, true, true);
1872 }
1873 
1874 static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1875 {
1876 	return wrp_mul(nfp_prog, meta, true, false);
1877 }
1878 
1879 static int div_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1880 {
1881 	const struct bpf_insn *insn = &meta->insn;
1882 
1883 	return wrp_div_imm(nfp_prog, insn->dst_reg * 2, insn->imm);
1884 }
1885 
1886 static int div_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1887 {
1888 	/* NOTE: verifier hook has rejected cases for which verifier doesn't
1889 	 * know whether the source operand is constant or not.
1890 	 */
1891 	return wrp_div_imm(nfp_prog, meta->insn.dst_reg * 2, meta->umin_src);
1892 }
1893 
1894 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1895 {
1896 	const struct bpf_insn *insn = &meta->insn;
1897 
1898 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2), reg_imm(0),
1899 		 ALU_OP_SUB, reg_b(insn->dst_reg * 2));
1900 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1), reg_imm(0),
1901 		 ALU_OP_SUB_C, reg_b(insn->dst_reg * 2 + 1));
1902 
1903 	return 0;
1904 }
1905 
1906 /* Pseudo code:
1907  *   if shift_amt >= 32
1908  *     dst_high = dst_low << shift_amt[4:0]
1909  *     dst_low = 0;
1910  *   else
1911  *     dst_high = (dst_high, dst_low) >> (32 - shift_amt)
1912  *     dst_low = dst_low << shift_amt
1913  *
1914  * The indirect shift will use the same logic at runtime.
1915  */
1916 static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
1917 {
1918 	if (shift_amt < 32) {
1919 		emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
1920 			 SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
1921 			 32 - shift_amt);
1922 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
1923 			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
1924 	} else if (shift_amt == 32) {
1925 		wrp_reg_mov(nfp_prog, dst + 1, dst);
1926 		wrp_immed(nfp_prog, reg_both(dst), 0);
1927 	} else if (shift_amt > 32) {
1928 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
1929 			 reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
1930 		wrp_immed(nfp_prog, reg_both(dst), 0);
1931 	}
1932 
1933 	return 0;
1934 }
1935 
1936 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1937 {
1938 	const struct bpf_insn *insn = &meta->insn;
1939 	u8 dst = insn->dst_reg * 2;
1940 
1941 	return __shl_imm64(nfp_prog, dst, insn->imm);
1942 }
1943 
1944 static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
1945 {
1946 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
1947 		 reg_b(src));
1948 	emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
1949 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
1950 		       reg_b(dst), SHF_SC_R_DSHF);
1951 }
1952 
1953 /* NOTE: for indirect left shift, HIGH part should be calculated first. */
1954 static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
1955 {
1956 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
1957 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
1958 		       reg_b(dst), SHF_SC_L_SHF);
1959 }
1960 
1961 static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
1962 {
1963 	shl_reg64_lt32_high(nfp_prog, dst, src);
1964 	shl_reg64_lt32_low(nfp_prog, dst, src);
1965 }
1966 
1967 static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
1968 {
1969 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
1970 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
1971 		       reg_b(dst), SHF_SC_L_SHF);
1972 	wrp_immed(nfp_prog, reg_both(dst), 0);
1973 }
1974 
1975 static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1976 {
1977 	const struct bpf_insn *insn = &meta->insn;
1978 	u64 umin, umax;
1979 	u8 dst, src;
1980 
1981 	dst = insn->dst_reg * 2;
1982 	umin = meta->umin_src;
1983 	umax = meta->umax_src;
1984 	if (umin == umax)
1985 		return __shl_imm64(nfp_prog, dst, umin);
1986 
1987 	src = insn->src_reg * 2;
1988 	if (umax < 32) {
1989 		shl_reg64_lt32(nfp_prog, dst, src);
1990 	} else if (umin >= 32) {
1991 		shl_reg64_ge32(nfp_prog, dst, src);
1992 	} else {
1993 		/* Generate different instruction sequences depending on runtime
1994 		 * value of shift amount.
1995 		 */
1996 		u16 label_ge32, label_end;
1997 
1998 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
1999 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2000 
2001 		shl_reg64_lt32_high(nfp_prog, dst, src);
2002 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2003 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2004 		/* shl_reg64_lt32_low packed in delay slot. */
2005 		shl_reg64_lt32_low(nfp_prog, dst, src);
2006 
2007 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2008 			return -EINVAL;
2009 		shl_reg64_ge32(nfp_prog, dst, src);
2010 
2011 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2012 			return -EINVAL;
2013 	}
2014 
2015 	return 0;
2016 }
2017 
2018 /* Pseudo code:
2019  *   if shift_amt >= 32
2020  *     dst_high = 0;
2021  *     dst_low = dst_high >> shift_amt[4:0]
2022  *   else
2023  *     dst_high = dst_high >> shift_amt
2024  *     dst_low = (dst_high, dst_low) >> shift_amt
2025  *
2026  * The indirect shift will use the same logic at runtime.
2027  */
2028 static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2029 {
2030 	if (shift_amt < 32) {
2031 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2032 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2033 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2034 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2035 	} else if (shift_amt == 32) {
2036 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2037 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2038 	} else if (shift_amt > 32) {
2039 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2040 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2041 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2042 	}
2043 
2044 	return 0;
2045 }
2046 
2047 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2048 {
2049 	const struct bpf_insn *insn = &meta->insn;
2050 	u8 dst = insn->dst_reg * 2;
2051 
2052 	return __shr_imm64(nfp_prog, dst, insn->imm);
2053 }
2054 
2055 /* NOTE: for indirect right shift, LOW part should be calculated first. */
2056 static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2057 {
2058 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2059 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2060 		       reg_b(dst + 1), SHF_SC_R_SHF);
2061 }
2062 
2063 static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2064 {
2065 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2066 	emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2067 		       reg_b(dst), SHF_SC_R_DSHF);
2068 }
2069 
2070 static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2071 {
2072 	shr_reg64_lt32_low(nfp_prog, dst, src);
2073 	shr_reg64_lt32_high(nfp_prog, dst, src);
2074 }
2075 
2076 static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2077 {
2078 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2079 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2080 		       reg_b(dst + 1), SHF_SC_R_SHF);
2081 	wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2082 }
2083 
2084 static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2085 {
2086 	const struct bpf_insn *insn = &meta->insn;
2087 	u64 umin, umax;
2088 	u8 dst, src;
2089 
2090 	dst = insn->dst_reg * 2;
2091 	umin = meta->umin_src;
2092 	umax = meta->umax_src;
2093 	if (umin == umax)
2094 		return __shr_imm64(nfp_prog, dst, umin);
2095 
2096 	src = insn->src_reg * 2;
2097 	if (umax < 32) {
2098 		shr_reg64_lt32(nfp_prog, dst, src);
2099 	} else if (umin >= 32) {
2100 		shr_reg64_ge32(nfp_prog, dst, src);
2101 	} else {
2102 		/* Generate different instruction sequences depending on runtime
2103 		 * value of shift amount.
2104 		 */
2105 		u16 label_ge32, label_end;
2106 
2107 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2108 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2109 		shr_reg64_lt32_low(nfp_prog, dst, src);
2110 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2111 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2112 		/* shr_reg64_lt32_high packed in delay slot. */
2113 		shr_reg64_lt32_high(nfp_prog, dst, src);
2114 
2115 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2116 			return -EINVAL;
2117 		shr_reg64_ge32(nfp_prog, dst, src);
2118 
2119 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2120 			return -EINVAL;
2121 	}
2122 
2123 	return 0;
2124 }
2125 
2126 /* Code logic is the same as __shr_imm64 except ashr requires signedness bit
2127  * told through PREV_ALU result.
2128  */
2129 static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2130 {
2131 	if (shift_amt < 32) {
2132 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2133 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2134 		/* Set signedness bit. */
2135 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2136 			 reg_imm(0));
2137 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2138 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2139 	} else if (shift_amt == 32) {
2140 		/* NOTE: this also helps setting signedness bit. */
2141 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2142 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2143 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2144 	} else if (shift_amt > 32) {
2145 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2146 			 reg_imm(0));
2147 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2148 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2149 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2150 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2151 	}
2152 
2153 	return 0;
2154 }
2155 
2156 static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2157 {
2158 	const struct bpf_insn *insn = &meta->insn;
2159 	u8 dst = insn->dst_reg * 2;
2160 
2161 	return __ashr_imm64(nfp_prog, dst, insn->imm);
2162 }
2163 
2164 static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2165 {
2166 	/* NOTE: the first insn will set both indirect shift amount (source A)
2167 	 * and signedness bit (MSB of result).
2168 	 */
2169 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2170 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2171 		       reg_b(dst + 1), SHF_SC_R_SHF);
2172 }
2173 
2174 static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2175 {
2176 	/* NOTE: it is the same as logic shift because we don't need to shift in
2177 	 * signedness bit when the shift amount is less than 32.
2178 	 */
2179 	return shr_reg64_lt32_low(nfp_prog, dst, src);
2180 }
2181 
2182 static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2183 {
2184 	ashr_reg64_lt32_low(nfp_prog, dst, src);
2185 	ashr_reg64_lt32_high(nfp_prog, dst, src);
2186 }
2187 
2188 static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2189 {
2190 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2191 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2192 		       reg_b(dst + 1), SHF_SC_R_SHF);
2193 	emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2194 		 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2195 }
2196 
2197 /* Like ashr_imm64, but need to use indirect shift. */
2198 static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2199 {
2200 	const struct bpf_insn *insn = &meta->insn;
2201 	u64 umin, umax;
2202 	u8 dst, src;
2203 
2204 	dst = insn->dst_reg * 2;
2205 	umin = meta->umin_src;
2206 	umax = meta->umax_src;
2207 	if (umin == umax)
2208 		return __ashr_imm64(nfp_prog, dst, umin);
2209 
2210 	src = insn->src_reg * 2;
2211 	if (umax < 32) {
2212 		ashr_reg64_lt32(nfp_prog, dst, src);
2213 	} else if (umin >= 32) {
2214 		ashr_reg64_ge32(nfp_prog, dst, src);
2215 	} else {
2216 		u16 label_ge32, label_end;
2217 
2218 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2219 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2220 		ashr_reg64_lt32_low(nfp_prog, dst, src);
2221 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2222 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2223 		/* ashr_reg64_lt32_high packed in delay slot. */
2224 		ashr_reg64_lt32_high(nfp_prog, dst, src);
2225 
2226 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2227 			return -EINVAL;
2228 		ashr_reg64_ge32(nfp_prog, dst, src);
2229 
2230 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2231 			return -EINVAL;
2232 	}
2233 
2234 	return 0;
2235 }
2236 
2237 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2238 {
2239 	const struct bpf_insn *insn = &meta->insn;
2240 
2241 	wrp_reg_mov(nfp_prog, insn->dst_reg * 2,  insn->src_reg * 2);
2242 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2243 
2244 	return 0;
2245 }
2246 
2247 static int mov_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2248 {
2249 	const struct bpf_insn *insn = &meta->insn;
2250 
2251 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), insn->imm);
2252 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2253 
2254 	return 0;
2255 }
2256 
2257 static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2258 {
2259 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_XOR);
2260 }
2261 
2262 static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2263 {
2264 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR, !~meta->insn.imm);
2265 }
2266 
2267 static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2268 {
2269 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_AND);
2270 }
2271 
2272 static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2273 {
2274 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
2275 }
2276 
2277 static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2278 {
2279 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_OR);
2280 }
2281 
2282 static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2283 {
2284 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
2285 }
2286 
2287 static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2288 {
2289 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_ADD);
2290 }
2291 
2292 static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2293 {
2294 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD, !meta->insn.imm);
2295 }
2296 
2297 static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2298 {
2299 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_SUB);
2300 }
2301 
2302 static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2303 {
2304 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB, !meta->insn.imm);
2305 }
2306 
2307 static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2308 {
2309 	return wrp_mul(nfp_prog, meta, false, true);
2310 }
2311 
2312 static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2313 {
2314 	return wrp_mul(nfp_prog, meta, false, false);
2315 }
2316 
2317 static int div_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2318 {
2319 	return div_reg64(nfp_prog, meta);
2320 }
2321 
2322 static int div_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2323 {
2324 	return div_imm64(nfp_prog, meta);
2325 }
2326 
2327 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2328 {
2329 	u8 dst = meta->insn.dst_reg * 2;
2330 
2331 	emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst));
2332 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2333 
2334 	return 0;
2335 }
2336 
2337 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2338 {
2339 	const struct bpf_insn *insn = &meta->insn;
2340 
2341 	if (!insn->imm)
2342 		return 1; /* TODO: zero shift means indirect */
2343 
2344 	emit_shf(nfp_prog, reg_both(insn->dst_reg * 2),
2345 		 reg_none(), SHF_OP_NONE, reg_b(insn->dst_reg * 2),
2346 		 SHF_SC_L_SHF, insn->imm);
2347 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2348 
2349 	return 0;
2350 }
2351 
2352 static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2353 {
2354 	const struct bpf_insn *insn = &meta->insn;
2355 	u8 gpr = insn->dst_reg * 2;
2356 
2357 	switch (insn->imm) {
2358 	case 16:
2359 		emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr),
2360 			      SHF_SC_R_ROT, 8);
2361 		emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr),
2362 			      SHF_SC_R_SHF, 16);
2363 
2364 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2365 		break;
2366 	case 32:
2367 		wrp_end32(nfp_prog, reg_a(gpr), gpr);
2368 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2369 		break;
2370 	case 64:
2371 		wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1));
2372 
2373 		wrp_end32(nfp_prog, reg_a(gpr), gpr + 1);
2374 		wrp_end32(nfp_prog, imm_a(nfp_prog), gpr);
2375 		break;
2376 	}
2377 
2378 	return 0;
2379 }
2380 
2381 static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2382 {
2383 	struct nfp_insn_meta *prev = nfp_meta_prev(meta);
2384 	u32 imm_lo, imm_hi;
2385 	u8 dst;
2386 
2387 	dst = prev->insn.dst_reg * 2;
2388 	imm_lo = prev->insn.imm;
2389 	imm_hi = meta->insn.imm;
2390 
2391 	wrp_immed(nfp_prog, reg_both(dst), imm_lo);
2392 
2393 	/* mov is always 1 insn, load imm may be two, so try to use mov */
2394 	if (imm_hi == imm_lo)
2395 		wrp_mov(nfp_prog, reg_both(dst + 1), reg_a(dst));
2396 	else
2397 		wrp_immed(nfp_prog, reg_both(dst + 1), imm_hi);
2398 
2399 	return 0;
2400 }
2401 
2402 static int imm_ld8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2403 {
2404 	meta->double_cb = imm_ld8_part2;
2405 	return 0;
2406 }
2407 
2408 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2409 {
2410 	return construct_data_ld(nfp_prog, meta->insn.imm, 1);
2411 }
2412 
2413 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2414 {
2415 	return construct_data_ld(nfp_prog, meta->insn.imm, 2);
2416 }
2417 
2418 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2419 {
2420 	return construct_data_ld(nfp_prog, meta->insn.imm, 4);
2421 }
2422 
2423 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2424 {
2425 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2426 				     meta->insn.src_reg * 2, 1);
2427 }
2428 
2429 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2430 {
2431 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2432 				     meta->insn.src_reg * 2, 2);
2433 }
2434 
2435 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2436 {
2437 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2438 				     meta->insn.src_reg * 2, 4);
2439 }
2440 
2441 static int
2442 mem_ldx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2443 	      unsigned int size, unsigned int ptr_off)
2444 {
2445 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2446 			    meta->insn.dst_reg * 2, meta->insn.src_reg * 2,
2447 			    true, wrp_lmem_load);
2448 }
2449 
2450 static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2451 		       u8 size)
2452 {
2453 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2454 
2455 	switch (meta->insn.off) {
2456 	case offsetof(struct __sk_buff, len):
2457 		if (size != FIELD_SIZEOF(struct __sk_buff, len))
2458 			return -EOPNOTSUPP;
2459 		wrp_mov(nfp_prog, dst, plen_reg(nfp_prog));
2460 		break;
2461 	case offsetof(struct __sk_buff, data):
2462 		if (size != FIELD_SIZEOF(struct __sk_buff, data))
2463 			return -EOPNOTSUPP;
2464 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2465 		break;
2466 	case offsetof(struct __sk_buff, data_end):
2467 		if (size != FIELD_SIZEOF(struct __sk_buff, data_end))
2468 			return -EOPNOTSUPP;
2469 		emit_alu(nfp_prog, dst,
2470 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2471 		break;
2472 	default:
2473 		return -EOPNOTSUPP;
2474 	}
2475 
2476 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2477 
2478 	return 0;
2479 }
2480 
2481 static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2482 		       u8 size)
2483 {
2484 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2485 
2486 	switch (meta->insn.off) {
2487 	case offsetof(struct xdp_md, data):
2488 		if (size != FIELD_SIZEOF(struct xdp_md, data))
2489 			return -EOPNOTSUPP;
2490 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2491 		break;
2492 	case offsetof(struct xdp_md, data_end):
2493 		if (size != FIELD_SIZEOF(struct xdp_md, data_end))
2494 			return -EOPNOTSUPP;
2495 		emit_alu(nfp_prog, dst,
2496 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2497 		break;
2498 	default:
2499 		return -EOPNOTSUPP;
2500 	}
2501 
2502 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2503 
2504 	return 0;
2505 }
2506 
2507 static int
2508 mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2509 	     unsigned int size)
2510 {
2511 	swreg tmp_reg;
2512 
2513 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2514 
2515 	return data_ld_host_order_addr32(nfp_prog, meta->insn.src_reg * 2,
2516 					 tmp_reg, meta->insn.dst_reg * 2, size);
2517 }
2518 
2519 static int
2520 mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2521 	     unsigned int size)
2522 {
2523 	swreg tmp_reg;
2524 
2525 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2526 
2527 	return data_ld_host_order_addr40(nfp_prog, meta->insn.src_reg * 2,
2528 					 tmp_reg, meta->insn.dst_reg * 2, size);
2529 }
2530 
2531 static void
2532 mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog,
2533 			   struct nfp_insn_meta *meta)
2534 {
2535 	s16 range_start = meta->pkt_cache.range_start;
2536 	s16 range_end = meta->pkt_cache.range_end;
2537 	swreg src_base, off;
2538 	u8 xfer_num, len;
2539 	bool indir;
2540 
2541 	off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog));
2542 	src_base = reg_a(meta->insn.src_reg * 2);
2543 	len = range_end - range_start;
2544 	xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH;
2545 
2546 	indir = len > 8 * REG_WIDTH;
2547 	/* Setup PREV_ALU for indirect mode. */
2548 	if (indir)
2549 		wrp_immed(nfp_prog, reg_none(),
2550 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
2551 
2552 	/* Cache memory into transfer-in registers. */
2553 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
2554 		     off, xfer_num - 1, CMD_CTX_SWAP, indir);
2555 }
2556 
2557 static int
2558 mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog,
2559 				     struct nfp_insn_meta *meta,
2560 				     unsigned int size)
2561 {
2562 	s16 range_start = meta->pkt_cache.range_start;
2563 	s16 insn_off = meta->insn.off - range_start;
2564 	swreg dst_lo, dst_hi, src_lo, src_mid;
2565 	u8 dst_gpr = meta->insn.dst_reg * 2;
2566 	u8 len_lo = size, len_mid = 0;
2567 	u8 idx = insn_off / REG_WIDTH;
2568 	u8 off = insn_off % REG_WIDTH;
2569 
2570 	dst_hi = reg_both(dst_gpr + 1);
2571 	dst_lo = reg_both(dst_gpr);
2572 	src_lo = reg_xfer(idx);
2573 
2574 	/* The read length could involve as many as three registers. */
2575 	if (size > REG_WIDTH - off) {
2576 		/* Calculate the part in the second register. */
2577 		len_lo = REG_WIDTH - off;
2578 		len_mid = size - len_lo;
2579 
2580 		/* Calculate the part in the third register. */
2581 		if (size > 2 * REG_WIDTH - off)
2582 			len_mid = REG_WIDTH;
2583 	}
2584 
2585 	wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off);
2586 
2587 	if (!len_mid) {
2588 		wrp_immed(nfp_prog, dst_hi, 0);
2589 		return 0;
2590 	}
2591 
2592 	src_mid = reg_xfer(idx + 1);
2593 
2594 	if (size <= REG_WIDTH) {
2595 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo);
2596 		wrp_immed(nfp_prog, dst_hi, 0);
2597 	} else {
2598 		swreg src_hi = reg_xfer(idx + 2);
2599 
2600 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid,
2601 				   REG_WIDTH - len_lo, len_lo);
2602 		wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo,
2603 				REG_WIDTH - len_lo);
2604 		wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo,
2605 				   len_lo);
2606 	}
2607 
2608 	return 0;
2609 }
2610 
2611 static int
2612 mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog,
2613 				   struct nfp_insn_meta *meta,
2614 				   unsigned int size)
2615 {
2616 	swreg dst_lo, dst_hi, src_lo;
2617 	u8 dst_gpr, idx;
2618 
2619 	idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH;
2620 	dst_gpr = meta->insn.dst_reg * 2;
2621 	dst_hi = reg_both(dst_gpr + 1);
2622 	dst_lo = reg_both(dst_gpr);
2623 	src_lo = reg_xfer(idx);
2624 
2625 	if (size < REG_WIDTH) {
2626 		wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0);
2627 		wrp_immed(nfp_prog, dst_hi, 0);
2628 	} else if (size == REG_WIDTH) {
2629 		wrp_mov(nfp_prog, dst_lo, src_lo);
2630 		wrp_immed(nfp_prog, dst_hi, 0);
2631 	} else {
2632 		swreg src_hi = reg_xfer(idx + 1);
2633 
2634 		wrp_mov(nfp_prog, dst_lo, src_lo);
2635 		wrp_mov(nfp_prog, dst_hi, src_hi);
2636 	}
2637 
2638 	return 0;
2639 }
2640 
2641 static int
2642 mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog,
2643 			   struct nfp_insn_meta *meta, unsigned int size)
2644 {
2645 	u8 off = meta->insn.off - meta->pkt_cache.range_start;
2646 
2647 	if (IS_ALIGNED(off, REG_WIDTH))
2648 		return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size);
2649 
2650 	return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size);
2651 }
2652 
2653 static int
2654 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2655 	unsigned int size)
2656 {
2657 	if (meta->ldst_gather_len)
2658 		return nfp_cpp_memcpy(nfp_prog, meta);
2659 
2660 	if (meta->ptr.type == PTR_TO_CTX) {
2661 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2662 			return mem_ldx_xdp(nfp_prog, meta, size);
2663 		else
2664 			return mem_ldx_skb(nfp_prog, meta, size);
2665 	}
2666 
2667 	if (meta->ptr.type == PTR_TO_PACKET) {
2668 		if (meta->pkt_cache.range_end) {
2669 			if (meta->pkt_cache.do_init)
2670 				mem_ldx_data_init_pktcache(nfp_prog, meta);
2671 
2672 			return mem_ldx_data_from_pktcache(nfp_prog, meta, size);
2673 		} else {
2674 			return mem_ldx_data(nfp_prog, meta, size);
2675 		}
2676 	}
2677 
2678 	if (meta->ptr.type == PTR_TO_STACK)
2679 		return mem_ldx_stack(nfp_prog, meta, size,
2680 				     meta->ptr.off + meta->ptr.var_off.value);
2681 
2682 	if (meta->ptr.type == PTR_TO_MAP_VALUE)
2683 		return mem_ldx_emem(nfp_prog, meta, size);
2684 
2685 	return -EOPNOTSUPP;
2686 }
2687 
2688 static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2689 {
2690 	return mem_ldx(nfp_prog, meta, 1);
2691 }
2692 
2693 static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2694 {
2695 	return mem_ldx(nfp_prog, meta, 2);
2696 }
2697 
2698 static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2699 {
2700 	return mem_ldx(nfp_prog, meta, 4);
2701 }
2702 
2703 static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2704 {
2705 	return mem_ldx(nfp_prog, meta, 8);
2706 }
2707 
2708 static int
2709 mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2710 	    unsigned int size)
2711 {
2712 	u64 imm = meta->insn.imm; /* sign extend */
2713 	swreg off_reg;
2714 
2715 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2716 
2717 	return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2718 				  imm, size);
2719 }
2720 
2721 static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2722 		  unsigned int size)
2723 {
2724 	if (meta->ptr.type == PTR_TO_PACKET)
2725 		return mem_st_data(nfp_prog, meta, size);
2726 
2727 	return -EOPNOTSUPP;
2728 }
2729 
2730 static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2731 {
2732 	return mem_st(nfp_prog, meta, 1);
2733 }
2734 
2735 static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2736 {
2737 	return mem_st(nfp_prog, meta, 2);
2738 }
2739 
2740 static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2741 {
2742 	return mem_st(nfp_prog, meta, 4);
2743 }
2744 
2745 static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2746 {
2747 	return mem_st(nfp_prog, meta, 8);
2748 }
2749 
2750 static int
2751 mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2752 	     unsigned int size)
2753 {
2754 	swreg off_reg;
2755 
2756 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2757 
2758 	return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2759 				   meta->insn.src_reg * 2, size);
2760 }
2761 
2762 static int
2763 mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2764 	      unsigned int size, unsigned int ptr_off)
2765 {
2766 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2767 			    meta->insn.src_reg * 2, meta->insn.dst_reg * 2,
2768 			    false, wrp_lmem_store);
2769 }
2770 
2771 static int mem_stx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2772 {
2773 	switch (meta->insn.off) {
2774 	case offsetof(struct xdp_md, rx_queue_index):
2775 		return nfp_queue_select(nfp_prog, meta);
2776 	}
2777 
2778 	WARN_ON_ONCE(1); /* verifier should have rejected bad accesses */
2779 	return -EOPNOTSUPP;
2780 }
2781 
2782 static int
2783 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2784 	unsigned int size)
2785 {
2786 	if (meta->ptr.type == PTR_TO_PACKET)
2787 		return mem_stx_data(nfp_prog, meta, size);
2788 
2789 	if (meta->ptr.type == PTR_TO_STACK)
2790 		return mem_stx_stack(nfp_prog, meta, size,
2791 				     meta->ptr.off + meta->ptr.var_off.value);
2792 
2793 	return -EOPNOTSUPP;
2794 }
2795 
2796 static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2797 {
2798 	return mem_stx(nfp_prog, meta, 1);
2799 }
2800 
2801 static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2802 {
2803 	return mem_stx(nfp_prog, meta, 2);
2804 }
2805 
2806 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2807 {
2808 	if (meta->ptr.type == PTR_TO_CTX)
2809 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2810 			return mem_stx_xdp(nfp_prog, meta);
2811 	return mem_stx(nfp_prog, meta, 4);
2812 }
2813 
2814 static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2815 {
2816 	return mem_stx(nfp_prog, meta, 8);
2817 }
2818 
2819 static int
2820 mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
2821 {
2822 	u8 dst_gpr = meta->insn.dst_reg * 2;
2823 	u8 src_gpr = meta->insn.src_reg * 2;
2824 	unsigned int full_add, out;
2825 	swreg addra, addrb, off;
2826 
2827 	off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2828 
2829 	/* We can fit 16 bits into command immediate, if we know the immediate
2830 	 * is guaranteed to either always or never fit into 16 bit we only
2831 	 * generate code to handle that particular case, otherwise generate
2832 	 * code for both.
2833 	 */
2834 	out = nfp_prog_current_offset(nfp_prog);
2835 	full_add = nfp_prog_current_offset(nfp_prog);
2836 
2837 	if (meta->insn.off) {
2838 		out += 2;
2839 		full_add += 2;
2840 	}
2841 	if (meta->xadd_maybe_16bit) {
2842 		out += 3;
2843 		full_add += 3;
2844 	}
2845 	if (meta->xadd_over_16bit)
2846 		out += 2 + is64;
2847 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2848 		out += 5;
2849 		full_add += 5;
2850 	}
2851 
2852 	/* Generate the branch for choosing add_imm vs add */
2853 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2854 		swreg max_imm = imm_a(nfp_prog);
2855 
2856 		wrp_immed(nfp_prog, max_imm, 0xffff);
2857 		emit_alu(nfp_prog, reg_none(),
2858 			 max_imm, ALU_OP_SUB, reg_b(src_gpr));
2859 		emit_alu(nfp_prog, reg_none(),
2860 			 reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1));
2861 		emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0);
2862 		/* defer for add */
2863 	}
2864 
2865 	/* If insn has an offset add to the address */
2866 	if (!meta->insn.off) {
2867 		addra = reg_a(dst_gpr);
2868 		addrb = reg_b(dst_gpr + 1);
2869 	} else {
2870 		emit_alu(nfp_prog, imma_a(nfp_prog),
2871 			 reg_a(dst_gpr), ALU_OP_ADD, off);
2872 		emit_alu(nfp_prog, imma_b(nfp_prog),
2873 			 reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0));
2874 		addra = imma_a(nfp_prog);
2875 		addrb = imma_b(nfp_prog);
2876 	}
2877 
2878 	/* Generate the add_imm if 16 bits are possible */
2879 	if (meta->xadd_maybe_16bit) {
2880 		swreg prev_alu = imm_a(nfp_prog);
2881 
2882 		wrp_immed(nfp_prog, prev_alu,
2883 			  FIELD_PREP(CMD_OVE_DATA, 2) |
2884 			  CMD_OVE_LEN |
2885 			  FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2));
2886 		wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2);
2887 		emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0,
2888 			       addra, addrb, 0, CMD_CTX_NO_SWAP);
2889 
2890 		if (meta->xadd_over_16bit)
2891 			emit_br(nfp_prog, BR_UNC, out, 0);
2892 	}
2893 
2894 	if (!nfp_prog_confirm_current_offset(nfp_prog, full_add))
2895 		return -EINVAL;
2896 
2897 	/* Generate the add if 16 bits are not guaranteed */
2898 	if (meta->xadd_over_16bit) {
2899 		emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0,
2900 			 addra, addrb, is64 << 2,
2901 			 is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1);
2902 
2903 		wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr));
2904 		if (is64)
2905 			wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1));
2906 	}
2907 
2908 	if (!nfp_prog_confirm_current_offset(nfp_prog, out))
2909 		return -EINVAL;
2910 
2911 	return 0;
2912 }
2913 
2914 static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2915 {
2916 	return mem_xadd(nfp_prog, meta, false);
2917 }
2918 
2919 static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2920 {
2921 	return mem_xadd(nfp_prog, meta, true);
2922 }
2923 
2924 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2925 {
2926 	emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
2927 
2928 	return 0;
2929 }
2930 
2931 static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2932 {
2933 	const struct bpf_insn *insn = &meta->insn;
2934 	u64 imm = insn->imm; /* sign extend */
2935 	swreg or1, or2, tmp_reg;
2936 
2937 	or1 = reg_a(insn->dst_reg * 2);
2938 	or2 = reg_b(insn->dst_reg * 2 + 1);
2939 
2940 	if (imm & ~0U) {
2941 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
2942 		emit_alu(nfp_prog, imm_a(nfp_prog),
2943 			 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
2944 		or1 = imm_a(nfp_prog);
2945 	}
2946 
2947 	if (imm >> 32) {
2948 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
2949 		emit_alu(nfp_prog, imm_b(nfp_prog),
2950 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
2951 		or2 = imm_b(nfp_prog);
2952 	}
2953 
2954 	emit_alu(nfp_prog, reg_none(), or1, ALU_OP_OR, or2);
2955 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
2956 
2957 	return 0;
2958 }
2959 
2960 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2961 {
2962 	const struct bpf_insn *insn = &meta->insn;
2963 	u64 imm = insn->imm; /* sign extend */
2964 	swreg tmp_reg;
2965 
2966 	if (!imm) {
2967 		meta->skip = true;
2968 		return 0;
2969 	}
2970 
2971 	if (imm & ~0U) {
2972 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
2973 		emit_alu(nfp_prog, reg_none(),
2974 			 reg_a(insn->dst_reg * 2), ALU_OP_AND, tmp_reg);
2975 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
2976 	}
2977 
2978 	if (imm >> 32) {
2979 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
2980 		emit_alu(nfp_prog, reg_none(),
2981 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_AND, tmp_reg);
2982 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
2983 	}
2984 
2985 	return 0;
2986 }
2987 
2988 static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2989 {
2990 	const struct bpf_insn *insn = &meta->insn;
2991 	u64 imm = insn->imm; /* sign extend */
2992 	swreg tmp_reg;
2993 
2994 	if (!imm) {
2995 		emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
2996 			 ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
2997 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
2998 		return 0;
2999 	}
3000 
3001 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3002 	emit_alu(nfp_prog, reg_none(),
3003 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3004 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3005 
3006 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3007 	emit_alu(nfp_prog, reg_none(),
3008 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3009 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3010 
3011 	return 0;
3012 }
3013 
3014 static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3015 {
3016 	const struct bpf_insn *insn = &meta->insn;
3017 
3018 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
3019 		 ALU_OP_XOR, reg_b(insn->src_reg * 2));
3020 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_a(insn->dst_reg * 2 + 1),
3021 		 ALU_OP_XOR, reg_b(insn->src_reg * 2 + 1));
3022 	emit_alu(nfp_prog, reg_none(),
3023 		 imm_a(nfp_prog), ALU_OP_OR, imm_b(nfp_prog));
3024 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3025 
3026 	return 0;
3027 }
3028 
3029 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3030 {
3031 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
3032 }
3033 
3034 static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3035 {
3036 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
3037 }
3038 
3039 static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3040 {
3041 	switch (meta->insn.imm) {
3042 	case BPF_FUNC_xdp_adjust_head:
3043 		return adjust_head(nfp_prog, meta);
3044 	case BPF_FUNC_map_lookup_elem:
3045 	case BPF_FUNC_map_update_elem:
3046 	case BPF_FUNC_map_delete_elem:
3047 		return map_call_stack_common(nfp_prog, meta);
3048 	case BPF_FUNC_get_prandom_u32:
3049 		return nfp_get_prandom_u32(nfp_prog, meta);
3050 	case BPF_FUNC_perf_event_output:
3051 		return nfp_perf_event_output(nfp_prog, meta);
3052 	default:
3053 		WARN_ONCE(1, "verifier allowed unsupported function\n");
3054 		return -EOPNOTSUPP;
3055 	}
3056 }
3057 
3058 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3059 {
3060 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
3061 
3062 	return 0;
3063 }
3064 
3065 static const instr_cb_t instr_cb[256] = {
3066 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
3067 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
3068 	[BPF_ALU64 | BPF_XOR | BPF_X] =	xor_reg64,
3069 	[BPF_ALU64 | BPF_XOR | BPF_K] =	xor_imm64,
3070 	[BPF_ALU64 | BPF_AND | BPF_X] =	and_reg64,
3071 	[BPF_ALU64 | BPF_AND | BPF_K] =	and_imm64,
3072 	[BPF_ALU64 | BPF_OR | BPF_X] =	or_reg64,
3073 	[BPF_ALU64 | BPF_OR | BPF_K] =	or_imm64,
3074 	[BPF_ALU64 | BPF_ADD | BPF_X] =	add_reg64,
3075 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
3076 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
3077 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
3078 	[BPF_ALU64 | BPF_MUL | BPF_X] =	mul_reg64,
3079 	[BPF_ALU64 | BPF_MUL | BPF_K] =	mul_imm64,
3080 	[BPF_ALU64 | BPF_DIV | BPF_X] =	div_reg64,
3081 	[BPF_ALU64 | BPF_DIV | BPF_K] =	div_imm64,
3082 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
3083 	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
3084 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
3085 	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
3086 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
3087 	[BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64,
3088 	[BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
3089 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
3090 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
3091 	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
3092 	[BPF_ALU | BPF_XOR | BPF_K] =	xor_imm,
3093 	[BPF_ALU | BPF_AND | BPF_X] =	and_reg,
3094 	[BPF_ALU | BPF_AND | BPF_K] =	and_imm,
3095 	[BPF_ALU | BPF_OR | BPF_X] =	or_reg,
3096 	[BPF_ALU | BPF_OR | BPF_K] =	or_imm,
3097 	[BPF_ALU | BPF_ADD | BPF_X] =	add_reg,
3098 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
3099 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
3100 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
3101 	[BPF_ALU | BPF_MUL | BPF_X] =	mul_reg,
3102 	[BPF_ALU | BPF_MUL | BPF_K] =	mul_imm,
3103 	[BPF_ALU | BPF_DIV | BPF_X] =	div_reg,
3104 	[BPF_ALU | BPF_DIV | BPF_K] =	div_imm,
3105 	[BPF_ALU | BPF_NEG] =		neg_reg,
3106 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
3107 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
3108 	[BPF_LD | BPF_IMM | BPF_DW] =	imm_ld8,
3109 	[BPF_LD | BPF_ABS | BPF_B] =	data_ld1,
3110 	[BPF_LD | BPF_ABS | BPF_H] =	data_ld2,
3111 	[BPF_LD | BPF_ABS | BPF_W] =	data_ld4,
3112 	[BPF_LD | BPF_IND | BPF_B] =	data_ind_ld1,
3113 	[BPF_LD | BPF_IND | BPF_H] =	data_ind_ld2,
3114 	[BPF_LD | BPF_IND | BPF_W] =	data_ind_ld4,
3115 	[BPF_LDX | BPF_MEM | BPF_B] =	mem_ldx1,
3116 	[BPF_LDX | BPF_MEM | BPF_H] =	mem_ldx2,
3117 	[BPF_LDX | BPF_MEM | BPF_W] =	mem_ldx4,
3118 	[BPF_LDX | BPF_MEM | BPF_DW] =	mem_ldx8,
3119 	[BPF_STX | BPF_MEM | BPF_B] =	mem_stx1,
3120 	[BPF_STX | BPF_MEM | BPF_H] =	mem_stx2,
3121 	[BPF_STX | BPF_MEM | BPF_W] =	mem_stx4,
3122 	[BPF_STX | BPF_MEM | BPF_DW] =	mem_stx8,
3123 	[BPF_STX | BPF_XADD | BPF_W] =	mem_xadd4,
3124 	[BPF_STX | BPF_XADD | BPF_DW] =	mem_xadd8,
3125 	[BPF_ST | BPF_MEM | BPF_B] =	mem_st1,
3126 	[BPF_ST | BPF_MEM | BPF_H] =	mem_st2,
3127 	[BPF_ST | BPF_MEM | BPF_W] =	mem_st4,
3128 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
3129 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
3130 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
3131 	[BPF_JMP | BPF_JGT | BPF_K] =	cmp_imm,
3132 	[BPF_JMP | BPF_JGE | BPF_K] =	cmp_imm,
3133 	[BPF_JMP | BPF_JLT | BPF_K] =	cmp_imm,
3134 	[BPF_JMP | BPF_JLE | BPF_K] =	cmp_imm,
3135 	[BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
3136 	[BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
3137 	[BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
3138 	[BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
3139 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
3140 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
3141 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
3142 	[BPF_JMP | BPF_JGT | BPF_X] =	cmp_reg,
3143 	[BPF_JMP | BPF_JGE | BPF_X] =	cmp_reg,
3144 	[BPF_JMP | BPF_JLT | BPF_X] =	cmp_reg,
3145 	[BPF_JMP | BPF_JLE | BPF_X] =	cmp_reg,
3146 	[BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
3147 	[BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
3148 	[BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
3149 	[BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
3150 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
3151 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
3152 	[BPF_JMP | BPF_CALL] =		call,
3153 	[BPF_JMP | BPF_EXIT] =		goto_out,
3154 };
3155 
3156 /* --- Assembler logic --- */
3157 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
3158 {
3159 	struct nfp_insn_meta *meta, *jmp_dst;
3160 	u32 idx, br_idx;
3161 
3162 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3163 		if (meta->skip)
3164 			continue;
3165 		if (meta->insn.code == (BPF_JMP | BPF_CALL))
3166 			continue;
3167 		if (BPF_CLASS(meta->insn.code) != BPF_JMP)
3168 			continue;
3169 
3170 		if (list_is_last(&meta->l, &nfp_prog->insns))
3171 			br_idx = nfp_prog->last_bpf_off;
3172 		else
3173 			br_idx = list_next_entry(meta, l)->off - 1;
3174 
3175 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
3176 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
3177 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
3178 			return -ELOOP;
3179 		}
3180 		/* Leave special branches for later */
3181 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3182 		    RELO_BR_REL)
3183 			continue;
3184 
3185 		if (!meta->jmp_dst) {
3186 			pr_err("Non-exit jump doesn't have destination info recorded!!\n");
3187 			return -ELOOP;
3188 		}
3189 
3190 		jmp_dst = meta->jmp_dst;
3191 
3192 		if (jmp_dst->skip) {
3193 			pr_err("Branch landing on removed instruction!!\n");
3194 			return -ELOOP;
3195 		}
3196 
3197 		for (idx = meta->off; idx <= br_idx; idx++) {
3198 			if (!nfp_is_br(nfp_prog->prog[idx]))
3199 				continue;
3200 			br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
3201 		}
3202 	}
3203 
3204 	return 0;
3205 }
3206 
3207 static void nfp_intro(struct nfp_prog *nfp_prog)
3208 {
3209 	wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
3210 	emit_alu(nfp_prog, plen_reg(nfp_prog),
3211 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
3212 }
3213 
3214 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
3215 {
3216 	/* TC direct-action mode:
3217 	 *   0,1   ok        NOT SUPPORTED[1]
3218 	 *   2   drop  0x22 -> drop,  count as stat1
3219 	 *   4,5 nuke  0x02 -> drop
3220 	 *   7  redir  0x44 -> redir, count as stat2
3221 	 *   * unspec  0x11 -> pass,  count as stat0
3222 	 *
3223 	 * [1] We can't support OK and RECLASSIFY because we can't tell TC
3224 	 *     the exact decision made.  We are forced to support UNSPEC
3225 	 *     to handle aborts so that's the only one we handle for passing
3226 	 *     packets up the stack.
3227 	 */
3228 	/* Target for aborts */
3229 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3230 
3231 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3232 
3233 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3234 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16);
3235 
3236 	/* Target for normal exits */
3237 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3238 
3239 	/* if R0 > 7 jump to abort */
3240 	emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0));
3241 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3242 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3243 
3244 	wrp_immed(nfp_prog, reg_b(2), 0x41221211);
3245 	wrp_immed(nfp_prog, reg_b(3), 0x41001211);
3246 
3247 	emit_shf(nfp_prog, reg_a(1),
3248 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 2);
3249 
3250 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3251 	emit_shf(nfp_prog, reg_a(2),
3252 		 reg_imm(0xf), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3253 
3254 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3255 	emit_shf(nfp_prog, reg_b(2),
3256 		 reg_imm(0xf), SHF_OP_AND, reg_b(3), SHF_SC_R_SHF, 0);
3257 
3258 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3259 
3260 	emit_shf(nfp_prog, reg_b(2),
3261 		 reg_a(2), SHF_OP_OR, reg_b(2), SHF_SC_L_SHF, 4);
3262 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3263 }
3264 
3265 static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
3266 {
3267 	/* XDP return codes:
3268 	 *   0 aborted  0x82 -> drop,  count as stat3
3269 	 *   1    drop  0x22 -> drop,  count as stat1
3270 	 *   2    pass  0x11 -> pass,  count as stat0
3271 	 *   3      tx  0x44 -> redir, count as stat2
3272 	 *   * unknown  0x82 -> drop,  count as stat3
3273 	 */
3274 	/* Target for aborts */
3275 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3276 
3277 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3278 
3279 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3280 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
3281 
3282 	/* Target for normal exits */
3283 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3284 
3285 	/* if R0 > 3 jump to abort */
3286 	emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
3287 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3288 
3289 	wrp_immed(nfp_prog, reg_b(2), 0x44112282);
3290 
3291 	emit_shf(nfp_prog, reg_a(1),
3292 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 3);
3293 
3294 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3295 	emit_shf(nfp_prog, reg_b(2),
3296 		 reg_imm(0xff), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3297 
3298 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3299 
3300 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3301 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3302 }
3303 
3304 static void nfp_outro(struct nfp_prog *nfp_prog)
3305 {
3306 	switch (nfp_prog->type) {
3307 	case BPF_PROG_TYPE_SCHED_CLS:
3308 		nfp_outro_tc_da(nfp_prog);
3309 		break;
3310 	case BPF_PROG_TYPE_XDP:
3311 		nfp_outro_xdp(nfp_prog);
3312 		break;
3313 	default:
3314 		WARN_ON(1);
3315 	}
3316 }
3317 
3318 static int nfp_translate(struct nfp_prog *nfp_prog)
3319 {
3320 	struct nfp_insn_meta *meta;
3321 	int err;
3322 
3323 	nfp_intro(nfp_prog);
3324 	if (nfp_prog->error)
3325 		return nfp_prog->error;
3326 
3327 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3328 		instr_cb_t cb = instr_cb[meta->insn.code];
3329 
3330 		meta->off = nfp_prog_current_offset(nfp_prog);
3331 
3332 		if (meta->skip) {
3333 			nfp_prog->n_translated++;
3334 			continue;
3335 		}
3336 
3337 		if (nfp_meta_has_prev(nfp_prog, meta) &&
3338 		    nfp_meta_prev(meta)->double_cb)
3339 			cb = nfp_meta_prev(meta)->double_cb;
3340 		if (!cb)
3341 			return -ENOENT;
3342 		err = cb(nfp_prog, meta);
3343 		if (err)
3344 			return err;
3345 		if (nfp_prog->error)
3346 			return nfp_prog->error;
3347 
3348 		nfp_prog->n_translated++;
3349 	}
3350 
3351 	nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
3352 
3353 	nfp_outro(nfp_prog);
3354 	if (nfp_prog->error)
3355 		return nfp_prog->error;
3356 
3357 	wrp_nops(nfp_prog, NFP_USTORE_PREFETCH_WINDOW);
3358 	if (nfp_prog->error)
3359 		return nfp_prog->error;
3360 
3361 	return nfp_fixup_branches(nfp_prog);
3362 }
3363 
3364 /* --- Optimizations --- */
3365 static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
3366 {
3367 	struct nfp_insn_meta *meta;
3368 
3369 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3370 		struct bpf_insn insn = meta->insn;
3371 
3372 		/* Programs converted from cBPF start with register xoring */
3373 		if (insn.code == (BPF_ALU64 | BPF_XOR | BPF_X) &&
3374 		    insn.src_reg == insn.dst_reg)
3375 			continue;
3376 
3377 		/* Programs start with R6 = R1 but we ignore the skb pointer */
3378 		if (insn.code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
3379 		    insn.src_reg == 1 && insn.dst_reg == 6)
3380 			meta->skip = true;
3381 
3382 		/* Return as soon as something doesn't match */
3383 		if (!meta->skip)
3384 			return;
3385 	}
3386 }
3387 
3388 /* abs(insn.imm) will fit better into unrestricted reg immediate -
3389  * convert add/sub of a negative number into a sub/add of a positive one.
3390  */
3391 static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
3392 {
3393 	struct nfp_insn_meta *meta;
3394 
3395 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3396 		struct bpf_insn insn = meta->insn;
3397 
3398 		if (meta->skip)
3399 			continue;
3400 
3401 		if (BPF_CLASS(insn.code) != BPF_ALU &&
3402 		    BPF_CLASS(insn.code) != BPF_ALU64 &&
3403 		    BPF_CLASS(insn.code) != BPF_JMP)
3404 			continue;
3405 		if (BPF_SRC(insn.code) != BPF_K)
3406 			continue;
3407 		if (insn.imm >= 0)
3408 			continue;
3409 
3410 		if (BPF_CLASS(insn.code) == BPF_JMP) {
3411 			switch (BPF_OP(insn.code)) {
3412 			case BPF_JGE:
3413 			case BPF_JSGE:
3414 			case BPF_JLT:
3415 			case BPF_JSLT:
3416 				meta->jump_neg_op = true;
3417 				break;
3418 			default:
3419 				continue;
3420 			}
3421 		} else {
3422 			if (BPF_OP(insn.code) == BPF_ADD)
3423 				insn.code = BPF_CLASS(insn.code) | BPF_SUB;
3424 			else if (BPF_OP(insn.code) == BPF_SUB)
3425 				insn.code = BPF_CLASS(insn.code) | BPF_ADD;
3426 			else
3427 				continue;
3428 
3429 			meta->insn.code = insn.code | BPF_K;
3430 		}
3431 
3432 		meta->insn.imm = -insn.imm;
3433 	}
3434 }
3435 
3436 /* Remove masking after load since our load guarantees this is not needed */
3437 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
3438 {
3439 	struct nfp_insn_meta *meta1, *meta2;
3440 	const s32 exp_mask[] = {
3441 		[BPF_B] = 0x000000ffU,
3442 		[BPF_H] = 0x0000ffffU,
3443 		[BPF_W] = 0xffffffffU,
3444 	};
3445 
3446 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3447 		struct bpf_insn insn, next;
3448 
3449 		insn = meta1->insn;
3450 		next = meta2->insn;
3451 
3452 		if (BPF_CLASS(insn.code) != BPF_LD)
3453 			continue;
3454 		if (BPF_MODE(insn.code) != BPF_ABS &&
3455 		    BPF_MODE(insn.code) != BPF_IND)
3456 			continue;
3457 
3458 		if (next.code != (BPF_ALU64 | BPF_AND | BPF_K))
3459 			continue;
3460 
3461 		if (!exp_mask[BPF_SIZE(insn.code)])
3462 			continue;
3463 		if (exp_mask[BPF_SIZE(insn.code)] != next.imm)
3464 			continue;
3465 
3466 		if (next.src_reg || next.dst_reg)
3467 			continue;
3468 
3469 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
3470 			continue;
3471 
3472 		meta2->skip = true;
3473 	}
3474 }
3475 
3476 static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
3477 {
3478 	struct nfp_insn_meta *meta1, *meta2, *meta3;
3479 
3480 	nfp_for_each_insn_walk3(nfp_prog, meta1, meta2, meta3) {
3481 		struct bpf_insn insn, next1, next2;
3482 
3483 		insn = meta1->insn;
3484 		next1 = meta2->insn;
3485 		next2 = meta3->insn;
3486 
3487 		if (BPF_CLASS(insn.code) != BPF_LD)
3488 			continue;
3489 		if (BPF_MODE(insn.code) != BPF_ABS &&
3490 		    BPF_MODE(insn.code) != BPF_IND)
3491 			continue;
3492 		if (BPF_SIZE(insn.code) != BPF_W)
3493 			continue;
3494 
3495 		if (!(next1.code == (BPF_LSH | BPF_K | BPF_ALU64) &&
3496 		      next2.code == (BPF_RSH | BPF_K | BPF_ALU64)) &&
3497 		    !(next1.code == (BPF_RSH | BPF_K | BPF_ALU64) &&
3498 		      next2.code == (BPF_LSH | BPF_K | BPF_ALU64)))
3499 			continue;
3500 
3501 		if (next1.src_reg || next1.dst_reg ||
3502 		    next2.src_reg || next2.dst_reg)
3503 			continue;
3504 
3505 		if (next1.imm != 0x20 || next2.imm != 0x20)
3506 			continue;
3507 
3508 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
3509 		    meta3->flags & FLAG_INSN_IS_JUMP_DST)
3510 			continue;
3511 
3512 		meta2->skip = true;
3513 		meta3->skip = true;
3514 	}
3515 }
3516 
3517 /* load/store pair that forms memory copy sould look like the following:
3518  *
3519  *   ld_width R, [addr_src + offset_src]
3520  *   st_width [addr_dest + offset_dest], R
3521  *
3522  * The destination register of load and source register of store should
3523  * be the same, load and store should also perform at the same width.
3524  * If either of addr_src or addr_dest is stack pointer, we don't do the
3525  * CPP optimization as stack is modelled by registers on NFP.
3526  */
3527 static bool
3528 curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
3529 		    struct nfp_insn_meta *st_meta)
3530 {
3531 	struct bpf_insn *ld = &ld_meta->insn;
3532 	struct bpf_insn *st = &st_meta->insn;
3533 
3534 	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
3535 		return false;
3536 
3537 	if (ld_meta->ptr.type != PTR_TO_PACKET &&
3538 	    ld_meta->ptr.type != PTR_TO_MAP_VALUE)
3539 		return false;
3540 
3541 	if (st_meta->ptr.type != PTR_TO_PACKET)
3542 		return false;
3543 
3544 	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
3545 		return false;
3546 
3547 	if (ld->dst_reg != st->src_reg)
3548 		return false;
3549 
3550 	/* There is jump to the store insn in this pair. */
3551 	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
3552 		return false;
3553 
3554 	return true;
3555 }
3556 
3557 /* Currently, we only support chaining load/store pairs if:
3558  *
3559  *  - Their address base registers are the same.
3560  *  - Their address offsets are in the same order.
3561  *  - They operate at the same memory width.
3562  *  - There is no jump into the middle of them.
3563  */
3564 static bool
3565 curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
3566 			      struct nfp_insn_meta *st_meta,
3567 			      struct bpf_insn *prev_ld,
3568 			      struct bpf_insn *prev_st)
3569 {
3570 	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
3571 	struct bpf_insn *ld = &ld_meta->insn;
3572 	struct bpf_insn *st = &st_meta->insn;
3573 	s16 prev_ld_off, prev_st_off;
3574 
3575 	/* This pair is the start pair. */
3576 	if (!prev_ld)
3577 		return true;
3578 
3579 	prev_size = BPF_LDST_BYTES(prev_ld);
3580 	curr_size = BPF_LDST_BYTES(ld);
3581 	prev_ld_base = prev_ld->src_reg;
3582 	prev_st_base = prev_st->dst_reg;
3583 	prev_ld_dst = prev_ld->dst_reg;
3584 	prev_ld_off = prev_ld->off;
3585 	prev_st_off = prev_st->off;
3586 
3587 	if (ld->dst_reg != prev_ld_dst)
3588 		return false;
3589 
3590 	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
3591 		return false;
3592 
3593 	if (curr_size != prev_size)
3594 		return false;
3595 
3596 	/* There is jump to the head of this pair. */
3597 	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
3598 		return false;
3599 
3600 	/* Both in ascending order. */
3601 	if (prev_ld_off + prev_size == ld->off &&
3602 	    prev_st_off + prev_size == st->off)
3603 		return true;
3604 
3605 	/* Both in descending order. */
3606 	if (ld->off + curr_size == prev_ld_off &&
3607 	    st->off + curr_size == prev_st_off)
3608 		return true;
3609 
3610 	return false;
3611 }
3612 
3613 /* Return TRUE if cross memory access happens. Cross memory access means
3614  * store area is overlapping with load area that a later load might load
3615  * the value from previous store, for this case we can't treat the sequence
3616  * as an memory copy.
3617  */
3618 static bool
3619 cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
3620 		 struct nfp_insn_meta *head_st_meta)
3621 {
3622 	s16 head_ld_off, head_st_off, ld_off;
3623 
3624 	/* Different pointer types does not overlap. */
3625 	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
3626 		return false;
3627 
3628 	/* load and store are both PTR_TO_PACKET, check ID info.  */
3629 	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
3630 		return true;
3631 
3632 	/* Canonicalize the offsets. Turn all of them against the original
3633 	 * base register.
3634 	 */
3635 	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
3636 	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
3637 	ld_off = ld->off + head_ld_meta->ptr.off;
3638 
3639 	/* Ascending order cross. */
3640 	if (ld_off > head_ld_off &&
3641 	    head_ld_off < head_st_off && ld_off >= head_st_off)
3642 		return true;
3643 
3644 	/* Descending order cross. */
3645 	if (ld_off < head_ld_off &&
3646 	    head_ld_off > head_st_off && ld_off <= head_st_off)
3647 		return true;
3648 
3649 	return false;
3650 }
3651 
3652 /* This pass try to identify the following instructoin sequences.
3653  *
3654  *   load R, [regA + offA]
3655  *   store [regB + offB], R
3656  *   load R, [regA + offA + const_imm_A]
3657  *   store [regB + offB + const_imm_A], R
3658  *   load R, [regA + offA + 2 * const_imm_A]
3659  *   store [regB + offB + 2 * const_imm_A], R
3660  *   ...
3661  *
3662  * Above sequence is typically generated by compiler when lowering
3663  * memcpy. NFP prefer using CPP instructions to accelerate it.
3664  */
3665 static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
3666 {
3667 	struct nfp_insn_meta *head_ld_meta = NULL;
3668 	struct nfp_insn_meta *head_st_meta = NULL;
3669 	struct nfp_insn_meta *meta1, *meta2;
3670 	struct bpf_insn *prev_ld = NULL;
3671 	struct bpf_insn *prev_st = NULL;
3672 	u8 count = 0;
3673 
3674 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3675 		struct bpf_insn *ld = &meta1->insn;
3676 		struct bpf_insn *st = &meta2->insn;
3677 
3678 		/* Reset record status if any of the following if true:
3679 		 *   - The current insn pair is not load/store.
3680 		 *   - The load/store pair doesn't chain with previous one.
3681 		 *   - The chained load/store pair crossed with previous pair.
3682 		 *   - The chained load/store pair has a total size of memory
3683 		 *     copy beyond 128 bytes which is the maximum length a
3684 		 *     single NFP CPP command can transfer.
3685 		 */
3686 		if (!curr_pair_is_memcpy(meta1, meta2) ||
3687 		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
3688 						   prev_st) ||
3689 		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
3690 						       head_st_meta) ||
3691 				      head_ld_meta->ldst_gather_len >= 128))) {
3692 			if (!count)
3693 				continue;
3694 
3695 			if (count > 1) {
3696 				s16 prev_ld_off = prev_ld->off;
3697 				s16 prev_st_off = prev_st->off;
3698 				s16 head_ld_off = head_ld_meta->insn.off;
3699 
3700 				if (prev_ld_off < head_ld_off) {
3701 					head_ld_meta->insn.off = prev_ld_off;
3702 					head_st_meta->insn.off = prev_st_off;
3703 					head_ld_meta->ldst_gather_len =
3704 						-head_ld_meta->ldst_gather_len;
3705 				}
3706 
3707 				head_ld_meta->paired_st = &head_st_meta->insn;
3708 				head_st_meta->skip = true;
3709 			} else {
3710 				head_ld_meta->ldst_gather_len = 0;
3711 			}
3712 
3713 			/* If the chain is ended by an load/store pair then this
3714 			 * could serve as the new head of the the next chain.
3715 			 */
3716 			if (curr_pair_is_memcpy(meta1, meta2)) {
3717 				head_ld_meta = meta1;
3718 				head_st_meta = meta2;
3719 				head_ld_meta->ldst_gather_len =
3720 					BPF_LDST_BYTES(ld);
3721 				meta1 = nfp_meta_next(meta1);
3722 				meta2 = nfp_meta_next(meta2);
3723 				prev_ld = ld;
3724 				prev_st = st;
3725 				count = 1;
3726 			} else {
3727 				head_ld_meta = NULL;
3728 				head_st_meta = NULL;
3729 				prev_ld = NULL;
3730 				prev_st = NULL;
3731 				count = 0;
3732 			}
3733 
3734 			continue;
3735 		}
3736 
3737 		if (!head_ld_meta) {
3738 			head_ld_meta = meta1;
3739 			head_st_meta = meta2;
3740 		} else {
3741 			meta1->skip = true;
3742 			meta2->skip = true;
3743 		}
3744 
3745 		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
3746 		meta1 = nfp_meta_next(meta1);
3747 		meta2 = nfp_meta_next(meta2);
3748 		prev_ld = ld;
3749 		prev_st = st;
3750 		count++;
3751 	}
3752 }
3753 
3754 static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog)
3755 {
3756 	struct nfp_insn_meta *meta, *range_node = NULL;
3757 	s16 range_start = 0, range_end = 0;
3758 	bool cache_avail = false;
3759 	struct bpf_insn *insn;
3760 	s32 range_ptr_off = 0;
3761 	u32 range_ptr_id = 0;
3762 
3763 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3764 		if (meta->flags & FLAG_INSN_IS_JUMP_DST)
3765 			cache_avail = false;
3766 
3767 		if (meta->skip)
3768 			continue;
3769 
3770 		insn = &meta->insn;
3771 
3772 		if (is_mbpf_store_pkt(meta) ||
3773 		    insn->code == (BPF_JMP | BPF_CALL) ||
3774 		    is_mbpf_classic_store_pkt(meta) ||
3775 		    is_mbpf_classic_load(meta)) {
3776 			cache_avail = false;
3777 			continue;
3778 		}
3779 
3780 		if (!is_mbpf_load(meta))
3781 			continue;
3782 
3783 		if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) {
3784 			cache_avail = false;
3785 			continue;
3786 		}
3787 
3788 		if (!cache_avail) {
3789 			cache_avail = true;
3790 			if (range_node)
3791 				goto end_current_then_start_new;
3792 			goto start_new;
3793 		}
3794 
3795 		/* Check ID to make sure two reads share the same
3796 		 * variable offset against PTR_TO_PACKET, and check OFF
3797 		 * to make sure they also share the same constant
3798 		 * offset.
3799 		 *
3800 		 * OFFs don't really need to be the same, because they
3801 		 * are the constant offsets against PTR_TO_PACKET, so
3802 		 * for different OFFs, we could canonicalize them to
3803 		 * offsets against original packet pointer. We don't
3804 		 * support this.
3805 		 */
3806 		if (meta->ptr.id == range_ptr_id &&
3807 		    meta->ptr.off == range_ptr_off) {
3808 			s16 new_start = range_start;
3809 			s16 end, off = insn->off;
3810 			s16 new_end = range_end;
3811 			bool changed = false;
3812 
3813 			if (off < range_start) {
3814 				new_start = off;
3815 				changed = true;
3816 			}
3817 
3818 			end = off + BPF_LDST_BYTES(insn);
3819 			if (end > range_end) {
3820 				new_end = end;
3821 				changed = true;
3822 			}
3823 
3824 			if (!changed)
3825 				continue;
3826 
3827 			if (new_end - new_start <= 64) {
3828 				/* Install new range. */
3829 				range_start = new_start;
3830 				range_end = new_end;
3831 				continue;
3832 			}
3833 		}
3834 
3835 end_current_then_start_new:
3836 		range_node->pkt_cache.range_start = range_start;
3837 		range_node->pkt_cache.range_end = range_end;
3838 start_new:
3839 		range_node = meta;
3840 		range_node->pkt_cache.do_init = true;
3841 		range_ptr_id = range_node->ptr.id;
3842 		range_ptr_off = range_node->ptr.off;
3843 		range_start = insn->off;
3844 		range_end = insn->off + BPF_LDST_BYTES(insn);
3845 	}
3846 
3847 	if (range_node) {
3848 		range_node->pkt_cache.range_start = range_start;
3849 		range_node->pkt_cache.range_end = range_end;
3850 	}
3851 
3852 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3853 		if (meta->skip)
3854 			continue;
3855 
3856 		if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) {
3857 			if (meta->pkt_cache.do_init) {
3858 				range_start = meta->pkt_cache.range_start;
3859 				range_end = meta->pkt_cache.range_end;
3860 			} else {
3861 				meta->pkt_cache.range_start = range_start;
3862 				meta->pkt_cache.range_end = range_end;
3863 			}
3864 		}
3865 	}
3866 }
3867 
3868 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
3869 {
3870 	nfp_bpf_opt_reg_init(nfp_prog);
3871 
3872 	nfp_bpf_opt_neg_add_sub(nfp_prog);
3873 	nfp_bpf_opt_ld_mask(nfp_prog);
3874 	nfp_bpf_opt_ld_shift(nfp_prog);
3875 	nfp_bpf_opt_ldst_gather(nfp_prog);
3876 	nfp_bpf_opt_pkt_cache(nfp_prog);
3877 
3878 	return 0;
3879 }
3880 
3881 static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
3882 {
3883 	struct nfp_insn_meta *meta1, *meta2;
3884 	struct nfp_bpf_map *nfp_map;
3885 	struct bpf_map *map;
3886 	u32 id;
3887 
3888 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3889 		if (meta1->skip || meta2->skip)
3890 			continue;
3891 
3892 		if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
3893 		    meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
3894 			continue;
3895 
3896 		map = (void *)(unsigned long)((u32)meta1->insn.imm |
3897 					      (u64)meta2->insn.imm << 32);
3898 		if (bpf_map_offload_neutral(map)) {
3899 			id = map->id;
3900 		} else {
3901 			nfp_map = map_to_offmap(map)->dev_priv;
3902 			id = nfp_map->tid;
3903 		}
3904 
3905 		meta1->insn.imm = id;
3906 		meta2->insn.imm = 0;
3907 	}
3908 
3909 	return 0;
3910 }
3911 
3912 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
3913 {
3914 	__le64 *ustore = (__force __le64 *)prog;
3915 	int i;
3916 
3917 	for (i = 0; i < len; i++) {
3918 		int err;
3919 
3920 		err = nfp_ustore_check_valid_no_ecc(prog[i]);
3921 		if (err)
3922 			return err;
3923 
3924 		ustore[i] = cpu_to_le64(nfp_ustore_calc_ecc_insn(prog[i]));
3925 	}
3926 
3927 	return 0;
3928 }
3929 
3930 static void nfp_bpf_prog_trim(struct nfp_prog *nfp_prog)
3931 {
3932 	void *prog;
3933 
3934 	prog = kvmalloc_array(nfp_prog->prog_len, sizeof(u64), GFP_KERNEL);
3935 	if (!prog)
3936 		return;
3937 
3938 	nfp_prog->__prog_alloc_len = nfp_prog->prog_len * sizeof(u64);
3939 	memcpy(prog, nfp_prog->prog, nfp_prog->__prog_alloc_len);
3940 	kvfree(nfp_prog->prog);
3941 	nfp_prog->prog = prog;
3942 }
3943 
3944 int nfp_bpf_jit(struct nfp_prog *nfp_prog)
3945 {
3946 	int ret;
3947 
3948 	ret = nfp_bpf_replace_map_ptrs(nfp_prog);
3949 	if (ret)
3950 		return ret;
3951 
3952 	ret = nfp_bpf_optimize(nfp_prog);
3953 	if (ret)
3954 		return ret;
3955 
3956 	ret = nfp_translate(nfp_prog);
3957 	if (ret) {
3958 		pr_err("Translation failed with error %d (translated: %u)\n",
3959 		       ret, nfp_prog->n_translated);
3960 		return -EINVAL;
3961 	}
3962 
3963 	nfp_bpf_prog_trim(nfp_prog);
3964 
3965 	return ret;
3966 }
3967 
3968 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt)
3969 {
3970 	struct nfp_insn_meta *meta;
3971 
3972 	/* Another pass to record jump information. */
3973 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3974 		u64 code = meta->insn.code;
3975 
3976 		if (BPF_CLASS(code) == BPF_JMP && BPF_OP(code) != BPF_EXIT &&
3977 		    BPF_OP(code) != BPF_CALL) {
3978 			struct nfp_insn_meta *dst_meta;
3979 			unsigned short dst_indx;
3980 
3981 			dst_indx = meta->n + 1 + meta->insn.off;
3982 			dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_indx,
3983 						     cnt);
3984 
3985 			meta->jmp_dst = dst_meta;
3986 			dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
3987 		}
3988 	}
3989 }
3990 
3991 bool nfp_bpf_supported_opcode(u8 code)
3992 {
3993 	return !!instr_cb[code];
3994 }
3995 
3996 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
3997 {
3998 	unsigned int i;
3999 	u64 *prog;
4000 	int err;
4001 
4002 	prog = kmemdup(nfp_prog->prog, nfp_prog->prog_len * sizeof(u64),
4003 		       GFP_KERNEL);
4004 	if (!prog)
4005 		return ERR_PTR(-ENOMEM);
4006 
4007 	for (i = 0; i < nfp_prog->prog_len; i++) {
4008 		enum nfp_relo_type special;
4009 		u32 val;
4010 
4011 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
4012 		switch (special) {
4013 		case RELO_NONE:
4014 			continue;
4015 		case RELO_BR_REL:
4016 			br_add_offset(&prog[i], bv->start_off);
4017 			break;
4018 		case RELO_BR_GO_OUT:
4019 			br_set_offset(&prog[i],
4020 				      nfp_prog->tgt_out + bv->start_off);
4021 			break;
4022 		case RELO_BR_GO_ABORT:
4023 			br_set_offset(&prog[i],
4024 				      nfp_prog->tgt_abort + bv->start_off);
4025 			break;
4026 		case RELO_BR_NEXT_PKT:
4027 			br_set_offset(&prog[i], bv->tgt_done);
4028 			break;
4029 		case RELO_BR_HELPER:
4030 			val = br_get_offset(prog[i]);
4031 			val -= BR_OFF_RELO;
4032 			switch (val) {
4033 			case BPF_FUNC_map_lookup_elem:
4034 				val = nfp_prog->bpf->helpers.map_lookup;
4035 				break;
4036 			case BPF_FUNC_map_update_elem:
4037 				val = nfp_prog->bpf->helpers.map_update;
4038 				break;
4039 			case BPF_FUNC_map_delete_elem:
4040 				val = nfp_prog->bpf->helpers.map_delete;
4041 				break;
4042 			case BPF_FUNC_perf_event_output:
4043 				val = nfp_prog->bpf->helpers.perf_event_output;
4044 				break;
4045 			default:
4046 				pr_err("relocation of unknown helper %d\n",
4047 				       val);
4048 				err = -EINVAL;
4049 				goto err_free_prog;
4050 			}
4051 			br_set_offset(&prog[i], val);
4052 			break;
4053 		case RELO_IMMED_REL:
4054 			immed_add_value(&prog[i], bv->start_off);
4055 			break;
4056 		}
4057 
4058 		prog[i] &= ~OP_RELO_TYPE;
4059 	}
4060 
4061 	err = nfp_bpf_ustore_calc(prog, nfp_prog->prog_len);
4062 	if (err)
4063 		goto err_free_prog;
4064 
4065 	return prog;
4066 
4067 err_free_prog:
4068 	kfree(prog);
4069 	return ERR_PTR(err);
4070 }
4071