1 /*
2  * Copyright (C) 2016-2018 Netronome Systems, Inc.
3  *
4  * This software is dual licensed under the GNU General License Version 2,
5  * June 1991 as shown in the file COPYING in the top-level directory of this
6  * source tree or the BSD 2-Clause License provided below.  You have the
7  * option to license this software under the complete terms of either license.
8  *
9  * The BSD 2-Clause License:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      1. Redistributions of source code must retain the above
16  *         copyright notice, this list of conditions and the following
17  *         disclaimer.
18  *
19  *      2. Redistributions in binary form must reproduce the above
20  *         copyright notice, this list of conditions and the following
21  *         disclaimer in the documentation and/or other materials
22  *         provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #define pr_fmt(fmt)	"NFP net bpf: " fmt
35 
36 #include <linux/bug.h>
37 #include <linux/bpf.h>
38 #include <linux/filter.h>
39 #include <linux/kernel.h>
40 #include <linux/pkt_cls.h>
41 #include <linux/reciprocal_div.h>
42 #include <linux/unistd.h>
43 
44 #include "main.h"
45 #include "../nfp_asm.h"
46 #include "../nfp_net_ctrl.h"
47 
48 /* --- NFP prog --- */
49 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
50  * It's safe to modify the next pointers (but not pos).
51  */
52 #define nfp_for_each_insn_walk2(nfp_prog, pos, next)			\
53 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
54 	     next = list_next_entry(pos, l);			\
55 	     &(nfp_prog)->insns != &pos->l &&			\
56 	     &(nfp_prog)->insns != &next->l;			\
57 	     pos = nfp_meta_next(pos),				\
58 	     next = nfp_meta_next(pos))
59 
60 #define nfp_for_each_insn_walk3(nfp_prog, pos, next, next2)		\
61 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
62 	     next = list_next_entry(pos, l),			\
63 	     next2 = list_next_entry(next, l);			\
64 	     &(nfp_prog)->insns != &pos->l &&			\
65 	     &(nfp_prog)->insns != &next->l &&			\
66 	     &(nfp_prog)->insns != &next2->l;			\
67 	     pos = nfp_meta_next(pos),				\
68 	     next = nfp_meta_next(pos),				\
69 	     next2 = nfp_meta_next(next))
70 
71 static bool
72 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
73 {
74 	return meta->l.prev != &nfp_prog->insns;
75 }
76 
77 static void nfp_prog_push(struct nfp_prog *nfp_prog, u64 insn)
78 {
79 	if (nfp_prog->__prog_alloc_len / sizeof(u64) == nfp_prog->prog_len) {
80 		pr_warn("instruction limit reached (%u NFP instructions)\n",
81 			nfp_prog->prog_len);
82 		nfp_prog->error = -ENOSPC;
83 		return;
84 	}
85 
86 	nfp_prog->prog[nfp_prog->prog_len] = insn;
87 	nfp_prog->prog_len++;
88 }
89 
90 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog)
91 {
92 	return nfp_prog->prog_len;
93 }
94 
95 static bool
96 nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off)
97 {
98 	/* If there is a recorded error we may have dropped instructions;
99 	 * that doesn't have to be due to translator bug, and the translation
100 	 * will fail anyway, so just return OK.
101 	 */
102 	if (nfp_prog->error)
103 		return true;
104 	return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off);
105 }
106 
107 /* --- Emitters --- */
108 static void
109 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
110 	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx,
111 	   bool indir)
112 {
113 	u64 insn;
114 
115 	insn =	FIELD_PREP(OP_CMD_A_SRC, areg) |
116 		FIELD_PREP(OP_CMD_CTX, ctx) |
117 		FIELD_PREP(OP_CMD_B_SRC, breg) |
118 		FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) |
119 		FIELD_PREP(OP_CMD_XFER, xfer) |
120 		FIELD_PREP(OP_CMD_CNT, size) |
121 		FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) |
122 		FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
123 		FIELD_PREP(OP_CMD_INDIR, indir) |
124 		FIELD_PREP(OP_CMD_MODE, mode);
125 
126 	nfp_prog_push(nfp_prog, insn);
127 }
128 
129 static void
130 emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
131 	     swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir)
132 {
133 	struct nfp_insn_re_regs reg;
134 	int err;
135 
136 	err = swreg_to_restricted(reg_none(), lreg, rreg, &reg, false);
137 	if (err) {
138 		nfp_prog->error = err;
139 		return;
140 	}
141 	if (reg.swap) {
142 		pr_err("cmd can't swap arguments\n");
143 		nfp_prog->error = -EFAULT;
144 		return;
145 	}
146 	if (reg.dst_lmextn || reg.src_lmextn) {
147 		pr_err("cmd can't use LMextn\n");
148 		nfp_prog->error = -EFAULT;
149 		return;
150 	}
151 
152 	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx,
153 		   indir);
154 }
155 
156 static void
157 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
158 	 swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
159 {
160 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false);
161 }
162 
163 static void
164 emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
165 	       swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
166 {
167 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true);
168 }
169 
170 static void
171 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
172 	  enum br_ctx_signal_state css, u16 addr, u8 defer)
173 {
174 	u16 addr_lo, addr_hi;
175 	u64 insn;
176 
177 	addr_lo = addr & (OP_BR_ADDR_LO >> __bf_shf(OP_BR_ADDR_LO));
178 	addr_hi = addr != addr_lo;
179 
180 	insn = OP_BR_BASE |
181 		FIELD_PREP(OP_BR_MASK, mask) |
182 		FIELD_PREP(OP_BR_EV_PIP, ev_pip) |
183 		FIELD_PREP(OP_BR_CSS, css) |
184 		FIELD_PREP(OP_BR_DEFBR, defer) |
185 		FIELD_PREP(OP_BR_ADDR_LO, addr_lo) |
186 		FIELD_PREP(OP_BR_ADDR_HI, addr_hi);
187 
188 	nfp_prog_push(nfp_prog, insn);
189 }
190 
191 static void
192 emit_br_relo(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer,
193 	     enum nfp_relo_type relo)
194 {
195 	if (mask == BR_UNC && defer > 2) {
196 		pr_err("BUG: branch defer out of bounds %d\n", defer);
197 		nfp_prog->error = -EFAULT;
198 		return;
199 	}
200 
201 	__emit_br(nfp_prog, mask,
202 		  mask != BR_UNC ? BR_EV_PIP_COND : BR_EV_PIP_UNCOND,
203 		  BR_CSS_NONE, addr, defer);
204 
205 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
206 		FIELD_PREP(OP_RELO_TYPE, relo);
207 }
208 
209 static void
210 emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
211 {
212 	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
213 }
214 
215 static void
216 __emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
217 	      bool set, bool src_lmextn)
218 {
219 	u16 addr_lo, addr_hi;
220 	u64 insn;
221 
222 	addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
223 	addr_hi = addr != addr_lo;
224 
225 	insn = OP_BR_BIT_BASE |
226 		FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
227 		FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
228 		FIELD_PREP(OP_BR_BIT_BV, set) |
229 		FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
230 		FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
231 		FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
232 		FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);
233 
234 	nfp_prog_push(nfp_prog, insn);
235 }
236 
237 static void
238 emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
239 		 u8 defer, bool set, enum nfp_relo_type relo)
240 {
241 	struct nfp_insn_re_regs reg;
242 	int err;
243 
244 	/* NOTE: The bit to test is specified as an rotation amount, such that
245 	 *	 the bit to test will be placed on the MSB of the result when
246 	 *	 doing a rotate right. For bit X, we need right rotate X + 1.
247 	 */
248 	bit += 1;
249 
250 	err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
251 	if (err) {
252 		nfp_prog->error = err;
253 		return;
254 	}
255 
256 	__emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
257 		      reg.src_lmextn);
258 
259 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
260 		FIELD_PREP(OP_RELO_TYPE, relo);
261 }
262 
263 static void
264 emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
265 {
266 	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
267 }
268 
269 static void
270 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
271 	     enum immed_width width, bool invert,
272 	     enum immed_shift shift, bool wr_both,
273 	     bool dst_lmextn, bool src_lmextn)
274 {
275 	u64 insn;
276 
277 	insn = OP_IMMED_BASE |
278 		FIELD_PREP(OP_IMMED_A_SRC, areg) |
279 		FIELD_PREP(OP_IMMED_B_SRC, breg) |
280 		FIELD_PREP(OP_IMMED_IMM, imm_hi) |
281 		FIELD_PREP(OP_IMMED_WIDTH, width) |
282 		FIELD_PREP(OP_IMMED_INV, invert) |
283 		FIELD_PREP(OP_IMMED_SHIFT, shift) |
284 		FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
285 		FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
286 		FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
287 
288 	nfp_prog_push(nfp_prog, insn);
289 }
290 
291 static void
292 emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
293 	   enum immed_width width, bool invert, enum immed_shift shift)
294 {
295 	struct nfp_insn_ur_regs reg;
296 	int err;
297 
298 	if (swreg_type(dst) == NN_REG_IMM) {
299 		nfp_prog->error = -EFAULT;
300 		return;
301 	}
302 
303 	err = swreg_to_unrestricted(dst, dst, reg_imm(imm & 0xff), &reg);
304 	if (err) {
305 		nfp_prog->error = err;
306 		return;
307 	}
308 
309 	/* Use reg.dst when destination is No-Dest. */
310 	__emit_immed(nfp_prog,
311 		     swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
312 		     reg.breg, imm >> 8, width, invert, shift,
313 		     reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
314 }
315 
316 static void
317 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
318 	   enum shf_sc sc, u8 shift,
319 	   u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
320 	   bool dst_lmextn, bool src_lmextn)
321 {
322 	u64 insn;
323 
324 	if (!FIELD_FIT(OP_SHF_SHIFT, shift)) {
325 		nfp_prog->error = -EFAULT;
326 		return;
327 	}
328 
329 	if (sc == SHF_SC_L_SHF)
330 		shift = 32 - shift;
331 
332 	insn = OP_SHF_BASE |
333 		FIELD_PREP(OP_SHF_A_SRC, areg) |
334 		FIELD_PREP(OP_SHF_SC, sc) |
335 		FIELD_PREP(OP_SHF_B_SRC, breg) |
336 		FIELD_PREP(OP_SHF_I8, i8) |
337 		FIELD_PREP(OP_SHF_SW, sw) |
338 		FIELD_PREP(OP_SHF_DST, dst) |
339 		FIELD_PREP(OP_SHF_SHIFT, shift) |
340 		FIELD_PREP(OP_SHF_OP, op) |
341 		FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
342 		FIELD_PREP(OP_SHF_WR_AB, wr_both) |
343 		FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
344 		FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
345 
346 	nfp_prog_push(nfp_prog, insn);
347 }
348 
349 static void
350 emit_shf(struct nfp_prog *nfp_prog, swreg dst,
351 	 swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
352 {
353 	struct nfp_insn_re_regs reg;
354 	int err;
355 
356 	err = swreg_to_restricted(dst, lreg, rreg, &reg, true);
357 	if (err) {
358 		nfp_prog->error = err;
359 		return;
360 	}
361 
362 	__emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
363 		   reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
364 		   reg.dst_lmextn, reg.src_lmextn);
365 }
366 
367 static void
368 emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
369 	       swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
370 {
371 	if (sc == SHF_SC_R_ROT) {
372 		pr_err("indirect shift is not allowed on rotation\n");
373 		nfp_prog->error = -EFAULT;
374 		return;
375 	}
376 
377 	emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
378 }
379 
380 static void
381 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
382 	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
383 	   bool dst_lmextn, bool src_lmextn)
384 {
385 	u64 insn;
386 
387 	insn = OP_ALU_BASE |
388 		FIELD_PREP(OP_ALU_A_SRC, areg) |
389 		FIELD_PREP(OP_ALU_B_SRC, breg) |
390 		FIELD_PREP(OP_ALU_DST, dst) |
391 		FIELD_PREP(OP_ALU_SW, swap) |
392 		FIELD_PREP(OP_ALU_OP, op) |
393 		FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
394 		FIELD_PREP(OP_ALU_WR_AB, wr_both) |
395 		FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
396 		FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
397 
398 	nfp_prog_push(nfp_prog, insn);
399 }
400 
401 static void
402 emit_alu(struct nfp_prog *nfp_prog, swreg dst,
403 	 swreg lreg, enum alu_op op, swreg rreg)
404 {
405 	struct nfp_insn_ur_regs reg;
406 	int err;
407 
408 	err = swreg_to_unrestricted(dst, lreg, rreg, &reg);
409 	if (err) {
410 		nfp_prog->error = err;
411 		return;
412 	}
413 
414 	__emit_alu(nfp_prog, reg.dst, reg.dst_ab,
415 		   reg.areg, op, reg.breg, reg.swap, reg.wr_both,
416 		   reg.dst_lmextn, reg.src_lmextn);
417 }
418 
419 static void
420 __emit_mul(struct nfp_prog *nfp_prog, enum alu_dst_ab dst_ab, u16 areg,
421 	   enum mul_type type, enum mul_step step, u16 breg, bool swap,
422 	   bool wr_both, bool dst_lmextn, bool src_lmextn)
423 {
424 	u64 insn;
425 
426 	insn = OP_MUL_BASE |
427 		FIELD_PREP(OP_MUL_A_SRC, areg) |
428 		FIELD_PREP(OP_MUL_B_SRC, breg) |
429 		FIELD_PREP(OP_MUL_STEP, step) |
430 		FIELD_PREP(OP_MUL_DST_AB, dst_ab) |
431 		FIELD_PREP(OP_MUL_SW, swap) |
432 		FIELD_PREP(OP_MUL_TYPE, type) |
433 		FIELD_PREP(OP_MUL_WR_AB, wr_both) |
434 		FIELD_PREP(OP_MUL_SRC_LMEXTN, src_lmextn) |
435 		FIELD_PREP(OP_MUL_DST_LMEXTN, dst_lmextn);
436 
437 	nfp_prog_push(nfp_prog, insn);
438 }
439 
440 static void
441 emit_mul(struct nfp_prog *nfp_prog, swreg lreg, enum mul_type type,
442 	 enum mul_step step, swreg rreg)
443 {
444 	struct nfp_insn_ur_regs reg;
445 	u16 areg;
446 	int err;
447 
448 	if (type == MUL_TYPE_START && step != MUL_STEP_NONE) {
449 		nfp_prog->error = -EINVAL;
450 		return;
451 	}
452 
453 	if (step == MUL_LAST || step == MUL_LAST_2) {
454 		/* When type is step and step Number is LAST or LAST2, left
455 		 * source is used as destination.
456 		 */
457 		err = swreg_to_unrestricted(lreg, reg_none(), rreg, &reg);
458 		areg = reg.dst;
459 	} else {
460 		err = swreg_to_unrestricted(reg_none(), lreg, rreg, &reg);
461 		areg = reg.areg;
462 	}
463 
464 	if (err) {
465 		nfp_prog->error = err;
466 		return;
467 	}
468 
469 	__emit_mul(nfp_prog, reg.dst_ab, areg, type, step, reg.breg, reg.swap,
470 		   reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
471 }
472 
473 static void
474 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
475 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
476 		bool zero, bool swap, bool wr_both,
477 		bool dst_lmextn, bool src_lmextn)
478 {
479 	u64 insn;
480 
481 	insn = OP_LDF_BASE |
482 		FIELD_PREP(OP_LDF_A_SRC, areg) |
483 		FIELD_PREP(OP_LDF_SC, sc) |
484 		FIELD_PREP(OP_LDF_B_SRC, breg) |
485 		FIELD_PREP(OP_LDF_I8, imm8) |
486 		FIELD_PREP(OP_LDF_SW, swap) |
487 		FIELD_PREP(OP_LDF_ZF, zero) |
488 		FIELD_PREP(OP_LDF_BMASK, bmask) |
489 		FIELD_PREP(OP_LDF_SHF, shift) |
490 		FIELD_PREP(OP_LDF_WR_AB, wr_both) |
491 		FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
492 		FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
493 
494 	nfp_prog_push(nfp_prog, insn);
495 }
496 
497 static void
498 emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
499 		  enum shf_sc sc, u8 shift, bool zero)
500 {
501 	struct nfp_insn_re_regs reg;
502 	int err;
503 
504 	/* Note: ld_field is special as it uses one of the src regs as dst */
505 	err = swreg_to_restricted(dst, dst, src, &reg, true);
506 	if (err) {
507 		nfp_prog->error = err;
508 		return;
509 	}
510 
511 	__emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
512 			reg.i8, zero, reg.swap, reg.wr_both,
513 			reg.dst_lmextn, reg.src_lmextn);
514 }
515 
516 static void
517 emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
518 	      enum shf_sc sc, u8 shift)
519 {
520 	emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
521 }
522 
523 static void
524 __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
525 	    bool dst_lmextn, bool src_lmextn)
526 {
527 	u64 insn;
528 
529 	insn = OP_LCSR_BASE |
530 		FIELD_PREP(OP_LCSR_A_SRC, areg) |
531 		FIELD_PREP(OP_LCSR_B_SRC, breg) |
532 		FIELD_PREP(OP_LCSR_WRITE, wr) |
533 		FIELD_PREP(OP_LCSR_ADDR, addr / 4) |
534 		FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
535 		FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
536 
537 	nfp_prog_push(nfp_prog, insn);
538 }
539 
540 static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
541 {
542 	struct nfp_insn_ur_regs reg;
543 	int err;
544 
545 	/* This instruction takes immeds instead of reg_none() for the ignored
546 	 * operand, but we can't encode 2 immeds in one instr with our normal
547 	 * swreg infra so if param is an immed, we encode as reg_none() and
548 	 * copy the immed to both operands.
549 	 */
550 	if (swreg_type(src) == NN_REG_IMM) {
551 		err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
552 		reg.breg = reg.areg;
553 	} else {
554 		err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
555 	}
556 	if (err) {
557 		nfp_prog->error = err;
558 		return;
559 	}
560 
561 	__emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr,
562 		    false, reg.src_lmextn);
563 }
564 
565 /* CSR value is read in following immed[gpr, 0] */
566 static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr)
567 {
568 	__emit_lcsr(nfp_prog, 0, 0, false, addr, false, false);
569 }
570 
571 static void emit_nop(struct nfp_prog *nfp_prog)
572 {
573 	__emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
574 }
575 
576 /* --- Wrappers --- */
577 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
578 {
579 	if (!(imm & 0xffff0000)) {
580 		*val = imm;
581 		*shift = IMMED_SHIFT_0B;
582 	} else if (!(imm & 0xff0000ff)) {
583 		*val = imm >> 8;
584 		*shift = IMMED_SHIFT_1B;
585 	} else if (!(imm & 0x0000ffff)) {
586 		*val = imm >> 16;
587 		*shift = IMMED_SHIFT_2B;
588 	} else {
589 		return false;
590 	}
591 
592 	return true;
593 }
594 
595 static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
596 {
597 	enum immed_shift shift;
598 	u16 val;
599 
600 	if (pack_immed(imm, &val, &shift)) {
601 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, false, shift);
602 	} else if (pack_immed(~imm, &val, &shift)) {
603 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, true, shift);
604 	} else {
605 		emit_immed(nfp_prog, dst, imm & 0xffff, IMMED_WIDTH_ALL,
606 			   false, IMMED_SHIFT_0B);
607 		emit_immed(nfp_prog, dst, imm >> 16, IMMED_WIDTH_WORD,
608 			   false, IMMED_SHIFT_2B);
609 	}
610 }
611 
612 static void
613 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm,
614 	       enum nfp_relo_type relo)
615 {
616 	if (imm > 0xffff) {
617 		pr_err("relocation of a large immediate!\n");
618 		nfp_prog->error = -EFAULT;
619 		return;
620 	}
621 	emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
622 
623 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
624 		FIELD_PREP(OP_RELO_TYPE, relo);
625 }
626 
627 /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted)
628  * If the @imm is small enough encode it directly in operand and return
629  * otherwise load @imm to a spare register and return its encoding.
630  */
631 static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
632 {
633 	if (FIELD_FIT(UR_REG_IMM_MAX, imm))
634 		return reg_imm(imm);
635 
636 	wrp_immed(nfp_prog, tmp_reg, imm);
637 	return tmp_reg;
638 }
639 
640 /* re_load_imm_any() - encode immediate or use tmp register (restricted)
641  * If the @imm is small enough encode it directly in operand and return
642  * otherwise load @imm to a spare register and return its encoding.
643  */
644 static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
645 {
646 	if (FIELD_FIT(RE_REG_IMM_MAX, imm))
647 		return reg_imm(imm);
648 
649 	wrp_immed(nfp_prog, tmp_reg, imm);
650 	return tmp_reg;
651 }
652 
653 static void wrp_nops(struct nfp_prog *nfp_prog, unsigned int count)
654 {
655 	while (count--)
656 		emit_nop(nfp_prog);
657 }
658 
659 static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src)
660 {
661 	emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src);
662 }
663 
664 static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
665 {
666 	wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
667 }
668 
669 /* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
670  * result to @dst from low end.
671  */
672 static void
673 wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
674 		u8 offset)
675 {
676 	enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
677 	u8 mask = (1 << field_len) - 1;
678 
679 	emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
680 }
681 
682 /* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the
683  * result to @dst from offset, there is no change on the other bits of @dst.
684  */
685 static void
686 wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src,
687 		   u8 field_len, u8 offset)
688 {
689 	enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE;
690 	u8 mask = ((1 << field_len) - 1) << offset;
691 
692 	emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8);
693 }
694 
695 static void
696 addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
697 	      swreg *rega, swreg *regb)
698 {
699 	if (offset == reg_imm(0)) {
700 		*rega = reg_a(src_gpr);
701 		*regb = reg_b(src_gpr + 1);
702 		return;
703 	}
704 
705 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset);
706 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C,
707 		 reg_imm(0));
708 	*rega = imm_a(nfp_prog);
709 	*regb = imm_b(nfp_prog);
710 }
711 
712 /* NFP has Command Push Pull bus which supports bluk memory operations. */
713 static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
714 {
715 	bool descending_seq = meta->ldst_gather_len < 0;
716 	s16 len = abs(meta->ldst_gather_len);
717 	swreg src_base, off;
718 	bool src_40bit_addr;
719 	unsigned int i;
720 	u8 xfer_num;
721 
722 	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
723 	src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE;
724 	src_base = reg_a(meta->insn.src_reg * 2);
725 	xfer_num = round_up(len, 4) / 4;
726 
727 	if (src_40bit_addr)
728 		addr40_offset(nfp_prog, meta->insn.src_reg * 2, off, &src_base,
729 			      &off);
730 
731 	/* Setup PREV_ALU fields to override memory read length. */
732 	if (len > 32)
733 		wrp_immed(nfp_prog, reg_none(),
734 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
735 
736 	/* Memory read from source addr into transfer-in registers. */
737 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP,
738 		     src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0,
739 		     src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32);
740 
741 	/* Move from transfer-in to transfer-out. */
742 	for (i = 0; i < xfer_num; i++)
743 		wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
744 
745 	off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
746 
747 	if (len <= 8) {
748 		/* Use single direct_ref write8. */
749 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
750 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
751 			 CMD_CTX_SWAP);
752 	} else if (len <= 32 && IS_ALIGNED(len, 4)) {
753 		/* Use single direct_ref write32. */
754 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
755 			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
756 			 CMD_CTX_SWAP);
757 	} else if (len <= 32) {
758 		/* Use single indirect_ref write8. */
759 		wrp_immed(nfp_prog, reg_none(),
760 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
761 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
762 			       reg_a(meta->paired_st->dst_reg * 2), off,
763 			       len - 1, CMD_CTX_SWAP);
764 	} else if (IS_ALIGNED(len, 4)) {
765 		/* Use single indirect_ref write32. */
766 		wrp_immed(nfp_prog, reg_none(),
767 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
768 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
769 			       reg_a(meta->paired_st->dst_reg * 2), off,
770 			       xfer_num - 1, CMD_CTX_SWAP);
771 	} else if (len <= 40) {
772 		/* Use one direct_ref write32 to write the first 32-bytes, then
773 		 * another direct_ref write8 to write the remaining bytes.
774 		 */
775 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
776 			 reg_a(meta->paired_st->dst_reg * 2), off, 7,
777 			 CMD_CTX_SWAP);
778 
779 		off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
780 				      imm_b(nfp_prog));
781 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
782 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
783 			 CMD_CTX_SWAP);
784 	} else {
785 		/* Use one indirect_ref write32 to write 4-bytes aligned length,
786 		 * then another direct_ref write8 to write the remaining bytes.
787 		 */
788 		u8 new_off;
789 
790 		wrp_immed(nfp_prog, reg_none(),
791 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
792 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
793 			       reg_a(meta->paired_st->dst_reg * 2), off,
794 			       xfer_num - 2, CMD_CTX_SWAP);
795 		new_off = meta->paired_st->off + (xfer_num - 1) * 4;
796 		off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
797 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
798 			 xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
799 			 (len & 0x3) - 1, CMD_CTX_SWAP);
800 	}
801 
802 	/* TODO: The following extra load is to make sure data flow be identical
803 	 *  before and after we do memory copy optimization.
804 	 *
805 	 *  The load destination register is not guaranteed to be dead, so we
806 	 *  need to make sure it is loaded with the value the same as before
807 	 *  this transformation.
808 	 *
809 	 *  These extra loads could be removed once we have accurate register
810 	 *  usage information.
811 	 */
812 	if (descending_seq)
813 		xfer_num = 0;
814 	else if (BPF_SIZE(meta->insn.code) != BPF_DW)
815 		xfer_num = xfer_num - 1;
816 	else
817 		xfer_num = xfer_num - 2;
818 
819 	switch (BPF_SIZE(meta->insn.code)) {
820 	case BPF_B:
821 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
822 				reg_xfer(xfer_num), 1,
823 				IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
824 		break;
825 	case BPF_H:
826 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
827 				reg_xfer(xfer_num), 2, (len & 3) ^ 2);
828 		break;
829 	case BPF_W:
830 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
831 			reg_xfer(0));
832 		break;
833 	case BPF_DW:
834 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
835 			reg_xfer(xfer_num));
836 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
837 			reg_xfer(xfer_num + 1));
838 		break;
839 	}
840 
841 	if (BPF_SIZE(meta->insn.code) != BPF_DW)
842 		wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
843 
844 	return 0;
845 }
846 
847 static int
848 data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size)
849 {
850 	unsigned int i;
851 	u16 shift, sz;
852 
853 	/* We load the value from the address indicated in @offset and then
854 	 * shift out the data we don't need.  Note: this is big endian!
855 	 */
856 	sz = max(size, 4);
857 	shift = size < 4 ? 4 - size : 0;
858 
859 	emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
860 		 pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP);
861 
862 	i = 0;
863 	if (shift)
864 		emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE,
865 			 reg_xfer(0), SHF_SC_R_SHF, shift * 8);
866 	else
867 		for (; i * 4 < size; i++)
868 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
869 
870 	if (i < 2)
871 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
872 
873 	return 0;
874 }
875 
876 static int
877 data_ld_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr,
878 		   swreg lreg, swreg rreg, int size, enum cmd_mode mode)
879 {
880 	unsigned int i;
881 	u8 mask, sz;
882 
883 	/* We load the value from the address indicated in rreg + lreg and then
884 	 * mask out the data we don't need.  Note: this is little endian!
885 	 */
886 	sz = max(size, 4);
887 	mask = size < 4 ? GENMASK(size - 1, 0) : 0;
888 
889 	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0,
890 		 lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP);
891 
892 	i = 0;
893 	if (mask)
894 		emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask,
895 				  reg_xfer(0), SHF_SC_NONE, 0, true);
896 	else
897 		for (; i * 4 < size; i++)
898 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
899 
900 	if (i < 2)
901 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
902 
903 	return 0;
904 }
905 
906 static int
907 data_ld_host_order_addr32(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
908 			  u8 dst_gpr, u8 size)
909 {
910 	return data_ld_host_order(nfp_prog, dst_gpr, reg_a(src_gpr), offset,
911 				  size, CMD_MODE_32b);
912 }
913 
914 static int
915 data_ld_host_order_addr40(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
916 			  u8 dst_gpr, u8 size)
917 {
918 	swreg rega, regb;
919 
920 	addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb);
921 
922 	return data_ld_host_order(nfp_prog, dst_gpr, rega, regb,
923 				  size, CMD_MODE_40b_BA);
924 }
925 
926 static int
927 construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset, u16 src, u8 size)
928 {
929 	swreg tmp_reg;
930 
931 	/* Calculate the true offset (src_reg + imm) */
932 	tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
933 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg);
934 
935 	/* Check packet length (size guaranteed to fit b/c it's u8) */
936 	emit_alu(nfp_prog, imm_a(nfp_prog),
937 		 imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
938 	emit_alu(nfp_prog, reg_none(),
939 		 plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
940 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
941 
942 	/* Load data */
943 	return data_ld(nfp_prog, imm_b(nfp_prog), 0, size);
944 }
945 
946 static int construct_data_ld(struct nfp_prog *nfp_prog, u16 offset, u8 size)
947 {
948 	swreg tmp_reg;
949 
950 	/* Check packet length */
951 	tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog));
952 	emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
953 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
954 
955 	/* Load data */
956 	tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
957 	return data_ld(nfp_prog, tmp_reg, 0, size);
958 }
959 
960 static int
961 data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
962 		    u8 src_gpr, u8 size)
963 {
964 	unsigned int i;
965 
966 	for (i = 0; i * 4 < size; i++)
967 		wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i));
968 
969 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
970 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
971 
972 	return 0;
973 }
974 
975 static int
976 data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
977 		   u64 imm, u8 size)
978 {
979 	wrp_immed(nfp_prog, reg_xfer(0), imm);
980 	if (size == 8)
981 		wrp_immed(nfp_prog, reg_xfer(1), imm >> 32);
982 
983 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
984 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
985 
986 	return 0;
987 }
988 
989 typedef int
990 (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
991 	     unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
992 	     bool needs_inc);
993 
994 static int
995 wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
996 	      unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
997 	      bool needs_inc)
998 {
999 	bool should_inc = needs_inc && new_gpr && !last;
1000 	u32 idx, src_byte;
1001 	enum shf_sc sc;
1002 	swreg reg;
1003 	int shf;
1004 	u8 mask;
1005 
1006 	if (WARN_ON_ONCE(dst_byte + size > 4 || off % 4 + size > 4))
1007 		return -EOPNOTSUPP;
1008 
1009 	idx = off / 4;
1010 
1011 	/* Move the entire word */
1012 	if (size == 4) {
1013 		wrp_mov(nfp_prog, reg_both(dst),
1014 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
1015 		return 0;
1016 	}
1017 
1018 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1019 		return -EOPNOTSUPP;
1020 
1021 	src_byte = off % 4;
1022 
1023 	mask = (1 << size) - 1;
1024 	mask <<= dst_byte;
1025 
1026 	if (WARN_ON_ONCE(mask > 0xf))
1027 		return -EOPNOTSUPP;
1028 
1029 	shf = abs(src_byte - dst_byte) * 8;
1030 	if (src_byte == dst_byte) {
1031 		sc = SHF_SC_NONE;
1032 	} else if (src_byte < dst_byte) {
1033 		shf = 32 - shf;
1034 		sc = SHF_SC_L_SHF;
1035 	} else {
1036 		sc = SHF_SC_R_SHF;
1037 	}
1038 
1039 	/* ld_field can address fewer indexes, if offset too large do RMW.
1040 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1041 	 */
1042 	if (idx <= RE_REG_LM_IDX_MAX) {
1043 		reg = reg_lm(lm3 ? 3 : 0, idx);
1044 	} else {
1045 		reg = imm_a(nfp_prog);
1046 		/* If it's not the first part of the load and we start a new GPR
1047 		 * that means we are loading a second part of the LMEM word into
1048 		 * a new GPR.  IOW we've already looked that LMEM word and
1049 		 * therefore it has been loaded into imm_a().
1050 		 */
1051 		if (first || !new_gpr)
1052 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1053 	}
1054 
1055 	emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
1056 
1057 	if (should_inc)
1058 		wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1059 
1060 	return 0;
1061 }
1062 
1063 static int
1064 wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
1065 	       unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1066 	       bool needs_inc)
1067 {
1068 	bool should_inc = needs_inc && new_gpr && !last;
1069 	u32 idx, dst_byte;
1070 	enum shf_sc sc;
1071 	swreg reg;
1072 	int shf;
1073 	u8 mask;
1074 
1075 	if (WARN_ON_ONCE(src_byte + size > 4 || off % 4 + size > 4))
1076 		return -EOPNOTSUPP;
1077 
1078 	idx = off / 4;
1079 
1080 	/* Move the entire word */
1081 	if (size == 4) {
1082 		wrp_mov(nfp_prog,
1083 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
1084 			reg_b(src));
1085 		return 0;
1086 	}
1087 
1088 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1089 		return -EOPNOTSUPP;
1090 
1091 	dst_byte = off % 4;
1092 
1093 	mask = (1 << size) - 1;
1094 	mask <<= dst_byte;
1095 
1096 	if (WARN_ON_ONCE(mask > 0xf))
1097 		return -EOPNOTSUPP;
1098 
1099 	shf = abs(src_byte - dst_byte) * 8;
1100 	if (src_byte == dst_byte) {
1101 		sc = SHF_SC_NONE;
1102 	} else if (src_byte < dst_byte) {
1103 		shf = 32 - shf;
1104 		sc = SHF_SC_L_SHF;
1105 	} else {
1106 		sc = SHF_SC_R_SHF;
1107 	}
1108 
1109 	/* ld_field can address fewer indexes, if offset too large do RMW.
1110 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1111 	 */
1112 	if (idx <= RE_REG_LM_IDX_MAX) {
1113 		reg = reg_lm(lm3 ? 3 : 0, idx);
1114 	} else {
1115 		reg = imm_a(nfp_prog);
1116 		/* Only first and last LMEM locations are going to need RMW,
1117 		 * the middle location will be overwritten fully.
1118 		 */
1119 		if (first || last)
1120 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1121 	}
1122 
1123 	emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
1124 
1125 	if (new_gpr || last) {
1126 		if (idx > RE_REG_LM_IDX_MAX)
1127 			wrp_mov(nfp_prog, reg_lm(0, idx), reg);
1128 		if (should_inc)
1129 			wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1130 	}
1131 
1132 	return 0;
1133 }
1134 
1135 static int
1136 mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1137 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
1138 	     bool clr_gpr, lmem_step step)
1139 {
1140 	s32 off = nfp_prog->stack_frame_depth + meta->insn.off + ptr_off;
1141 	bool first = true, last;
1142 	bool needs_inc = false;
1143 	swreg stack_off_reg;
1144 	u8 prev_gpr = 255;
1145 	u32 gpr_byte = 0;
1146 	bool lm3 = true;
1147 	int ret;
1148 
1149 	if (meta->ptr_not_const) {
1150 		/* Use of the last encountered ptr_off is OK, they all have
1151 		 * the same alignment.  Depend on low bits of value being
1152 		 * discarded when written to LMaddr register.
1153 		 */
1154 		stack_off_reg = ur_load_imm_any(nfp_prog, meta->insn.off,
1155 						stack_imm(nfp_prog));
1156 
1157 		emit_alu(nfp_prog, imm_b(nfp_prog),
1158 			 reg_a(ptr_gpr), ALU_OP_ADD, stack_off_reg);
1159 
1160 		needs_inc = true;
1161 	} else if (off + size <= 64) {
1162 		/* We can reach bottom 64B with LMaddr0 */
1163 		lm3 = false;
1164 	} else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
1165 		/* We have to set up a new pointer.  If we know the offset
1166 		 * and the entire access falls into a single 32 byte aligned
1167 		 * window we won't have to increment the LM pointer.
1168 		 * The 32 byte alignment is imporant because offset is ORed in
1169 		 * not added when doing *l$indexN[off].
1170 		 */
1171 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
1172 						stack_imm(nfp_prog));
1173 		emit_alu(nfp_prog, imm_b(nfp_prog),
1174 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1175 
1176 		off %= 32;
1177 	} else {
1178 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
1179 						stack_imm(nfp_prog));
1180 
1181 		emit_alu(nfp_prog, imm_b(nfp_prog),
1182 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1183 
1184 		needs_inc = true;
1185 	}
1186 	if (lm3) {
1187 		emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
1188 		/* For size < 4 one slot will be filled by zeroing of upper. */
1189 		wrp_nops(nfp_prog, clr_gpr && size < 8 ? 2 : 3);
1190 	}
1191 
1192 	if (clr_gpr && size < 8)
1193 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
1194 
1195 	while (size) {
1196 		u32 slice_end;
1197 		u8 slice_size;
1198 
1199 		slice_size = min(size, 4 - gpr_byte);
1200 		slice_end = min(off + slice_size, round_up(off + 1, 4));
1201 		slice_size = slice_end - off;
1202 
1203 		last = slice_size == size;
1204 
1205 		if (needs_inc)
1206 			off %= 4;
1207 
1208 		ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
1209 			   first, gpr != prev_gpr, last, lm3, needs_inc);
1210 		if (ret)
1211 			return ret;
1212 
1213 		prev_gpr = gpr;
1214 		first = false;
1215 
1216 		gpr_byte += slice_size;
1217 		if (gpr_byte >= 4) {
1218 			gpr_byte -= 4;
1219 			gpr++;
1220 		}
1221 
1222 		size -= slice_size;
1223 		off += slice_size;
1224 	}
1225 
1226 	return 0;
1227 }
1228 
1229 static void
1230 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
1231 {
1232 	swreg tmp_reg;
1233 
1234 	if (alu_op == ALU_OP_AND) {
1235 		if (!imm)
1236 			wrp_immed(nfp_prog, reg_both(dst), 0);
1237 		if (!imm || !~imm)
1238 			return;
1239 	}
1240 	if (alu_op == ALU_OP_OR) {
1241 		if (!~imm)
1242 			wrp_immed(nfp_prog, reg_both(dst), ~0U);
1243 		if (!imm || !~imm)
1244 			return;
1245 	}
1246 	if (alu_op == ALU_OP_XOR) {
1247 		if (!~imm)
1248 			emit_alu(nfp_prog, reg_both(dst), reg_none(),
1249 				 ALU_OP_NOT, reg_b(dst));
1250 		if (!imm || !~imm)
1251 			return;
1252 	}
1253 
1254 	tmp_reg = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1255 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, tmp_reg);
1256 }
1257 
1258 static int
1259 wrp_alu64_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1260 	      enum alu_op alu_op, bool skip)
1261 {
1262 	const struct bpf_insn *insn = &meta->insn;
1263 	u64 imm = insn->imm; /* sign extend */
1264 
1265 	if (skip) {
1266 		meta->skip = true;
1267 		return 0;
1268 	}
1269 
1270 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, imm & ~0U);
1271 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, alu_op, imm >> 32);
1272 
1273 	return 0;
1274 }
1275 
1276 static int
1277 wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1278 	      enum alu_op alu_op)
1279 {
1280 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1281 
1282 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1283 	emit_alu(nfp_prog, reg_both(dst + 1),
1284 		 reg_a(dst + 1), alu_op, reg_b(src + 1));
1285 
1286 	return 0;
1287 }
1288 
1289 static int
1290 wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1291 	      enum alu_op alu_op, bool skip)
1292 {
1293 	const struct bpf_insn *insn = &meta->insn;
1294 
1295 	if (skip) {
1296 		meta->skip = true;
1297 		return 0;
1298 	}
1299 
1300 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, insn->imm);
1301 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1302 
1303 	return 0;
1304 }
1305 
1306 static int
1307 wrp_alu32_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1308 	      enum alu_op alu_op)
1309 {
1310 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1311 
1312 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1313 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1314 
1315 	return 0;
1316 }
1317 
1318 static void
1319 wrp_test_reg_one(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u8 src,
1320 		 enum br_mask br_mask, u16 off)
1321 {
1322 	emit_alu(nfp_prog, reg_none(), reg_a(dst), alu_op, reg_b(src));
1323 	emit_br(nfp_prog, br_mask, off, 0);
1324 }
1325 
1326 static int
1327 wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1328 	     enum alu_op alu_op, enum br_mask br_mask)
1329 {
1330 	const struct bpf_insn *insn = &meta->insn;
1331 
1332 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
1333 			 insn->src_reg * 2, br_mask, insn->off);
1334 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
1335 			 insn->src_reg * 2 + 1, br_mask, insn->off);
1336 
1337 	return 0;
1338 }
1339 
1340 static const struct jmp_code_map {
1341 	enum br_mask br_mask;
1342 	bool swap;
1343 } jmp_code_map[] = {
1344 	[BPF_JGT >> 4]	= { BR_BLO, true },
1345 	[BPF_JGE >> 4]	= { BR_BHS, false },
1346 	[BPF_JLT >> 4]	= { BR_BLO, false },
1347 	[BPF_JLE >> 4]	= { BR_BHS, true },
1348 	[BPF_JSGT >> 4]	= { BR_BLT, true },
1349 	[BPF_JSGE >> 4]	= { BR_BGE, false },
1350 	[BPF_JSLT >> 4]	= { BR_BLT, false },
1351 	[BPF_JSLE >> 4]	= { BR_BGE, true },
1352 };
1353 
1354 static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
1355 {
1356 	unsigned int op;
1357 
1358 	op = BPF_OP(meta->insn.code) >> 4;
1359 	/* br_mask of 0 is BR_BEQ which we don't use in jump code table */
1360 	if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
1361 		      !jmp_code_map[op].br_mask,
1362 		      "no code found for jump instruction"))
1363 		return NULL;
1364 
1365 	return &jmp_code_map[op];
1366 }
1367 
1368 static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1369 {
1370 	const struct bpf_insn *insn = &meta->insn;
1371 	u64 imm = insn->imm; /* sign extend */
1372 	const struct jmp_code_map *code;
1373 	enum alu_op alu_op, carry_op;
1374 	u8 reg = insn->dst_reg * 2;
1375 	swreg tmp_reg;
1376 
1377 	code = nfp_jmp_code_get(meta);
1378 	if (!code)
1379 		return -EINVAL;
1380 
1381 	alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
1382 	carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
1383 
1384 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
1385 	if (!code->swap)
1386 		emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
1387 	else
1388 		emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
1389 
1390 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
1391 	if (!code->swap)
1392 		emit_alu(nfp_prog, reg_none(),
1393 			 reg_a(reg + 1), carry_op, tmp_reg);
1394 	else
1395 		emit_alu(nfp_prog, reg_none(),
1396 			 tmp_reg, carry_op, reg_a(reg + 1));
1397 
1398 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1399 
1400 	return 0;
1401 }
1402 
1403 static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1404 {
1405 	const struct bpf_insn *insn = &meta->insn;
1406 	const struct jmp_code_map *code;
1407 	u8 areg, breg;
1408 
1409 	code = nfp_jmp_code_get(meta);
1410 	if (!code)
1411 		return -EINVAL;
1412 
1413 	areg = insn->dst_reg * 2;
1414 	breg = insn->src_reg * 2;
1415 
1416 	if (code->swap) {
1417 		areg ^= breg;
1418 		breg ^= areg;
1419 		areg ^= breg;
1420 	}
1421 
1422 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
1423 	emit_alu(nfp_prog, reg_none(),
1424 		 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
1425 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1426 
1427 	return 0;
1428 }
1429 
1430 static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
1431 {
1432 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in,
1433 		      SHF_SC_R_ROT, 8);
1434 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out),
1435 		      SHF_SC_R_ROT, 16);
1436 }
1437 
1438 static void
1439 wrp_mul_u32(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1440 	    swreg rreg, bool gen_high_half)
1441 {
1442 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1443 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_1, rreg);
1444 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_2, rreg);
1445 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_3, rreg);
1446 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_4, rreg);
1447 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_32x32, MUL_LAST, reg_none());
1448 	if (gen_high_half)
1449 		emit_mul(nfp_prog, dst_hi, MUL_TYPE_STEP_32x32, MUL_LAST_2,
1450 			 reg_none());
1451 	else
1452 		wrp_immed(nfp_prog, dst_hi, 0);
1453 }
1454 
1455 static void
1456 wrp_mul_u16(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1457 	    swreg rreg)
1458 {
1459 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1460 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_1, rreg);
1461 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_2, rreg);
1462 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_16x16, MUL_LAST, reg_none());
1463 }
1464 
1465 static int
1466 wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1467 	bool gen_high_half, bool ropnd_from_reg)
1468 {
1469 	swreg multiplier, multiplicand, dst_hi, dst_lo;
1470 	const struct bpf_insn *insn = &meta->insn;
1471 	u32 lopnd_max, ropnd_max;
1472 	u8 dst_reg;
1473 
1474 	dst_reg = insn->dst_reg;
1475 	multiplicand = reg_a(dst_reg * 2);
1476 	dst_hi = reg_both(dst_reg * 2 + 1);
1477 	dst_lo = reg_both(dst_reg * 2);
1478 	lopnd_max = meta->umax_dst;
1479 	if (ropnd_from_reg) {
1480 		multiplier = reg_b(insn->src_reg * 2);
1481 		ropnd_max = meta->umax_src;
1482 	} else {
1483 		u32 imm = insn->imm;
1484 
1485 		multiplier = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1486 		ropnd_max = imm;
1487 	}
1488 	if (lopnd_max > U16_MAX || ropnd_max > U16_MAX)
1489 		wrp_mul_u32(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier,
1490 			    gen_high_half);
1491 	else
1492 		wrp_mul_u16(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier);
1493 
1494 	return 0;
1495 }
1496 
1497 static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
1498 {
1499 	swreg dst_both = reg_both(dst), dst_a = reg_a(dst), dst_b = reg_a(dst);
1500 	struct reciprocal_value_adv rvalue;
1501 	u8 pre_shift, exp;
1502 	swreg magic;
1503 
1504 	if (imm > U32_MAX) {
1505 		wrp_immed(nfp_prog, dst_both, 0);
1506 		return 0;
1507 	}
1508 
1509 	/* NOTE: because we are using "reciprocal_value_adv" which doesn't
1510 	 * support "divisor > (1u << 31)", we need to JIT separate NFP sequence
1511 	 * to handle such case which actually equals to the result of unsigned
1512 	 * comparison "dst >= imm" which could be calculated using the following
1513 	 * NFP sequence:
1514 	 *
1515 	 *  alu[--, dst, -, imm]
1516 	 *  immed[imm, 0]
1517 	 *  alu[dst, imm, +carry, 0]
1518 	 *
1519 	 */
1520 	if (imm > 1U << 31) {
1521 		swreg tmp_b = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1522 
1523 		emit_alu(nfp_prog, reg_none(), dst_a, ALU_OP_SUB, tmp_b);
1524 		wrp_immed(nfp_prog, imm_a(nfp_prog), 0);
1525 		emit_alu(nfp_prog, dst_both, imm_a(nfp_prog), ALU_OP_ADD_C,
1526 			 reg_imm(0));
1527 		return 0;
1528 	}
1529 
1530 	rvalue = reciprocal_value_adv(imm, 32);
1531 	exp = rvalue.exp;
1532 	if (rvalue.is_wide_m && !(imm & 1)) {
1533 		pre_shift = fls(imm & -imm) - 1;
1534 		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
1535 	} else {
1536 		pre_shift = 0;
1537 	}
1538 	magic = ur_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
1539 	if (imm == 1U << exp) {
1540 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1541 			 SHF_SC_R_SHF, exp);
1542 	} else if (rvalue.is_wide_m) {
1543 		wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a,
1544 			    magic, true);
1545 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB,
1546 			 imm_b(nfp_prog));
1547 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1548 			 SHF_SC_R_SHF, 1);
1549 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD,
1550 			 imm_b(nfp_prog));
1551 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1552 			 SHF_SC_R_SHF, rvalue.sh - 1);
1553 	} else {
1554 		if (pre_shift)
1555 			emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1556 				 dst_b, SHF_SC_R_SHF, pre_shift);
1557 		wrp_mul_u32(nfp_prog, dst_both, reg_none(), dst_a, magic, true);
1558 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1559 			 dst_b, SHF_SC_R_SHF, rvalue.sh);
1560 	}
1561 
1562 	return 0;
1563 }
1564 
1565 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1566 {
1567 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
1568 	struct nfp_bpf_cap_adjust_head *adjust_head;
1569 	u32 ret_einval, end;
1570 
1571 	adjust_head = &nfp_prog->bpf->adjust_head;
1572 
1573 	/* Optimized version - 5 vs 14 cycles */
1574 	if (nfp_prog->adjust_head_location != UINT_MAX) {
1575 		if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n))
1576 			return -EINVAL;
1577 
1578 		emit_alu(nfp_prog, pptr_reg(nfp_prog),
1579 			 reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog));
1580 		emit_alu(nfp_prog, plen_reg(nfp_prog),
1581 			 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1582 		emit_alu(nfp_prog, pv_len(nfp_prog),
1583 			 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1584 
1585 		wrp_immed(nfp_prog, reg_both(0), 0);
1586 		wrp_immed(nfp_prog, reg_both(1), 0);
1587 
1588 		/* TODO: when adjust head is guaranteed to succeed we can
1589 		 * also eliminate the following if (r0 == 0) branch.
1590 		 */
1591 
1592 		return 0;
1593 	}
1594 
1595 	ret_einval = nfp_prog_current_offset(nfp_prog) + 14;
1596 	end = ret_einval + 2;
1597 
1598 	/* We need to use a temp because offset is just a part of the pkt ptr */
1599 	emit_alu(nfp_prog, tmp,
1600 		 reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog));
1601 
1602 	/* Validate result will fit within FW datapath constraints */
1603 	emit_alu(nfp_prog, reg_none(),
1604 		 tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min));
1605 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1606 	emit_alu(nfp_prog, reg_none(),
1607 		 reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp);
1608 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1609 
1610 	/* Validate the length is at least ETH_HLEN */
1611 	emit_alu(nfp_prog, tmp_len,
1612 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1613 	emit_alu(nfp_prog, reg_none(),
1614 		 tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN));
1615 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1616 
1617 	/* Load the ret code */
1618 	wrp_immed(nfp_prog, reg_both(0), 0);
1619 	wrp_immed(nfp_prog, reg_both(1), 0);
1620 
1621 	/* Modify the packet metadata */
1622 	emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0);
1623 
1624 	/* Skip over the -EINVAL ret code (defer 2) */
1625 	emit_br(nfp_prog, BR_UNC, end, 2);
1626 
1627 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1628 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1629 	emit_alu(nfp_prog, pv_len(nfp_prog),
1630 		 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1631 
1632 	/* return -EINVAL target */
1633 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1634 		return -EINVAL;
1635 
1636 	wrp_immed(nfp_prog, reg_both(0), -22);
1637 	wrp_immed(nfp_prog, reg_both(1), ~0);
1638 
1639 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1640 		return -EINVAL;
1641 
1642 	return 0;
1643 }
1644 
1645 static int adjust_tail(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1646 {
1647 	u32 ret_einval, end;
1648 	swreg plen, delta;
1649 
1650 	BUILD_BUG_ON(plen_reg(nfp_prog) != reg_b(STATIC_REG_PKT_LEN));
1651 
1652 	plen = imm_a(nfp_prog);
1653 	delta = reg_a(2 * 2);
1654 
1655 	ret_einval = nfp_prog_current_offset(nfp_prog) + 9;
1656 	end = nfp_prog_current_offset(nfp_prog) + 11;
1657 
1658 	/* Calculate resulting length */
1659 	emit_alu(nfp_prog, plen, plen_reg(nfp_prog), ALU_OP_ADD, delta);
1660 	/* delta == 0 is not allowed by the kernel, add must overflow to make
1661 	 * length smaller.
1662 	 */
1663 	emit_br(nfp_prog, BR_BCC, ret_einval, 0);
1664 
1665 	/* if (new_len < 14) then -EINVAL */
1666 	emit_alu(nfp_prog, reg_none(), plen, ALU_OP_SUB, reg_imm(ETH_HLEN));
1667 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1668 
1669 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1670 		 plen_reg(nfp_prog), ALU_OP_ADD, delta);
1671 	emit_alu(nfp_prog, pv_len(nfp_prog),
1672 		 pv_len(nfp_prog), ALU_OP_ADD, delta);
1673 
1674 	emit_br(nfp_prog, BR_UNC, end, 2);
1675 	wrp_immed(nfp_prog, reg_both(0), 0);
1676 	wrp_immed(nfp_prog, reg_both(1), 0);
1677 
1678 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1679 		return -EINVAL;
1680 
1681 	wrp_immed(nfp_prog, reg_both(0), -22);
1682 	wrp_immed(nfp_prog, reg_both(1), ~0);
1683 
1684 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1685 		return -EINVAL;
1686 
1687 	return 0;
1688 }
1689 
1690 static int
1691 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1692 {
1693 	bool load_lm_ptr;
1694 	u32 ret_tgt;
1695 	s64 lm_off;
1696 
1697 	/* We only have to reload LM0 if the key is not at start of stack */
1698 	lm_off = nfp_prog->stack_frame_depth;
1699 	lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
1700 	load_lm_ptr = meta->arg2.var_off || lm_off;
1701 
1702 	/* Set LM0 to start of key */
1703 	if (load_lm_ptr)
1704 		emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0);
1705 	if (meta->func_id == BPF_FUNC_map_update_elem)
1706 		emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1707 
1708 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1709 		     2, RELO_BR_HELPER);
1710 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1711 
1712 	/* Load map ID into A0 */
1713 	wrp_mov(nfp_prog, reg_a(0), reg_a(2));
1714 
1715 	/* Load the return address into B0 */
1716 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1717 
1718 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1719 		return -EINVAL;
1720 
1721 	/* Reset the LM0 pointer */
1722 	if (!load_lm_ptr)
1723 		return 0;
1724 
1725 	emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
1726 	wrp_nops(nfp_prog, 3);
1727 
1728 	return 0;
1729 }
1730 
1731 static int
1732 nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1733 {
1734 	__emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM);
1735 	/* CSR value is read in following immed[gpr, 0] */
1736 	emit_immed(nfp_prog, reg_both(0), 0,
1737 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1738 	emit_immed(nfp_prog, reg_both(1), 0,
1739 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1740 	return 0;
1741 }
1742 
1743 static int
1744 nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1745 {
1746 	swreg ptr_type;
1747 	u32 ret_tgt;
1748 
1749 	ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
1750 
1751 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
1752 
1753 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1754 		     2, RELO_BR_HELPER);
1755 
1756 	/* Load ptr type into A1 */
1757 	wrp_mov(nfp_prog, reg_a(1), ptr_type);
1758 
1759 	/* Load the return address into B0 */
1760 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1761 
1762 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1763 		return -EINVAL;
1764 
1765 	return 0;
1766 }
1767 
1768 static int
1769 nfp_queue_select(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1770 {
1771 	u32 jmp_tgt;
1772 
1773 	jmp_tgt = nfp_prog_current_offset(nfp_prog) + 5;
1774 
1775 	/* Make sure the queue id fits into FW field */
1776 	emit_alu(nfp_prog, reg_none(), reg_a(meta->insn.src_reg * 2),
1777 		 ALU_OP_AND_NOT_B, reg_imm(0xff));
1778 	emit_br(nfp_prog, BR_BEQ, jmp_tgt, 2);
1779 
1780 	/* Set the 'queue selected' bit and the queue value */
1781 	emit_shf(nfp_prog, pv_qsel_set(nfp_prog),
1782 		 pv_qsel_set(nfp_prog), SHF_OP_OR, reg_imm(1),
1783 		 SHF_SC_L_SHF, PKT_VEL_QSEL_SET_BIT);
1784 	emit_ld_field(nfp_prog,
1785 		      pv_qsel_val(nfp_prog), 0x1, reg_b(meta->insn.src_reg * 2),
1786 		      SHF_SC_NONE, 0);
1787 	/* Delay slots end here, we will jump over next instruction if queue
1788 	 * value fits into the field.
1789 	 */
1790 	emit_ld_field(nfp_prog,
1791 		      pv_qsel_val(nfp_prog), 0x1, reg_imm(NFP_NET_RXR_MAX),
1792 		      SHF_SC_NONE, 0);
1793 
1794 	if (!nfp_prog_confirm_current_offset(nfp_prog, jmp_tgt))
1795 		return -EINVAL;
1796 
1797 	return 0;
1798 }
1799 
1800 /* --- Callbacks --- */
1801 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1802 {
1803 	const struct bpf_insn *insn = &meta->insn;
1804 	u8 dst = insn->dst_reg * 2;
1805 	u8 src = insn->src_reg * 2;
1806 
1807 	if (insn->src_reg == BPF_REG_10) {
1808 		swreg stack_depth_reg;
1809 
1810 		stack_depth_reg = ur_load_imm_any(nfp_prog,
1811 						  nfp_prog->stack_frame_depth,
1812 						  stack_imm(nfp_prog));
1813 		emit_alu(nfp_prog, reg_both(dst), stack_reg(nfp_prog),
1814 			 ALU_OP_ADD, stack_depth_reg);
1815 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1816 	} else {
1817 		wrp_reg_mov(nfp_prog, dst, src);
1818 		wrp_reg_mov(nfp_prog, dst + 1, src + 1);
1819 	}
1820 
1821 	return 0;
1822 }
1823 
1824 static int mov_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1825 {
1826 	u64 imm = meta->insn.imm; /* sign extend */
1827 
1828 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2), imm & ~0U);
1829 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), imm >> 32);
1830 
1831 	return 0;
1832 }
1833 
1834 static int xor_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1835 {
1836 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_XOR);
1837 }
1838 
1839 static int xor_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1840 {
1841 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm);
1842 }
1843 
1844 static int and_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1845 {
1846 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_AND);
1847 }
1848 
1849 static int and_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1850 {
1851 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1852 }
1853 
1854 static int or_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1855 {
1856 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_OR);
1857 }
1858 
1859 static int or_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1860 {
1861 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1862 }
1863 
1864 static int add_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1865 {
1866 	const struct bpf_insn *insn = &meta->insn;
1867 
1868 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1869 		 reg_a(insn->dst_reg * 2), ALU_OP_ADD,
1870 		 reg_b(insn->src_reg * 2));
1871 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1872 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_ADD_C,
1873 		 reg_b(insn->src_reg * 2 + 1));
1874 
1875 	return 0;
1876 }
1877 
1878 static int add_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1879 {
1880 	const struct bpf_insn *insn = &meta->insn;
1881 	u64 imm = insn->imm; /* sign extend */
1882 
1883 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_ADD, imm & ~0U);
1884 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_ADD_C, imm >> 32);
1885 
1886 	return 0;
1887 }
1888 
1889 static int sub_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1890 {
1891 	const struct bpf_insn *insn = &meta->insn;
1892 
1893 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1894 		 reg_a(insn->dst_reg * 2), ALU_OP_SUB,
1895 		 reg_b(insn->src_reg * 2));
1896 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1897 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_SUB_C,
1898 		 reg_b(insn->src_reg * 2 + 1));
1899 
1900 	return 0;
1901 }
1902 
1903 static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1904 {
1905 	const struct bpf_insn *insn = &meta->insn;
1906 	u64 imm = insn->imm; /* sign extend */
1907 
1908 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_SUB, imm & ~0U);
1909 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_SUB_C, imm >> 32);
1910 
1911 	return 0;
1912 }
1913 
1914 static int mul_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1915 {
1916 	return wrp_mul(nfp_prog, meta, true, true);
1917 }
1918 
1919 static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1920 {
1921 	return wrp_mul(nfp_prog, meta, true, false);
1922 }
1923 
1924 static int div_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1925 {
1926 	const struct bpf_insn *insn = &meta->insn;
1927 
1928 	return wrp_div_imm(nfp_prog, insn->dst_reg * 2, insn->imm);
1929 }
1930 
1931 static int div_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1932 {
1933 	/* NOTE: verifier hook has rejected cases for which verifier doesn't
1934 	 * know whether the source operand is constant or not.
1935 	 */
1936 	return wrp_div_imm(nfp_prog, meta->insn.dst_reg * 2, meta->umin_src);
1937 }
1938 
1939 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1940 {
1941 	const struct bpf_insn *insn = &meta->insn;
1942 
1943 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2), reg_imm(0),
1944 		 ALU_OP_SUB, reg_b(insn->dst_reg * 2));
1945 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1), reg_imm(0),
1946 		 ALU_OP_SUB_C, reg_b(insn->dst_reg * 2 + 1));
1947 
1948 	return 0;
1949 }
1950 
1951 /* Pseudo code:
1952  *   if shift_amt >= 32
1953  *     dst_high = dst_low << shift_amt[4:0]
1954  *     dst_low = 0;
1955  *   else
1956  *     dst_high = (dst_high, dst_low) >> (32 - shift_amt)
1957  *     dst_low = dst_low << shift_amt
1958  *
1959  * The indirect shift will use the same logic at runtime.
1960  */
1961 static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
1962 {
1963 	if (shift_amt < 32) {
1964 		emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
1965 			 SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
1966 			 32 - shift_amt);
1967 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
1968 			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
1969 	} else if (shift_amt == 32) {
1970 		wrp_reg_mov(nfp_prog, dst + 1, dst);
1971 		wrp_immed(nfp_prog, reg_both(dst), 0);
1972 	} else if (shift_amt > 32) {
1973 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
1974 			 reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
1975 		wrp_immed(nfp_prog, reg_both(dst), 0);
1976 	}
1977 
1978 	return 0;
1979 }
1980 
1981 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1982 {
1983 	const struct bpf_insn *insn = &meta->insn;
1984 	u8 dst = insn->dst_reg * 2;
1985 
1986 	return __shl_imm64(nfp_prog, dst, insn->imm);
1987 }
1988 
1989 static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
1990 {
1991 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
1992 		 reg_b(src));
1993 	emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
1994 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
1995 		       reg_b(dst), SHF_SC_R_DSHF);
1996 }
1997 
1998 /* NOTE: for indirect left shift, HIGH part should be calculated first. */
1999 static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2000 {
2001 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2002 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2003 		       reg_b(dst), SHF_SC_L_SHF);
2004 }
2005 
2006 static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2007 {
2008 	shl_reg64_lt32_high(nfp_prog, dst, src);
2009 	shl_reg64_lt32_low(nfp_prog, dst, src);
2010 }
2011 
2012 static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2013 {
2014 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2015 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2016 		       reg_b(dst), SHF_SC_L_SHF);
2017 	wrp_immed(nfp_prog, reg_both(dst), 0);
2018 }
2019 
2020 static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2021 {
2022 	const struct bpf_insn *insn = &meta->insn;
2023 	u64 umin, umax;
2024 	u8 dst, src;
2025 
2026 	dst = insn->dst_reg * 2;
2027 	umin = meta->umin_src;
2028 	umax = meta->umax_src;
2029 	if (umin == umax)
2030 		return __shl_imm64(nfp_prog, dst, umin);
2031 
2032 	src = insn->src_reg * 2;
2033 	if (umax < 32) {
2034 		shl_reg64_lt32(nfp_prog, dst, src);
2035 	} else if (umin >= 32) {
2036 		shl_reg64_ge32(nfp_prog, dst, src);
2037 	} else {
2038 		/* Generate different instruction sequences depending on runtime
2039 		 * value of shift amount.
2040 		 */
2041 		u16 label_ge32, label_end;
2042 
2043 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
2044 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2045 
2046 		shl_reg64_lt32_high(nfp_prog, dst, src);
2047 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2048 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2049 		/* shl_reg64_lt32_low packed in delay slot. */
2050 		shl_reg64_lt32_low(nfp_prog, dst, src);
2051 
2052 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2053 			return -EINVAL;
2054 		shl_reg64_ge32(nfp_prog, dst, src);
2055 
2056 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2057 			return -EINVAL;
2058 	}
2059 
2060 	return 0;
2061 }
2062 
2063 /* Pseudo code:
2064  *   if shift_amt >= 32
2065  *     dst_high = 0;
2066  *     dst_low = dst_high >> shift_amt[4:0]
2067  *   else
2068  *     dst_high = dst_high >> shift_amt
2069  *     dst_low = (dst_high, dst_low) >> shift_amt
2070  *
2071  * The indirect shift will use the same logic at runtime.
2072  */
2073 static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2074 {
2075 	if (shift_amt < 32) {
2076 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2077 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2078 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2079 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2080 	} else if (shift_amt == 32) {
2081 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2082 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2083 	} else if (shift_amt > 32) {
2084 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2085 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2086 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2087 	}
2088 
2089 	return 0;
2090 }
2091 
2092 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2093 {
2094 	const struct bpf_insn *insn = &meta->insn;
2095 	u8 dst = insn->dst_reg * 2;
2096 
2097 	return __shr_imm64(nfp_prog, dst, insn->imm);
2098 }
2099 
2100 /* NOTE: for indirect right shift, LOW part should be calculated first. */
2101 static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2102 {
2103 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2104 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2105 		       reg_b(dst + 1), SHF_SC_R_SHF);
2106 }
2107 
2108 static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2109 {
2110 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2111 	emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2112 		       reg_b(dst), SHF_SC_R_DSHF);
2113 }
2114 
2115 static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2116 {
2117 	shr_reg64_lt32_low(nfp_prog, dst, src);
2118 	shr_reg64_lt32_high(nfp_prog, dst, src);
2119 }
2120 
2121 static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2122 {
2123 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2124 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2125 		       reg_b(dst + 1), SHF_SC_R_SHF);
2126 	wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2127 }
2128 
2129 static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2130 {
2131 	const struct bpf_insn *insn = &meta->insn;
2132 	u64 umin, umax;
2133 	u8 dst, src;
2134 
2135 	dst = insn->dst_reg * 2;
2136 	umin = meta->umin_src;
2137 	umax = meta->umax_src;
2138 	if (umin == umax)
2139 		return __shr_imm64(nfp_prog, dst, umin);
2140 
2141 	src = insn->src_reg * 2;
2142 	if (umax < 32) {
2143 		shr_reg64_lt32(nfp_prog, dst, src);
2144 	} else if (umin >= 32) {
2145 		shr_reg64_ge32(nfp_prog, dst, src);
2146 	} else {
2147 		/* Generate different instruction sequences depending on runtime
2148 		 * value of shift amount.
2149 		 */
2150 		u16 label_ge32, label_end;
2151 
2152 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2153 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2154 		shr_reg64_lt32_low(nfp_prog, dst, src);
2155 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2156 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2157 		/* shr_reg64_lt32_high packed in delay slot. */
2158 		shr_reg64_lt32_high(nfp_prog, dst, src);
2159 
2160 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2161 			return -EINVAL;
2162 		shr_reg64_ge32(nfp_prog, dst, src);
2163 
2164 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2165 			return -EINVAL;
2166 	}
2167 
2168 	return 0;
2169 }
2170 
2171 /* Code logic is the same as __shr_imm64 except ashr requires signedness bit
2172  * told through PREV_ALU result.
2173  */
2174 static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2175 {
2176 	if (shift_amt < 32) {
2177 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2178 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2179 		/* Set signedness bit. */
2180 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2181 			 reg_imm(0));
2182 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2183 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2184 	} else if (shift_amt == 32) {
2185 		/* NOTE: this also helps setting signedness bit. */
2186 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2187 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2188 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2189 	} else if (shift_amt > 32) {
2190 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2191 			 reg_imm(0));
2192 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2193 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2194 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2195 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2196 	}
2197 
2198 	return 0;
2199 }
2200 
2201 static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2202 {
2203 	const struct bpf_insn *insn = &meta->insn;
2204 	u8 dst = insn->dst_reg * 2;
2205 
2206 	return __ashr_imm64(nfp_prog, dst, insn->imm);
2207 }
2208 
2209 static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2210 {
2211 	/* NOTE: the first insn will set both indirect shift amount (source A)
2212 	 * and signedness bit (MSB of result).
2213 	 */
2214 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2215 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2216 		       reg_b(dst + 1), SHF_SC_R_SHF);
2217 }
2218 
2219 static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2220 {
2221 	/* NOTE: it is the same as logic shift because we don't need to shift in
2222 	 * signedness bit when the shift amount is less than 32.
2223 	 */
2224 	return shr_reg64_lt32_low(nfp_prog, dst, src);
2225 }
2226 
2227 static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2228 {
2229 	ashr_reg64_lt32_low(nfp_prog, dst, src);
2230 	ashr_reg64_lt32_high(nfp_prog, dst, src);
2231 }
2232 
2233 static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2234 {
2235 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2236 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2237 		       reg_b(dst + 1), SHF_SC_R_SHF);
2238 	emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2239 		 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2240 }
2241 
2242 /* Like ashr_imm64, but need to use indirect shift. */
2243 static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2244 {
2245 	const struct bpf_insn *insn = &meta->insn;
2246 	u64 umin, umax;
2247 	u8 dst, src;
2248 
2249 	dst = insn->dst_reg * 2;
2250 	umin = meta->umin_src;
2251 	umax = meta->umax_src;
2252 	if (umin == umax)
2253 		return __ashr_imm64(nfp_prog, dst, umin);
2254 
2255 	src = insn->src_reg * 2;
2256 	if (umax < 32) {
2257 		ashr_reg64_lt32(nfp_prog, dst, src);
2258 	} else if (umin >= 32) {
2259 		ashr_reg64_ge32(nfp_prog, dst, src);
2260 	} else {
2261 		u16 label_ge32, label_end;
2262 
2263 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2264 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2265 		ashr_reg64_lt32_low(nfp_prog, dst, src);
2266 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2267 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2268 		/* ashr_reg64_lt32_high packed in delay slot. */
2269 		ashr_reg64_lt32_high(nfp_prog, dst, src);
2270 
2271 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2272 			return -EINVAL;
2273 		ashr_reg64_ge32(nfp_prog, dst, src);
2274 
2275 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2276 			return -EINVAL;
2277 	}
2278 
2279 	return 0;
2280 }
2281 
2282 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2283 {
2284 	const struct bpf_insn *insn = &meta->insn;
2285 
2286 	wrp_reg_mov(nfp_prog, insn->dst_reg * 2,  insn->src_reg * 2);
2287 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2288 
2289 	return 0;
2290 }
2291 
2292 static int mov_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2293 {
2294 	const struct bpf_insn *insn = &meta->insn;
2295 
2296 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), insn->imm);
2297 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2298 
2299 	return 0;
2300 }
2301 
2302 static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2303 {
2304 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_XOR);
2305 }
2306 
2307 static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2308 {
2309 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR, !~meta->insn.imm);
2310 }
2311 
2312 static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2313 {
2314 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_AND);
2315 }
2316 
2317 static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2318 {
2319 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
2320 }
2321 
2322 static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2323 {
2324 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_OR);
2325 }
2326 
2327 static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2328 {
2329 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
2330 }
2331 
2332 static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2333 {
2334 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_ADD);
2335 }
2336 
2337 static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2338 {
2339 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD, !meta->insn.imm);
2340 }
2341 
2342 static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2343 {
2344 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_SUB);
2345 }
2346 
2347 static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2348 {
2349 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB, !meta->insn.imm);
2350 }
2351 
2352 static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2353 {
2354 	return wrp_mul(nfp_prog, meta, false, true);
2355 }
2356 
2357 static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2358 {
2359 	return wrp_mul(nfp_prog, meta, false, false);
2360 }
2361 
2362 static int div_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2363 {
2364 	return div_reg64(nfp_prog, meta);
2365 }
2366 
2367 static int div_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2368 {
2369 	return div_imm64(nfp_prog, meta);
2370 }
2371 
2372 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2373 {
2374 	u8 dst = meta->insn.dst_reg * 2;
2375 
2376 	emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst));
2377 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2378 
2379 	return 0;
2380 }
2381 
2382 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2383 {
2384 	const struct bpf_insn *insn = &meta->insn;
2385 
2386 	if (!insn->imm)
2387 		return 1; /* TODO: zero shift means indirect */
2388 
2389 	emit_shf(nfp_prog, reg_both(insn->dst_reg * 2),
2390 		 reg_none(), SHF_OP_NONE, reg_b(insn->dst_reg * 2),
2391 		 SHF_SC_L_SHF, insn->imm);
2392 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2393 
2394 	return 0;
2395 }
2396 
2397 static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2398 {
2399 	const struct bpf_insn *insn = &meta->insn;
2400 	u8 gpr = insn->dst_reg * 2;
2401 
2402 	switch (insn->imm) {
2403 	case 16:
2404 		emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr),
2405 			      SHF_SC_R_ROT, 8);
2406 		emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr),
2407 			      SHF_SC_R_SHF, 16);
2408 
2409 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2410 		break;
2411 	case 32:
2412 		wrp_end32(nfp_prog, reg_a(gpr), gpr);
2413 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2414 		break;
2415 	case 64:
2416 		wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1));
2417 
2418 		wrp_end32(nfp_prog, reg_a(gpr), gpr + 1);
2419 		wrp_end32(nfp_prog, imm_a(nfp_prog), gpr);
2420 		break;
2421 	}
2422 
2423 	return 0;
2424 }
2425 
2426 static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2427 {
2428 	struct nfp_insn_meta *prev = nfp_meta_prev(meta);
2429 	u32 imm_lo, imm_hi;
2430 	u8 dst;
2431 
2432 	dst = prev->insn.dst_reg * 2;
2433 	imm_lo = prev->insn.imm;
2434 	imm_hi = meta->insn.imm;
2435 
2436 	wrp_immed(nfp_prog, reg_both(dst), imm_lo);
2437 
2438 	/* mov is always 1 insn, load imm may be two, so try to use mov */
2439 	if (imm_hi == imm_lo)
2440 		wrp_mov(nfp_prog, reg_both(dst + 1), reg_a(dst));
2441 	else
2442 		wrp_immed(nfp_prog, reg_both(dst + 1), imm_hi);
2443 
2444 	return 0;
2445 }
2446 
2447 static int imm_ld8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2448 {
2449 	meta->double_cb = imm_ld8_part2;
2450 	return 0;
2451 }
2452 
2453 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2454 {
2455 	return construct_data_ld(nfp_prog, meta->insn.imm, 1);
2456 }
2457 
2458 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2459 {
2460 	return construct_data_ld(nfp_prog, meta->insn.imm, 2);
2461 }
2462 
2463 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2464 {
2465 	return construct_data_ld(nfp_prog, meta->insn.imm, 4);
2466 }
2467 
2468 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2469 {
2470 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2471 				     meta->insn.src_reg * 2, 1);
2472 }
2473 
2474 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2475 {
2476 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2477 				     meta->insn.src_reg * 2, 2);
2478 }
2479 
2480 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2481 {
2482 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2483 				     meta->insn.src_reg * 2, 4);
2484 }
2485 
2486 static int
2487 mem_ldx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2488 	      unsigned int size, unsigned int ptr_off)
2489 {
2490 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2491 			    meta->insn.dst_reg * 2, meta->insn.src_reg * 2,
2492 			    true, wrp_lmem_load);
2493 }
2494 
2495 static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2496 		       u8 size)
2497 {
2498 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2499 
2500 	switch (meta->insn.off) {
2501 	case offsetof(struct __sk_buff, len):
2502 		if (size != FIELD_SIZEOF(struct __sk_buff, len))
2503 			return -EOPNOTSUPP;
2504 		wrp_mov(nfp_prog, dst, plen_reg(nfp_prog));
2505 		break;
2506 	case offsetof(struct __sk_buff, data):
2507 		if (size != FIELD_SIZEOF(struct __sk_buff, data))
2508 			return -EOPNOTSUPP;
2509 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2510 		break;
2511 	case offsetof(struct __sk_buff, data_end):
2512 		if (size != FIELD_SIZEOF(struct __sk_buff, data_end))
2513 			return -EOPNOTSUPP;
2514 		emit_alu(nfp_prog, dst,
2515 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2516 		break;
2517 	default:
2518 		return -EOPNOTSUPP;
2519 	}
2520 
2521 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2522 
2523 	return 0;
2524 }
2525 
2526 static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2527 		       u8 size)
2528 {
2529 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2530 
2531 	switch (meta->insn.off) {
2532 	case offsetof(struct xdp_md, data):
2533 		if (size != FIELD_SIZEOF(struct xdp_md, data))
2534 			return -EOPNOTSUPP;
2535 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2536 		break;
2537 	case offsetof(struct xdp_md, data_end):
2538 		if (size != FIELD_SIZEOF(struct xdp_md, data_end))
2539 			return -EOPNOTSUPP;
2540 		emit_alu(nfp_prog, dst,
2541 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2542 		break;
2543 	default:
2544 		return -EOPNOTSUPP;
2545 	}
2546 
2547 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2548 
2549 	return 0;
2550 }
2551 
2552 static int
2553 mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2554 	     unsigned int size)
2555 {
2556 	swreg tmp_reg;
2557 
2558 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2559 
2560 	return data_ld_host_order_addr32(nfp_prog, meta->insn.src_reg * 2,
2561 					 tmp_reg, meta->insn.dst_reg * 2, size);
2562 }
2563 
2564 static int
2565 mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2566 	     unsigned int size)
2567 {
2568 	swreg tmp_reg;
2569 
2570 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2571 
2572 	return data_ld_host_order_addr40(nfp_prog, meta->insn.src_reg * 2,
2573 					 tmp_reg, meta->insn.dst_reg * 2, size);
2574 }
2575 
2576 static void
2577 mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog,
2578 			   struct nfp_insn_meta *meta)
2579 {
2580 	s16 range_start = meta->pkt_cache.range_start;
2581 	s16 range_end = meta->pkt_cache.range_end;
2582 	swreg src_base, off;
2583 	u8 xfer_num, len;
2584 	bool indir;
2585 
2586 	off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog));
2587 	src_base = reg_a(meta->insn.src_reg * 2);
2588 	len = range_end - range_start;
2589 	xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH;
2590 
2591 	indir = len > 8 * REG_WIDTH;
2592 	/* Setup PREV_ALU for indirect mode. */
2593 	if (indir)
2594 		wrp_immed(nfp_prog, reg_none(),
2595 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
2596 
2597 	/* Cache memory into transfer-in registers. */
2598 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
2599 		     off, xfer_num - 1, CMD_CTX_SWAP, indir);
2600 }
2601 
2602 static int
2603 mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog,
2604 				     struct nfp_insn_meta *meta,
2605 				     unsigned int size)
2606 {
2607 	s16 range_start = meta->pkt_cache.range_start;
2608 	s16 insn_off = meta->insn.off - range_start;
2609 	swreg dst_lo, dst_hi, src_lo, src_mid;
2610 	u8 dst_gpr = meta->insn.dst_reg * 2;
2611 	u8 len_lo = size, len_mid = 0;
2612 	u8 idx = insn_off / REG_WIDTH;
2613 	u8 off = insn_off % REG_WIDTH;
2614 
2615 	dst_hi = reg_both(dst_gpr + 1);
2616 	dst_lo = reg_both(dst_gpr);
2617 	src_lo = reg_xfer(idx);
2618 
2619 	/* The read length could involve as many as three registers. */
2620 	if (size > REG_WIDTH - off) {
2621 		/* Calculate the part in the second register. */
2622 		len_lo = REG_WIDTH - off;
2623 		len_mid = size - len_lo;
2624 
2625 		/* Calculate the part in the third register. */
2626 		if (size > 2 * REG_WIDTH - off)
2627 			len_mid = REG_WIDTH;
2628 	}
2629 
2630 	wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off);
2631 
2632 	if (!len_mid) {
2633 		wrp_immed(nfp_prog, dst_hi, 0);
2634 		return 0;
2635 	}
2636 
2637 	src_mid = reg_xfer(idx + 1);
2638 
2639 	if (size <= REG_WIDTH) {
2640 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo);
2641 		wrp_immed(nfp_prog, dst_hi, 0);
2642 	} else {
2643 		swreg src_hi = reg_xfer(idx + 2);
2644 
2645 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid,
2646 				   REG_WIDTH - len_lo, len_lo);
2647 		wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo,
2648 				REG_WIDTH - len_lo);
2649 		wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo,
2650 				   len_lo);
2651 	}
2652 
2653 	return 0;
2654 }
2655 
2656 static int
2657 mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog,
2658 				   struct nfp_insn_meta *meta,
2659 				   unsigned int size)
2660 {
2661 	swreg dst_lo, dst_hi, src_lo;
2662 	u8 dst_gpr, idx;
2663 
2664 	idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH;
2665 	dst_gpr = meta->insn.dst_reg * 2;
2666 	dst_hi = reg_both(dst_gpr + 1);
2667 	dst_lo = reg_both(dst_gpr);
2668 	src_lo = reg_xfer(idx);
2669 
2670 	if (size < REG_WIDTH) {
2671 		wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0);
2672 		wrp_immed(nfp_prog, dst_hi, 0);
2673 	} else if (size == REG_WIDTH) {
2674 		wrp_mov(nfp_prog, dst_lo, src_lo);
2675 		wrp_immed(nfp_prog, dst_hi, 0);
2676 	} else {
2677 		swreg src_hi = reg_xfer(idx + 1);
2678 
2679 		wrp_mov(nfp_prog, dst_lo, src_lo);
2680 		wrp_mov(nfp_prog, dst_hi, src_hi);
2681 	}
2682 
2683 	return 0;
2684 }
2685 
2686 static int
2687 mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog,
2688 			   struct nfp_insn_meta *meta, unsigned int size)
2689 {
2690 	u8 off = meta->insn.off - meta->pkt_cache.range_start;
2691 
2692 	if (IS_ALIGNED(off, REG_WIDTH))
2693 		return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size);
2694 
2695 	return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size);
2696 }
2697 
2698 static int
2699 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2700 	unsigned int size)
2701 {
2702 	if (meta->ldst_gather_len)
2703 		return nfp_cpp_memcpy(nfp_prog, meta);
2704 
2705 	if (meta->ptr.type == PTR_TO_CTX) {
2706 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2707 			return mem_ldx_xdp(nfp_prog, meta, size);
2708 		else
2709 			return mem_ldx_skb(nfp_prog, meta, size);
2710 	}
2711 
2712 	if (meta->ptr.type == PTR_TO_PACKET) {
2713 		if (meta->pkt_cache.range_end) {
2714 			if (meta->pkt_cache.do_init)
2715 				mem_ldx_data_init_pktcache(nfp_prog, meta);
2716 
2717 			return mem_ldx_data_from_pktcache(nfp_prog, meta, size);
2718 		} else {
2719 			return mem_ldx_data(nfp_prog, meta, size);
2720 		}
2721 	}
2722 
2723 	if (meta->ptr.type == PTR_TO_STACK)
2724 		return mem_ldx_stack(nfp_prog, meta, size,
2725 				     meta->ptr.off + meta->ptr.var_off.value);
2726 
2727 	if (meta->ptr.type == PTR_TO_MAP_VALUE)
2728 		return mem_ldx_emem(nfp_prog, meta, size);
2729 
2730 	return -EOPNOTSUPP;
2731 }
2732 
2733 static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2734 {
2735 	return mem_ldx(nfp_prog, meta, 1);
2736 }
2737 
2738 static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2739 {
2740 	return mem_ldx(nfp_prog, meta, 2);
2741 }
2742 
2743 static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2744 {
2745 	return mem_ldx(nfp_prog, meta, 4);
2746 }
2747 
2748 static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2749 {
2750 	return mem_ldx(nfp_prog, meta, 8);
2751 }
2752 
2753 static int
2754 mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2755 	    unsigned int size)
2756 {
2757 	u64 imm = meta->insn.imm; /* sign extend */
2758 	swreg off_reg;
2759 
2760 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2761 
2762 	return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2763 				  imm, size);
2764 }
2765 
2766 static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2767 		  unsigned int size)
2768 {
2769 	if (meta->ptr.type == PTR_TO_PACKET)
2770 		return mem_st_data(nfp_prog, meta, size);
2771 
2772 	return -EOPNOTSUPP;
2773 }
2774 
2775 static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2776 {
2777 	return mem_st(nfp_prog, meta, 1);
2778 }
2779 
2780 static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2781 {
2782 	return mem_st(nfp_prog, meta, 2);
2783 }
2784 
2785 static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2786 {
2787 	return mem_st(nfp_prog, meta, 4);
2788 }
2789 
2790 static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2791 {
2792 	return mem_st(nfp_prog, meta, 8);
2793 }
2794 
2795 static int
2796 mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2797 	     unsigned int size)
2798 {
2799 	swreg off_reg;
2800 
2801 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2802 
2803 	return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2804 				   meta->insn.src_reg * 2, size);
2805 }
2806 
2807 static int
2808 mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2809 	      unsigned int size, unsigned int ptr_off)
2810 {
2811 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2812 			    meta->insn.src_reg * 2, meta->insn.dst_reg * 2,
2813 			    false, wrp_lmem_store);
2814 }
2815 
2816 static int mem_stx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2817 {
2818 	switch (meta->insn.off) {
2819 	case offsetof(struct xdp_md, rx_queue_index):
2820 		return nfp_queue_select(nfp_prog, meta);
2821 	}
2822 
2823 	WARN_ON_ONCE(1); /* verifier should have rejected bad accesses */
2824 	return -EOPNOTSUPP;
2825 }
2826 
2827 static int
2828 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2829 	unsigned int size)
2830 {
2831 	if (meta->ptr.type == PTR_TO_PACKET)
2832 		return mem_stx_data(nfp_prog, meta, size);
2833 
2834 	if (meta->ptr.type == PTR_TO_STACK)
2835 		return mem_stx_stack(nfp_prog, meta, size,
2836 				     meta->ptr.off + meta->ptr.var_off.value);
2837 
2838 	return -EOPNOTSUPP;
2839 }
2840 
2841 static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2842 {
2843 	return mem_stx(nfp_prog, meta, 1);
2844 }
2845 
2846 static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2847 {
2848 	return mem_stx(nfp_prog, meta, 2);
2849 }
2850 
2851 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2852 {
2853 	if (meta->ptr.type == PTR_TO_CTX)
2854 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2855 			return mem_stx_xdp(nfp_prog, meta);
2856 	return mem_stx(nfp_prog, meta, 4);
2857 }
2858 
2859 static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2860 {
2861 	return mem_stx(nfp_prog, meta, 8);
2862 }
2863 
2864 static int
2865 mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
2866 {
2867 	u8 dst_gpr = meta->insn.dst_reg * 2;
2868 	u8 src_gpr = meta->insn.src_reg * 2;
2869 	unsigned int full_add, out;
2870 	swreg addra, addrb, off;
2871 
2872 	off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2873 
2874 	/* We can fit 16 bits into command immediate, if we know the immediate
2875 	 * is guaranteed to either always or never fit into 16 bit we only
2876 	 * generate code to handle that particular case, otherwise generate
2877 	 * code for both.
2878 	 */
2879 	out = nfp_prog_current_offset(nfp_prog);
2880 	full_add = nfp_prog_current_offset(nfp_prog);
2881 
2882 	if (meta->insn.off) {
2883 		out += 2;
2884 		full_add += 2;
2885 	}
2886 	if (meta->xadd_maybe_16bit) {
2887 		out += 3;
2888 		full_add += 3;
2889 	}
2890 	if (meta->xadd_over_16bit)
2891 		out += 2 + is64;
2892 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2893 		out += 5;
2894 		full_add += 5;
2895 	}
2896 
2897 	/* Generate the branch for choosing add_imm vs add */
2898 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2899 		swreg max_imm = imm_a(nfp_prog);
2900 
2901 		wrp_immed(nfp_prog, max_imm, 0xffff);
2902 		emit_alu(nfp_prog, reg_none(),
2903 			 max_imm, ALU_OP_SUB, reg_b(src_gpr));
2904 		emit_alu(nfp_prog, reg_none(),
2905 			 reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1));
2906 		emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0);
2907 		/* defer for add */
2908 	}
2909 
2910 	/* If insn has an offset add to the address */
2911 	if (!meta->insn.off) {
2912 		addra = reg_a(dst_gpr);
2913 		addrb = reg_b(dst_gpr + 1);
2914 	} else {
2915 		emit_alu(nfp_prog, imma_a(nfp_prog),
2916 			 reg_a(dst_gpr), ALU_OP_ADD, off);
2917 		emit_alu(nfp_prog, imma_b(nfp_prog),
2918 			 reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0));
2919 		addra = imma_a(nfp_prog);
2920 		addrb = imma_b(nfp_prog);
2921 	}
2922 
2923 	/* Generate the add_imm if 16 bits are possible */
2924 	if (meta->xadd_maybe_16bit) {
2925 		swreg prev_alu = imm_a(nfp_prog);
2926 
2927 		wrp_immed(nfp_prog, prev_alu,
2928 			  FIELD_PREP(CMD_OVE_DATA, 2) |
2929 			  CMD_OVE_LEN |
2930 			  FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2));
2931 		wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2);
2932 		emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0,
2933 			       addra, addrb, 0, CMD_CTX_NO_SWAP);
2934 
2935 		if (meta->xadd_over_16bit)
2936 			emit_br(nfp_prog, BR_UNC, out, 0);
2937 	}
2938 
2939 	if (!nfp_prog_confirm_current_offset(nfp_prog, full_add))
2940 		return -EINVAL;
2941 
2942 	/* Generate the add if 16 bits are not guaranteed */
2943 	if (meta->xadd_over_16bit) {
2944 		emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0,
2945 			 addra, addrb, is64 << 2,
2946 			 is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1);
2947 
2948 		wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr));
2949 		if (is64)
2950 			wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1));
2951 	}
2952 
2953 	if (!nfp_prog_confirm_current_offset(nfp_prog, out))
2954 		return -EINVAL;
2955 
2956 	return 0;
2957 }
2958 
2959 static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2960 {
2961 	return mem_xadd(nfp_prog, meta, false);
2962 }
2963 
2964 static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2965 {
2966 	return mem_xadd(nfp_prog, meta, true);
2967 }
2968 
2969 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2970 {
2971 	emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
2972 
2973 	return 0;
2974 }
2975 
2976 static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2977 {
2978 	const struct bpf_insn *insn = &meta->insn;
2979 	u64 imm = insn->imm; /* sign extend */
2980 	swreg or1, or2, tmp_reg;
2981 
2982 	or1 = reg_a(insn->dst_reg * 2);
2983 	or2 = reg_b(insn->dst_reg * 2 + 1);
2984 
2985 	if (imm & ~0U) {
2986 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
2987 		emit_alu(nfp_prog, imm_a(nfp_prog),
2988 			 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
2989 		or1 = imm_a(nfp_prog);
2990 	}
2991 
2992 	if (imm >> 32) {
2993 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
2994 		emit_alu(nfp_prog, imm_b(nfp_prog),
2995 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
2996 		or2 = imm_b(nfp_prog);
2997 	}
2998 
2999 	emit_alu(nfp_prog, reg_none(), or1, ALU_OP_OR, or2);
3000 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3001 
3002 	return 0;
3003 }
3004 
3005 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3006 {
3007 	const struct bpf_insn *insn = &meta->insn;
3008 	u64 imm = insn->imm; /* sign extend */
3009 	swreg tmp_reg;
3010 
3011 	if (!imm) {
3012 		meta->skip = true;
3013 		return 0;
3014 	}
3015 
3016 	if (imm & ~0U) {
3017 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3018 		emit_alu(nfp_prog, reg_none(),
3019 			 reg_a(insn->dst_reg * 2), ALU_OP_AND, tmp_reg);
3020 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3021 	}
3022 
3023 	if (imm >> 32) {
3024 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3025 		emit_alu(nfp_prog, reg_none(),
3026 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_AND, tmp_reg);
3027 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3028 	}
3029 
3030 	return 0;
3031 }
3032 
3033 static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3034 {
3035 	const struct bpf_insn *insn = &meta->insn;
3036 	u64 imm = insn->imm; /* sign extend */
3037 	swreg tmp_reg;
3038 
3039 	if (!imm) {
3040 		emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
3041 			 ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
3042 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3043 		return 0;
3044 	}
3045 
3046 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3047 	emit_alu(nfp_prog, reg_none(),
3048 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3049 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3050 
3051 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3052 	emit_alu(nfp_prog, reg_none(),
3053 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3054 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3055 
3056 	return 0;
3057 }
3058 
3059 static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3060 {
3061 	const struct bpf_insn *insn = &meta->insn;
3062 
3063 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
3064 		 ALU_OP_XOR, reg_b(insn->src_reg * 2));
3065 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_a(insn->dst_reg * 2 + 1),
3066 		 ALU_OP_XOR, reg_b(insn->src_reg * 2 + 1));
3067 	emit_alu(nfp_prog, reg_none(),
3068 		 imm_a(nfp_prog), ALU_OP_OR, imm_b(nfp_prog));
3069 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3070 
3071 	return 0;
3072 }
3073 
3074 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3075 {
3076 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
3077 }
3078 
3079 static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3080 {
3081 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
3082 }
3083 
3084 static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3085 {
3086 	switch (meta->insn.imm) {
3087 	case BPF_FUNC_xdp_adjust_head:
3088 		return adjust_head(nfp_prog, meta);
3089 	case BPF_FUNC_xdp_adjust_tail:
3090 		return adjust_tail(nfp_prog, meta);
3091 	case BPF_FUNC_map_lookup_elem:
3092 	case BPF_FUNC_map_update_elem:
3093 	case BPF_FUNC_map_delete_elem:
3094 		return map_call_stack_common(nfp_prog, meta);
3095 	case BPF_FUNC_get_prandom_u32:
3096 		return nfp_get_prandom_u32(nfp_prog, meta);
3097 	case BPF_FUNC_perf_event_output:
3098 		return nfp_perf_event_output(nfp_prog, meta);
3099 	default:
3100 		WARN_ONCE(1, "verifier allowed unsupported function\n");
3101 		return -EOPNOTSUPP;
3102 	}
3103 }
3104 
3105 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3106 {
3107 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
3108 
3109 	return 0;
3110 }
3111 
3112 static const instr_cb_t instr_cb[256] = {
3113 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
3114 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
3115 	[BPF_ALU64 | BPF_XOR | BPF_X] =	xor_reg64,
3116 	[BPF_ALU64 | BPF_XOR | BPF_K] =	xor_imm64,
3117 	[BPF_ALU64 | BPF_AND | BPF_X] =	and_reg64,
3118 	[BPF_ALU64 | BPF_AND | BPF_K] =	and_imm64,
3119 	[BPF_ALU64 | BPF_OR | BPF_X] =	or_reg64,
3120 	[BPF_ALU64 | BPF_OR | BPF_K] =	or_imm64,
3121 	[BPF_ALU64 | BPF_ADD | BPF_X] =	add_reg64,
3122 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
3123 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
3124 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
3125 	[BPF_ALU64 | BPF_MUL | BPF_X] =	mul_reg64,
3126 	[BPF_ALU64 | BPF_MUL | BPF_K] =	mul_imm64,
3127 	[BPF_ALU64 | BPF_DIV | BPF_X] =	div_reg64,
3128 	[BPF_ALU64 | BPF_DIV | BPF_K] =	div_imm64,
3129 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
3130 	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
3131 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
3132 	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
3133 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
3134 	[BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64,
3135 	[BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
3136 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
3137 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
3138 	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
3139 	[BPF_ALU | BPF_XOR | BPF_K] =	xor_imm,
3140 	[BPF_ALU | BPF_AND | BPF_X] =	and_reg,
3141 	[BPF_ALU | BPF_AND | BPF_K] =	and_imm,
3142 	[BPF_ALU | BPF_OR | BPF_X] =	or_reg,
3143 	[BPF_ALU | BPF_OR | BPF_K] =	or_imm,
3144 	[BPF_ALU | BPF_ADD | BPF_X] =	add_reg,
3145 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
3146 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
3147 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
3148 	[BPF_ALU | BPF_MUL | BPF_X] =	mul_reg,
3149 	[BPF_ALU | BPF_MUL | BPF_K] =	mul_imm,
3150 	[BPF_ALU | BPF_DIV | BPF_X] =	div_reg,
3151 	[BPF_ALU | BPF_DIV | BPF_K] =	div_imm,
3152 	[BPF_ALU | BPF_NEG] =		neg_reg,
3153 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
3154 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
3155 	[BPF_LD | BPF_IMM | BPF_DW] =	imm_ld8,
3156 	[BPF_LD | BPF_ABS | BPF_B] =	data_ld1,
3157 	[BPF_LD | BPF_ABS | BPF_H] =	data_ld2,
3158 	[BPF_LD | BPF_ABS | BPF_W] =	data_ld4,
3159 	[BPF_LD | BPF_IND | BPF_B] =	data_ind_ld1,
3160 	[BPF_LD | BPF_IND | BPF_H] =	data_ind_ld2,
3161 	[BPF_LD | BPF_IND | BPF_W] =	data_ind_ld4,
3162 	[BPF_LDX | BPF_MEM | BPF_B] =	mem_ldx1,
3163 	[BPF_LDX | BPF_MEM | BPF_H] =	mem_ldx2,
3164 	[BPF_LDX | BPF_MEM | BPF_W] =	mem_ldx4,
3165 	[BPF_LDX | BPF_MEM | BPF_DW] =	mem_ldx8,
3166 	[BPF_STX | BPF_MEM | BPF_B] =	mem_stx1,
3167 	[BPF_STX | BPF_MEM | BPF_H] =	mem_stx2,
3168 	[BPF_STX | BPF_MEM | BPF_W] =	mem_stx4,
3169 	[BPF_STX | BPF_MEM | BPF_DW] =	mem_stx8,
3170 	[BPF_STX | BPF_XADD | BPF_W] =	mem_xadd4,
3171 	[BPF_STX | BPF_XADD | BPF_DW] =	mem_xadd8,
3172 	[BPF_ST | BPF_MEM | BPF_B] =	mem_st1,
3173 	[BPF_ST | BPF_MEM | BPF_H] =	mem_st2,
3174 	[BPF_ST | BPF_MEM | BPF_W] =	mem_st4,
3175 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
3176 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
3177 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
3178 	[BPF_JMP | BPF_JGT | BPF_K] =	cmp_imm,
3179 	[BPF_JMP | BPF_JGE | BPF_K] =	cmp_imm,
3180 	[BPF_JMP | BPF_JLT | BPF_K] =	cmp_imm,
3181 	[BPF_JMP | BPF_JLE | BPF_K] =	cmp_imm,
3182 	[BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
3183 	[BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
3184 	[BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
3185 	[BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
3186 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
3187 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
3188 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
3189 	[BPF_JMP | BPF_JGT | BPF_X] =	cmp_reg,
3190 	[BPF_JMP | BPF_JGE | BPF_X] =	cmp_reg,
3191 	[BPF_JMP | BPF_JLT | BPF_X] =	cmp_reg,
3192 	[BPF_JMP | BPF_JLE | BPF_X] =	cmp_reg,
3193 	[BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
3194 	[BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
3195 	[BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
3196 	[BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
3197 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
3198 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
3199 	[BPF_JMP | BPF_CALL] =		call,
3200 	[BPF_JMP | BPF_EXIT] =		goto_out,
3201 };
3202 
3203 /* --- Assembler logic --- */
3204 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
3205 {
3206 	struct nfp_insn_meta *meta, *jmp_dst;
3207 	u32 idx, br_idx;
3208 
3209 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3210 		if (meta->skip)
3211 			continue;
3212 		if (meta->insn.code == (BPF_JMP | BPF_CALL))
3213 			continue;
3214 		if (BPF_CLASS(meta->insn.code) != BPF_JMP)
3215 			continue;
3216 
3217 		if (list_is_last(&meta->l, &nfp_prog->insns))
3218 			br_idx = nfp_prog->last_bpf_off;
3219 		else
3220 			br_idx = list_next_entry(meta, l)->off - 1;
3221 
3222 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
3223 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
3224 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
3225 			return -ELOOP;
3226 		}
3227 		/* Leave special branches for later */
3228 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3229 		    RELO_BR_REL)
3230 			continue;
3231 
3232 		if (!meta->jmp_dst) {
3233 			pr_err("Non-exit jump doesn't have destination info recorded!!\n");
3234 			return -ELOOP;
3235 		}
3236 
3237 		jmp_dst = meta->jmp_dst;
3238 
3239 		if (jmp_dst->skip) {
3240 			pr_err("Branch landing on removed instruction!!\n");
3241 			return -ELOOP;
3242 		}
3243 
3244 		for (idx = meta->off; idx <= br_idx; idx++) {
3245 			if (!nfp_is_br(nfp_prog->prog[idx]))
3246 				continue;
3247 			br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
3248 		}
3249 	}
3250 
3251 	return 0;
3252 }
3253 
3254 static void nfp_intro(struct nfp_prog *nfp_prog)
3255 {
3256 	wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
3257 	emit_alu(nfp_prog, plen_reg(nfp_prog),
3258 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
3259 }
3260 
3261 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
3262 {
3263 	/* TC direct-action mode:
3264 	 *   0,1   ok        NOT SUPPORTED[1]
3265 	 *   2   drop  0x22 -> drop,  count as stat1
3266 	 *   4,5 nuke  0x02 -> drop
3267 	 *   7  redir  0x44 -> redir, count as stat2
3268 	 *   * unspec  0x11 -> pass,  count as stat0
3269 	 *
3270 	 * [1] We can't support OK and RECLASSIFY because we can't tell TC
3271 	 *     the exact decision made.  We are forced to support UNSPEC
3272 	 *     to handle aborts so that's the only one we handle for passing
3273 	 *     packets up the stack.
3274 	 */
3275 	/* Target for aborts */
3276 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3277 
3278 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3279 
3280 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3281 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16);
3282 
3283 	/* Target for normal exits */
3284 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3285 
3286 	/* if R0 > 7 jump to abort */
3287 	emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0));
3288 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3289 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3290 
3291 	wrp_immed(nfp_prog, reg_b(2), 0x41221211);
3292 	wrp_immed(nfp_prog, reg_b(3), 0x41001211);
3293 
3294 	emit_shf(nfp_prog, reg_a(1),
3295 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 2);
3296 
3297 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3298 	emit_shf(nfp_prog, reg_a(2),
3299 		 reg_imm(0xf), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3300 
3301 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3302 	emit_shf(nfp_prog, reg_b(2),
3303 		 reg_imm(0xf), SHF_OP_AND, reg_b(3), SHF_SC_R_SHF, 0);
3304 
3305 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3306 
3307 	emit_shf(nfp_prog, reg_b(2),
3308 		 reg_a(2), SHF_OP_OR, reg_b(2), SHF_SC_L_SHF, 4);
3309 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3310 }
3311 
3312 static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
3313 {
3314 	/* XDP return codes:
3315 	 *   0 aborted  0x82 -> drop,  count as stat3
3316 	 *   1    drop  0x22 -> drop,  count as stat1
3317 	 *   2    pass  0x11 -> pass,  count as stat0
3318 	 *   3      tx  0x44 -> redir, count as stat2
3319 	 *   * unknown  0x82 -> drop,  count as stat3
3320 	 */
3321 	/* Target for aborts */
3322 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3323 
3324 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3325 
3326 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3327 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
3328 
3329 	/* Target for normal exits */
3330 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3331 
3332 	/* if R0 > 3 jump to abort */
3333 	emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
3334 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3335 
3336 	wrp_immed(nfp_prog, reg_b(2), 0x44112282);
3337 
3338 	emit_shf(nfp_prog, reg_a(1),
3339 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 3);
3340 
3341 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3342 	emit_shf(nfp_prog, reg_b(2),
3343 		 reg_imm(0xff), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3344 
3345 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3346 
3347 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3348 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3349 }
3350 
3351 static void nfp_outro(struct nfp_prog *nfp_prog)
3352 {
3353 	switch (nfp_prog->type) {
3354 	case BPF_PROG_TYPE_SCHED_CLS:
3355 		nfp_outro_tc_da(nfp_prog);
3356 		break;
3357 	case BPF_PROG_TYPE_XDP:
3358 		nfp_outro_xdp(nfp_prog);
3359 		break;
3360 	default:
3361 		WARN_ON(1);
3362 	}
3363 }
3364 
3365 static int nfp_translate(struct nfp_prog *nfp_prog)
3366 {
3367 	struct nfp_insn_meta *meta;
3368 	int err;
3369 
3370 	nfp_intro(nfp_prog);
3371 	if (nfp_prog->error)
3372 		return nfp_prog->error;
3373 
3374 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3375 		instr_cb_t cb = instr_cb[meta->insn.code];
3376 
3377 		meta->off = nfp_prog_current_offset(nfp_prog);
3378 
3379 		if (meta->skip) {
3380 			nfp_prog->n_translated++;
3381 			continue;
3382 		}
3383 
3384 		if (nfp_meta_has_prev(nfp_prog, meta) &&
3385 		    nfp_meta_prev(meta)->double_cb)
3386 			cb = nfp_meta_prev(meta)->double_cb;
3387 		if (!cb)
3388 			return -ENOENT;
3389 		err = cb(nfp_prog, meta);
3390 		if (err)
3391 			return err;
3392 		if (nfp_prog->error)
3393 			return nfp_prog->error;
3394 
3395 		nfp_prog->n_translated++;
3396 	}
3397 
3398 	nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
3399 
3400 	nfp_outro(nfp_prog);
3401 	if (nfp_prog->error)
3402 		return nfp_prog->error;
3403 
3404 	wrp_nops(nfp_prog, NFP_USTORE_PREFETCH_WINDOW);
3405 	if (nfp_prog->error)
3406 		return nfp_prog->error;
3407 
3408 	return nfp_fixup_branches(nfp_prog);
3409 }
3410 
3411 /* --- Optimizations --- */
3412 static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
3413 {
3414 	struct nfp_insn_meta *meta;
3415 
3416 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3417 		struct bpf_insn insn = meta->insn;
3418 
3419 		/* Programs converted from cBPF start with register xoring */
3420 		if (insn.code == (BPF_ALU64 | BPF_XOR | BPF_X) &&
3421 		    insn.src_reg == insn.dst_reg)
3422 			continue;
3423 
3424 		/* Programs start with R6 = R1 but we ignore the skb pointer */
3425 		if (insn.code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
3426 		    insn.src_reg == 1 && insn.dst_reg == 6)
3427 			meta->skip = true;
3428 
3429 		/* Return as soon as something doesn't match */
3430 		if (!meta->skip)
3431 			return;
3432 	}
3433 }
3434 
3435 /* abs(insn.imm) will fit better into unrestricted reg immediate -
3436  * convert add/sub of a negative number into a sub/add of a positive one.
3437  */
3438 static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
3439 {
3440 	struct nfp_insn_meta *meta;
3441 
3442 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3443 		struct bpf_insn insn = meta->insn;
3444 
3445 		if (meta->skip)
3446 			continue;
3447 
3448 		if (BPF_CLASS(insn.code) != BPF_ALU &&
3449 		    BPF_CLASS(insn.code) != BPF_ALU64 &&
3450 		    BPF_CLASS(insn.code) != BPF_JMP)
3451 			continue;
3452 		if (BPF_SRC(insn.code) != BPF_K)
3453 			continue;
3454 		if (insn.imm >= 0)
3455 			continue;
3456 
3457 		if (BPF_CLASS(insn.code) == BPF_JMP) {
3458 			switch (BPF_OP(insn.code)) {
3459 			case BPF_JGE:
3460 			case BPF_JSGE:
3461 			case BPF_JLT:
3462 			case BPF_JSLT:
3463 				meta->jump_neg_op = true;
3464 				break;
3465 			default:
3466 				continue;
3467 			}
3468 		} else {
3469 			if (BPF_OP(insn.code) == BPF_ADD)
3470 				insn.code = BPF_CLASS(insn.code) | BPF_SUB;
3471 			else if (BPF_OP(insn.code) == BPF_SUB)
3472 				insn.code = BPF_CLASS(insn.code) | BPF_ADD;
3473 			else
3474 				continue;
3475 
3476 			meta->insn.code = insn.code | BPF_K;
3477 		}
3478 
3479 		meta->insn.imm = -insn.imm;
3480 	}
3481 }
3482 
3483 /* Remove masking after load since our load guarantees this is not needed */
3484 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
3485 {
3486 	struct nfp_insn_meta *meta1, *meta2;
3487 	const s32 exp_mask[] = {
3488 		[BPF_B] = 0x000000ffU,
3489 		[BPF_H] = 0x0000ffffU,
3490 		[BPF_W] = 0xffffffffU,
3491 	};
3492 
3493 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3494 		struct bpf_insn insn, next;
3495 
3496 		insn = meta1->insn;
3497 		next = meta2->insn;
3498 
3499 		if (BPF_CLASS(insn.code) != BPF_LD)
3500 			continue;
3501 		if (BPF_MODE(insn.code) != BPF_ABS &&
3502 		    BPF_MODE(insn.code) != BPF_IND)
3503 			continue;
3504 
3505 		if (next.code != (BPF_ALU64 | BPF_AND | BPF_K))
3506 			continue;
3507 
3508 		if (!exp_mask[BPF_SIZE(insn.code)])
3509 			continue;
3510 		if (exp_mask[BPF_SIZE(insn.code)] != next.imm)
3511 			continue;
3512 
3513 		if (next.src_reg || next.dst_reg)
3514 			continue;
3515 
3516 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
3517 			continue;
3518 
3519 		meta2->skip = true;
3520 	}
3521 }
3522 
3523 static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
3524 {
3525 	struct nfp_insn_meta *meta1, *meta2, *meta3;
3526 
3527 	nfp_for_each_insn_walk3(nfp_prog, meta1, meta2, meta3) {
3528 		struct bpf_insn insn, next1, next2;
3529 
3530 		insn = meta1->insn;
3531 		next1 = meta2->insn;
3532 		next2 = meta3->insn;
3533 
3534 		if (BPF_CLASS(insn.code) != BPF_LD)
3535 			continue;
3536 		if (BPF_MODE(insn.code) != BPF_ABS &&
3537 		    BPF_MODE(insn.code) != BPF_IND)
3538 			continue;
3539 		if (BPF_SIZE(insn.code) != BPF_W)
3540 			continue;
3541 
3542 		if (!(next1.code == (BPF_LSH | BPF_K | BPF_ALU64) &&
3543 		      next2.code == (BPF_RSH | BPF_K | BPF_ALU64)) &&
3544 		    !(next1.code == (BPF_RSH | BPF_K | BPF_ALU64) &&
3545 		      next2.code == (BPF_LSH | BPF_K | BPF_ALU64)))
3546 			continue;
3547 
3548 		if (next1.src_reg || next1.dst_reg ||
3549 		    next2.src_reg || next2.dst_reg)
3550 			continue;
3551 
3552 		if (next1.imm != 0x20 || next2.imm != 0x20)
3553 			continue;
3554 
3555 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
3556 		    meta3->flags & FLAG_INSN_IS_JUMP_DST)
3557 			continue;
3558 
3559 		meta2->skip = true;
3560 		meta3->skip = true;
3561 	}
3562 }
3563 
3564 /* load/store pair that forms memory copy sould look like the following:
3565  *
3566  *   ld_width R, [addr_src + offset_src]
3567  *   st_width [addr_dest + offset_dest], R
3568  *
3569  * The destination register of load and source register of store should
3570  * be the same, load and store should also perform at the same width.
3571  * If either of addr_src or addr_dest is stack pointer, we don't do the
3572  * CPP optimization as stack is modelled by registers on NFP.
3573  */
3574 static bool
3575 curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
3576 		    struct nfp_insn_meta *st_meta)
3577 {
3578 	struct bpf_insn *ld = &ld_meta->insn;
3579 	struct bpf_insn *st = &st_meta->insn;
3580 
3581 	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
3582 		return false;
3583 
3584 	if (ld_meta->ptr.type != PTR_TO_PACKET &&
3585 	    ld_meta->ptr.type != PTR_TO_MAP_VALUE)
3586 		return false;
3587 
3588 	if (st_meta->ptr.type != PTR_TO_PACKET)
3589 		return false;
3590 
3591 	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
3592 		return false;
3593 
3594 	if (ld->dst_reg != st->src_reg)
3595 		return false;
3596 
3597 	/* There is jump to the store insn in this pair. */
3598 	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
3599 		return false;
3600 
3601 	return true;
3602 }
3603 
3604 /* Currently, we only support chaining load/store pairs if:
3605  *
3606  *  - Their address base registers are the same.
3607  *  - Their address offsets are in the same order.
3608  *  - They operate at the same memory width.
3609  *  - There is no jump into the middle of them.
3610  */
3611 static bool
3612 curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
3613 			      struct nfp_insn_meta *st_meta,
3614 			      struct bpf_insn *prev_ld,
3615 			      struct bpf_insn *prev_st)
3616 {
3617 	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
3618 	struct bpf_insn *ld = &ld_meta->insn;
3619 	struct bpf_insn *st = &st_meta->insn;
3620 	s16 prev_ld_off, prev_st_off;
3621 
3622 	/* This pair is the start pair. */
3623 	if (!prev_ld)
3624 		return true;
3625 
3626 	prev_size = BPF_LDST_BYTES(prev_ld);
3627 	curr_size = BPF_LDST_BYTES(ld);
3628 	prev_ld_base = prev_ld->src_reg;
3629 	prev_st_base = prev_st->dst_reg;
3630 	prev_ld_dst = prev_ld->dst_reg;
3631 	prev_ld_off = prev_ld->off;
3632 	prev_st_off = prev_st->off;
3633 
3634 	if (ld->dst_reg != prev_ld_dst)
3635 		return false;
3636 
3637 	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
3638 		return false;
3639 
3640 	if (curr_size != prev_size)
3641 		return false;
3642 
3643 	/* There is jump to the head of this pair. */
3644 	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
3645 		return false;
3646 
3647 	/* Both in ascending order. */
3648 	if (prev_ld_off + prev_size == ld->off &&
3649 	    prev_st_off + prev_size == st->off)
3650 		return true;
3651 
3652 	/* Both in descending order. */
3653 	if (ld->off + curr_size == prev_ld_off &&
3654 	    st->off + curr_size == prev_st_off)
3655 		return true;
3656 
3657 	return false;
3658 }
3659 
3660 /* Return TRUE if cross memory access happens. Cross memory access means
3661  * store area is overlapping with load area that a later load might load
3662  * the value from previous store, for this case we can't treat the sequence
3663  * as an memory copy.
3664  */
3665 static bool
3666 cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
3667 		 struct nfp_insn_meta *head_st_meta)
3668 {
3669 	s16 head_ld_off, head_st_off, ld_off;
3670 
3671 	/* Different pointer types does not overlap. */
3672 	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
3673 		return false;
3674 
3675 	/* load and store are both PTR_TO_PACKET, check ID info.  */
3676 	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
3677 		return true;
3678 
3679 	/* Canonicalize the offsets. Turn all of them against the original
3680 	 * base register.
3681 	 */
3682 	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
3683 	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
3684 	ld_off = ld->off + head_ld_meta->ptr.off;
3685 
3686 	/* Ascending order cross. */
3687 	if (ld_off > head_ld_off &&
3688 	    head_ld_off < head_st_off && ld_off >= head_st_off)
3689 		return true;
3690 
3691 	/* Descending order cross. */
3692 	if (ld_off < head_ld_off &&
3693 	    head_ld_off > head_st_off && ld_off <= head_st_off)
3694 		return true;
3695 
3696 	return false;
3697 }
3698 
3699 /* This pass try to identify the following instructoin sequences.
3700  *
3701  *   load R, [regA + offA]
3702  *   store [regB + offB], R
3703  *   load R, [regA + offA + const_imm_A]
3704  *   store [regB + offB + const_imm_A], R
3705  *   load R, [regA + offA + 2 * const_imm_A]
3706  *   store [regB + offB + 2 * const_imm_A], R
3707  *   ...
3708  *
3709  * Above sequence is typically generated by compiler when lowering
3710  * memcpy. NFP prefer using CPP instructions to accelerate it.
3711  */
3712 static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
3713 {
3714 	struct nfp_insn_meta *head_ld_meta = NULL;
3715 	struct nfp_insn_meta *head_st_meta = NULL;
3716 	struct nfp_insn_meta *meta1, *meta2;
3717 	struct bpf_insn *prev_ld = NULL;
3718 	struct bpf_insn *prev_st = NULL;
3719 	u8 count = 0;
3720 
3721 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3722 		struct bpf_insn *ld = &meta1->insn;
3723 		struct bpf_insn *st = &meta2->insn;
3724 
3725 		/* Reset record status if any of the following if true:
3726 		 *   - The current insn pair is not load/store.
3727 		 *   - The load/store pair doesn't chain with previous one.
3728 		 *   - The chained load/store pair crossed with previous pair.
3729 		 *   - The chained load/store pair has a total size of memory
3730 		 *     copy beyond 128 bytes which is the maximum length a
3731 		 *     single NFP CPP command can transfer.
3732 		 */
3733 		if (!curr_pair_is_memcpy(meta1, meta2) ||
3734 		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
3735 						   prev_st) ||
3736 		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
3737 						       head_st_meta) ||
3738 				      head_ld_meta->ldst_gather_len >= 128))) {
3739 			if (!count)
3740 				continue;
3741 
3742 			if (count > 1) {
3743 				s16 prev_ld_off = prev_ld->off;
3744 				s16 prev_st_off = prev_st->off;
3745 				s16 head_ld_off = head_ld_meta->insn.off;
3746 
3747 				if (prev_ld_off < head_ld_off) {
3748 					head_ld_meta->insn.off = prev_ld_off;
3749 					head_st_meta->insn.off = prev_st_off;
3750 					head_ld_meta->ldst_gather_len =
3751 						-head_ld_meta->ldst_gather_len;
3752 				}
3753 
3754 				head_ld_meta->paired_st = &head_st_meta->insn;
3755 				head_st_meta->skip = true;
3756 			} else {
3757 				head_ld_meta->ldst_gather_len = 0;
3758 			}
3759 
3760 			/* If the chain is ended by an load/store pair then this
3761 			 * could serve as the new head of the the next chain.
3762 			 */
3763 			if (curr_pair_is_memcpy(meta1, meta2)) {
3764 				head_ld_meta = meta1;
3765 				head_st_meta = meta2;
3766 				head_ld_meta->ldst_gather_len =
3767 					BPF_LDST_BYTES(ld);
3768 				meta1 = nfp_meta_next(meta1);
3769 				meta2 = nfp_meta_next(meta2);
3770 				prev_ld = ld;
3771 				prev_st = st;
3772 				count = 1;
3773 			} else {
3774 				head_ld_meta = NULL;
3775 				head_st_meta = NULL;
3776 				prev_ld = NULL;
3777 				prev_st = NULL;
3778 				count = 0;
3779 			}
3780 
3781 			continue;
3782 		}
3783 
3784 		if (!head_ld_meta) {
3785 			head_ld_meta = meta1;
3786 			head_st_meta = meta2;
3787 		} else {
3788 			meta1->skip = true;
3789 			meta2->skip = true;
3790 		}
3791 
3792 		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
3793 		meta1 = nfp_meta_next(meta1);
3794 		meta2 = nfp_meta_next(meta2);
3795 		prev_ld = ld;
3796 		prev_st = st;
3797 		count++;
3798 	}
3799 }
3800 
3801 static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog)
3802 {
3803 	struct nfp_insn_meta *meta, *range_node = NULL;
3804 	s16 range_start = 0, range_end = 0;
3805 	bool cache_avail = false;
3806 	struct bpf_insn *insn;
3807 	s32 range_ptr_off = 0;
3808 	u32 range_ptr_id = 0;
3809 
3810 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3811 		if (meta->flags & FLAG_INSN_IS_JUMP_DST)
3812 			cache_avail = false;
3813 
3814 		if (meta->skip)
3815 			continue;
3816 
3817 		insn = &meta->insn;
3818 
3819 		if (is_mbpf_store_pkt(meta) ||
3820 		    insn->code == (BPF_JMP | BPF_CALL) ||
3821 		    is_mbpf_classic_store_pkt(meta) ||
3822 		    is_mbpf_classic_load(meta)) {
3823 			cache_avail = false;
3824 			continue;
3825 		}
3826 
3827 		if (!is_mbpf_load(meta))
3828 			continue;
3829 
3830 		if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) {
3831 			cache_avail = false;
3832 			continue;
3833 		}
3834 
3835 		if (!cache_avail) {
3836 			cache_avail = true;
3837 			if (range_node)
3838 				goto end_current_then_start_new;
3839 			goto start_new;
3840 		}
3841 
3842 		/* Check ID to make sure two reads share the same
3843 		 * variable offset against PTR_TO_PACKET, and check OFF
3844 		 * to make sure they also share the same constant
3845 		 * offset.
3846 		 *
3847 		 * OFFs don't really need to be the same, because they
3848 		 * are the constant offsets against PTR_TO_PACKET, so
3849 		 * for different OFFs, we could canonicalize them to
3850 		 * offsets against original packet pointer. We don't
3851 		 * support this.
3852 		 */
3853 		if (meta->ptr.id == range_ptr_id &&
3854 		    meta->ptr.off == range_ptr_off) {
3855 			s16 new_start = range_start;
3856 			s16 end, off = insn->off;
3857 			s16 new_end = range_end;
3858 			bool changed = false;
3859 
3860 			if (off < range_start) {
3861 				new_start = off;
3862 				changed = true;
3863 			}
3864 
3865 			end = off + BPF_LDST_BYTES(insn);
3866 			if (end > range_end) {
3867 				new_end = end;
3868 				changed = true;
3869 			}
3870 
3871 			if (!changed)
3872 				continue;
3873 
3874 			if (new_end - new_start <= 64) {
3875 				/* Install new range. */
3876 				range_start = new_start;
3877 				range_end = new_end;
3878 				continue;
3879 			}
3880 		}
3881 
3882 end_current_then_start_new:
3883 		range_node->pkt_cache.range_start = range_start;
3884 		range_node->pkt_cache.range_end = range_end;
3885 start_new:
3886 		range_node = meta;
3887 		range_node->pkt_cache.do_init = true;
3888 		range_ptr_id = range_node->ptr.id;
3889 		range_ptr_off = range_node->ptr.off;
3890 		range_start = insn->off;
3891 		range_end = insn->off + BPF_LDST_BYTES(insn);
3892 	}
3893 
3894 	if (range_node) {
3895 		range_node->pkt_cache.range_start = range_start;
3896 		range_node->pkt_cache.range_end = range_end;
3897 	}
3898 
3899 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3900 		if (meta->skip)
3901 			continue;
3902 
3903 		if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) {
3904 			if (meta->pkt_cache.do_init) {
3905 				range_start = meta->pkt_cache.range_start;
3906 				range_end = meta->pkt_cache.range_end;
3907 			} else {
3908 				meta->pkt_cache.range_start = range_start;
3909 				meta->pkt_cache.range_end = range_end;
3910 			}
3911 		}
3912 	}
3913 }
3914 
3915 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
3916 {
3917 	nfp_bpf_opt_reg_init(nfp_prog);
3918 
3919 	nfp_bpf_opt_neg_add_sub(nfp_prog);
3920 	nfp_bpf_opt_ld_mask(nfp_prog);
3921 	nfp_bpf_opt_ld_shift(nfp_prog);
3922 	nfp_bpf_opt_ldst_gather(nfp_prog);
3923 	nfp_bpf_opt_pkt_cache(nfp_prog);
3924 
3925 	return 0;
3926 }
3927 
3928 static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
3929 {
3930 	struct nfp_insn_meta *meta1, *meta2;
3931 	struct nfp_bpf_map *nfp_map;
3932 	struct bpf_map *map;
3933 	u32 id;
3934 
3935 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3936 		if (meta1->skip || meta2->skip)
3937 			continue;
3938 
3939 		if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
3940 		    meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
3941 			continue;
3942 
3943 		map = (void *)(unsigned long)((u32)meta1->insn.imm |
3944 					      (u64)meta2->insn.imm << 32);
3945 		if (bpf_map_offload_neutral(map)) {
3946 			id = map->id;
3947 		} else {
3948 			nfp_map = map_to_offmap(map)->dev_priv;
3949 			id = nfp_map->tid;
3950 		}
3951 
3952 		meta1->insn.imm = id;
3953 		meta2->insn.imm = 0;
3954 	}
3955 
3956 	return 0;
3957 }
3958 
3959 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
3960 {
3961 	__le64 *ustore = (__force __le64 *)prog;
3962 	int i;
3963 
3964 	for (i = 0; i < len; i++) {
3965 		int err;
3966 
3967 		err = nfp_ustore_check_valid_no_ecc(prog[i]);
3968 		if (err)
3969 			return err;
3970 
3971 		ustore[i] = cpu_to_le64(nfp_ustore_calc_ecc_insn(prog[i]));
3972 	}
3973 
3974 	return 0;
3975 }
3976 
3977 static void nfp_bpf_prog_trim(struct nfp_prog *nfp_prog)
3978 {
3979 	void *prog;
3980 
3981 	prog = kvmalloc_array(nfp_prog->prog_len, sizeof(u64), GFP_KERNEL);
3982 	if (!prog)
3983 		return;
3984 
3985 	nfp_prog->__prog_alloc_len = nfp_prog->prog_len * sizeof(u64);
3986 	memcpy(prog, nfp_prog->prog, nfp_prog->__prog_alloc_len);
3987 	kvfree(nfp_prog->prog);
3988 	nfp_prog->prog = prog;
3989 }
3990 
3991 int nfp_bpf_jit(struct nfp_prog *nfp_prog)
3992 {
3993 	int ret;
3994 
3995 	ret = nfp_bpf_replace_map_ptrs(nfp_prog);
3996 	if (ret)
3997 		return ret;
3998 
3999 	ret = nfp_bpf_optimize(nfp_prog);
4000 	if (ret)
4001 		return ret;
4002 
4003 	ret = nfp_translate(nfp_prog);
4004 	if (ret) {
4005 		pr_err("Translation failed with error %d (translated: %u)\n",
4006 		       ret, nfp_prog->n_translated);
4007 		return -EINVAL;
4008 	}
4009 
4010 	nfp_bpf_prog_trim(nfp_prog);
4011 
4012 	return ret;
4013 }
4014 
4015 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt)
4016 {
4017 	struct nfp_insn_meta *meta;
4018 
4019 	/* Another pass to record jump information. */
4020 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4021 		struct nfp_insn_meta *dst_meta;
4022 		u64 code = meta->insn.code;
4023 		unsigned int dst_idx;
4024 		bool pseudo_call;
4025 
4026 		if (BPF_CLASS(code) != BPF_JMP)
4027 			continue;
4028 		if (BPF_OP(code) == BPF_EXIT)
4029 			continue;
4030 		if (is_mbpf_helper_call(meta))
4031 			continue;
4032 
4033 		/* If opcode is BPF_CALL at this point, this can only be a
4034 		 * BPF-to-BPF call (a.k.a pseudo call).
4035 		 */
4036 		pseudo_call = BPF_OP(code) == BPF_CALL;
4037 
4038 		if (pseudo_call)
4039 			dst_idx = meta->n + 1 + meta->insn.imm;
4040 		else
4041 			dst_idx = meta->n + 1 + meta->insn.off;
4042 
4043 		dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_idx, cnt);
4044 
4045 		if (pseudo_call)
4046 			dst_meta->flags |= FLAG_INSN_IS_SUBPROG_START;
4047 
4048 		dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
4049 		meta->jmp_dst = dst_meta;
4050 	}
4051 }
4052 
4053 bool nfp_bpf_supported_opcode(u8 code)
4054 {
4055 	return !!instr_cb[code];
4056 }
4057 
4058 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
4059 {
4060 	unsigned int i;
4061 	u64 *prog;
4062 	int err;
4063 
4064 	prog = kmemdup(nfp_prog->prog, nfp_prog->prog_len * sizeof(u64),
4065 		       GFP_KERNEL);
4066 	if (!prog)
4067 		return ERR_PTR(-ENOMEM);
4068 
4069 	for (i = 0; i < nfp_prog->prog_len; i++) {
4070 		enum nfp_relo_type special;
4071 		u32 val;
4072 
4073 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
4074 		switch (special) {
4075 		case RELO_NONE:
4076 			continue;
4077 		case RELO_BR_REL:
4078 			br_add_offset(&prog[i], bv->start_off);
4079 			break;
4080 		case RELO_BR_GO_OUT:
4081 			br_set_offset(&prog[i],
4082 				      nfp_prog->tgt_out + bv->start_off);
4083 			break;
4084 		case RELO_BR_GO_ABORT:
4085 			br_set_offset(&prog[i],
4086 				      nfp_prog->tgt_abort + bv->start_off);
4087 			break;
4088 		case RELO_BR_NEXT_PKT:
4089 			br_set_offset(&prog[i], bv->tgt_done);
4090 			break;
4091 		case RELO_BR_HELPER:
4092 			val = br_get_offset(prog[i]);
4093 			val -= BR_OFF_RELO;
4094 			switch (val) {
4095 			case BPF_FUNC_map_lookup_elem:
4096 				val = nfp_prog->bpf->helpers.map_lookup;
4097 				break;
4098 			case BPF_FUNC_map_update_elem:
4099 				val = nfp_prog->bpf->helpers.map_update;
4100 				break;
4101 			case BPF_FUNC_map_delete_elem:
4102 				val = nfp_prog->bpf->helpers.map_delete;
4103 				break;
4104 			case BPF_FUNC_perf_event_output:
4105 				val = nfp_prog->bpf->helpers.perf_event_output;
4106 				break;
4107 			default:
4108 				pr_err("relocation of unknown helper %d\n",
4109 				       val);
4110 				err = -EINVAL;
4111 				goto err_free_prog;
4112 			}
4113 			br_set_offset(&prog[i], val);
4114 			break;
4115 		case RELO_IMMED_REL:
4116 			immed_add_value(&prog[i], bv->start_off);
4117 			break;
4118 		}
4119 
4120 		prog[i] &= ~OP_RELO_TYPE;
4121 	}
4122 
4123 	err = nfp_bpf_ustore_calc(prog, nfp_prog->prog_len);
4124 	if (err)
4125 		goto err_free_prog;
4126 
4127 	return prog;
4128 
4129 err_free_prog:
4130 	kfree(prog);
4131 	return ERR_PTR(err);
4132 }
4133