xref: /openbmc/linux/drivers/net/ethernet/netronome/nfp/bpf/jit.c (revision 5ef12cb4a3a78ffb331c03a795a15eea4ae35155)
1 /*
2  * Copyright (C) 2016-2017 Netronome Systems, Inc.
3  *
4  * This software is dual licensed under the GNU General License Version 2,
5  * June 1991 as shown in the file COPYING in the top-level directory of this
6  * source tree or the BSD 2-Clause License provided below.  You have the
7  * option to license this software under the complete terms of either license.
8  *
9  * The BSD 2-Clause License:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      1. Redistributions of source code must retain the above
16  *         copyright notice, this list of conditions and the following
17  *         disclaimer.
18  *
19  *      2. Redistributions in binary form must reproduce the above
20  *         copyright notice, this list of conditions and the following
21  *         disclaimer in the documentation and/or other materials
22  *         provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #define pr_fmt(fmt)	"NFP net bpf: " fmt
35 
36 #include <linux/bug.h>
37 #include <linux/kernel.h>
38 #include <linux/bpf.h>
39 #include <linux/filter.h>
40 #include <linux/pkt_cls.h>
41 #include <linux/unistd.h>
42 
43 #include "main.h"
44 #include "../nfp_asm.h"
45 
46 /* --- NFP prog --- */
47 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
48  * It's safe to modify the next pointers (but not pos).
49  */
50 #define nfp_for_each_insn_walk2(nfp_prog, pos, next)			\
51 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
52 	     next = list_next_entry(pos, l);			\
53 	     &(nfp_prog)->insns != &pos->l &&			\
54 	     &(nfp_prog)->insns != &next->l;			\
55 	     pos = nfp_meta_next(pos),				\
56 	     next = nfp_meta_next(pos))
57 
58 #define nfp_for_each_insn_walk3(nfp_prog, pos, next, next2)		\
59 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
60 	     next = list_next_entry(pos, l),			\
61 	     next2 = list_next_entry(next, l);			\
62 	     &(nfp_prog)->insns != &pos->l &&			\
63 	     &(nfp_prog)->insns != &next->l &&			\
64 	     &(nfp_prog)->insns != &next2->l;			\
65 	     pos = nfp_meta_next(pos),				\
66 	     next = nfp_meta_next(pos),				\
67 	     next2 = nfp_meta_next(next))
68 
69 static bool
70 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
71 {
72 	return meta->l.prev != &nfp_prog->insns;
73 }
74 
75 static void nfp_prog_push(struct nfp_prog *nfp_prog, u64 insn)
76 {
77 	if (nfp_prog->__prog_alloc_len / sizeof(u64) == nfp_prog->prog_len) {
78 		pr_warn("instruction limit reached (%u NFP instructions)\n",
79 			nfp_prog->prog_len);
80 		nfp_prog->error = -ENOSPC;
81 		return;
82 	}
83 
84 	nfp_prog->prog[nfp_prog->prog_len] = insn;
85 	nfp_prog->prog_len++;
86 }
87 
88 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog)
89 {
90 	return nfp_prog->prog_len;
91 }
92 
93 static bool
94 nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off)
95 {
96 	/* If there is a recorded error we may have dropped instructions;
97 	 * that doesn't have to be due to translator bug, and the translation
98 	 * will fail anyway, so just return OK.
99 	 */
100 	if (nfp_prog->error)
101 		return true;
102 	return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off);
103 }
104 
105 /* --- Emitters --- */
106 static void
107 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
108 	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx,
109 	   bool indir)
110 {
111 	u64 insn;
112 
113 	insn =	FIELD_PREP(OP_CMD_A_SRC, areg) |
114 		FIELD_PREP(OP_CMD_CTX, ctx) |
115 		FIELD_PREP(OP_CMD_B_SRC, breg) |
116 		FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) |
117 		FIELD_PREP(OP_CMD_XFER, xfer) |
118 		FIELD_PREP(OP_CMD_CNT, size) |
119 		FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) |
120 		FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
121 		FIELD_PREP(OP_CMD_INDIR, indir) |
122 		FIELD_PREP(OP_CMD_MODE, mode);
123 
124 	nfp_prog_push(nfp_prog, insn);
125 }
126 
127 static void
128 emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
129 	     swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir)
130 {
131 	struct nfp_insn_re_regs reg;
132 	int err;
133 
134 	err = swreg_to_restricted(reg_none(), lreg, rreg, &reg, false);
135 	if (err) {
136 		nfp_prog->error = err;
137 		return;
138 	}
139 	if (reg.swap) {
140 		pr_err("cmd can't swap arguments\n");
141 		nfp_prog->error = -EFAULT;
142 		return;
143 	}
144 	if (reg.dst_lmextn || reg.src_lmextn) {
145 		pr_err("cmd can't use LMextn\n");
146 		nfp_prog->error = -EFAULT;
147 		return;
148 	}
149 
150 	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx,
151 		   indir);
152 }
153 
154 static void
155 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
156 	 swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
157 {
158 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false);
159 }
160 
161 static void
162 emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
163 	       swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
164 {
165 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true);
166 }
167 
168 static void
169 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
170 	  enum br_ctx_signal_state css, u16 addr, u8 defer)
171 {
172 	u16 addr_lo, addr_hi;
173 	u64 insn;
174 
175 	addr_lo = addr & (OP_BR_ADDR_LO >> __bf_shf(OP_BR_ADDR_LO));
176 	addr_hi = addr != addr_lo;
177 
178 	insn = OP_BR_BASE |
179 		FIELD_PREP(OP_BR_MASK, mask) |
180 		FIELD_PREP(OP_BR_EV_PIP, ev_pip) |
181 		FIELD_PREP(OP_BR_CSS, css) |
182 		FIELD_PREP(OP_BR_DEFBR, defer) |
183 		FIELD_PREP(OP_BR_ADDR_LO, addr_lo) |
184 		FIELD_PREP(OP_BR_ADDR_HI, addr_hi);
185 
186 	nfp_prog_push(nfp_prog, insn);
187 }
188 
189 static void
190 emit_br_relo(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer,
191 	     enum nfp_relo_type relo)
192 {
193 	if (mask == BR_UNC && defer > 2) {
194 		pr_err("BUG: branch defer out of bounds %d\n", defer);
195 		nfp_prog->error = -EFAULT;
196 		return;
197 	}
198 
199 	__emit_br(nfp_prog, mask,
200 		  mask != BR_UNC ? BR_EV_PIP_COND : BR_EV_PIP_UNCOND,
201 		  BR_CSS_NONE, addr, defer);
202 
203 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
204 		FIELD_PREP(OP_RELO_TYPE, relo);
205 }
206 
207 static void
208 emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
209 {
210 	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
211 }
212 
213 static void
214 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
215 	     enum immed_width width, bool invert,
216 	     enum immed_shift shift, bool wr_both,
217 	     bool dst_lmextn, bool src_lmextn)
218 {
219 	u64 insn;
220 
221 	insn = OP_IMMED_BASE |
222 		FIELD_PREP(OP_IMMED_A_SRC, areg) |
223 		FIELD_PREP(OP_IMMED_B_SRC, breg) |
224 		FIELD_PREP(OP_IMMED_IMM, imm_hi) |
225 		FIELD_PREP(OP_IMMED_WIDTH, width) |
226 		FIELD_PREP(OP_IMMED_INV, invert) |
227 		FIELD_PREP(OP_IMMED_SHIFT, shift) |
228 		FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
229 		FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
230 		FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
231 
232 	nfp_prog_push(nfp_prog, insn);
233 }
234 
235 static void
236 emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
237 	   enum immed_width width, bool invert, enum immed_shift shift)
238 {
239 	struct nfp_insn_ur_regs reg;
240 	int err;
241 
242 	if (swreg_type(dst) == NN_REG_IMM) {
243 		nfp_prog->error = -EFAULT;
244 		return;
245 	}
246 
247 	err = swreg_to_unrestricted(dst, dst, reg_imm(imm & 0xff), &reg);
248 	if (err) {
249 		nfp_prog->error = err;
250 		return;
251 	}
252 
253 	/* Use reg.dst when destination is No-Dest. */
254 	__emit_immed(nfp_prog,
255 		     swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
256 		     reg.breg, imm >> 8, width, invert, shift,
257 		     reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
258 }
259 
260 static void
261 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
262 	   enum shf_sc sc, u8 shift,
263 	   u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
264 	   bool dst_lmextn, bool src_lmextn)
265 {
266 	u64 insn;
267 
268 	if (!FIELD_FIT(OP_SHF_SHIFT, shift)) {
269 		nfp_prog->error = -EFAULT;
270 		return;
271 	}
272 
273 	if (sc == SHF_SC_L_SHF)
274 		shift = 32 - shift;
275 
276 	insn = OP_SHF_BASE |
277 		FIELD_PREP(OP_SHF_A_SRC, areg) |
278 		FIELD_PREP(OP_SHF_SC, sc) |
279 		FIELD_PREP(OP_SHF_B_SRC, breg) |
280 		FIELD_PREP(OP_SHF_I8, i8) |
281 		FIELD_PREP(OP_SHF_SW, sw) |
282 		FIELD_PREP(OP_SHF_DST, dst) |
283 		FIELD_PREP(OP_SHF_SHIFT, shift) |
284 		FIELD_PREP(OP_SHF_OP, op) |
285 		FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
286 		FIELD_PREP(OP_SHF_WR_AB, wr_both) |
287 		FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
288 		FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
289 
290 	nfp_prog_push(nfp_prog, insn);
291 }
292 
293 static void
294 emit_shf(struct nfp_prog *nfp_prog, swreg dst,
295 	 swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
296 {
297 	struct nfp_insn_re_regs reg;
298 	int err;
299 
300 	err = swreg_to_restricted(dst, lreg, rreg, &reg, true);
301 	if (err) {
302 		nfp_prog->error = err;
303 		return;
304 	}
305 
306 	__emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
307 		   reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
308 		   reg.dst_lmextn, reg.src_lmextn);
309 }
310 
311 static void
312 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
313 	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
314 	   bool dst_lmextn, bool src_lmextn)
315 {
316 	u64 insn;
317 
318 	insn = OP_ALU_BASE |
319 		FIELD_PREP(OP_ALU_A_SRC, areg) |
320 		FIELD_PREP(OP_ALU_B_SRC, breg) |
321 		FIELD_PREP(OP_ALU_DST, dst) |
322 		FIELD_PREP(OP_ALU_SW, swap) |
323 		FIELD_PREP(OP_ALU_OP, op) |
324 		FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
325 		FIELD_PREP(OP_ALU_WR_AB, wr_both) |
326 		FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
327 		FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
328 
329 	nfp_prog_push(nfp_prog, insn);
330 }
331 
332 static void
333 emit_alu(struct nfp_prog *nfp_prog, swreg dst,
334 	 swreg lreg, enum alu_op op, swreg rreg)
335 {
336 	struct nfp_insn_ur_regs reg;
337 	int err;
338 
339 	err = swreg_to_unrestricted(dst, lreg, rreg, &reg);
340 	if (err) {
341 		nfp_prog->error = err;
342 		return;
343 	}
344 
345 	__emit_alu(nfp_prog, reg.dst, reg.dst_ab,
346 		   reg.areg, op, reg.breg, reg.swap, reg.wr_both,
347 		   reg.dst_lmextn, reg.src_lmextn);
348 }
349 
350 static void
351 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
352 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
353 		bool zero, bool swap, bool wr_both,
354 		bool dst_lmextn, bool src_lmextn)
355 {
356 	u64 insn;
357 
358 	insn = OP_LDF_BASE |
359 		FIELD_PREP(OP_LDF_A_SRC, areg) |
360 		FIELD_PREP(OP_LDF_SC, sc) |
361 		FIELD_PREP(OP_LDF_B_SRC, breg) |
362 		FIELD_PREP(OP_LDF_I8, imm8) |
363 		FIELD_PREP(OP_LDF_SW, swap) |
364 		FIELD_PREP(OP_LDF_ZF, zero) |
365 		FIELD_PREP(OP_LDF_BMASK, bmask) |
366 		FIELD_PREP(OP_LDF_SHF, shift) |
367 		FIELD_PREP(OP_LDF_WR_AB, wr_both) |
368 		FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
369 		FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
370 
371 	nfp_prog_push(nfp_prog, insn);
372 }
373 
374 static void
375 emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
376 		  enum shf_sc sc, u8 shift, bool zero)
377 {
378 	struct nfp_insn_re_regs reg;
379 	int err;
380 
381 	/* Note: ld_field is special as it uses one of the src regs as dst */
382 	err = swreg_to_restricted(dst, dst, src, &reg, true);
383 	if (err) {
384 		nfp_prog->error = err;
385 		return;
386 	}
387 
388 	__emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
389 			reg.i8, zero, reg.swap, reg.wr_both,
390 			reg.dst_lmextn, reg.src_lmextn);
391 }
392 
393 static void
394 emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
395 	      enum shf_sc sc, u8 shift)
396 {
397 	emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
398 }
399 
400 static void
401 __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
402 	    bool dst_lmextn, bool src_lmextn)
403 {
404 	u64 insn;
405 
406 	insn = OP_LCSR_BASE |
407 		FIELD_PREP(OP_LCSR_A_SRC, areg) |
408 		FIELD_PREP(OP_LCSR_B_SRC, breg) |
409 		FIELD_PREP(OP_LCSR_WRITE, wr) |
410 		FIELD_PREP(OP_LCSR_ADDR, addr / 4) |
411 		FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
412 		FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
413 
414 	nfp_prog_push(nfp_prog, insn);
415 }
416 
417 static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
418 {
419 	struct nfp_insn_ur_regs reg;
420 	int err;
421 
422 	/* This instruction takes immeds instead of reg_none() for the ignored
423 	 * operand, but we can't encode 2 immeds in one instr with our normal
424 	 * swreg infra so if param is an immed, we encode as reg_none() and
425 	 * copy the immed to both operands.
426 	 */
427 	if (swreg_type(src) == NN_REG_IMM) {
428 		err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
429 		reg.breg = reg.areg;
430 	} else {
431 		err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
432 	}
433 	if (err) {
434 		nfp_prog->error = err;
435 		return;
436 	}
437 
438 	__emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr,
439 		    false, reg.src_lmextn);
440 }
441 
442 /* CSR value is read in following immed[gpr, 0] */
443 static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr)
444 {
445 	__emit_lcsr(nfp_prog, 0, 0, false, addr, false, false);
446 }
447 
448 static void emit_nop(struct nfp_prog *nfp_prog)
449 {
450 	__emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
451 }
452 
453 /* --- Wrappers --- */
454 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
455 {
456 	if (!(imm & 0xffff0000)) {
457 		*val = imm;
458 		*shift = IMMED_SHIFT_0B;
459 	} else if (!(imm & 0xff0000ff)) {
460 		*val = imm >> 8;
461 		*shift = IMMED_SHIFT_1B;
462 	} else if (!(imm & 0x0000ffff)) {
463 		*val = imm >> 16;
464 		*shift = IMMED_SHIFT_2B;
465 	} else {
466 		return false;
467 	}
468 
469 	return true;
470 }
471 
472 static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
473 {
474 	enum immed_shift shift;
475 	u16 val;
476 
477 	if (pack_immed(imm, &val, &shift)) {
478 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, false, shift);
479 	} else if (pack_immed(~imm, &val, &shift)) {
480 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, true, shift);
481 	} else {
482 		emit_immed(nfp_prog, dst, imm & 0xffff, IMMED_WIDTH_ALL,
483 			   false, IMMED_SHIFT_0B);
484 		emit_immed(nfp_prog, dst, imm >> 16, IMMED_WIDTH_WORD,
485 			   false, IMMED_SHIFT_2B);
486 	}
487 }
488 
489 static void
490 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm,
491 	       enum nfp_relo_type relo)
492 {
493 	if (imm > 0xffff) {
494 		pr_err("relocation of a large immediate!\n");
495 		nfp_prog->error = -EFAULT;
496 		return;
497 	}
498 	emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
499 
500 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
501 		FIELD_PREP(OP_RELO_TYPE, relo);
502 }
503 
504 /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted)
505  * If the @imm is small enough encode it directly in operand and return
506  * otherwise load @imm to a spare register and return its encoding.
507  */
508 static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
509 {
510 	if (FIELD_FIT(UR_REG_IMM_MAX, imm))
511 		return reg_imm(imm);
512 
513 	wrp_immed(nfp_prog, tmp_reg, imm);
514 	return tmp_reg;
515 }
516 
517 /* re_load_imm_any() - encode immediate or use tmp register (restricted)
518  * If the @imm is small enough encode it directly in operand and return
519  * otherwise load @imm to a spare register and return its encoding.
520  */
521 static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
522 {
523 	if (FIELD_FIT(RE_REG_IMM_MAX, imm))
524 		return reg_imm(imm);
525 
526 	wrp_immed(nfp_prog, tmp_reg, imm);
527 	return tmp_reg;
528 }
529 
530 static void wrp_nops(struct nfp_prog *nfp_prog, unsigned int count)
531 {
532 	while (count--)
533 		emit_nop(nfp_prog);
534 }
535 
536 static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src)
537 {
538 	emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src);
539 }
540 
541 static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
542 {
543 	wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
544 }
545 
546 /* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
547  * result to @dst from low end.
548  */
549 static void
550 wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
551 		u8 offset)
552 {
553 	enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
554 	u8 mask = (1 << field_len) - 1;
555 
556 	emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
557 }
558 
559 /* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the
560  * result to @dst from offset, there is no change on the other bits of @dst.
561  */
562 static void
563 wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src,
564 		   u8 field_len, u8 offset)
565 {
566 	enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE;
567 	u8 mask = ((1 << field_len) - 1) << offset;
568 
569 	emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8);
570 }
571 
572 static void
573 addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
574 	      swreg *rega, swreg *regb)
575 {
576 	if (offset == reg_imm(0)) {
577 		*rega = reg_a(src_gpr);
578 		*regb = reg_b(src_gpr + 1);
579 		return;
580 	}
581 
582 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset);
583 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C,
584 		 reg_imm(0));
585 	*rega = imm_a(nfp_prog);
586 	*regb = imm_b(nfp_prog);
587 }
588 
589 /* NFP has Command Push Pull bus which supports bluk memory operations. */
590 static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
591 {
592 	bool descending_seq = meta->ldst_gather_len < 0;
593 	s16 len = abs(meta->ldst_gather_len);
594 	swreg src_base, off;
595 	bool src_40bit_addr;
596 	unsigned int i;
597 	u8 xfer_num;
598 
599 	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
600 	src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE;
601 	src_base = reg_a(meta->insn.src_reg * 2);
602 	xfer_num = round_up(len, 4) / 4;
603 
604 	if (src_40bit_addr)
605 		addr40_offset(nfp_prog, meta->insn.src_reg, off, &src_base,
606 			      &off);
607 
608 	/* Setup PREV_ALU fields to override memory read length. */
609 	if (len > 32)
610 		wrp_immed(nfp_prog, reg_none(),
611 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
612 
613 	/* Memory read from source addr into transfer-in registers. */
614 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP,
615 		     src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0,
616 		     src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32);
617 
618 	/* Move from transfer-in to transfer-out. */
619 	for (i = 0; i < xfer_num; i++)
620 		wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
621 
622 	off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
623 
624 	if (len <= 8) {
625 		/* Use single direct_ref write8. */
626 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
627 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
628 			 CMD_CTX_SWAP);
629 	} else if (len <= 32 && IS_ALIGNED(len, 4)) {
630 		/* Use single direct_ref write32. */
631 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
632 			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
633 			 CMD_CTX_SWAP);
634 	} else if (len <= 32) {
635 		/* Use single indirect_ref write8. */
636 		wrp_immed(nfp_prog, reg_none(),
637 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
638 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
639 			       reg_a(meta->paired_st->dst_reg * 2), off,
640 			       len - 1, CMD_CTX_SWAP);
641 	} else if (IS_ALIGNED(len, 4)) {
642 		/* Use single indirect_ref write32. */
643 		wrp_immed(nfp_prog, reg_none(),
644 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
645 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
646 			       reg_a(meta->paired_st->dst_reg * 2), off,
647 			       xfer_num - 1, CMD_CTX_SWAP);
648 	} else if (len <= 40) {
649 		/* Use one direct_ref write32 to write the first 32-bytes, then
650 		 * another direct_ref write8 to write the remaining bytes.
651 		 */
652 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
653 			 reg_a(meta->paired_st->dst_reg * 2), off, 7,
654 			 CMD_CTX_SWAP);
655 
656 		off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
657 				      imm_b(nfp_prog));
658 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
659 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
660 			 CMD_CTX_SWAP);
661 	} else {
662 		/* Use one indirect_ref write32 to write 4-bytes aligned length,
663 		 * then another direct_ref write8 to write the remaining bytes.
664 		 */
665 		u8 new_off;
666 
667 		wrp_immed(nfp_prog, reg_none(),
668 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
669 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
670 			       reg_a(meta->paired_st->dst_reg * 2), off,
671 			       xfer_num - 2, CMD_CTX_SWAP);
672 		new_off = meta->paired_st->off + (xfer_num - 1) * 4;
673 		off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
674 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
675 			 xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
676 			 (len & 0x3) - 1, CMD_CTX_SWAP);
677 	}
678 
679 	/* TODO: The following extra load is to make sure data flow be identical
680 	 *  before and after we do memory copy optimization.
681 	 *
682 	 *  The load destination register is not guaranteed to be dead, so we
683 	 *  need to make sure it is loaded with the value the same as before
684 	 *  this transformation.
685 	 *
686 	 *  These extra loads could be removed once we have accurate register
687 	 *  usage information.
688 	 */
689 	if (descending_seq)
690 		xfer_num = 0;
691 	else if (BPF_SIZE(meta->insn.code) != BPF_DW)
692 		xfer_num = xfer_num - 1;
693 	else
694 		xfer_num = xfer_num - 2;
695 
696 	switch (BPF_SIZE(meta->insn.code)) {
697 	case BPF_B:
698 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
699 				reg_xfer(xfer_num), 1,
700 				IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
701 		break;
702 	case BPF_H:
703 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
704 				reg_xfer(xfer_num), 2, (len & 3) ^ 2);
705 		break;
706 	case BPF_W:
707 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
708 			reg_xfer(0));
709 		break;
710 	case BPF_DW:
711 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
712 			reg_xfer(xfer_num));
713 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
714 			reg_xfer(xfer_num + 1));
715 		break;
716 	}
717 
718 	if (BPF_SIZE(meta->insn.code) != BPF_DW)
719 		wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
720 
721 	return 0;
722 }
723 
724 static int
725 data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size)
726 {
727 	unsigned int i;
728 	u16 shift, sz;
729 
730 	/* We load the value from the address indicated in @offset and then
731 	 * shift out the data we don't need.  Note: this is big endian!
732 	 */
733 	sz = max(size, 4);
734 	shift = size < 4 ? 4 - size : 0;
735 
736 	emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
737 		 pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP);
738 
739 	i = 0;
740 	if (shift)
741 		emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE,
742 			 reg_xfer(0), SHF_SC_R_SHF, shift * 8);
743 	else
744 		for (; i * 4 < size; i++)
745 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
746 
747 	if (i < 2)
748 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
749 
750 	return 0;
751 }
752 
753 static int
754 data_ld_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr,
755 		   swreg lreg, swreg rreg, int size, enum cmd_mode mode)
756 {
757 	unsigned int i;
758 	u8 mask, sz;
759 
760 	/* We load the value from the address indicated in rreg + lreg and then
761 	 * mask out the data we don't need.  Note: this is little endian!
762 	 */
763 	sz = max(size, 4);
764 	mask = size < 4 ? GENMASK(size - 1, 0) : 0;
765 
766 	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0,
767 		 lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP);
768 
769 	i = 0;
770 	if (mask)
771 		emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask,
772 				  reg_xfer(0), SHF_SC_NONE, 0, true);
773 	else
774 		for (; i * 4 < size; i++)
775 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
776 
777 	if (i < 2)
778 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
779 
780 	return 0;
781 }
782 
783 static int
784 data_ld_host_order_addr32(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
785 			  u8 dst_gpr, u8 size)
786 {
787 	return data_ld_host_order(nfp_prog, dst_gpr, reg_a(src_gpr), offset,
788 				  size, CMD_MODE_32b);
789 }
790 
791 static int
792 data_ld_host_order_addr40(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
793 			  u8 dst_gpr, u8 size)
794 {
795 	swreg rega, regb;
796 
797 	addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb);
798 
799 	return data_ld_host_order(nfp_prog, dst_gpr, rega, regb,
800 				  size, CMD_MODE_40b_BA);
801 }
802 
803 static int
804 construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset, u16 src, u8 size)
805 {
806 	swreg tmp_reg;
807 
808 	/* Calculate the true offset (src_reg + imm) */
809 	tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
810 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg);
811 
812 	/* Check packet length (size guaranteed to fit b/c it's u8) */
813 	emit_alu(nfp_prog, imm_a(nfp_prog),
814 		 imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
815 	emit_alu(nfp_prog, reg_none(),
816 		 plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
817 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
818 
819 	/* Load data */
820 	return data_ld(nfp_prog, imm_b(nfp_prog), 0, size);
821 }
822 
823 static int construct_data_ld(struct nfp_prog *nfp_prog, u16 offset, u8 size)
824 {
825 	swreg tmp_reg;
826 
827 	/* Check packet length */
828 	tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog));
829 	emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
830 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
831 
832 	/* Load data */
833 	tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
834 	return data_ld(nfp_prog, tmp_reg, 0, size);
835 }
836 
837 static int
838 data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
839 		    u8 src_gpr, u8 size)
840 {
841 	unsigned int i;
842 
843 	for (i = 0; i * 4 < size; i++)
844 		wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i));
845 
846 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
847 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
848 
849 	return 0;
850 }
851 
852 static int
853 data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
854 		   u64 imm, u8 size)
855 {
856 	wrp_immed(nfp_prog, reg_xfer(0), imm);
857 	if (size == 8)
858 		wrp_immed(nfp_prog, reg_xfer(1), imm >> 32);
859 
860 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
861 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
862 
863 	return 0;
864 }
865 
866 typedef int
867 (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
868 	     unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
869 	     bool needs_inc);
870 
871 static int
872 wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
873 	      unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
874 	      bool needs_inc)
875 {
876 	bool should_inc = needs_inc && new_gpr && !last;
877 	u32 idx, src_byte;
878 	enum shf_sc sc;
879 	swreg reg;
880 	int shf;
881 	u8 mask;
882 
883 	if (WARN_ON_ONCE(dst_byte + size > 4 || off % 4 + size > 4))
884 		return -EOPNOTSUPP;
885 
886 	idx = off / 4;
887 
888 	/* Move the entire word */
889 	if (size == 4) {
890 		wrp_mov(nfp_prog, reg_both(dst),
891 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
892 		return 0;
893 	}
894 
895 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
896 		return -EOPNOTSUPP;
897 
898 	src_byte = off % 4;
899 
900 	mask = (1 << size) - 1;
901 	mask <<= dst_byte;
902 
903 	if (WARN_ON_ONCE(mask > 0xf))
904 		return -EOPNOTSUPP;
905 
906 	shf = abs(src_byte - dst_byte) * 8;
907 	if (src_byte == dst_byte) {
908 		sc = SHF_SC_NONE;
909 	} else if (src_byte < dst_byte) {
910 		shf = 32 - shf;
911 		sc = SHF_SC_L_SHF;
912 	} else {
913 		sc = SHF_SC_R_SHF;
914 	}
915 
916 	/* ld_field can address fewer indexes, if offset too large do RMW.
917 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
918 	 */
919 	if (idx <= RE_REG_LM_IDX_MAX) {
920 		reg = reg_lm(lm3 ? 3 : 0, idx);
921 	} else {
922 		reg = imm_a(nfp_prog);
923 		/* If it's not the first part of the load and we start a new GPR
924 		 * that means we are loading a second part of the LMEM word into
925 		 * a new GPR.  IOW we've already looked that LMEM word and
926 		 * therefore it has been loaded into imm_a().
927 		 */
928 		if (first || !new_gpr)
929 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
930 	}
931 
932 	emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
933 
934 	if (should_inc)
935 		wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
936 
937 	return 0;
938 }
939 
940 static int
941 wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
942 	       unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
943 	       bool needs_inc)
944 {
945 	bool should_inc = needs_inc && new_gpr && !last;
946 	u32 idx, dst_byte;
947 	enum shf_sc sc;
948 	swreg reg;
949 	int shf;
950 	u8 mask;
951 
952 	if (WARN_ON_ONCE(src_byte + size > 4 || off % 4 + size > 4))
953 		return -EOPNOTSUPP;
954 
955 	idx = off / 4;
956 
957 	/* Move the entire word */
958 	if (size == 4) {
959 		wrp_mov(nfp_prog,
960 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
961 			reg_b(src));
962 		return 0;
963 	}
964 
965 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
966 		return -EOPNOTSUPP;
967 
968 	dst_byte = off % 4;
969 
970 	mask = (1 << size) - 1;
971 	mask <<= dst_byte;
972 
973 	if (WARN_ON_ONCE(mask > 0xf))
974 		return -EOPNOTSUPP;
975 
976 	shf = abs(src_byte - dst_byte) * 8;
977 	if (src_byte == dst_byte) {
978 		sc = SHF_SC_NONE;
979 	} else if (src_byte < dst_byte) {
980 		shf = 32 - shf;
981 		sc = SHF_SC_L_SHF;
982 	} else {
983 		sc = SHF_SC_R_SHF;
984 	}
985 
986 	/* ld_field can address fewer indexes, if offset too large do RMW.
987 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
988 	 */
989 	if (idx <= RE_REG_LM_IDX_MAX) {
990 		reg = reg_lm(lm3 ? 3 : 0, idx);
991 	} else {
992 		reg = imm_a(nfp_prog);
993 		/* Only first and last LMEM locations are going to need RMW,
994 		 * the middle location will be overwritten fully.
995 		 */
996 		if (first || last)
997 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
998 	}
999 
1000 	emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
1001 
1002 	if (new_gpr || last) {
1003 		if (idx > RE_REG_LM_IDX_MAX)
1004 			wrp_mov(nfp_prog, reg_lm(0, idx), reg);
1005 		if (should_inc)
1006 			wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1007 	}
1008 
1009 	return 0;
1010 }
1011 
1012 static int
1013 mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1014 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
1015 	     bool clr_gpr, lmem_step step)
1016 {
1017 	s32 off = nfp_prog->stack_depth + meta->insn.off + ptr_off;
1018 	bool first = true, last;
1019 	bool needs_inc = false;
1020 	swreg stack_off_reg;
1021 	u8 prev_gpr = 255;
1022 	u32 gpr_byte = 0;
1023 	bool lm3 = true;
1024 	int ret;
1025 
1026 	if (meta->ptr_not_const) {
1027 		/* Use of the last encountered ptr_off is OK, they all have
1028 		 * the same alignment.  Depend on low bits of value being
1029 		 * discarded when written to LMaddr register.
1030 		 */
1031 		stack_off_reg = ur_load_imm_any(nfp_prog, meta->insn.off,
1032 						stack_imm(nfp_prog));
1033 
1034 		emit_alu(nfp_prog, imm_b(nfp_prog),
1035 			 reg_a(ptr_gpr), ALU_OP_ADD, stack_off_reg);
1036 
1037 		needs_inc = true;
1038 	} else if (off + size <= 64) {
1039 		/* We can reach bottom 64B with LMaddr0 */
1040 		lm3 = false;
1041 	} else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
1042 		/* We have to set up a new pointer.  If we know the offset
1043 		 * and the entire access falls into a single 32 byte aligned
1044 		 * window we won't have to increment the LM pointer.
1045 		 * The 32 byte alignment is imporant because offset is ORed in
1046 		 * not added when doing *l$indexN[off].
1047 		 */
1048 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
1049 						stack_imm(nfp_prog));
1050 		emit_alu(nfp_prog, imm_b(nfp_prog),
1051 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1052 
1053 		off %= 32;
1054 	} else {
1055 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
1056 						stack_imm(nfp_prog));
1057 
1058 		emit_alu(nfp_prog, imm_b(nfp_prog),
1059 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1060 
1061 		needs_inc = true;
1062 	}
1063 	if (lm3) {
1064 		emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
1065 		/* For size < 4 one slot will be filled by zeroing of upper. */
1066 		wrp_nops(nfp_prog, clr_gpr && size < 8 ? 2 : 3);
1067 	}
1068 
1069 	if (clr_gpr && size < 8)
1070 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
1071 
1072 	while (size) {
1073 		u32 slice_end;
1074 		u8 slice_size;
1075 
1076 		slice_size = min(size, 4 - gpr_byte);
1077 		slice_end = min(off + slice_size, round_up(off + 1, 4));
1078 		slice_size = slice_end - off;
1079 
1080 		last = slice_size == size;
1081 
1082 		if (needs_inc)
1083 			off %= 4;
1084 
1085 		ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
1086 			   first, gpr != prev_gpr, last, lm3, needs_inc);
1087 		if (ret)
1088 			return ret;
1089 
1090 		prev_gpr = gpr;
1091 		first = false;
1092 
1093 		gpr_byte += slice_size;
1094 		if (gpr_byte >= 4) {
1095 			gpr_byte -= 4;
1096 			gpr++;
1097 		}
1098 
1099 		size -= slice_size;
1100 		off += slice_size;
1101 	}
1102 
1103 	return 0;
1104 }
1105 
1106 static void
1107 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
1108 {
1109 	swreg tmp_reg;
1110 
1111 	if (alu_op == ALU_OP_AND) {
1112 		if (!imm)
1113 			wrp_immed(nfp_prog, reg_both(dst), 0);
1114 		if (!imm || !~imm)
1115 			return;
1116 	}
1117 	if (alu_op == ALU_OP_OR) {
1118 		if (!~imm)
1119 			wrp_immed(nfp_prog, reg_both(dst), ~0U);
1120 		if (!imm || !~imm)
1121 			return;
1122 	}
1123 	if (alu_op == ALU_OP_XOR) {
1124 		if (!~imm)
1125 			emit_alu(nfp_prog, reg_both(dst), reg_none(),
1126 				 ALU_OP_NOT, reg_b(dst));
1127 		if (!imm || !~imm)
1128 			return;
1129 	}
1130 
1131 	tmp_reg = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1132 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, tmp_reg);
1133 }
1134 
1135 static int
1136 wrp_alu64_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1137 	      enum alu_op alu_op, bool skip)
1138 {
1139 	const struct bpf_insn *insn = &meta->insn;
1140 	u64 imm = insn->imm; /* sign extend */
1141 
1142 	if (skip) {
1143 		meta->skip = true;
1144 		return 0;
1145 	}
1146 
1147 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, imm & ~0U);
1148 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, alu_op, imm >> 32);
1149 
1150 	return 0;
1151 }
1152 
1153 static int
1154 wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1155 	      enum alu_op alu_op)
1156 {
1157 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1158 
1159 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1160 	emit_alu(nfp_prog, reg_both(dst + 1),
1161 		 reg_a(dst + 1), alu_op, reg_b(src + 1));
1162 
1163 	return 0;
1164 }
1165 
1166 static int
1167 wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1168 	      enum alu_op alu_op, bool skip)
1169 {
1170 	const struct bpf_insn *insn = &meta->insn;
1171 
1172 	if (skip) {
1173 		meta->skip = true;
1174 		return 0;
1175 	}
1176 
1177 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, insn->imm);
1178 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1179 
1180 	return 0;
1181 }
1182 
1183 static int
1184 wrp_alu32_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1185 	      enum alu_op alu_op)
1186 {
1187 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1188 
1189 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1190 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1191 
1192 	return 0;
1193 }
1194 
1195 static void
1196 wrp_test_reg_one(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u8 src,
1197 		 enum br_mask br_mask, u16 off)
1198 {
1199 	emit_alu(nfp_prog, reg_none(), reg_a(dst), alu_op, reg_b(src));
1200 	emit_br(nfp_prog, br_mask, off, 0);
1201 }
1202 
1203 static int
1204 wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1205 	     enum alu_op alu_op, enum br_mask br_mask)
1206 {
1207 	const struct bpf_insn *insn = &meta->insn;
1208 
1209 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
1210 			 insn->src_reg * 2, br_mask, insn->off);
1211 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
1212 			 insn->src_reg * 2 + 1, br_mask, insn->off);
1213 
1214 	return 0;
1215 }
1216 
1217 static int
1218 wrp_cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1219 	    enum br_mask br_mask, bool swap)
1220 {
1221 	const struct bpf_insn *insn = &meta->insn;
1222 	u64 imm = insn->imm; /* sign extend */
1223 	u8 reg = insn->dst_reg * 2;
1224 	swreg tmp_reg;
1225 
1226 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
1227 	if (!swap)
1228 		emit_alu(nfp_prog, reg_none(), reg_a(reg), ALU_OP_SUB, tmp_reg);
1229 	else
1230 		emit_alu(nfp_prog, reg_none(), tmp_reg, ALU_OP_SUB, reg_a(reg));
1231 
1232 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
1233 	if (!swap)
1234 		emit_alu(nfp_prog, reg_none(),
1235 			 reg_a(reg + 1), ALU_OP_SUB_C, tmp_reg);
1236 	else
1237 		emit_alu(nfp_prog, reg_none(),
1238 			 tmp_reg, ALU_OP_SUB_C, reg_a(reg + 1));
1239 
1240 	emit_br(nfp_prog, br_mask, insn->off, 0);
1241 
1242 	return 0;
1243 }
1244 
1245 static int
1246 wrp_cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1247 	    enum br_mask br_mask, bool swap)
1248 {
1249 	const struct bpf_insn *insn = &meta->insn;
1250 	u8 areg, breg;
1251 
1252 	areg = insn->dst_reg * 2;
1253 	breg = insn->src_reg * 2;
1254 
1255 	if (swap) {
1256 		areg ^= breg;
1257 		breg ^= areg;
1258 		areg ^= breg;
1259 	}
1260 
1261 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
1262 	emit_alu(nfp_prog, reg_none(),
1263 		 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
1264 	emit_br(nfp_prog, br_mask, insn->off, 0);
1265 
1266 	return 0;
1267 }
1268 
1269 static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
1270 {
1271 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in,
1272 		      SHF_SC_R_ROT, 8);
1273 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out),
1274 		      SHF_SC_R_ROT, 16);
1275 }
1276 
1277 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1278 {
1279 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
1280 	struct nfp_bpf_cap_adjust_head *adjust_head;
1281 	u32 ret_einval, end;
1282 
1283 	adjust_head = &nfp_prog->bpf->adjust_head;
1284 
1285 	/* Optimized version - 5 vs 14 cycles */
1286 	if (nfp_prog->adjust_head_location != UINT_MAX) {
1287 		if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n))
1288 			return -EINVAL;
1289 
1290 		emit_alu(nfp_prog, pptr_reg(nfp_prog),
1291 			 reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog));
1292 		emit_alu(nfp_prog, plen_reg(nfp_prog),
1293 			 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1294 		emit_alu(nfp_prog, pv_len(nfp_prog),
1295 			 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1296 
1297 		wrp_immed(nfp_prog, reg_both(0), 0);
1298 		wrp_immed(nfp_prog, reg_both(1), 0);
1299 
1300 		/* TODO: when adjust head is guaranteed to succeed we can
1301 		 * also eliminate the following if (r0 == 0) branch.
1302 		 */
1303 
1304 		return 0;
1305 	}
1306 
1307 	ret_einval = nfp_prog_current_offset(nfp_prog) + 14;
1308 	end = ret_einval + 2;
1309 
1310 	/* We need to use a temp because offset is just a part of the pkt ptr */
1311 	emit_alu(nfp_prog, tmp,
1312 		 reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog));
1313 
1314 	/* Validate result will fit within FW datapath constraints */
1315 	emit_alu(nfp_prog, reg_none(),
1316 		 tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min));
1317 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1318 	emit_alu(nfp_prog, reg_none(),
1319 		 reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp);
1320 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1321 
1322 	/* Validate the length is at least ETH_HLEN */
1323 	emit_alu(nfp_prog, tmp_len,
1324 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1325 	emit_alu(nfp_prog, reg_none(),
1326 		 tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN));
1327 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1328 
1329 	/* Load the ret code */
1330 	wrp_immed(nfp_prog, reg_both(0), 0);
1331 	wrp_immed(nfp_prog, reg_both(1), 0);
1332 
1333 	/* Modify the packet metadata */
1334 	emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0);
1335 
1336 	/* Skip over the -EINVAL ret code (defer 2) */
1337 	emit_br(nfp_prog, BR_UNC, end, 2);
1338 
1339 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1340 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1341 	emit_alu(nfp_prog, pv_len(nfp_prog),
1342 		 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1343 
1344 	/* return -EINVAL target */
1345 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1346 		return -EINVAL;
1347 
1348 	wrp_immed(nfp_prog, reg_both(0), -22);
1349 	wrp_immed(nfp_prog, reg_both(1), ~0);
1350 
1351 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1352 		return -EINVAL;
1353 
1354 	return 0;
1355 }
1356 
1357 static int
1358 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1359 {
1360 	struct bpf_offloaded_map *offmap;
1361 	struct nfp_bpf_map *nfp_map;
1362 	bool load_lm_ptr;
1363 	u32 ret_tgt;
1364 	s64 lm_off;
1365 	swreg tid;
1366 
1367 	offmap = (struct bpf_offloaded_map *)meta->arg1.map_ptr;
1368 	nfp_map = offmap->dev_priv;
1369 
1370 	/* We only have to reload LM0 if the key is not at start of stack */
1371 	lm_off = nfp_prog->stack_depth;
1372 	lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
1373 	load_lm_ptr = meta->arg2.var_off || lm_off;
1374 
1375 	/* Set LM0 to start of key */
1376 	if (load_lm_ptr)
1377 		emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0);
1378 	if (meta->func_id == BPF_FUNC_map_update_elem)
1379 		emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1380 
1381 	/* Load map ID into a register, it should actually fit as an immediate
1382 	 * but in case it doesn't deal with it here, not in the delay slots.
1383 	 */
1384 	tid = ur_load_imm_any(nfp_prog, nfp_map->tid, imm_a(nfp_prog));
1385 
1386 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1387 		     2, RELO_BR_HELPER);
1388 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1389 
1390 	/* Load map ID into A0 */
1391 	wrp_mov(nfp_prog, reg_a(0), tid);
1392 
1393 	/* Load the return address into B0 */
1394 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1395 
1396 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1397 		return -EINVAL;
1398 
1399 	/* Reset the LM0 pointer */
1400 	if (!load_lm_ptr)
1401 		return 0;
1402 
1403 	emit_csr_wr(nfp_prog, stack_reg(nfp_prog),  NFP_CSR_ACT_LM_ADDR0);
1404 	wrp_nops(nfp_prog, 3);
1405 
1406 	return 0;
1407 }
1408 
1409 static int
1410 nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1411 {
1412 	__emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM);
1413 	/* CSR value is read in following immed[gpr, 0] */
1414 	emit_immed(nfp_prog, reg_both(0), 0,
1415 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1416 	emit_immed(nfp_prog, reg_both(1), 0,
1417 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1418 	return 0;
1419 }
1420 
1421 /* --- Callbacks --- */
1422 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1423 {
1424 	const struct bpf_insn *insn = &meta->insn;
1425 	u8 dst = insn->dst_reg * 2;
1426 	u8 src = insn->src_reg * 2;
1427 
1428 	if (insn->src_reg == BPF_REG_10) {
1429 		swreg stack_depth_reg;
1430 
1431 		stack_depth_reg = ur_load_imm_any(nfp_prog,
1432 						  nfp_prog->stack_depth,
1433 						  stack_imm(nfp_prog));
1434 		emit_alu(nfp_prog, reg_both(dst),
1435 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_depth_reg);
1436 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1437 	} else {
1438 		wrp_reg_mov(nfp_prog, dst, src);
1439 		wrp_reg_mov(nfp_prog, dst + 1, src + 1);
1440 	}
1441 
1442 	return 0;
1443 }
1444 
1445 static int mov_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1446 {
1447 	u64 imm = meta->insn.imm; /* sign extend */
1448 
1449 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2), imm & ~0U);
1450 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), imm >> 32);
1451 
1452 	return 0;
1453 }
1454 
1455 static int xor_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1456 {
1457 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_XOR);
1458 }
1459 
1460 static int xor_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1461 {
1462 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm);
1463 }
1464 
1465 static int and_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1466 {
1467 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_AND);
1468 }
1469 
1470 static int and_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1471 {
1472 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1473 }
1474 
1475 static int or_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1476 {
1477 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_OR);
1478 }
1479 
1480 static int or_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1481 {
1482 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1483 }
1484 
1485 static int add_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1486 {
1487 	const struct bpf_insn *insn = &meta->insn;
1488 
1489 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1490 		 reg_a(insn->dst_reg * 2), ALU_OP_ADD,
1491 		 reg_b(insn->src_reg * 2));
1492 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1493 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_ADD_C,
1494 		 reg_b(insn->src_reg * 2 + 1));
1495 
1496 	return 0;
1497 }
1498 
1499 static int add_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1500 {
1501 	const struct bpf_insn *insn = &meta->insn;
1502 	u64 imm = insn->imm; /* sign extend */
1503 
1504 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_ADD, imm & ~0U);
1505 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_ADD_C, imm >> 32);
1506 
1507 	return 0;
1508 }
1509 
1510 static int sub_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1511 {
1512 	const struct bpf_insn *insn = &meta->insn;
1513 
1514 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1515 		 reg_a(insn->dst_reg * 2), ALU_OP_SUB,
1516 		 reg_b(insn->src_reg * 2));
1517 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1518 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_SUB_C,
1519 		 reg_b(insn->src_reg * 2 + 1));
1520 
1521 	return 0;
1522 }
1523 
1524 static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1525 {
1526 	const struct bpf_insn *insn = &meta->insn;
1527 	u64 imm = insn->imm; /* sign extend */
1528 
1529 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_SUB, imm & ~0U);
1530 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_SUB_C, imm >> 32);
1531 
1532 	return 0;
1533 }
1534 
1535 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1536 {
1537 	const struct bpf_insn *insn = &meta->insn;
1538 
1539 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2), reg_imm(0),
1540 		 ALU_OP_SUB, reg_b(insn->dst_reg * 2));
1541 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1), reg_imm(0),
1542 		 ALU_OP_SUB_C, reg_b(insn->dst_reg * 2 + 1));
1543 
1544 	return 0;
1545 }
1546 
1547 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1548 {
1549 	const struct bpf_insn *insn = &meta->insn;
1550 	u8 dst = insn->dst_reg * 2;
1551 
1552 	if (insn->imm < 32) {
1553 		emit_shf(nfp_prog, reg_both(dst + 1),
1554 			 reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
1555 			 SHF_SC_R_DSHF, 32 - insn->imm);
1556 		emit_shf(nfp_prog, reg_both(dst),
1557 			 reg_none(), SHF_OP_NONE, reg_b(dst),
1558 			 SHF_SC_L_SHF, insn->imm);
1559 	} else if (insn->imm == 32) {
1560 		wrp_reg_mov(nfp_prog, dst + 1, dst);
1561 		wrp_immed(nfp_prog, reg_both(dst), 0);
1562 	} else if (insn->imm > 32) {
1563 		emit_shf(nfp_prog, reg_both(dst + 1),
1564 			 reg_none(), SHF_OP_NONE, reg_b(dst),
1565 			 SHF_SC_L_SHF, insn->imm - 32);
1566 		wrp_immed(nfp_prog, reg_both(dst), 0);
1567 	}
1568 
1569 	return 0;
1570 }
1571 
1572 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1573 {
1574 	const struct bpf_insn *insn = &meta->insn;
1575 	u8 dst = insn->dst_reg * 2;
1576 
1577 	if (insn->imm < 32) {
1578 		emit_shf(nfp_prog, reg_both(dst),
1579 			 reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
1580 			 SHF_SC_R_DSHF, insn->imm);
1581 		emit_shf(nfp_prog, reg_both(dst + 1),
1582 			 reg_none(), SHF_OP_NONE, reg_b(dst + 1),
1583 			 SHF_SC_R_SHF, insn->imm);
1584 	} else if (insn->imm == 32) {
1585 		wrp_reg_mov(nfp_prog, dst, dst + 1);
1586 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1587 	} else if (insn->imm > 32) {
1588 		emit_shf(nfp_prog, reg_both(dst),
1589 			 reg_none(), SHF_OP_NONE, reg_b(dst + 1),
1590 			 SHF_SC_R_SHF, insn->imm - 32);
1591 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1592 	}
1593 
1594 	return 0;
1595 }
1596 
1597 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1598 {
1599 	const struct bpf_insn *insn = &meta->insn;
1600 
1601 	wrp_reg_mov(nfp_prog, insn->dst_reg * 2,  insn->src_reg * 2);
1602 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1603 
1604 	return 0;
1605 }
1606 
1607 static int mov_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1608 {
1609 	const struct bpf_insn *insn = &meta->insn;
1610 
1611 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), insn->imm);
1612 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1613 
1614 	return 0;
1615 }
1616 
1617 static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1618 {
1619 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_XOR);
1620 }
1621 
1622 static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1623 {
1624 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR, !~meta->insn.imm);
1625 }
1626 
1627 static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1628 {
1629 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_AND);
1630 }
1631 
1632 static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1633 {
1634 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1635 }
1636 
1637 static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1638 {
1639 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_OR);
1640 }
1641 
1642 static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1643 {
1644 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1645 }
1646 
1647 static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1648 {
1649 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_ADD);
1650 }
1651 
1652 static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1653 {
1654 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD, !meta->insn.imm);
1655 }
1656 
1657 static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1658 {
1659 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_SUB);
1660 }
1661 
1662 static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1663 {
1664 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB, !meta->insn.imm);
1665 }
1666 
1667 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1668 {
1669 	u8 dst = meta->insn.dst_reg * 2;
1670 
1671 	emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst));
1672 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1673 
1674 	return 0;
1675 }
1676 
1677 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1678 {
1679 	const struct bpf_insn *insn = &meta->insn;
1680 
1681 	if (!insn->imm)
1682 		return 1; /* TODO: zero shift means indirect */
1683 
1684 	emit_shf(nfp_prog, reg_both(insn->dst_reg * 2),
1685 		 reg_none(), SHF_OP_NONE, reg_b(insn->dst_reg * 2),
1686 		 SHF_SC_L_SHF, insn->imm);
1687 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1688 
1689 	return 0;
1690 }
1691 
1692 static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1693 {
1694 	const struct bpf_insn *insn = &meta->insn;
1695 	u8 gpr = insn->dst_reg * 2;
1696 
1697 	switch (insn->imm) {
1698 	case 16:
1699 		emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr),
1700 			      SHF_SC_R_ROT, 8);
1701 		emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr),
1702 			      SHF_SC_R_SHF, 16);
1703 
1704 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
1705 		break;
1706 	case 32:
1707 		wrp_end32(nfp_prog, reg_a(gpr), gpr);
1708 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
1709 		break;
1710 	case 64:
1711 		wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1));
1712 
1713 		wrp_end32(nfp_prog, reg_a(gpr), gpr + 1);
1714 		wrp_end32(nfp_prog, imm_a(nfp_prog), gpr);
1715 		break;
1716 	}
1717 
1718 	return 0;
1719 }
1720 
1721 static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1722 {
1723 	struct nfp_insn_meta *prev = nfp_meta_prev(meta);
1724 	u32 imm_lo, imm_hi;
1725 	u8 dst;
1726 
1727 	dst = prev->insn.dst_reg * 2;
1728 	imm_lo = prev->insn.imm;
1729 	imm_hi = meta->insn.imm;
1730 
1731 	wrp_immed(nfp_prog, reg_both(dst), imm_lo);
1732 
1733 	/* mov is always 1 insn, load imm may be two, so try to use mov */
1734 	if (imm_hi == imm_lo)
1735 		wrp_mov(nfp_prog, reg_both(dst + 1), reg_a(dst));
1736 	else
1737 		wrp_immed(nfp_prog, reg_both(dst + 1), imm_hi);
1738 
1739 	return 0;
1740 }
1741 
1742 static int imm_ld8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1743 {
1744 	meta->double_cb = imm_ld8_part2;
1745 	return 0;
1746 }
1747 
1748 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1749 {
1750 	return construct_data_ld(nfp_prog, meta->insn.imm, 1);
1751 }
1752 
1753 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1754 {
1755 	return construct_data_ld(nfp_prog, meta->insn.imm, 2);
1756 }
1757 
1758 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1759 {
1760 	return construct_data_ld(nfp_prog, meta->insn.imm, 4);
1761 }
1762 
1763 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1764 {
1765 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
1766 				     meta->insn.src_reg * 2, 1);
1767 }
1768 
1769 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1770 {
1771 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
1772 				     meta->insn.src_reg * 2, 2);
1773 }
1774 
1775 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1776 {
1777 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
1778 				     meta->insn.src_reg * 2, 4);
1779 }
1780 
1781 static int
1782 mem_ldx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1783 	      unsigned int size, unsigned int ptr_off)
1784 {
1785 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
1786 			    meta->insn.dst_reg * 2, meta->insn.src_reg * 2,
1787 			    true, wrp_lmem_load);
1788 }
1789 
1790 static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1791 		       u8 size)
1792 {
1793 	swreg dst = reg_both(meta->insn.dst_reg * 2);
1794 
1795 	switch (meta->insn.off) {
1796 	case offsetof(struct __sk_buff, len):
1797 		if (size != FIELD_SIZEOF(struct __sk_buff, len))
1798 			return -EOPNOTSUPP;
1799 		wrp_mov(nfp_prog, dst, plen_reg(nfp_prog));
1800 		break;
1801 	case offsetof(struct __sk_buff, data):
1802 		if (size != FIELD_SIZEOF(struct __sk_buff, data))
1803 			return -EOPNOTSUPP;
1804 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
1805 		break;
1806 	case offsetof(struct __sk_buff, data_end):
1807 		if (size != FIELD_SIZEOF(struct __sk_buff, data_end))
1808 			return -EOPNOTSUPP;
1809 		emit_alu(nfp_prog, dst,
1810 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
1811 		break;
1812 	default:
1813 		return -EOPNOTSUPP;
1814 	}
1815 
1816 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1817 
1818 	return 0;
1819 }
1820 
1821 static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1822 		       u8 size)
1823 {
1824 	swreg dst = reg_both(meta->insn.dst_reg * 2);
1825 
1826 	switch (meta->insn.off) {
1827 	case offsetof(struct xdp_md, data):
1828 		if (size != FIELD_SIZEOF(struct xdp_md, data))
1829 			return -EOPNOTSUPP;
1830 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
1831 		break;
1832 	case offsetof(struct xdp_md, data_end):
1833 		if (size != FIELD_SIZEOF(struct xdp_md, data_end))
1834 			return -EOPNOTSUPP;
1835 		emit_alu(nfp_prog, dst,
1836 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
1837 		break;
1838 	default:
1839 		return -EOPNOTSUPP;
1840 	}
1841 
1842 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1843 
1844 	return 0;
1845 }
1846 
1847 static int
1848 mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1849 	     unsigned int size)
1850 {
1851 	swreg tmp_reg;
1852 
1853 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
1854 
1855 	return data_ld_host_order_addr32(nfp_prog, meta->insn.src_reg * 2,
1856 					 tmp_reg, meta->insn.dst_reg * 2, size);
1857 }
1858 
1859 static int
1860 mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1861 	     unsigned int size)
1862 {
1863 	swreg tmp_reg;
1864 
1865 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
1866 
1867 	return data_ld_host_order_addr40(nfp_prog, meta->insn.src_reg * 2,
1868 					 tmp_reg, meta->insn.dst_reg * 2, size);
1869 }
1870 
1871 static void
1872 mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog,
1873 			   struct nfp_insn_meta *meta)
1874 {
1875 	s16 range_start = meta->pkt_cache.range_start;
1876 	s16 range_end = meta->pkt_cache.range_end;
1877 	swreg src_base, off;
1878 	u8 xfer_num, len;
1879 	bool indir;
1880 
1881 	off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog));
1882 	src_base = reg_a(meta->insn.src_reg * 2);
1883 	len = range_end - range_start;
1884 	xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH;
1885 
1886 	indir = len > 8 * REG_WIDTH;
1887 	/* Setup PREV_ALU for indirect mode. */
1888 	if (indir)
1889 		wrp_immed(nfp_prog, reg_none(),
1890 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
1891 
1892 	/* Cache memory into transfer-in registers. */
1893 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
1894 		     off, xfer_num - 1, CMD_CTX_SWAP, indir);
1895 }
1896 
1897 static int
1898 mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog,
1899 				     struct nfp_insn_meta *meta,
1900 				     unsigned int size)
1901 {
1902 	s16 range_start = meta->pkt_cache.range_start;
1903 	s16 insn_off = meta->insn.off - range_start;
1904 	swreg dst_lo, dst_hi, src_lo, src_mid;
1905 	u8 dst_gpr = meta->insn.dst_reg * 2;
1906 	u8 len_lo = size, len_mid = 0;
1907 	u8 idx = insn_off / REG_WIDTH;
1908 	u8 off = insn_off % REG_WIDTH;
1909 
1910 	dst_hi = reg_both(dst_gpr + 1);
1911 	dst_lo = reg_both(dst_gpr);
1912 	src_lo = reg_xfer(idx);
1913 
1914 	/* The read length could involve as many as three registers. */
1915 	if (size > REG_WIDTH - off) {
1916 		/* Calculate the part in the second register. */
1917 		len_lo = REG_WIDTH - off;
1918 		len_mid = size - len_lo;
1919 
1920 		/* Calculate the part in the third register. */
1921 		if (size > 2 * REG_WIDTH - off)
1922 			len_mid = REG_WIDTH;
1923 	}
1924 
1925 	wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off);
1926 
1927 	if (!len_mid) {
1928 		wrp_immed(nfp_prog, dst_hi, 0);
1929 		return 0;
1930 	}
1931 
1932 	src_mid = reg_xfer(idx + 1);
1933 
1934 	if (size <= REG_WIDTH) {
1935 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo);
1936 		wrp_immed(nfp_prog, dst_hi, 0);
1937 	} else {
1938 		swreg src_hi = reg_xfer(idx + 2);
1939 
1940 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid,
1941 				   REG_WIDTH - len_lo, len_lo);
1942 		wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo,
1943 				REG_WIDTH - len_lo);
1944 		wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo,
1945 				   len_lo);
1946 	}
1947 
1948 	return 0;
1949 }
1950 
1951 static int
1952 mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog,
1953 				   struct nfp_insn_meta *meta,
1954 				   unsigned int size)
1955 {
1956 	swreg dst_lo, dst_hi, src_lo;
1957 	u8 dst_gpr, idx;
1958 
1959 	idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH;
1960 	dst_gpr = meta->insn.dst_reg * 2;
1961 	dst_hi = reg_both(dst_gpr + 1);
1962 	dst_lo = reg_both(dst_gpr);
1963 	src_lo = reg_xfer(idx);
1964 
1965 	if (size < REG_WIDTH) {
1966 		wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0);
1967 		wrp_immed(nfp_prog, dst_hi, 0);
1968 	} else if (size == REG_WIDTH) {
1969 		wrp_mov(nfp_prog, dst_lo, src_lo);
1970 		wrp_immed(nfp_prog, dst_hi, 0);
1971 	} else {
1972 		swreg src_hi = reg_xfer(idx + 1);
1973 
1974 		wrp_mov(nfp_prog, dst_lo, src_lo);
1975 		wrp_mov(nfp_prog, dst_hi, src_hi);
1976 	}
1977 
1978 	return 0;
1979 }
1980 
1981 static int
1982 mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog,
1983 			   struct nfp_insn_meta *meta, unsigned int size)
1984 {
1985 	u8 off = meta->insn.off - meta->pkt_cache.range_start;
1986 
1987 	if (IS_ALIGNED(off, REG_WIDTH))
1988 		return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size);
1989 
1990 	return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size);
1991 }
1992 
1993 static int
1994 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1995 	unsigned int size)
1996 {
1997 	if (meta->ldst_gather_len)
1998 		return nfp_cpp_memcpy(nfp_prog, meta);
1999 
2000 	if (meta->ptr.type == PTR_TO_CTX) {
2001 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2002 			return mem_ldx_xdp(nfp_prog, meta, size);
2003 		else
2004 			return mem_ldx_skb(nfp_prog, meta, size);
2005 	}
2006 
2007 	if (meta->ptr.type == PTR_TO_PACKET) {
2008 		if (meta->pkt_cache.range_end) {
2009 			if (meta->pkt_cache.do_init)
2010 				mem_ldx_data_init_pktcache(nfp_prog, meta);
2011 
2012 			return mem_ldx_data_from_pktcache(nfp_prog, meta, size);
2013 		} else {
2014 			return mem_ldx_data(nfp_prog, meta, size);
2015 		}
2016 	}
2017 
2018 	if (meta->ptr.type == PTR_TO_STACK)
2019 		return mem_ldx_stack(nfp_prog, meta, size,
2020 				     meta->ptr.off + meta->ptr.var_off.value);
2021 
2022 	if (meta->ptr.type == PTR_TO_MAP_VALUE)
2023 		return mem_ldx_emem(nfp_prog, meta, size);
2024 
2025 	return -EOPNOTSUPP;
2026 }
2027 
2028 static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2029 {
2030 	return mem_ldx(nfp_prog, meta, 1);
2031 }
2032 
2033 static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2034 {
2035 	return mem_ldx(nfp_prog, meta, 2);
2036 }
2037 
2038 static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2039 {
2040 	return mem_ldx(nfp_prog, meta, 4);
2041 }
2042 
2043 static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2044 {
2045 	return mem_ldx(nfp_prog, meta, 8);
2046 }
2047 
2048 static int
2049 mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2050 	    unsigned int size)
2051 {
2052 	u64 imm = meta->insn.imm; /* sign extend */
2053 	swreg off_reg;
2054 
2055 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2056 
2057 	return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2058 				  imm, size);
2059 }
2060 
2061 static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2062 		  unsigned int size)
2063 {
2064 	if (meta->ptr.type == PTR_TO_PACKET)
2065 		return mem_st_data(nfp_prog, meta, size);
2066 
2067 	return -EOPNOTSUPP;
2068 }
2069 
2070 static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2071 {
2072 	return mem_st(nfp_prog, meta, 1);
2073 }
2074 
2075 static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2076 {
2077 	return mem_st(nfp_prog, meta, 2);
2078 }
2079 
2080 static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2081 {
2082 	return mem_st(nfp_prog, meta, 4);
2083 }
2084 
2085 static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2086 {
2087 	return mem_st(nfp_prog, meta, 8);
2088 }
2089 
2090 static int
2091 mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2092 	     unsigned int size)
2093 {
2094 	swreg off_reg;
2095 
2096 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2097 
2098 	return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2099 				   meta->insn.src_reg * 2, size);
2100 }
2101 
2102 static int
2103 mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2104 	      unsigned int size, unsigned int ptr_off)
2105 {
2106 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2107 			    meta->insn.src_reg * 2, meta->insn.dst_reg * 2,
2108 			    false, wrp_lmem_store);
2109 }
2110 
2111 static int
2112 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2113 	unsigned int size)
2114 {
2115 	if (meta->ptr.type == PTR_TO_PACKET)
2116 		return mem_stx_data(nfp_prog, meta, size);
2117 
2118 	if (meta->ptr.type == PTR_TO_STACK)
2119 		return mem_stx_stack(nfp_prog, meta, size,
2120 				     meta->ptr.off + meta->ptr.var_off.value);
2121 
2122 	return -EOPNOTSUPP;
2123 }
2124 
2125 static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2126 {
2127 	return mem_stx(nfp_prog, meta, 1);
2128 }
2129 
2130 static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2131 {
2132 	return mem_stx(nfp_prog, meta, 2);
2133 }
2134 
2135 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2136 {
2137 	return mem_stx(nfp_prog, meta, 4);
2138 }
2139 
2140 static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2141 {
2142 	return mem_stx(nfp_prog, meta, 8);
2143 }
2144 
2145 static int
2146 mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
2147 {
2148 	u8 dst_gpr = meta->insn.dst_reg * 2;
2149 	u8 src_gpr = meta->insn.src_reg * 2;
2150 	unsigned int full_add, out;
2151 	swreg addra, addrb, off;
2152 
2153 	off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2154 
2155 	/* We can fit 16 bits into command immediate, if we know the immediate
2156 	 * is guaranteed to either always or never fit into 16 bit we only
2157 	 * generate code to handle that particular case, otherwise generate
2158 	 * code for both.
2159 	 */
2160 	out = nfp_prog_current_offset(nfp_prog);
2161 	full_add = nfp_prog_current_offset(nfp_prog);
2162 
2163 	if (meta->insn.off) {
2164 		out += 2;
2165 		full_add += 2;
2166 	}
2167 	if (meta->xadd_maybe_16bit) {
2168 		out += 3;
2169 		full_add += 3;
2170 	}
2171 	if (meta->xadd_over_16bit)
2172 		out += 2 + is64;
2173 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2174 		out += 5;
2175 		full_add += 5;
2176 	}
2177 
2178 	/* Generate the branch for choosing add_imm vs add */
2179 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2180 		swreg max_imm = imm_a(nfp_prog);
2181 
2182 		wrp_immed(nfp_prog, max_imm, 0xffff);
2183 		emit_alu(nfp_prog, reg_none(),
2184 			 max_imm, ALU_OP_SUB, reg_b(src_gpr));
2185 		emit_alu(nfp_prog, reg_none(),
2186 			 reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1));
2187 		emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0);
2188 		/* defer for add */
2189 	}
2190 
2191 	/* If insn has an offset add to the address */
2192 	if (!meta->insn.off) {
2193 		addra = reg_a(dst_gpr);
2194 		addrb = reg_b(dst_gpr + 1);
2195 	} else {
2196 		emit_alu(nfp_prog, imma_a(nfp_prog),
2197 			 reg_a(dst_gpr), ALU_OP_ADD, off);
2198 		emit_alu(nfp_prog, imma_b(nfp_prog),
2199 			 reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0));
2200 		addra = imma_a(nfp_prog);
2201 		addrb = imma_b(nfp_prog);
2202 	}
2203 
2204 	/* Generate the add_imm if 16 bits are possible */
2205 	if (meta->xadd_maybe_16bit) {
2206 		swreg prev_alu = imm_a(nfp_prog);
2207 
2208 		wrp_immed(nfp_prog, prev_alu,
2209 			  FIELD_PREP(CMD_OVE_DATA, 2) |
2210 			  CMD_OVE_LEN |
2211 			  FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2));
2212 		wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2);
2213 		emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0,
2214 			       addra, addrb, 0, CMD_CTX_NO_SWAP);
2215 
2216 		if (meta->xadd_over_16bit)
2217 			emit_br(nfp_prog, BR_UNC, out, 0);
2218 	}
2219 
2220 	if (!nfp_prog_confirm_current_offset(nfp_prog, full_add))
2221 		return -EINVAL;
2222 
2223 	/* Generate the add if 16 bits are not guaranteed */
2224 	if (meta->xadd_over_16bit) {
2225 		emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0,
2226 			 addra, addrb, is64 << 2,
2227 			 is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1);
2228 
2229 		wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr));
2230 		if (is64)
2231 			wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1));
2232 	}
2233 
2234 	if (!nfp_prog_confirm_current_offset(nfp_prog, out))
2235 		return -EINVAL;
2236 
2237 	return 0;
2238 }
2239 
2240 static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2241 {
2242 	return mem_xadd(nfp_prog, meta, false);
2243 }
2244 
2245 static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2246 {
2247 	return mem_xadd(nfp_prog, meta, true);
2248 }
2249 
2250 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2251 {
2252 	emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
2253 
2254 	return 0;
2255 }
2256 
2257 static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2258 {
2259 	const struct bpf_insn *insn = &meta->insn;
2260 	u64 imm = insn->imm; /* sign extend */
2261 	swreg or1, or2, tmp_reg;
2262 
2263 	or1 = reg_a(insn->dst_reg * 2);
2264 	or2 = reg_b(insn->dst_reg * 2 + 1);
2265 
2266 	if (imm & ~0U) {
2267 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
2268 		emit_alu(nfp_prog, imm_a(nfp_prog),
2269 			 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
2270 		or1 = imm_a(nfp_prog);
2271 	}
2272 
2273 	if (imm >> 32) {
2274 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
2275 		emit_alu(nfp_prog, imm_b(nfp_prog),
2276 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
2277 		or2 = imm_b(nfp_prog);
2278 	}
2279 
2280 	emit_alu(nfp_prog, reg_none(), or1, ALU_OP_OR, or2);
2281 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
2282 
2283 	return 0;
2284 }
2285 
2286 static int jgt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2287 {
2288 	return wrp_cmp_imm(nfp_prog, meta, BR_BLO, true);
2289 }
2290 
2291 static int jge_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2292 {
2293 	return wrp_cmp_imm(nfp_prog, meta, BR_BHS, false);
2294 }
2295 
2296 static int jlt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2297 {
2298 	return wrp_cmp_imm(nfp_prog, meta, BR_BLO, false);
2299 }
2300 
2301 static int jle_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2302 {
2303 	return wrp_cmp_imm(nfp_prog, meta, BR_BHS, true);
2304 }
2305 
2306 static int jsgt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2307 {
2308 	return wrp_cmp_imm(nfp_prog, meta, BR_BLT, true);
2309 }
2310 
2311 static int jsge_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2312 {
2313 	return wrp_cmp_imm(nfp_prog, meta, BR_BGE, false);
2314 }
2315 
2316 static int jslt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2317 {
2318 	return wrp_cmp_imm(nfp_prog, meta, BR_BLT, false);
2319 }
2320 
2321 static int jsle_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2322 {
2323 	return wrp_cmp_imm(nfp_prog, meta, BR_BGE, true);
2324 }
2325 
2326 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2327 {
2328 	const struct bpf_insn *insn = &meta->insn;
2329 	u64 imm = insn->imm; /* sign extend */
2330 	swreg tmp_reg;
2331 
2332 	if (!imm) {
2333 		meta->skip = true;
2334 		return 0;
2335 	}
2336 
2337 	if (imm & ~0U) {
2338 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
2339 		emit_alu(nfp_prog, reg_none(),
2340 			 reg_a(insn->dst_reg * 2), ALU_OP_AND, tmp_reg);
2341 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
2342 	}
2343 
2344 	if (imm >> 32) {
2345 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
2346 		emit_alu(nfp_prog, reg_none(),
2347 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_AND, tmp_reg);
2348 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
2349 	}
2350 
2351 	return 0;
2352 }
2353 
2354 static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2355 {
2356 	const struct bpf_insn *insn = &meta->insn;
2357 	u64 imm = insn->imm; /* sign extend */
2358 	swreg tmp_reg;
2359 
2360 	if (!imm) {
2361 		emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
2362 			 ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
2363 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
2364 		return 0;
2365 	}
2366 
2367 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
2368 	emit_alu(nfp_prog, reg_none(),
2369 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
2370 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
2371 
2372 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
2373 	emit_alu(nfp_prog, reg_none(),
2374 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
2375 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
2376 
2377 	return 0;
2378 }
2379 
2380 static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2381 {
2382 	const struct bpf_insn *insn = &meta->insn;
2383 
2384 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
2385 		 ALU_OP_XOR, reg_b(insn->src_reg * 2));
2386 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_a(insn->dst_reg * 2 + 1),
2387 		 ALU_OP_XOR, reg_b(insn->src_reg * 2 + 1));
2388 	emit_alu(nfp_prog, reg_none(),
2389 		 imm_a(nfp_prog), ALU_OP_OR, imm_b(nfp_prog));
2390 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
2391 
2392 	return 0;
2393 }
2394 
2395 static int jgt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2396 {
2397 	return wrp_cmp_reg(nfp_prog, meta, BR_BLO, true);
2398 }
2399 
2400 static int jge_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2401 {
2402 	return wrp_cmp_reg(nfp_prog, meta, BR_BHS, false);
2403 }
2404 
2405 static int jlt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2406 {
2407 	return wrp_cmp_reg(nfp_prog, meta, BR_BLO, false);
2408 }
2409 
2410 static int jle_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2411 {
2412 	return wrp_cmp_reg(nfp_prog, meta, BR_BHS, true);
2413 }
2414 
2415 static int jsgt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2416 {
2417 	return wrp_cmp_reg(nfp_prog, meta, BR_BLT, true);
2418 }
2419 
2420 static int jsge_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2421 {
2422 	return wrp_cmp_reg(nfp_prog, meta, BR_BGE, false);
2423 }
2424 
2425 static int jslt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2426 {
2427 	return wrp_cmp_reg(nfp_prog, meta, BR_BLT, false);
2428 }
2429 
2430 static int jsle_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2431 {
2432 	return wrp_cmp_reg(nfp_prog, meta, BR_BGE, true);
2433 }
2434 
2435 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2436 {
2437 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
2438 }
2439 
2440 static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2441 {
2442 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
2443 }
2444 
2445 static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2446 {
2447 	switch (meta->insn.imm) {
2448 	case BPF_FUNC_xdp_adjust_head:
2449 		return adjust_head(nfp_prog, meta);
2450 	case BPF_FUNC_map_lookup_elem:
2451 	case BPF_FUNC_map_update_elem:
2452 	case BPF_FUNC_map_delete_elem:
2453 		return map_call_stack_common(nfp_prog, meta);
2454 	case BPF_FUNC_get_prandom_u32:
2455 		return nfp_get_prandom_u32(nfp_prog, meta);
2456 	default:
2457 		WARN_ONCE(1, "verifier allowed unsupported function\n");
2458 		return -EOPNOTSUPP;
2459 	}
2460 }
2461 
2462 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2463 {
2464 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
2465 
2466 	return 0;
2467 }
2468 
2469 static const instr_cb_t instr_cb[256] = {
2470 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
2471 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
2472 	[BPF_ALU64 | BPF_XOR | BPF_X] =	xor_reg64,
2473 	[BPF_ALU64 | BPF_XOR | BPF_K] =	xor_imm64,
2474 	[BPF_ALU64 | BPF_AND | BPF_X] =	and_reg64,
2475 	[BPF_ALU64 | BPF_AND | BPF_K] =	and_imm64,
2476 	[BPF_ALU64 | BPF_OR | BPF_X] =	or_reg64,
2477 	[BPF_ALU64 | BPF_OR | BPF_K] =	or_imm64,
2478 	[BPF_ALU64 | BPF_ADD | BPF_X] =	add_reg64,
2479 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
2480 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
2481 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
2482 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
2483 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
2484 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
2485 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
2486 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
2487 	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
2488 	[BPF_ALU | BPF_XOR | BPF_K] =	xor_imm,
2489 	[BPF_ALU | BPF_AND | BPF_X] =	and_reg,
2490 	[BPF_ALU | BPF_AND | BPF_K] =	and_imm,
2491 	[BPF_ALU | BPF_OR | BPF_X] =	or_reg,
2492 	[BPF_ALU | BPF_OR | BPF_K] =	or_imm,
2493 	[BPF_ALU | BPF_ADD | BPF_X] =	add_reg,
2494 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
2495 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
2496 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
2497 	[BPF_ALU | BPF_NEG] =		neg_reg,
2498 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
2499 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
2500 	[BPF_LD | BPF_IMM | BPF_DW] =	imm_ld8,
2501 	[BPF_LD | BPF_ABS | BPF_B] =	data_ld1,
2502 	[BPF_LD | BPF_ABS | BPF_H] =	data_ld2,
2503 	[BPF_LD | BPF_ABS | BPF_W] =	data_ld4,
2504 	[BPF_LD | BPF_IND | BPF_B] =	data_ind_ld1,
2505 	[BPF_LD | BPF_IND | BPF_H] =	data_ind_ld2,
2506 	[BPF_LD | BPF_IND | BPF_W] =	data_ind_ld4,
2507 	[BPF_LDX | BPF_MEM | BPF_B] =	mem_ldx1,
2508 	[BPF_LDX | BPF_MEM | BPF_H] =	mem_ldx2,
2509 	[BPF_LDX | BPF_MEM | BPF_W] =	mem_ldx4,
2510 	[BPF_LDX | BPF_MEM | BPF_DW] =	mem_ldx8,
2511 	[BPF_STX | BPF_MEM | BPF_B] =	mem_stx1,
2512 	[BPF_STX | BPF_MEM | BPF_H] =	mem_stx2,
2513 	[BPF_STX | BPF_MEM | BPF_W] =	mem_stx4,
2514 	[BPF_STX | BPF_MEM | BPF_DW] =	mem_stx8,
2515 	[BPF_STX | BPF_XADD | BPF_W] =	mem_xadd4,
2516 	[BPF_STX | BPF_XADD | BPF_DW] =	mem_xadd8,
2517 	[BPF_ST | BPF_MEM | BPF_B] =	mem_st1,
2518 	[BPF_ST | BPF_MEM | BPF_H] =	mem_st2,
2519 	[BPF_ST | BPF_MEM | BPF_W] =	mem_st4,
2520 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
2521 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
2522 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
2523 	[BPF_JMP | BPF_JGT | BPF_K] =	jgt_imm,
2524 	[BPF_JMP | BPF_JGE | BPF_K] =	jge_imm,
2525 	[BPF_JMP | BPF_JLT | BPF_K] =	jlt_imm,
2526 	[BPF_JMP | BPF_JLE | BPF_K] =	jle_imm,
2527 	[BPF_JMP | BPF_JSGT | BPF_K] =  jsgt_imm,
2528 	[BPF_JMP | BPF_JSGE | BPF_K] =  jsge_imm,
2529 	[BPF_JMP | BPF_JSLT | BPF_K] =  jslt_imm,
2530 	[BPF_JMP | BPF_JSLE | BPF_K] =  jsle_imm,
2531 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
2532 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
2533 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
2534 	[BPF_JMP | BPF_JGT | BPF_X] =	jgt_reg,
2535 	[BPF_JMP | BPF_JGE | BPF_X] =	jge_reg,
2536 	[BPF_JMP | BPF_JLT | BPF_X] =	jlt_reg,
2537 	[BPF_JMP | BPF_JLE | BPF_X] =	jle_reg,
2538 	[BPF_JMP | BPF_JSGT | BPF_X] =  jsgt_reg,
2539 	[BPF_JMP | BPF_JSGE | BPF_X] =  jsge_reg,
2540 	[BPF_JMP | BPF_JSLT | BPF_X] =  jslt_reg,
2541 	[BPF_JMP | BPF_JSLE | BPF_X] =  jsle_reg,
2542 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
2543 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
2544 	[BPF_JMP | BPF_CALL] =		call,
2545 	[BPF_JMP | BPF_EXIT] =		goto_out,
2546 };
2547 
2548 /* --- Assembler logic --- */
2549 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
2550 {
2551 	struct nfp_insn_meta *meta, *jmp_dst;
2552 	u32 idx, br_idx;
2553 
2554 	list_for_each_entry(meta, &nfp_prog->insns, l) {
2555 		if (meta->skip)
2556 			continue;
2557 		if (meta->insn.code == (BPF_JMP | BPF_CALL))
2558 			continue;
2559 		if (BPF_CLASS(meta->insn.code) != BPF_JMP)
2560 			continue;
2561 
2562 		if (list_is_last(&meta->l, &nfp_prog->insns))
2563 			br_idx = nfp_prog->last_bpf_off;
2564 		else
2565 			br_idx = list_next_entry(meta, l)->off - 1;
2566 
2567 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
2568 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
2569 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
2570 			return -ELOOP;
2571 		}
2572 		/* Leave special branches for later */
2573 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
2574 		    RELO_BR_REL)
2575 			continue;
2576 
2577 		if (!meta->jmp_dst) {
2578 			pr_err("Non-exit jump doesn't have destination info recorded!!\n");
2579 			return -ELOOP;
2580 		}
2581 
2582 		jmp_dst = meta->jmp_dst;
2583 
2584 		if (jmp_dst->skip) {
2585 			pr_err("Branch landing on removed instruction!!\n");
2586 			return -ELOOP;
2587 		}
2588 
2589 		for (idx = meta->off; idx <= br_idx; idx++) {
2590 			if (!nfp_is_br(nfp_prog->prog[idx]))
2591 				continue;
2592 			br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
2593 		}
2594 	}
2595 
2596 	return 0;
2597 }
2598 
2599 static void nfp_intro(struct nfp_prog *nfp_prog)
2600 {
2601 	wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
2602 	emit_alu(nfp_prog, plen_reg(nfp_prog),
2603 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
2604 }
2605 
2606 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
2607 {
2608 	/* TC direct-action mode:
2609 	 *   0,1   ok        NOT SUPPORTED[1]
2610 	 *   2   drop  0x22 -> drop,  count as stat1
2611 	 *   4,5 nuke  0x02 -> drop
2612 	 *   7  redir  0x44 -> redir, count as stat2
2613 	 *   * unspec  0x11 -> pass,  count as stat0
2614 	 *
2615 	 * [1] We can't support OK and RECLASSIFY because we can't tell TC
2616 	 *     the exact decision made.  We are forced to support UNSPEC
2617 	 *     to handle aborts so that's the only one we handle for passing
2618 	 *     packets up the stack.
2619 	 */
2620 	/* Target for aborts */
2621 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
2622 
2623 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
2624 
2625 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
2626 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16);
2627 
2628 	/* Target for normal exits */
2629 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
2630 
2631 	/* if R0 > 7 jump to abort */
2632 	emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0));
2633 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
2634 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
2635 
2636 	wrp_immed(nfp_prog, reg_b(2), 0x41221211);
2637 	wrp_immed(nfp_prog, reg_b(3), 0x41001211);
2638 
2639 	emit_shf(nfp_prog, reg_a(1),
2640 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 2);
2641 
2642 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
2643 	emit_shf(nfp_prog, reg_a(2),
2644 		 reg_imm(0xf), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
2645 
2646 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
2647 	emit_shf(nfp_prog, reg_b(2),
2648 		 reg_imm(0xf), SHF_OP_AND, reg_b(3), SHF_SC_R_SHF, 0);
2649 
2650 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
2651 
2652 	emit_shf(nfp_prog, reg_b(2),
2653 		 reg_a(2), SHF_OP_OR, reg_b(2), SHF_SC_L_SHF, 4);
2654 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
2655 }
2656 
2657 static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
2658 {
2659 	/* XDP return codes:
2660 	 *   0 aborted  0x82 -> drop,  count as stat3
2661 	 *   1    drop  0x22 -> drop,  count as stat1
2662 	 *   2    pass  0x11 -> pass,  count as stat0
2663 	 *   3      tx  0x44 -> redir, count as stat2
2664 	 *   * unknown  0x82 -> drop,  count as stat3
2665 	 */
2666 	/* Target for aborts */
2667 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
2668 
2669 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
2670 
2671 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
2672 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
2673 
2674 	/* Target for normal exits */
2675 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
2676 
2677 	/* if R0 > 3 jump to abort */
2678 	emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
2679 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
2680 
2681 	wrp_immed(nfp_prog, reg_b(2), 0x44112282);
2682 
2683 	emit_shf(nfp_prog, reg_a(1),
2684 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 3);
2685 
2686 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
2687 	emit_shf(nfp_prog, reg_b(2),
2688 		 reg_imm(0xff), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
2689 
2690 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
2691 
2692 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
2693 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
2694 }
2695 
2696 static void nfp_outro(struct nfp_prog *nfp_prog)
2697 {
2698 	switch (nfp_prog->type) {
2699 	case BPF_PROG_TYPE_SCHED_CLS:
2700 		nfp_outro_tc_da(nfp_prog);
2701 		break;
2702 	case BPF_PROG_TYPE_XDP:
2703 		nfp_outro_xdp(nfp_prog);
2704 		break;
2705 	default:
2706 		WARN_ON(1);
2707 	}
2708 }
2709 
2710 static int nfp_translate(struct nfp_prog *nfp_prog)
2711 {
2712 	struct nfp_insn_meta *meta;
2713 	int err;
2714 
2715 	nfp_intro(nfp_prog);
2716 	if (nfp_prog->error)
2717 		return nfp_prog->error;
2718 
2719 	list_for_each_entry(meta, &nfp_prog->insns, l) {
2720 		instr_cb_t cb = instr_cb[meta->insn.code];
2721 
2722 		meta->off = nfp_prog_current_offset(nfp_prog);
2723 
2724 		if (meta->skip) {
2725 			nfp_prog->n_translated++;
2726 			continue;
2727 		}
2728 
2729 		if (nfp_meta_has_prev(nfp_prog, meta) &&
2730 		    nfp_meta_prev(meta)->double_cb)
2731 			cb = nfp_meta_prev(meta)->double_cb;
2732 		if (!cb)
2733 			return -ENOENT;
2734 		err = cb(nfp_prog, meta);
2735 		if (err)
2736 			return err;
2737 		if (nfp_prog->error)
2738 			return nfp_prog->error;
2739 
2740 		nfp_prog->n_translated++;
2741 	}
2742 
2743 	nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
2744 
2745 	nfp_outro(nfp_prog);
2746 	if (nfp_prog->error)
2747 		return nfp_prog->error;
2748 
2749 	wrp_nops(nfp_prog, NFP_USTORE_PREFETCH_WINDOW);
2750 	if (nfp_prog->error)
2751 		return nfp_prog->error;
2752 
2753 	return nfp_fixup_branches(nfp_prog);
2754 }
2755 
2756 /* --- Optimizations --- */
2757 static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
2758 {
2759 	struct nfp_insn_meta *meta;
2760 
2761 	list_for_each_entry(meta, &nfp_prog->insns, l) {
2762 		struct bpf_insn insn = meta->insn;
2763 
2764 		/* Programs converted from cBPF start with register xoring */
2765 		if (insn.code == (BPF_ALU64 | BPF_XOR | BPF_X) &&
2766 		    insn.src_reg == insn.dst_reg)
2767 			continue;
2768 
2769 		/* Programs start with R6 = R1 but we ignore the skb pointer */
2770 		if (insn.code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
2771 		    insn.src_reg == 1 && insn.dst_reg == 6)
2772 			meta->skip = true;
2773 
2774 		/* Return as soon as something doesn't match */
2775 		if (!meta->skip)
2776 			return;
2777 	}
2778 }
2779 
2780 /* Remove masking after load since our load guarantees this is not needed */
2781 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
2782 {
2783 	struct nfp_insn_meta *meta1, *meta2;
2784 	const s32 exp_mask[] = {
2785 		[BPF_B] = 0x000000ffU,
2786 		[BPF_H] = 0x0000ffffU,
2787 		[BPF_W] = 0xffffffffU,
2788 	};
2789 
2790 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
2791 		struct bpf_insn insn, next;
2792 
2793 		insn = meta1->insn;
2794 		next = meta2->insn;
2795 
2796 		if (BPF_CLASS(insn.code) != BPF_LD)
2797 			continue;
2798 		if (BPF_MODE(insn.code) != BPF_ABS &&
2799 		    BPF_MODE(insn.code) != BPF_IND)
2800 			continue;
2801 
2802 		if (next.code != (BPF_ALU64 | BPF_AND | BPF_K))
2803 			continue;
2804 
2805 		if (!exp_mask[BPF_SIZE(insn.code)])
2806 			continue;
2807 		if (exp_mask[BPF_SIZE(insn.code)] != next.imm)
2808 			continue;
2809 
2810 		if (next.src_reg || next.dst_reg)
2811 			continue;
2812 
2813 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
2814 			continue;
2815 
2816 		meta2->skip = true;
2817 	}
2818 }
2819 
2820 static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
2821 {
2822 	struct nfp_insn_meta *meta1, *meta2, *meta3;
2823 
2824 	nfp_for_each_insn_walk3(nfp_prog, meta1, meta2, meta3) {
2825 		struct bpf_insn insn, next1, next2;
2826 
2827 		insn = meta1->insn;
2828 		next1 = meta2->insn;
2829 		next2 = meta3->insn;
2830 
2831 		if (BPF_CLASS(insn.code) != BPF_LD)
2832 			continue;
2833 		if (BPF_MODE(insn.code) != BPF_ABS &&
2834 		    BPF_MODE(insn.code) != BPF_IND)
2835 			continue;
2836 		if (BPF_SIZE(insn.code) != BPF_W)
2837 			continue;
2838 
2839 		if (!(next1.code == (BPF_LSH | BPF_K | BPF_ALU64) &&
2840 		      next2.code == (BPF_RSH | BPF_K | BPF_ALU64)) &&
2841 		    !(next1.code == (BPF_RSH | BPF_K | BPF_ALU64) &&
2842 		      next2.code == (BPF_LSH | BPF_K | BPF_ALU64)))
2843 			continue;
2844 
2845 		if (next1.src_reg || next1.dst_reg ||
2846 		    next2.src_reg || next2.dst_reg)
2847 			continue;
2848 
2849 		if (next1.imm != 0x20 || next2.imm != 0x20)
2850 			continue;
2851 
2852 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
2853 		    meta3->flags & FLAG_INSN_IS_JUMP_DST)
2854 			continue;
2855 
2856 		meta2->skip = true;
2857 		meta3->skip = true;
2858 	}
2859 }
2860 
2861 /* load/store pair that forms memory copy sould look like the following:
2862  *
2863  *   ld_width R, [addr_src + offset_src]
2864  *   st_width [addr_dest + offset_dest], R
2865  *
2866  * The destination register of load and source register of store should
2867  * be the same, load and store should also perform at the same width.
2868  * If either of addr_src or addr_dest is stack pointer, we don't do the
2869  * CPP optimization as stack is modelled by registers on NFP.
2870  */
2871 static bool
2872 curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
2873 		    struct nfp_insn_meta *st_meta)
2874 {
2875 	struct bpf_insn *ld = &ld_meta->insn;
2876 	struct bpf_insn *st = &st_meta->insn;
2877 
2878 	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
2879 		return false;
2880 
2881 	if (ld_meta->ptr.type != PTR_TO_PACKET)
2882 		return false;
2883 
2884 	if (st_meta->ptr.type != PTR_TO_PACKET)
2885 		return false;
2886 
2887 	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
2888 		return false;
2889 
2890 	if (ld->dst_reg != st->src_reg)
2891 		return false;
2892 
2893 	/* There is jump to the store insn in this pair. */
2894 	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
2895 		return false;
2896 
2897 	return true;
2898 }
2899 
2900 /* Currently, we only support chaining load/store pairs if:
2901  *
2902  *  - Their address base registers are the same.
2903  *  - Their address offsets are in the same order.
2904  *  - They operate at the same memory width.
2905  *  - There is no jump into the middle of them.
2906  */
2907 static bool
2908 curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
2909 			      struct nfp_insn_meta *st_meta,
2910 			      struct bpf_insn *prev_ld,
2911 			      struct bpf_insn *prev_st)
2912 {
2913 	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
2914 	struct bpf_insn *ld = &ld_meta->insn;
2915 	struct bpf_insn *st = &st_meta->insn;
2916 	s16 prev_ld_off, prev_st_off;
2917 
2918 	/* This pair is the start pair. */
2919 	if (!prev_ld)
2920 		return true;
2921 
2922 	prev_size = BPF_LDST_BYTES(prev_ld);
2923 	curr_size = BPF_LDST_BYTES(ld);
2924 	prev_ld_base = prev_ld->src_reg;
2925 	prev_st_base = prev_st->dst_reg;
2926 	prev_ld_dst = prev_ld->dst_reg;
2927 	prev_ld_off = prev_ld->off;
2928 	prev_st_off = prev_st->off;
2929 
2930 	if (ld->dst_reg != prev_ld_dst)
2931 		return false;
2932 
2933 	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
2934 		return false;
2935 
2936 	if (curr_size != prev_size)
2937 		return false;
2938 
2939 	/* There is jump to the head of this pair. */
2940 	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
2941 		return false;
2942 
2943 	/* Both in ascending order. */
2944 	if (prev_ld_off + prev_size == ld->off &&
2945 	    prev_st_off + prev_size == st->off)
2946 		return true;
2947 
2948 	/* Both in descending order. */
2949 	if (ld->off + curr_size == prev_ld_off &&
2950 	    st->off + curr_size == prev_st_off)
2951 		return true;
2952 
2953 	return false;
2954 }
2955 
2956 /* Return TRUE if cross memory access happens. Cross memory access means
2957  * store area is overlapping with load area that a later load might load
2958  * the value from previous store, for this case we can't treat the sequence
2959  * as an memory copy.
2960  */
2961 static bool
2962 cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
2963 		 struct nfp_insn_meta *head_st_meta)
2964 {
2965 	s16 head_ld_off, head_st_off, ld_off;
2966 
2967 	/* Different pointer types does not overlap. */
2968 	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
2969 		return false;
2970 
2971 	/* load and store are both PTR_TO_PACKET, check ID info.  */
2972 	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
2973 		return true;
2974 
2975 	/* Canonicalize the offsets. Turn all of them against the original
2976 	 * base register.
2977 	 */
2978 	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
2979 	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
2980 	ld_off = ld->off + head_ld_meta->ptr.off;
2981 
2982 	/* Ascending order cross. */
2983 	if (ld_off > head_ld_off &&
2984 	    head_ld_off < head_st_off && ld_off >= head_st_off)
2985 		return true;
2986 
2987 	/* Descending order cross. */
2988 	if (ld_off < head_ld_off &&
2989 	    head_ld_off > head_st_off && ld_off <= head_st_off)
2990 		return true;
2991 
2992 	return false;
2993 }
2994 
2995 /* This pass try to identify the following instructoin sequences.
2996  *
2997  *   load R, [regA + offA]
2998  *   store [regB + offB], R
2999  *   load R, [regA + offA + const_imm_A]
3000  *   store [regB + offB + const_imm_A], R
3001  *   load R, [regA + offA + 2 * const_imm_A]
3002  *   store [regB + offB + 2 * const_imm_A], R
3003  *   ...
3004  *
3005  * Above sequence is typically generated by compiler when lowering
3006  * memcpy. NFP prefer using CPP instructions to accelerate it.
3007  */
3008 static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
3009 {
3010 	struct nfp_insn_meta *head_ld_meta = NULL;
3011 	struct nfp_insn_meta *head_st_meta = NULL;
3012 	struct nfp_insn_meta *meta1, *meta2;
3013 	struct bpf_insn *prev_ld = NULL;
3014 	struct bpf_insn *prev_st = NULL;
3015 	u8 count = 0;
3016 
3017 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3018 		struct bpf_insn *ld = &meta1->insn;
3019 		struct bpf_insn *st = &meta2->insn;
3020 
3021 		/* Reset record status if any of the following if true:
3022 		 *   - The current insn pair is not load/store.
3023 		 *   - The load/store pair doesn't chain with previous one.
3024 		 *   - The chained load/store pair crossed with previous pair.
3025 		 *   - The chained load/store pair has a total size of memory
3026 		 *     copy beyond 128 bytes which is the maximum length a
3027 		 *     single NFP CPP command can transfer.
3028 		 */
3029 		if (!curr_pair_is_memcpy(meta1, meta2) ||
3030 		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
3031 						   prev_st) ||
3032 		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
3033 						       head_st_meta) ||
3034 				      head_ld_meta->ldst_gather_len >= 128))) {
3035 			if (!count)
3036 				continue;
3037 
3038 			if (count > 1) {
3039 				s16 prev_ld_off = prev_ld->off;
3040 				s16 prev_st_off = prev_st->off;
3041 				s16 head_ld_off = head_ld_meta->insn.off;
3042 
3043 				if (prev_ld_off < head_ld_off) {
3044 					head_ld_meta->insn.off = prev_ld_off;
3045 					head_st_meta->insn.off = prev_st_off;
3046 					head_ld_meta->ldst_gather_len =
3047 						-head_ld_meta->ldst_gather_len;
3048 				}
3049 
3050 				head_ld_meta->paired_st = &head_st_meta->insn;
3051 				head_st_meta->skip = true;
3052 			} else {
3053 				head_ld_meta->ldst_gather_len = 0;
3054 			}
3055 
3056 			/* If the chain is ended by an load/store pair then this
3057 			 * could serve as the new head of the the next chain.
3058 			 */
3059 			if (curr_pair_is_memcpy(meta1, meta2)) {
3060 				head_ld_meta = meta1;
3061 				head_st_meta = meta2;
3062 				head_ld_meta->ldst_gather_len =
3063 					BPF_LDST_BYTES(ld);
3064 				meta1 = nfp_meta_next(meta1);
3065 				meta2 = nfp_meta_next(meta2);
3066 				prev_ld = ld;
3067 				prev_st = st;
3068 				count = 1;
3069 			} else {
3070 				head_ld_meta = NULL;
3071 				head_st_meta = NULL;
3072 				prev_ld = NULL;
3073 				prev_st = NULL;
3074 				count = 0;
3075 			}
3076 
3077 			continue;
3078 		}
3079 
3080 		if (!head_ld_meta) {
3081 			head_ld_meta = meta1;
3082 			head_st_meta = meta2;
3083 		} else {
3084 			meta1->skip = true;
3085 			meta2->skip = true;
3086 		}
3087 
3088 		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
3089 		meta1 = nfp_meta_next(meta1);
3090 		meta2 = nfp_meta_next(meta2);
3091 		prev_ld = ld;
3092 		prev_st = st;
3093 		count++;
3094 	}
3095 }
3096 
3097 static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog)
3098 {
3099 	struct nfp_insn_meta *meta, *range_node = NULL;
3100 	s16 range_start = 0, range_end = 0;
3101 	bool cache_avail = false;
3102 	struct bpf_insn *insn;
3103 	s32 range_ptr_off = 0;
3104 	u32 range_ptr_id = 0;
3105 
3106 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3107 		if (meta->flags & FLAG_INSN_IS_JUMP_DST)
3108 			cache_avail = false;
3109 
3110 		if (meta->skip)
3111 			continue;
3112 
3113 		insn = &meta->insn;
3114 
3115 		if (is_mbpf_store_pkt(meta) ||
3116 		    insn->code == (BPF_JMP | BPF_CALL) ||
3117 		    is_mbpf_classic_store_pkt(meta) ||
3118 		    is_mbpf_classic_load(meta)) {
3119 			cache_avail = false;
3120 			continue;
3121 		}
3122 
3123 		if (!is_mbpf_load(meta))
3124 			continue;
3125 
3126 		if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) {
3127 			cache_avail = false;
3128 			continue;
3129 		}
3130 
3131 		if (!cache_avail) {
3132 			cache_avail = true;
3133 			if (range_node)
3134 				goto end_current_then_start_new;
3135 			goto start_new;
3136 		}
3137 
3138 		/* Check ID to make sure two reads share the same
3139 		 * variable offset against PTR_TO_PACKET, and check OFF
3140 		 * to make sure they also share the same constant
3141 		 * offset.
3142 		 *
3143 		 * OFFs don't really need to be the same, because they
3144 		 * are the constant offsets against PTR_TO_PACKET, so
3145 		 * for different OFFs, we could canonicalize them to
3146 		 * offsets against original packet pointer. We don't
3147 		 * support this.
3148 		 */
3149 		if (meta->ptr.id == range_ptr_id &&
3150 		    meta->ptr.off == range_ptr_off) {
3151 			s16 new_start = range_start;
3152 			s16 end, off = insn->off;
3153 			s16 new_end = range_end;
3154 			bool changed = false;
3155 
3156 			if (off < range_start) {
3157 				new_start = off;
3158 				changed = true;
3159 			}
3160 
3161 			end = off + BPF_LDST_BYTES(insn);
3162 			if (end > range_end) {
3163 				new_end = end;
3164 				changed = true;
3165 			}
3166 
3167 			if (!changed)
3168 				continue;
3169 
3170 			if (new_end - new_start <= 64) {
3171 				/* Install new range. */
3172 				range_start = new_start;
3173 				range_end = new_end;
3174 				continue;
3175 			}
3176 		}
3177 
3178 end_current_then_start_new:
3179 		range_node->pkt_cache.range_start = range_start;
3180 		range_node->pkt_cache.range_end = range_end;
3181 start_new:
3182 		range_node = meta;
3183 		range_node->pkt_cache.do_init = true;
3184 		range_ptr_id = range_node->ptr.id;
3185 		range_ptr_off = range_node->ptr.off;
3186 		range_start = insn->off;
3187 		range_end = insn->off + BPF_LDST_BYTES(insn);
3188 	}
3189 
3190 	if (range_node) {
3191 		range_node->pkt_cache.range_start = range_start;
3192 		range_node->pkt_cache.range_end = range_end;
3193 	}
3194 
3195 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3196 		if (meta->skip)
3197 			continue;
3198 
3199 		if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) {
3200 			if (meta->pkt_cache.do_init) {
3201 				range_start = meta->pkt_cache.range_start;
3202 				range_end = meta->pkt_cache.range_end;
3203 			} else {
3204 				meta->pkt_cache.range_start = range_start;
3205 				meta->pkt_cache.range_end = range_end;
3206 			}
3207 		}
3208 	}
3209 }
3210 
3211 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
3212 {
3213 	nfp_bpf_opt_reg_init(nfp_prog);
3214 
3215 	nfp_bpf_opt_ld_mask(nfp_prog);
3216 	nfp_bpf_opt_ld_shift(nfp_prog);
3217 	nfp_bpf_opt_ldst_gather(nfp_prog);
3218 	nfp_bpf_opt_pkt_cache(nfp_prog);
3219 
3220 	return 0;
3221 }
3222 
3223 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
3224 {
3225 	__le64 *ustore = (__force __le64 *)prog;
3226 	int i;
3227 
3228 	for (i = 0; i < len; i++) {
3229 		int err;
3230 
3231 		err = nfp_ustore_check_valid_no_ecc(prog[i]);
3232 		if (err)
3233 			return err;
3234 
3235 		ustore[i] = cpu_to_le64(nfp_ustore_calc_ecc_insn(prog[i]));
3236 	}
3237 
3238 	return 0;
3239 }
3240 
3241 static void nfp_bpf_prog_trim(struct nfp_prog *nfp_prog)
3242 {
3243 	void *prog;
3244 
3245 	prog = kvmalloc_array(nfp_prog->prog_len, sizeof(u64), GFP_KERNEL);
3246 	if (!prog)
3247 		return;
3248 
3249 	nfp_prog->__prog_alloc_len = nfp_prog->prog_len * sizeof(u64);
3250 	memcpy(prog, nfp_prog->prog, nfp_prog->__prog_alloc_len);
3251 	kvfree(nfp_prog->prog);
3252 	nfp_prog->prog = prog;
3253 }
3254 
3255 int nfp_bpf_jit(struct nfp_prog *nfp_prog)
3256 {
3257 	int ret;
3258 
3259 	ret = nfp_bpf_optimize(nfp_prog);
3260 	if (ret)
3261 		return ret;
3262 
3263 	ret = nfp_translate(nfp_prog);
3264 	if (ret) {
3265 		pr_err("Translation failed with error %d (translated: %u)\n",
3266 		       ret, nfp_prog->n_translated);
3267 		return -EINVAL;
3268 	}
3269 
3270 	nfp_bpf_prog_trim(nfp_prog);
3271 
3272 	return ret;
3273 }
3274 
3275 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt)
3276 {
3277 	struct nfp_insn_meta *meta;
3278 
3279 	/* Another pass to record jump information. */
3280 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3281 		u64 code = meta->insn.code;
3282 
3283 		if (BPF_CLASS(code) == BPF_JMP && BPF_OP(code) != BPF_EXIT &&
3284 		    BPF_OP(code) != BPF_CALL) {
3285 			struct nfp_insn_meta *dst_meta;
3286 			unsigned short dst_indx;
3287 
3288 			dst_indx = meta->n + 1 + meta->insn.off;
3289 			dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_indx,
3290 						     cnt);
3291 
3292 			meta->jmp_dst = dst_meta;
3293 			dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
3294 		}
3295 	}
3296 }
3297 
3298 bool nfp_bpf_supported_opcode(u8 code)
3299 {
3300 	return !!instr_cb[code];
3301 }
3302 
3303 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
3304 {
3305 	unsigned int i;
3306 	u64 *prog;
3307 	int err;
3308 
3309 	prog = kmemdup(nfp_prog->prog, nfp_prog->prog_len * sizeof(u64),
3310 		       GFP_KERNEL);
3311 	if (!prog)
3312 		return ERR_PTR(-ENOMEM);
3313 
3314 	for (i = 0; i < nfp_prog->prog_len; i++) {
3315 		enum nfp_relo_type special;
3316 		u32 val;
3317 
3318 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
3319 		switch (special) {
3320 		case RELO_NONE:
3321 			continue;
3322 		case RELO_BR_REL:
3323 			br_add_offset(&prog[i], bv->start_off);
3324 			break;
3325 		case RELO_BR_GO_OUT:
3326 			br_set_offset(&prog[i],
3327 				      nfp_prog->tgt_out + bv->start_off);
3328 			break;
3329 		case RELO_BR_GO_ABORT:
3330 			br_set_offset(&prog[i],
3331 				      nfp_prog->tgt_abort + bv->start_off);
3332 			break;
3333 		case RELO_BR_NEXT_PKT:
3334 			br_set_offset(&prog[i], bv->tgt_done);
3335 			break;
3336 		case RELO_BR_HELPER:
3337 			val = br_get_offset(prog[i]);
3338 			val -= BR_OFF_RELO;
3339 			switch (val) {
3340 			case BPF_FUNC_map_lookup_elem:
3341 				val = nfp_prog->bpf->helpers.map_lookup;
3342 				break;
3343 			case BPF_FUNC_map_update_elem:
3344 				val = nfp_prog->bpf->helpers.map_update;
3345 				break;
3346 			case BPF_FUNC_map_delete_elem:
3347 				val = nfp_prog->bpf->helpers.map_delete;
3348 				break;
3349 			default:
3350 				pr_err("relocation of unknown helper %d\n",
3351 				       val);
3352 				err = -EINVAL;
3353 				goto err_free_prog;
3354 			}
3355 			br_set_offset(&prog[i], val);
3356 			break;
3357 		case RELO_IMMED_REL:
3358 			immed_add_value(&prog[i], bv->start_off);
3359 			break;
3360 		}
3361 
3362 		prog[i] &= ~OP_RELO_TYPE;
3363 	}
3364 
3365 	err = nfp_bpf_ustore_calc(prog, nfp_prog->prog_len);
3366 	if (err)
3367 		goto err_free_prog;
3368 
3369 	return prog;
3370 
3371 err_free_prog:
3372 	kfree(prog);
3373 	return ERR_PTR(err);
3374 }
3375