1 /*
2  * Copyright (C) 2016-2018 Netronome Systems, Inc.
3  *
4  * This software is dual licensed under the GNU General License Version 2,
5  * June 1991 as shown in the file COPYING in the top-level directory of this
6  * source tree or the BSD 2-Clause License provided below.  You have the
7  * option to license this software under the complete terms of either license.
8  *
9  * The BSD 2-Clause License:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      1. Redistributions of source code must retain the above
16  *         copyright notice, this list of conditions and the following
17  *         disclaimer.
18  *
19  *      2. Redistributions in binary form must reproduce the above
20  *         copyright notice, this list of conditions and the following
21  *         disclaimer in the documentation and/or other materials
22  *         provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #define pr_fmt(fmt)	"NFP net bpf: " fmt
35 
36 #include <linux/bug.h>
37 #include <linux/kernel.h>
38 #include <linux/bpf.h>
39 #include <linux/filter.h>
40 #include <linux/pkt_cls.h>
41 #include <linux/unistd.h>
42 
43 #include "main.h"
44 #include "../nfp_asm.h"
45 
46 /* --- NFP prog --- */
47 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
48  * It's safe to modify the next pointers (but not pos).
49  */
50 #define nfp_for_each_insn_walk2(nfp_prog, pos, next)			\
51 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
52 	     next = list_next_entry(pos, l);			\
53 	     &(nfp_prog)->insns != &pos->l &&			\
54 	     &(nfp_prog)->insns != &next->l;			\
55 	     pos = nfp_meta_next(pos),				\
56 	     next = nfp_meta_next(pos))
57 
58 #define nfp_for_each_insn_walk3(nfp_prog, pos, next, next2)		\
59 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
60 	     next = list_next_entry(pos, l),			\
61 	     next2 = list_next_entry(next, l);			\
62 	     &(nfp_prog)->insns != &pos->l &&			\
63 	     &(nfp_prog)->insns != &next->l &&			\
64 	     &(nfp_prog)->insns != &next2->l;			\
65 	     pos = nfp_meta_next(pos),				\
66 	     next = nfp_meta_next(pos),				\
67 	     next2 = nfp_meta_next(next))
68 
69 static bool
70 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
71 {
72 	return meta->l.prev != &nfp_prog->insns;
73 }
74 
75 static void nfp_prog_push(struct nfp_prog *nfp_prog, u64 insn)
76 {
77 	if (nfp_prog->__prog_alloc_len / sizeof(u64) == nfp_prog->prog_len) {
78 		pr_warn("instruction limit reached (%u NFP instructions)\n",
79 			nfp_prog->prog_len);
80 		nfp_prog->error = -ENOSPC;
81 		return;
82 	}
83 
84 	nfp_prog->prog[nfp_prog->prog_len] = insn;
85 	nfp_prog->prog_len++;
86 }
87 
88 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog)
89 {
90 	return nfp_prog->prog_len;
91 }
92 
93 static bool
94 nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off)
95 {
96 	/* If there is a recorded error we may have dropped instructions;
97 	 * that doesn't have to be due to translator bug, and the translation
98 	 * will fail anyway, so just return OK.
99 	 */
100 	if (nfp_prog->error)
101 		return true;
102 	return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off);
103 }
104 
105 /* --- Emitters --- */
106 static void
107 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
108 	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx,
109 	   bool indir)
110 {
111 	u64 insn;
112 
113 	insn =	FIELD_PREP(OP_CMD_A_SRC, areg) |
114 		FIELD_PREP(OP_CMD_CTX, ctx) |
115 		FIELD_PREP(OP_CMD_B_SRC, breg) |
116 		FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) |
117 		FIELD_PREP(OP_CMD_XFER, xfer) |
118 		FIELD_PREP(OP_CMD_CNT, size) |
119 		FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) |
120 		FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
121 		FIELD_PREP(OP_CMD_INDIR, indir) |
122 		FIELD_PREP(OP_CMD_MODE, mode);
123 
124 	nfp_prog_push(nfp_prog, insn);
125 }
126 
127 static void
128 emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
129 	     swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir)
130 {
131 	struct nfp_insn_re_regs reg;
132 	int err;
133 
134 	err = swreg_to_restricted(reg_none(), lreg, rreg, &reg, false);
135 	if (err) {
136 		nfp_prog->error = err;
137 		return;
138 	}
139 	if (reg.swap) {
140 		pr_err("cmd can't swap arguments\n");
141 		nfp_prog->error = -EFAULT;
142 		return;
143 	}
144 	if (reg.dst_lmextn || reg.src_lmextn) {
145 		pr_err("cmd can't use LMextn\n");
146 		nfp_prog->error = -EFAULT;
147 		return;
148 	}
149 
150 	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx,
151 		   indir);
152 }
153 
154 static void
155 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
156 	 swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
157 {
158 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false);
159 }
160 
161 static void
162 emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
163 	       swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
164 {
165 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true);
166 }
167 
168 static void
169 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
170 	  enum br_ctx_signal_state css, u16 addr, u8 defer)
171 {
172 	u16 addr_lo, addr_hi;
173 	u64 insn;
174 
175 	addr_lo = addr & (OP_BR_ADDR_LO >> __bf_shf(OP_BR_ADDR_LO));
176 	addr_hi = addr != addr_lo;
177 
178 	insn = OP_BR_BASE |
179 		FIELD_PREP(OP_BR_MASK, mask) |
180 		FIELD_PREP(OP_BR_EV_PIP, ev_pip) |
181 		FIELD_PREP(OP_BR_CSS, css) |
182 		FIELD_PREP(OP_BR_DEFBR, defer) |
183 		FIELD_PREP(OP_BR_ADDR_LO, addr_lo) |
184 		FIELD_PREP(OP_BR_ADDR_HI, addr_hi);
185 
186 	nfp_prog_push(nfp_prog, insn);
187 }
188 
189 static void
190 emit_br_relo(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer,
191 	     enum nfp_relo_type relo)
192 {
193 	if (mask == BR_UNC && defer > 2) {
194 		pr_err("BUG: branch defer out of bounds %d\n", defer);
195 		nfp_prog->error = -EFAULT;
196 		return;
197 	}
198 
199 	__emit_br(nfp_prog, mask,
200 		  mask != BR_UNC ? BR_EV_PIP_COND : BR_EV_PIP_UNCOND,
201 		  BR_CSS_NONE, addr, defer);
202 
203 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
204 		FIELD_PREP(OP_RELO_TYPE, relo);
205 }
206 
207 static void
208 emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
209 {
210 	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
211 }
212 
213 static void
214 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
215 	     enum immed_width width, bool invert,
216 	     enum immed_shift shift, bool wr_both,
217 	     bool dst_lmextn, bool src_lmextn)
218 {
219 	u64 insn;
220 
221 	insn = OP_IMMED_BASE |
222 		FIELD_PREP(OP_IMMED_A_SRC, areg) |
223 		FIELD_PREP(OP_IMMED_B_SRC, breg) |
224 		FIELD_PREP(OP_IMMED_IMM, imm_hi) |
225 		FIELD_PREP(OP_IMMED_WIDTH, width) |
226 		FIELD_PREP(OP_IMMED_INV, invert) |
227 		FIELD_PREP(OP_IMMED_SHIFT, shift) |
228 		FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
229 		FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
230 		FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
231 
232 	nfp_prog_push(nfp_prog, insn);
233 }
234 
235 static void
236 emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
237 	   enum immed_width width, bool invert, enum immed_shift shift)
238 {
239 	struct nfp_insn_ur_regs reg;
240 	int err;
241 
242 	if (swreg_type(dst) == NN_REG_IMM) {
243 		nfp_prog->error = -EFAULT;
244 		return;
245 	}
246 
247 	err = swreg_to_unrestricted(dst, dst, reg_imm(imm & 0xff), &reg);
248 	if (err) {
249 		nfp_prog->error = err;
250 		return;
251 	}
252 
253 	/* Use reg.dst when destination is No-Dest. */
254 	__emit_immed(nfp_prog,
255 		     swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
256 		     reg.breg, imm >> 8, width, invert, shift,
257 		     reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
258 }
259 
260 static void
261 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
262 	   enum shf_sc sc, u8 shift,
263 	   u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
264 	   bool dst_lmextn, bool src_lmextn)
265 {
266 	u64 insn;
267 
268 	if (!FIELD_FIT(OP_SHF_SHIFT, shift)) {
269 		nfp_prog->error = -EFAULT;
270 		return;
271 	}
272 
273 	if (sc == SHF_SC_L_SHF)
274 		shift = 32 - shift;
275 
276 	insn = OP_SHF_BASE |
277 		FIELD_PREP(OP_SHF_A_SRC, areg) |
278 		FIELD_PREP(OP_SHF_SC, sc) |
279 		FIELD_PREP(OP_SHF_B_SRC, breg) |
280 		FIELD_PREP(OP_SHF_I8, i8) |
281 		FIELD_PREP(OP_SHF_SW, sw) |
282 		FIELD_PREP(OP_SHF_DST, dst) |
283 		FIELD_PREP(OP_SHF_SHIFT, shift) |
284 		FIELD_PREP(OP_SHF_OP, op) |
285 		FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
286 		FIELD_PREP(OP_SHF_WR_AB, wr_both) |
287 		FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
288 		FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
289 
290 	nfp_prog_push(nfp_prog, insn);
291 }
292 
293 static void
294 emit_shf(struct nfp_prog *nfp_prog, swreg dst,
295 	 swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
296 {
297 	struct nfp_insn_re_regs reg;
298 	int err;
299 
300 	err = swreg_to_restricted(dst, lreg, rreg, &reg, true);
301 	if (err) {
302 		nfp_prog->error = err;
303 		return;
304 	}
305 
306 	__emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
307 		   reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
308 		   reg.dst_lmextn, reg.src_lmextn);
309 }
310 
311 static void
312 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
313 	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
314 	   bool dst_lmextn, bool src_lmextn)
315 {
316 	u64 insn;
317 
318 	insn = OP_ALU_BASE |
319 		FIELD_PREP(OP_ALU_A_SRC, areg) |
320 		FIELD_PREP(OP_ALU_B_SRC, breg) |
321 		FIELD_PREP(OP_ALU_DST, dst) |
322 		FIELD_PREP(OP_ALU_SW, swap) |
323 		FIELD_PREP(OP_ALU_OP, op) |
324 		FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
325 		FIELD_PREP(OP_ALU_WR_AB, wr_both) |
326 		FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
327 		FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
328 
329 	nfp_prog_push(nfp_prog, insn);
330 }
331 
332 static void
333 emit_alu(struct nfp_prog *nfp_prog, swreg dst,
334 	 swreg lreg, enum alu_op op, swreg rreg)
335 {
336 	struct nfp_insn_ur_regs reg;
337 	int err;
338 
339 	err = swreg_to_unrestricted(dst, lreg, rreg, &reg);
340 	if (err) {
341 		nfp_prog->error = err;
342 		return;
343 	}
344 
345 	__emit_alu(nfp_prog, reg.dst, reg.dst_ab,
346 		   reg.areg, op, reg.breg, reg.swap, reg.wr_both,
347 		   reg.dst_lmextn, reg.src_lmextn);
348 }
349 
350 static void
351 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
352 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
353 		bool zero, bool swap, bool wr_both,
354 		bool dst_lmextn, bool src_lmextn)
355 {
356 	u64 insn;
357 
358 	insn = OP_LDF_BASE |
359 		FIELD_PREP(OP_LDF_A_SRC, areg) |
360 		FIELD_PREP(OP_LDF_SC, sc) |
361 		FIELD_PREP(OP_LDF_B_SRC, breg) |
362 		FIELD_PREP(OP_LDF_I8, imm8) |
363 		FIELD_PREP(OP_LDF_SW, swap) |
364 		FIELD_PREP(OP_LDF_ZF, zero) |
365 		FIELD_PREP(OP_LDF_BMASK, bmask) |
366 		FIELD_PREP(OP_LDF_SHF, shift) |
367 		FIELD_PREP(OP_LDF_WR_AB, wr_both) |
368 		FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
369 		FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
370 
371 	nfp_prog_push(nfp_prog, insn);
372 }
373 
374 static void
375 emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
376 		  enum shf_sc sc, u8 shift, bool zero)
377 {
378 	struct nfp_insn_re_regs reg;
379 	int err;
380 
381 	/* Note: ld_field is special as it uses one of the src regs as dst */
382 	err = swreg_to_restricted(dst, dst, src, &reg, true);
383 	if (err) {
384 		nfp_prog->error = err;
385 		return;
386 	}
387 
388 	__emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
389 			reg.i8, zero, reg.swap, reg.wr_both,
390 			reg.dst_lmextn, reg.src_lmextn);
391 }
392 
393 static void
394 emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
395 	      enum shf_sc sc, u8 shift)
396 {
397 	emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
398 }
399 
400 static void
401 __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
402 	    bool dst_lmextn, bool src_lmextn)
403 {
404 	u64 insn;
405 
406 	insn = OP_LCSR_BASE |
407 		FIELD_PREP(OP_LCSR_A_SRC, areg) |
408 		FIELD_PREP(OP_LCSR_B_SRC, breg) |
409 		FIELD_PREP(OP_LCSR_WRITE, wr) |
410 		FIELD_PREP(OP_LCSR_ADDR, addr / 4) |
411 		FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
412 		FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
413 
414 	nfp_prog_push(nfp_prog, insn);
415 }
416 
417 static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
418 {
419 	struct nfp_insn_ur_regs reg;
420 	int err;
421 
422 	/* This instruction takes immeds instead of reg_none() for the ignored
423 	 * operand, but we can't encode 2 immeds in one instr with our normal
424 	 * swreg infra so if param is an immed, we encode as reg_none() and
425 	 * copy the immed to both operands.
426 	 */
427 	if (swreg_type(src) == NN_REG_IMM) {
428 		err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
429 		reg.breg = reg.areg;
430 	} else {
431 		err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
432 	}
433 	if (err) {
434 		nfp_prog->error = err;
435 		return;
436 	}
437 
438 	__emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr,
439 		    false, reg.src_lmextn);
440 }
441 
442 /* CSR value is read in following immed[gpr, 0] */
443 static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr)
444 {
445 	__emit_lcsr(nfp_prog, 0, 0, false, addr, false, false);
446 }
447 
448 static void emit_nop(struct nfp_prog *nfp_prog)
449 {
450 	__emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
451 }
452 
453 /* --- Wrappers --- */
454 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
455 {
456 	if (!(imm & 0xffff0000)) {
457 		*val = imm;
458 		*shift = IMMED_SHIFT_0B;
459 	} else if (!(imm & 0xff0000ff)) {
460 		*val = imm >> 8;
461 		*shift = IMMED_SHIFT_1B;
462 	} else if (!(imm & 0x0000ffff)) {
463 		*val = imm >> 16;
464 		*shift = IMMED_SHIFT_2B;
465 	} else {
466 		return false;
467 	}
468 
469 	return true;
470 }
471 
472 static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
473 {
474 	enum immed_shift shift;
475 	u16 val;
476 
477 	if (pack_immed(imm, &val, &shift)) {
478 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, false, shift);
479 	} else if (pack_immed(~imm, &val, &shift)) {
480 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, true, shift);
481 	} else {
482 		emit_immed(nfp_prog, dst, imm & 0xffff, IMMED_WIDTH_ALL,
483 			   false, IMMED_SHIFT_0B);
484 		emit_immed(nfp_prog, dst, imm >> 16, IMMED_WIDTH_WORD,
485 			   false, IMMED_SHIFT_2B);
486 	}
487 }
488 
489 static void
490 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm,
491 	       enum nfp_relo_type relo)
492 {
493 	if (imm > 0xffff) {
494 		pr_err("relocation of a large immediate!\n");
495 		nfp_prog->error = -EFAULT;
496 		return;
497 	}
498 	emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
499 
500 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
501 		FIELD_PREP(OP_RELO_TYPE, relo);
502 }
503 
504 /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted)
505  * If the @imm is small enough encode it directly in operand and return
506  * otherwise load @imm to a spare register and return its encoding.
507  */
508 static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
509 {
510 	if (FIELD_FIT(UR_REG_IMM_MAX, imm))
511 		return reg_imm(imm);
512 
513 	wrp_immed(nfp_prog, tmp_reg, imm);
514 	return tmp_reg;
515 }
516 
517 /* re_load_imm_any() - encode immediate or use tmp register (restricted)
518  * If the @imm is small enough encode it directly in operand and return
519  * otherwise load @imm to a spare register and return its encoding.
520  */
521 static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
522 {
523 	if (FIELD_FIT(RE_REG_IMM_MAX, imm))
524 		return reg_imm(imm);
525 
526 	wrp_immed(nfp_prog, tmp_reg, imm);
527 	return tmp_reg;
528 }
529 
530 static void wrp_nops(struct nfp_prog *nfp_prog, unsigned int count)
531 {
532 	while (count--)
533 		emit_nop(nfp_prog);
534 }
535 
536 static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src)
537 {
538 	emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src);
539 }
540 
541 static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
542 {
543 	wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
544 }
545 
546 /* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
547  * result to @dst from low end.
548  */
549 static void
550 wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
551 		u8 offset)
552 {
553 	enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
554 	u8 mask = (1 << field_len) - 1;
555 
556 	emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
557 }
558 
559 /* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the
560  * result to @dst from offset, there is no change on the other bits of @dst.
561  */
562 static void
563 wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src,
564 		   u8 field_len, u8 offset)
565 {
566 	enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE;
567 	u8 mask = ((1 << field_len) - 1) << offset;
568 
569 	emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8);
570 }
571 
572 static void
573 addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
574 	      swreg *rega, swreg *regb)
575 {
576 	if (offset == reg_imm(0)) {
577 		*rega = reg_a(src_gpr);
578 		*regb = reg_b(src_gpr + 1);
579 		return;
580 	}
581 
582 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset);
583 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C,
584 		 reg_imm(0));
585 	*rega = imm_a(nfp_prog);
586 	*regb = imm_b(nfp_prog);
587 }
588 
589 /* NFP has Command Push Pull bus which supports bluk memory operations. */
590 static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
591 {
592 	bool descending_seq = meta->ldst_gather_len < 0;
593 	s16 len = abs(meta->ldst_gather_len);
594 	swreg src_base, off;
595 	bool src_40bit_addr;
596 	unsigned int i;
597 	u8 xfer_num;
598 
599 	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
600 	src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE;
601 	src_base = reg_a(meta->insn.src_reg * 2);
602 	xfer_num = round_up(len, 4) / 4;
603 
604 	if (src_40bit_addr)
605 		addr40_offset(nfp_prog, meta->insn.src_reg, off, &src_base,
606 			      &off);
607 
608 	/* Setup PREV_ALU fields to override memory read length. */
609 	if (len > 32)
610 		wrp_immed(nfp_prog, reg_none(),
611 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
612 
613 	/* Memory read from source addr into transfer-in registers. */
614 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP,
615 		     src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0,
616 		     src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32);
617 
618 	/* Move from transfer-in to transfer-out. */
619 	for (i = 0; i < xfer_num; i++)
620 		wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
621 
622 	off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
623 
624 	if (len <= 8) {
625 		/* Use single direct_ref write8. */
626 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
627 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
628 			 CMD_CTX_SWAP);
629 	} else if (len <= 32 && IS_ALIGNED(len, 4)) {
630 		/* Use single direct_ref write32. */
631 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
632 			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
633 			 CMD_CTX_SWAP);
634 	} else if (len <= 32) {
635 		/* Use single indirect_ref write8. */
636 		wrp_immed(nfp_prog, reg_none(),
637 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
638 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
639 			       reg_a(meta->paired_st->dst_reg * 2), off,
640 			       len - 1, CMD_CTX_SWAP);
641 	} else if (IS_ALIGNED(len, 4)) {
642 		/* Use single indirect_ref write32. */
643 		wrp_immed(nfp_prog, reg_none(),
644 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
645 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
646 			       reg_a(meta->paired_st->dst_reg * 2), off,
647 			       xfer_num - 1, CMD_CTX_SWAP);
648 	} else if (len <= 40) {
649 		/* Use one direct_ref write32 to write the first 32-bytes, then
650 		 * another direct_ref write8 to write the remaining bytes.
651 		 */
652 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
653 			 reg_a(meta->paired_st->dst_reg * 2), off, 7,
654 			 CMD_CTX_SWAP);
655 
656 		off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
657 				      imm_b(nfp_prog));
658 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
659 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
660 			 CMD_CTX_SWAP);
661 	} else {
662 		/* Use one indirect_ref write32 to write 4-bytes aligned length,
663 		 * then another direct_ref write8 to write the remaining bytes.
664 		 */
665 		u8 new_off;
666 
667 		wrp_immed(nfp_prog, reg_none(),
668 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
669 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
670 			       reg_a(meta->paired_st->dst_reg * 2), off,
671 			       xfer_num - 2, CMD_CTX_SWAP);
672 		new_off = meta->paired_st->off + (xfer_num - 1) * 4;
673 		off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
674 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
675 			 xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
676 			 (len & 0x3) - 1, CMD_CTX_SWAP);
677 	}
678 
679 	/* TODO: The following extra load is to make sure data flow be identical
680 	 *  before and after we do memory copy optimization.
681 	 *
682 	 *  The load destination register is not guaranteed to be dead, so we
683 	 *  need to make sure it is loaded with the value the same as before
684 	 *  this transformation.
685 	 *
686 	 *  These extra loads could be removed once we have accurate register
687 	 *  usage information.
688 	 */
689 	if (descending_seq)
690 		xfer_num = 0;
691 	else if (BPF_SIZE(meta->insn.code) != BPF_DW)
692 		xfer_num = xfer_num - 1;
693 	else
694 		xfer_num = xfer_num - 2;
695 
696 	switch (BPF_SIZE(meta->insn.code)) {
697 	case BPF_B:
698 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
699 				reg_xfer(xfer_num), 1,
700 				IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
701 		break;
702 	case BPF_H:
703 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
704 				reg_xfer(xfer_num), 2, (len & 3) ^ 2);
705 		break;
706 	case BPF_W:
707 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
708 			reg_xfer(0));
709 		break;
710 	case BPF_DW:
711 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
712 			reg_xfer(xfer_num));
713 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
714 			reg_xfer(xfer_num + 1));
715 		break;
716 	}
717 
718 	if (BPF_SIZE(meta->insn.code) != BPF_DW)
719 		wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
720 
721 	return 0;
722 }
723 
724 static int
725 data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size)
726 {
727 	unsigned int i;
728 	u16 shift, sz;
729 
730 	/* We load the value from the address indicated in @offset and then
731 	 * shift out the data we don't need.  Note: this is big endian!
732 	 */
733 	sz = max(size, 4);
734 	shift = size < 4 ? 4 - size : 0;
735 
736 	emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
737 		 pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP);
738 
739 	i = 0;
740 	if (shift)
741 		emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE,
742 			 reg_xfer(0), SHF_SC_R_SHF, shift * 8);
743 	else
744 		for (; i * 4 < size; i++)
745 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
746 
747 	if (i < 2)
748 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
749 
750 	return 0;
751 }
752 
753 static int
754 data_ld_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr,
755 		   swreg lreg, swreg rreg, int size, enum cmd_mode mode)
756 {
757 	unsigned int i;
758 	u8 mask, sz;
759 
760 	/* We load the value from the address indicated in rreg + lreg and then
761 	 * mask out the data we don't need.  Note: this is little endian!
762 	 */
763 	sz = max(size, 4);
764 	mask = size < 4 ? GENMASK(size - 1, 0) : 0;
765 
766 	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0,
767 		 lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP);
768 
769 	i = 0;
770 	if (mask)
771 		emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask,
772 				  reg_xfer(0), SHF_SC_NONE, 0, true);
773 	else
774 		for (; i * 4 < size; i++)
775 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
776 
777 	if (i < 2)
778 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
779 
780 	return 0;
781 }
782 
783 static int
784 data_ld_host_order_addr32(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
785 			  u8 dst_gpr, u8 size)
786 {
787 	return data_ld_host_order(nfp_prog, dst_gpr, reg_a(src_gpr), offset,
788 				  size, CMD_MODE_32b);
789 }
790 
791 static int
792 data_ld_host_order_addr40(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
793 			  u8 dst_gpr, u8 size)
794 {
795 	swreg rega, regb;
796 
797 	addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb);
798 
799 	return data_ld_host_order(nfp_prog, dst_gpr, rega, regb,
800 				  size, CMD_MODE_40b_BA);
801 }
802 
803 static int
804 construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset, u16 src, u8 size)
805 {
806 	swreg tmp_reg;
807 
808 	/* Calculate the true offset (src_reg + imm) */
809 	tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
810 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg);
811 
812 	/* Check packet length (size guaranteed to fit b/c it's u8) */
813 	emit_alu(nfp_prog, imm_a(nfp_prog),
814 		 imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
815 	emit_alu(nfp_prog, reg_none(),
816 		 plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
817 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
818 
819 	/* Load data */
820 	return data_ld(nfp_prog, imm_b(nfp_prog), 0, size);
821 }
822 
823 static int construct_data_ld(struct nfp_prog *nfp_prog, u16 offset, u8 size)
824 {
825 	swreg tmp_reg;
826 
827 	/* Check packet length */
828 	tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog));
829 	emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
830 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
831 
832 	/* Load data */
833 	tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
834 	return data_ld(nfp_prog, tmp_reg, 0, size);
835 }
836 
837 static int
838 data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
839 		    u8 src_gpr, u8 size)
840 {
841 	unsigned int i;
842 
843 	for (i = 0; i * 4 < size; i++)
844 		wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i));
845 
846 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
847 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
848 
849 	return 0;
850 }
851 
852 static int
853 data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
854 		   u64 imm, u8 size)
855 {
856 	wrp_immed(nfp_prog, reg_xfer(0), imm);
857 	if (size == 8)
858 		wrp_immed(nfp_prog, reg_xfer(1), imm >> 32);
859 
860 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
861 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
862 
863 	return 0;
864 }
865 
866 typedef int
867 (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
868 	     unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
869 	     bool needs_inc);
870 
871 static int
872 wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
873 	      unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
874 	      bool needs_inc)
875 {
876 	bool should_inc = needs_inc && new_gpr && !last;
877 	u32 idx, src_byte;
878 	enum shf_sc sc;
879 	swreg reg;
880 	int shf;
881 	u8 mask;
882 
883 	if (WARN_ON_ONCE(dst_byte + size > 4 || off % 4 + size > 4))
884 		return -EOPNOTSUPP;
885 
886 	idx = off / 4;
887 
888 	/* Move the entire word */
889 	if (size == 4) {
890 		wrp_mov(nfp_prog, reg_both(dst),
891 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
892 		return 0;
893 	}
894 
895 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
896 		return -EOPNOTSUPP;
897 
898 	src_byte = off % 4;
899 
900 	mask = (1 << size) - 1;
901 	mask <<= dst_byte;
902 
903 	if (WARN_ON_ONCE(mask > 0xf))
904 		return -EOPNOTSUPP;
905 
906 	shf = abs(src_byte - dst_byte) * 8;
907 	if (src_byte == dst_byte) {
908 		sc = SHF_SC_NONE;
909 	} else if (src_byte < dst_byte) {
910 		shf = 32 - shf;
911 		sc = SHF_SC_L_SHF;
912 	} else {
913 		sc = SHF_SC_R_SHF;
914 	}
915 
916 	/* ld_field can address fewer indexes, if offset too large do RMW.
917 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
918 	 */
919 	if (idx <= RE_REG_LM_IDX_MAX) {
920 		reg = reg_lm(lm3 ? 3 : 0, idx);
921 	} else {
922 		reg = imm_a(nfp_prog);
923 		/* If it's not the first part of the load and we start a new GPR
924 		 * that means we are loading a second part of the LMEM word into
925 		 * a new GPR.  IOW we've already looked that LMEM word and
926 		 * therefore it has been loaded into imm_a().
927 		 */
928 		if (first || !new_gpr)
929 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
930 	}
931 
932 	emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
933 
934 	if (should_inc)
935 		wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
936 
937 	return 0;
938 }
939 
940 static int
941 wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
942 	       unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
943 	       bool needs_inc)
944 {
945 	bool should_inc = needs_inc && new_gpr && !last;
946 	u32 idx, dst_byte;
947 	enum shf_sc sc;
948 	swreg reg;
949 	int shf;
950 	u8 mask;
951 
952 	if (WARN_ON_ONCE(src_byte + size > 4 || off % 4 + size > 4))
953 		return -EOPNOTSUPP;
954 
955 	idx = off / 4;
956 
957 	/* Move the entire word */
958 	if (size == 4) {
959 		wrp_mov(nfp_prog,
960 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
961 			reg_b(src));
962 		return 0;
963 	}
964 
965 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
966 		return -EOPNOTSUPP;
967 
968 	dst_byte = off % 4;
969 
970 	mask = (1 << size) - 1;
971 	mask <<= dst_byte;
972 
973 	if (WARN_ON_ONCE(mask > 0xf))
974 		return -EOPNOTSUPP;
975 
976 	shf = abs(src_byte - dst_byte) * 8;
977 	if (src_byte == dst_byte) {
978 		sc = SHF_SC_NONE;
979 	} else if (src_byte < dst_byte) {
980 		shf = 32 - shf;
981 		sc = SHF_SC_L_SHF;
982 	} else {
983 		sc = SHF_SC_R_SHF;
984 	}
985 
986 	/* ld_field can address fewer indexes, if offset too large do RMW.
987 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
988 	 */
989 	if (idx <= RE_REG_LM_IDX_MAX) {
990 		reg = reg_lm(lm3 ? 3 : 0, idx);
991 	} else {
992 		reg = imm_a(nfp_prog);
993 		/* Only first and last LMEM locations are going to need RMW,
994 		 * the middle location will be overwritten fully.
995 		 */
996 		if (first || last)
997 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
998 	}
999 
1000 	emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
1001 
1002 	if (new_gpr || last) {
1003 		if (idx > RE_REG_LM_IDX_MAX)
1004 			wrp_mov(nfp_prog, reg_lm(0, idx), reg);
1005 		if (should_inc)
1006 			wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1007 	}
1008 
1009 	return 0;
1010 }
1011 
1012 static int
1013 mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1014 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
1015 	     bool clr_gpr, lmem_step step)
1016 {
1017 	s32 off = nfp_prog->stack_depth + meta->insn.off + ptr_off;
1018 	bool first = true, last;
1019 	bool needs_inc = false;
1020 	swreg stack_off_reg;
1021 	u8 prev_gpr = 255;
1022 	u32 gpr_byte = 0;
1023 	bool lm3 = true;
1024 	int ret;
1025 
1026 	if (meta->ptr_not_const) {
1027 		/* Use of the last encountered ptr_off is OK, they all have
1028 		 * the same alignment.  Depend on low bits of value being
1029 		 * discarded when written to LMaddr register.
1030 		 */
1031 		stack_off_reg = ur_load_imm_any(nfp_prog, meta->insn.off,
1032 						stack_imm(nfp_prog));
1033 
1034 		emit_alu(nfp_prog, imm_b(nfp_prog),
1035 			 reg_a(ptr_gpr), ALU_OP_ADD, stack_off_reg);
1036 
1037 		needs_inc = true;
1038 	} else if (off + size <= 64) {
1039 		/* We can reach bottom 64B with LMaddr0 */
1040 		lm3 = false;
1041 	} else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
1042 		/* We have to set up a new pointer.  If we know the offset
1043 		 * and the entire access falls into a single 32 byte aligned
1044 		 * window we won't have to increment the LM pointer.
1045 		 * The 32 byte alignment is imporant because offset is ORed in
1046 		 * not added when doing *l$indexN[off].
1047 		 */
1048 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
1049 						stack_imm(nfp_prog));
1050 		emit_alu(nfp_prog, imm_b(nfp_prog),
1051 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1052 
1053 		off %= 32;
1054 	} else {
1055 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
1056 						stack_imm(nfp_prog));
1057 
1058 		emit_alu(nfp_prog, imm_b(nfp_prog),
1059 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1060 
1061 		needs_inc = true;
1062 	}
1063 	if (lm3) {
1064 		emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
1065 		/* For size < 4 one slot will be filled by zeroing of upper. */
1066 		wrp_nops(nfp_prog, clr_gpr && size < 8 ? 2 : 3);
1067 	}
1068 
1069 	if (clr_gpr && size < 8)
1070 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
1071 
1072 	while (size) {
1073 		u32 slice_end;
1074 		u8 slice_size;
1075 
1076 		slice_size = min(size, 4 - gpr_byte);
1077 		slice_end = min(off + slice_size, round_up(off + 1, 4));
1078 		slice_size = slice_end - off;
1079 
1080 		last = slice_size == size;
1081 
1082 		if (needs_inc)
1083 			off %= 4;
1084 
1085 		ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
1086 			   first, gpr != prev_gpr, last, lm3, needs_inc);
1087 		if (ret)
1088 			return ret;
1089 
1090 		prev_gpr = gpr;
1091 		first = false;
1092 
1093 		gpr_byte += slice_size;
1094 		if (gpr_byte >= 4) {
1095 			gpr_byte -= 4;
1096 			gpr++;
1097 		}
1098 
1099 		size -= slice_size;
1100 		off += slice_size;
1101 	}
1102 
1103 	return 0;
1104 }
1105 
1106 static void
1107 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
1108 {
1109 	swreg tmp_reg;
1110 
1111 	if (alu_op == ALU_OP_AND) {
1112 		if (!imm)
1113 			wrp_immed(nfp_prog, reg_both(dst), 0);
1114 		if (!imm || !~imm)
1115 			return;
1116 	}
1117 	if (alu_op == ALU_OP_OR) {
1118 		if (!~imm)
1119 			wrp_immed(nfp_prog, reg_both(dst), ~0U);
1120 		if (!imm || !~imm)
1121 			return;
1122 	}
1123 	if (alu_op == ALU_OP_XOR) {
1124 		if (!~imm)
1125 			emit_alu(nfp_prog, reg_both(dst), reg_none(),
1126 				 ALU_OP_NOT, reg_b(dst));
1127 		if (!imm || !~imm)
1128 			return;
1129 	}
1130 
1131 	tmp_reg = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1132 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, tmp_reg);
1133 }
1134 
1135 static int
1136 wrp_alu64_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1137 	      enum alu_op alu_op, bool skip)
1138 {
1139 	const struct bpf_insn *insn = &meta->insn;
1140 	u64 imm = insn->imm; /* sign extend */
1141 
1142 	if (skip) {
1143 		meta->skip = true;
1144 		return 0;
1145 	}
1146 
1147 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, imm & ~0U);
1148 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, alu_op, imm >> 32);
1149 
1150 	return 0;
1151 }
1152 
1153 static int
1154 wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1155 	      enum alu_op alu_op)
1156 {
1157 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1158 
1159 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1160 	emit_alu(nfp_prog, reg_both(dst + 1),
1161 		 reg_a(dst + 1), alu_op, reg_b(src + 1));
1162 
1163 	return 0;
1164 }
1165 
1166 static int
1167 wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1168 	      enum alu_op alu_op, bool skip)
1169 {
1170 	const struct bpf_insn *insn = &meta->insn;
1171 
1172 	if (skip) {
1173 		meta->skip = true;
1174 		return 0;
1175 	}
1176 
1177 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, insn->imm);
1178 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1179 
1180 	return 0;
1181 }
1182 
1183 static int
1184 wrp_alu32_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1185 	      enum alu_op alu_op)
1186 {
1187 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1188 
1189 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1190 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1191 
1192 	return 0;
1193 }
1194 
1195 static void
1196 wrp_test_reg_one(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u8 src,
1197 		 enum br_mask br_mask, u16 off)
1198 {
1199 	emit_alu(nfp_prog, reg_none(), reg_a(dst), alu_op, reg_b(src));
1200 	emit_br(nfp_prog, br_mask, off, 0);
1201 }
1202 
1203 static int
1204 wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1205 	     enum alu_op alu_op, enum br_mask br_mask)
1206 {
1207 	const struct bpf_insn *insn = &meta->insn;
1208 
1209 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
1210 			 insn->src_reg * 2, br_mask, insn->off);
1211 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
1212 			 insn->src_reg * 2 + 1, br_mask, insn->off);
1213 
1214 	return 0;
1215 }
1216 
1217 static const struct jmp_code_map {
1218 	enum br_mask br_mask;
1219 	bool swap;
1220 } jmp_code_map[] = {
1221 	[BPF_JGT >> 4]	= { BR_BLO, true },
1222 	[BPF_JGE >> 4]	= { BR_BHS, false },
1223 	[BPF_JLT >> 4]	= { BR_BLO, false },
1224 	[BPF_JLE >> 4]	= { BR_BHS, true },
1225 	[BPF_JSGT >> 4]	= { BR_BLT, true },
1226 	[BPF_JSGE >> 4]	= { BR_BGE, false },
1227 	[BPF_JSLT >> 4]	= { BR_BLT, false },
1228 	[BPF_JSLE >> 4]	= { BR_BGE, true },
1229 };
1230 
1231 static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
1232 {
1233 	unsigned int op;
1234 
1235 	op = BPF_OP(meta->insn.code) >> 4;
1236 	/* br_mask of 0 is BR_BEQ which we don't use in jump code table */
1237 	if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
1238 		      !jmp_code_map[op].br_mask,
1239 		      "no code found for jump instruction"))
1240 		return NULL;
1241 
1242 	return &jmp_code_map[op];
1243 }
1244 
1245 static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1246 {
1247 	const struct bpf_insn *insn = &meta->insn;
1248 	u64 imm = insn->imm; /* sign extend */
1249 	const struct jmp_code_map *code;
1250 	enum alu_op alu_op, carry_op;
1251 	u8 reg = insn->dst_reg * 2;
1252 	swreg tmp_reg;
1253 
1254 	code = nfp_jmp_code_get(meta);
1255 	if (!code)
1256 		return -EINVAL;
1257 
1258 	alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
1259 	carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
1260 
1261 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
1262 	if (!code->swap)
1263 		emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
1264 	else
1265 		emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
1266 
1267 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
1268 	if (!code->swap)
1269 		emit_alu(nfp_prog, reg_none(),
1270 			 reg_a(reg + 1), carry_op, tmp_reg);
1271 	else
1272 		emit_alu(nfp_prog, reg_none(),
1273 			 tmp_reg, carry_op, reg_a(reg + 1));
1274 
1275 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1276 
1277 	return 0;
1278 }
1279 
1280 static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1281 {
1282 	const struct bpf_insn *insn = &meta->insn;
1283 	const struct jmp_code_map *code;
1284 	u8 areg, breg;
1285 
1286 	code = nfp_jmp_code_get(meta);
1287 	if (!code)
1288 		return -EINVAL;
1289 
1290 	areg = insn->dst_reg * 2;
1291 	breg = insn->src_reg * 2;
1292 
1293 	if (code->swap) {
1294 		areg ^= breg;
1295 		breg ^= areg;
1296 		areg ^= breg;
1297 	}
1298 
1299 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
1300 	emit_alu(nfp_prog, reg_none(),
1301 		 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
1302 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1303 
1304 	return 0;
1305 }
1306 
1307 static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
1308 {
1309 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in,
1310 		      SHF_SC_R_ROT, 8);
1311 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out),
1312 		      SHF_SC_R_ROT, 16);
1313 }
1314 
1315 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1316 {
1317 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
1318 	struct nfp_bpf_cap_adjust_head *adjust_head;
1319 	u32 ret_einval, end;
1320 
1321 	adjust_head = &nfp_prog->bpf->adjust_head;
1322 
1323 	/* Optimized version - 5 vs 14 cycles */
1324 	if (nfp_prog->adjust_head_location != UINT_MAX) {
1325 		if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n))
1326 			return -EINVAL;
1327 
1328 		emit_alu(nfp_prog, pptr_reg(nfp_prog),
1329 			 reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog));
1330 		emit_alu(nfp_prog, plen_reg(nfp_prog),
1331 			 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1332 		emit_alu(nfp_prog, pv_len(nfp_prog),
1333 			 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1334 
1335 		wrp_immed(nfp_prog, reg_both(0), 0);
1336 		wrp_immed(nfp_prog, reg_both(1), 0);
1337 
1338 		/* TODO: when adjust head is guaranteed to succeed we can
1339 		 * also eliminate the following if (r0 == 0) branch.
1340 		 */
1341 
1342 		return 0;
1343 	}
1344 
1345 	ret_einval = nfp_prog_current_offset(nfp_prog) + 14;
1346 	end = ret_einval + 2;
1347 
1348 	/* We need to use a temp because offset is just a part of the pkt ptr */
1349 	emit_alu(nfp_prog, tmp,
1350 		 reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog));
1351 
1352 	/* Validate result will fit within FW datapath constraints */
1353 	emit_alu(nfp_prog, reg_none(),
1354 		 tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min));
1355 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1356 	emit_alu(nfp_prog, reg_none(),
1357 		 reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp);
1358 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1359 
1360 	/* Validate the length is at least ETH_HLEN */
1361 	emit_alu(nfp_prog, tmp_len,
1362 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1363 	emit_alu(nfp_prog, reg_none(),
1364 		 tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN));
1365 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1366 
1367 	/* Load the ret code */
1368 	wrp_immed(nfp_prog, reg_both(0), 0);
1369 	wrp_immed(nfp_prog, reg_both(1), 0);
1370 
1371 	/* Modify the packet metadata */
1372 	emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0);
1373 
1374 	/* Skip over the -EINVAL ret code (defer 2) */
1375 	emit_br(nfp_prog, BR_UNC, end, 2);
1376 
1377 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1378 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1379 	emit_alu(nfp_prog, pv_len(nfp_prog),
1380 		 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1381 
1382 	/* return -EINVAL target */
1383 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1384 		return -EINVAL;
1385 
1386 	wrp_immed(nfp_prog, reg_both(0), -22);
1387 	wrp_immed(nfp_prog, reg_both(1), ~0);
1388 
1389 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1390 		return -EINVAL;
1391 
1392 	return 0;
1393 }
1394 
1395 static int
1396 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1397 {
1398 	bool load_lm_ptr;
1399 	u32 ret_tgt;
1400 	s64 lm_off;
1401 
1402 	/* We only have to reload LM0 if the key is not at start of stack */
1403 	lm_off = nfp_prog->stack_depth;
1404 	lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
1405 	load_lm_ptr = meta->arg2.var_off || lm_off;
1406 
1407 	/* Set LM0 to start of key */
1408 	if (load_lm_ptr)
1409 		emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0);
1410 	if (meta->func_id == BPF_FUNC_map_update_elem)
1411 		emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1412 
1413 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1414 		     2, RELO_BR_HELPER);
1415 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1416 
1417 	/* Load map ID into A0 */
1418 	wrp_mov(nfp_prog, reg_a(0), reg_a(2));
1419 
1420 	/* Load the return address into B0 */
1421 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1422 
1423 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1424 		return -EINVAL;
1425 
1426 	/* Reset the LM0 pointer */
1427 	if (!load_lm_ptr)
1428 		return 0;
1429 
1430 	emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
1431 	wrp_nops(nfp_prog, 3);
1432 
1433 	return 0;
1434 }
1435 
1436 static int
1437 nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1438 {
1439 	__emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM);
1440 	/* CSR value is read in following immed[gpr, 0] */
1441 	emit_immed(nfp_prog, reg_both(0), 0,
1442 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1443 	emit_immed(nfp_prog, reg_both(1), 0,
1444 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1445 	return 0;
1446 }
1447 
1448 static int
1449 nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1450 {
1451 	swreg ptr_type;
1452 	u32 ret_tgt;
1453 
1454 	ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
1455 
1456 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
1457 
1458 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1459 		     2, RELO_BR_HELPER);
1460 
1461 	/* Load ptr type into A1 */
1462 	wrp_mov(nfp_prog, reg_a(1), ptr_type);
1463 
1464 	/* Load the return address into B0 */
1465 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1466 
1467 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1468 		return -EINVAL;
1469 
1470 	return 0;
1471 }
1472 
1473 /* --- Callbacks --- */
1474 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1475 {
1476 	const struct bpf_insn *insn = &meta->insn;
1477 	u8 dst = insn->dst_reg * 2;
1478 	u8 src = insn->src_reg * 2;
1479 
1480 	if (insn->src_reg == BPF_REG_10) {
1481 		swreg stack_depth_reg;
1482 
1483 		stack_depth_reg = ur_load_imm_any(nfp_prog,
1484 						  nfp_prog->stack_depth,
1485 						  stack_imm(nfp_prog));
1486 		emit_alu(nfp_prog, reg_both(dst),
1487 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_depth_reg);
1488 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1489 	} else {
1490 		wrp_reg_mov(nfp_prog, dst, src);
1491 		wrp_reg_mov(nfp_prog, dst + 1, src + 1);
1492 	}
1493 
1494 	return 0;
1495 }
1496 
1497 static int mov_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1498 {
1499 	u64 imm = meta->insn.imm; /* sign extend */
1500 
1501 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2), imm & ~0U);
1502 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), imm >> 32);
1503 
1504 	return 0;
1505 }
1506 
1507 static int xor_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1508 {
1509 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_XOR);
1510 }
1511 
1512 static int xor_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1513 {
1514 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm);
1515 }
1516 
1517 static int and_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1518 {
1519 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_AND);
1520 }
1521 
1522 static int and_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1523 {
1524 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1525 }
1526 
1527 static int or_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1528 {
1529 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_OR);
1530 }
1531 
1532 static int or_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1533 {
1534 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1535 }
1536 
1537 static int add_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1538 {
1539 	const struct bpf_insn *insn = &meta->insn;
1540 
1541 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1542 		 reg_a(insn->dst_reg * 2), ALU_OP_ADD,
1543 		 reg_b(insn->src_reg * 2));
1544 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1545 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_ADD_C,
1546 		 reg_b(insn->src_reg * 2 + 1));
1547 
1548 	return 0;
1549 }
1550 
1551 static int add_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1552 {
1553 	const struct bpf_insn *insn = &meta->insn;
1554 	u64 imm = insn->imm; /* sign extend */
1555 
1556 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_ADD, imm & ~0U);
1557 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_ADD_C, imm >> 32);
1558 
1559 	return 0;
1560 }
1561 
1562 static int sub_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1563 {
1564 	const struct bpf_insn *insn = &meta->insn;
1565 
1566 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1567 		 reg_a(insn->dst_reg * 2), ALU_OP_SUB,
1568 		 reg_b(insn->src_reg * 2));
1569 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1570 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_SUB_C,
1571 		 reg_b(insn->src_reg * 2 + 1));
1572 
1573 	return 0;
1574 }
1575 
1576 static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1577 {
1578 	const struct bpf_insn *insn = &meta->insn;
1579 	u64 imm = insn->imm; /* sign extend */
1580 
1581 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_SUB, imm & ~0U);
1582 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_SUB_C, imm >> 32);
1583 
1584 	return 0;
1585 }
1586 
1587 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1588 {
1589 	const struct bpf_insn *insn = &meta->insn;
1590 
1591 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2), reg_imm(0),
1592 		 ALU_OP_SUB, reg_b(insn->dst_reg * 2));
1593 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1), reg_imm(0),
1594 		 ALU_OP_SUB_C, reg_b(insn->dst_reg * 2 + 1));
1595 
1596 	return 0;
1597 }
1598 
1599 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1600 {
1601 	const struct bpf_insn *insn = &meta->insn;
1602 	u8 dst = insn->dst_reg * 2;
1603 
1604 	if (insn->imm < 32) {
1605 		emit_shf(nfp_prog, reg_both(dst + 1),
1606 			 reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
1607 			 SHF_SC_R_DSHF, 32 - insn->imm);
1608 		emit_shf(nfp_prog, reg_both(dst),
1609 			 reg_none(), SHF_OP_NONE, reg_b(dst),
1610 			 SHF_SC_L_SHF, insn->imm);
1611 	} else if (insn->imm == 32) {
1612 		wrp_reg_mov(nfp_prog, dst + 1, dst);
1613 		wrp_immed(nfp_prog, reg_both(dst), 0);
1614 	} else if (insn->imm > 32) {
1615 		emit_shf(nfp_prog, reg_both(dst + 1),
1616 			 reg_none(), SHF_OP_NONE, reg_b(dst),
1617 			 SHF_SC_L_SHF, insn->imm - 32);
1618 		wrp_immed(nfp_prog, reg_both(dst), 0);
1619 	}
1620 
1621 	return 0;
1622 }
1623 
1624 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1625 {
1626 	const struct bpf_insn *insn = &meta->insn;
1627 	u8 dst = insn->dst_reg * 2;
1628 
1629 	if (insn->imm < 32) {
1630 		emit_shf(nfp_prog, reg_both(dst),
1631 			 reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
1632 			 SHF_SC_R_DSHF, insn->imm);
1633 		emit_shf(nfp_prog, reg_both(dst + 1),
1634 			 reg_none(), SHF_OP_NONE, reg_b(dst + 1),
1635 			 SHF_SC_R_SHF, insn->imm);
1636 	} else if (insn->imm == 32) {
1637 		wrp_reg_mov(nfp_prog, dst, dst + 1);
1638 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1639 	} else if (insn->imm > 32) {
1640 		emit_shf(nfp_prog, reg_both(dst),
1641 			 reg_none(), SHF_OP_NONE, reg_b(dst + 1),
1642 			 SHF_SC_R_SHF, insn->imm - 32);
1643 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1644 	}
1645 
1646 	return 0;
1647 }
1648 
1649 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1650 {
1651 	const struct bpf_insn *insn = &meta->insn;
1652 
1653 	wrp_reg_mov(nfp_prog, insn->dst_reg * 2,  insn->src_reg * 2);
1654 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1655 
1656 	return 0;
1657 }
1658 
1659 static int mov_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1660 {
1661 	const struct bpf_insn *insn = &meta->insn;
1662 
1663 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), insn->imm);
1664 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1665 
1666 	return 0;
1667 }
1668 
1669 static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1670 {
1671 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_XOR);
1672 }
1673 
1674 static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1675 {
1676 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR, !~meta->insn.imm);
1677 }
1678 
1679 static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1680 {
1681 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_AND);
1682 }
1683 
1684 static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1685 {
1686 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1687 }
1688 
1689 static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1690 {
1691 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_OR);
1692 }
1693 
1694 static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1695 {
1696 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1697 }
1698 
1699 static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1700 {
1701 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_ADD);
1702 }
1703 
1704 static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1705 {
1706 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD, !meta->insn.imm);
1707 }
1708 
1709 static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1710 {
1711 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_SUB);
1712 }
1713 
1714 static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1715 {
1716 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB, !meta->insn.imm);
1717 }
1718 
1719 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1720 {
1721 	u8 dst = meta->insn.dst_reg * 2;
1722 
1723 	emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst));
1724 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1725 
1726 	return 0;
1727 }
1728 
1729 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1730 {
1731 	const struct bpf_insn *insn = &meta->insn;
1732 
1733 	if (!insn->imm)
1734 		return 1; /* TODO: zero shift means indirect */
1735 
1736 	emit_shf(nfp_prog, reg_both(insn->dst_reg * 2),
1737 		 reg_none(), SHF_OP_NONE, reg_b(insn->dst_reg * 2),
1738 		 SHF_SC_L_SHF, insn->imm);
1739 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1740 
1741 	return 0;
1742 }
1743 
1744 static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1745 {
1746 	const struct bpf_insn *insn = &meta->insn;
1747 	u8 gpr = insn->dst_reg * 2;
1748 
1749 	switch (insn->imm) {
1750 	case 16:
1751 		emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr),
1752 			      SHF_SC_R_ROT, 8);
1753 		emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr),
1754 			      SHF_SC_R_SHF, 16);
1755 
1756 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
1757 		break;
1758 	case 32:
1759 		wrp_end32(nfp_prog, reg_a(gpr), gpr);
1760 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
1761 		break;
1762 	case 64:
1763 		wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1));
1764 
1765 		wrp_end32(nfp_prog, reg_a(gpr), gpr + 1);
1766 		wrp_end32(nfp_prog, imm_a(nfp_prog), gpr);
1767 		break;
1768 	}
1769 
1770 	return 0;
1771 }
1772 
1773 static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1774 {
1775 	struct nfp_insn_meta *prev = nfp_meta_prev(meta);
1776 	u32 imm_lo, imm_hi;
1777 	u8 dst;
1778 
1779 	dst = prev->insn.dst_reg * 2;
1780 	imm_lo = prev->insn.imm;
1781 	imm_hi = meta->insn.imm;
1782 
1783 	wrp_immed(nfp_prog, reg_both(dst), imm_lo);
1784 
1785 	/* mov is always 1 insn, load imm may be two, so try to use mov */
1786 	if (imm_hi == imm_lo)
1787 		wrp_mov(nfp_prog, reg_both(dst + 1), reg_a(dst));
1788 	else
1789 		wrp_immed(nfp_prog, reg_both(dst + 1), imm_hi);
1790 
1791 	return 0;
1792 }
1793 
1794 static int imm_ld8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1795 {
1796 	meta->double_cb = imm_ld8_part2;
1797 	return 0;
1798 }
1799 
1800 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1801 {
1802 	return construct_data_ld(nfp_prog, meta->insn.imm, 1);
1803 }
1804 
1805 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1806 {
1807 	return construct_data_ld(nfp_prog, meta->insn.imm, 2);
1808 }
1809 
1810 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1811 {
1812 	return construct_data_ld(nfp_prog, meta->insn.imm, 4);
1813 }
1814 
1815 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1816 {
1817 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
1818 				     meta->insn.src_reg * 2, 1);
1819 }
1820 
1821 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1822 {
1823 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
1824 				     meta->insn.src_reg * 2, 2);
1825 }
1826 
1827 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1828 {
1829 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
1830 				     meta->insn.src_reg * 2, 4);
1831 }
1832 
1833 static int
1834 mem_ldx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1835 	      unsigned int size, unsigned int ptr_off)
1836 {
1837 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
1838 			    meta->insn.dst_reg * 2, meta->insn.src_reg * 2,
1839 			    true, wrp_lmem_load);
1840 }
1841 
1842 static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1843 		       u8 size)
1844 {
1845 	swreg dst = reg_both(meta->insn.dst_reg * 2);
1846 
1847 	switch (meta->insn.off) {
1848 	case offsetof(struct __sk_buff, len):
1849 		if (size != FIELD_SIZEOF(struct __sk_buff, len))
1850 			return -EOPNOTSUPP;
1851 		wrp_mov(nfp_prog, dst, plen_reg(nfp_prog));
1852 		break;
1853 	case offsetof(struct __sk_buff, data):
1854 		if (size != FIELD_SIZEOF(struct __sk_buff, data))
1855 			return -EOPNOTSUPP;
1856 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
1857 		break;
1858 	case offsetof(struct __sk_buff, data_end):
1859 		if (size != FIELD_SIZEOF(struct __sk_buff, data_end))
1860 			return -EOPNOTSUPP;
1861 		emit_alu(nfp_prog, dst,
1862 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
1863 		break;
1864 	default:
1865 		return -EOPNOTSUPP;
1866 	}
1867 
1868 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1869 
1870 	return 0;
1871 }
1872 
1873 static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1874 		       u8 size)
1875 {
1876 	swreg dst = reg_both(meta->insn.dst_reg * 2);
1877 
1878 	switch (meta->insn.off) {
1879 	case offsetof(struct xdp_md, data):
1880 		if (size != FIELD_SIZEOF(struct xdp_md, data))
1881 			return -EOPNOTSUPP;
1882 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
1883 		break;
1884 	case offsetof(struct xdp_md, data_end):
1885 		if (size != FIELD_SIZEOF(struct xdp_md, data_end))
1886 			return -EOPNOTSUPP;
1887 		emit_alu(nfp_prog, dst,
1888 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
1889 		break;
1890 	default:
1891 		return -EOPNOTSUPP;
1892 	}
1893 
1894 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1895 
1896 	return 0;
1897 }
1898 
1899 static int
1900 mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1901 	     unsigned int size)
1902 {
1903 	swreg tmp_reg;
1904 
1905 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
1906 
1907 	return data_ld_host_order_addr32(nfp_prog, meta->insn.src_reg * 2,
1908 					 tmp_reg, meta->insn.dst_reg * 2, size);
1909 }
1910 
1911 static int
1912 mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1913 	     unsigned int size)
1914 {
1915 	swreg tmp_reg;
1916 
1917 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
1918 
1919 	return data_ld_host_order_addr40(nfp_prog, meta->insn.src_reg * 2,
1920 					 tmp_reg, meta->insn.dst_reg * 2, size);
1921 }
1922 
1923 static void
1924 mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog,
1925 			   struct nfp_insn_meta *meta)
1926 {
1927 	s16 range_start = meta->pkt_cache.range_start;
1928 	s16 range_end = meta->pkt_cache.range_end;
1929 	swreg src_base, off;
1930 	u8 xfer_num, len;
1931 	bool indir;
1932 
1933 	off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog));
1934 	src_base = reg_a(meta->insn.src_reg * 2);
1935 	len = range_end - range_start;
1936 	xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH;
1937 
1938 	indir = len > 8 * REG_WIDTH;
1939 	/* Setup PREV_ALU for indirect mode. */
1940 	if (indir)
1941 		wrp_immed(nfp_prog, reg_none(),
1942 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
1943 
1944 	/* Cache memory into transfer-in registers. */
1945 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
1946 		     off, xfer_num - 1, CMD_CTX_SWAP, indir);
1947 }
1948 
1949 static int
1950 mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog,
1951 				     struct nfp_insn_meta *meta,
1952 				     unsigned int size)
1953 {
1954 	s16 range_start = meta->pkt_cache.range_start;
1955 	s16 insn_off = meta->insn.off - range_start;
1956 	swreg dst_lo, dst_hi, src_lo, src_mid;
1957 	u8 dst_gpr = meta->insn.dst_reg * 2;
1958 	u8 len_lo = size, len_mid = 0;
1959 	u8 idx = insn_off / REG_WIDTH;
1960 	u8 off = insn_off % REG_WIDTH;
1961 
1962 	dst_hi = reg_both(dst_gpr + 1);
1963 	dst_lo = reg_both(dst_gpr);
1964 	src_lo = reg_xfer(idx);
1965 
1966 	/* The read length could involve as many as three registers. */
1967 	if (size > REG_WIDTH - off) {
1968 		/* Calculate the part in the second register. */
1969 		len_lo = REG_WIDTH - off;
1970 		len_mid = size - len_lo;
1971 
1972 		/* Calculate the part in the third register. */
1973 		if (size > 2 * REG_WIDTH - off)
1974 			len_mid = REG_WIDTH;
1975 	}
1976 
1977 	wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off);
1978 
1979 	if (!len_mid) {
1980 		wrp_immed(nfp_prog, dst_hi, 0);
1981 		return 0;
1982 	}
1983 
1984 	src_mid = reg_xfer(idx + 1);
1985 
1986 	if (size <= REG_WIDTH) {
1987 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo);
1988 		wrp_immed(nfp_prog, dst_hi, 0);
1989 	} else {
1990 		swreg src_hi = reg_xfer(idx + 2);
1991 
1992 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid,
1993 				   REG_WIDTH - len_lo, len_lo);
1994 		wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo,
1995 				REG_WIDTH - len_lo);
1996 		wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo,
1997 				   len_lo);
1998 	}
1999 
2000 	return 0;
2001 }
2002 
2003 static int
2004 mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog,
2005 				   struct nfp_insn_meta *meta,
2006 				   unsigned int size)
2007 {
2008 	swreg dst_lo, dst_hi, src_lo;
2009 	u8 dst_gpr, idx;
2010 
2011 	idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH;
2012 	dst_gpr = meta->insn.dst_reg * 2;
2013 	dst_hi = reg_both(dst_gpr + 1);
2014 	dst_lo = reg_both(dst_gpr);
2015 	src_lo = reg_xfer(idx);
2016 
2017 	if (size < REG_WIDTH) {
2018 		wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0);
2019 		wrp_immed(nfp_prog, dst_hi, 0);
2020 	} else if (size == REG_WIDTH) {
2021 		wrp_mov(nfp_prog, dst_lo, src_lo);
2022 		wrp_immed(nfp_prog, dst_hi, 0);
2023 	} else {
2024 		swreg src_hi = reg_xfer(idx + 1);
2025 
2026 		wrp_mov(nfp_prog, dst_lo, src_lo);
2027 		wrp_mov(nfp_prog, dst_hi, src_hi);
2028 	}
2029 
2030 	return 0;
2031 }
2032 
2033 static int
2034 mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog,
2035 			   struct nfp_insn_meta *meta, unsigned int size)
2036 {
2037 	u8 off = meta->insn.off - meta->pkt_cache.range_start;
2038 
2039 	if (IS_ALIGNED(off, REG_WIDTH))
2040 		return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size);
2041 
2042 	return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size);
2043 }
2044 
2045 static int
2046 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2047 	unsigned int size)
2048 {
2049 	if (meta->ldst_gather_len)
2050 		return nfp_cpp_memcpy(nfp_prog, meta);
2051 
2052 	if (meta->ptr.type == PTR_TO_CTX) {
2053 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2054 			return mem_ldx_xdp(nfp_prog, meta, size);
2055 		else
2056 			return mem_ldx_skb(nfp_prog, meta, size);
2057 	}
2058 
2059 	if (meta->ptr.type == PTR_TO_PACKET) {
2060 		if (meta->pkt_cache.range_end) {
2061 			if (meta->pkt_cache.do_init)
2062 				mem_ldx_data_init_pktcache(nfp_prog, meta);
2063 
2064 			return mem_ldx_data_from_pktcache(nfp_prog, meta, size);
2065 		} else {
2066 			return mem_ldx_data(nfp_prog, meta, size);
2067 		}
2068 	}
2069 
2070 	if (meta->ptr.type == PTR_TO_STACK)
2071 		return mem_ldx_stack(nfp_prog, meta, size,
2072 				     meta->ptr.off + meta->ptr.var_off.value);
2073 
2074 	if (meta->ptr.type == PTR_TO_MAP_VALUE)
2075 		return mem_ldx_emem(nfp_prog, meta, size);
2076 
2077 	return -EOPNOTSUPP;
2078 }
2079 
2080 static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2081 {
2082 	return mem_ldx(nfp_prog, meta, 1);
2083 }
2084 
2085 static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2086 {
2087 	return mem_ldx(nfp_prog, meta, 2);
2088 }
2089 
2090 static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2091 {
2092 	return mem_ldx(nfp_prog, meta, 4);
2093 }
2094 
2095 static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2096 {
2097 	return mem_ldx(nfp_prog, meta, 8);
2098 }
2099 
2100 static int
2101 mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2102 	    unsigned int size)
2103 {
2104 	u64 imm = meta->insn.imm; /* sign extend */
2105 	swreg off_reg;
2106 
2107 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2108 
2109 	return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2110 				  imm, size);
2111 }
2112 
2113 static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2114 		  unsigned int size)
2115 {
2116 	if (meta->ptr.type == PTR_TO_PACKET)
2117 		return mem_st_data(nfp_prog, meta, size);
2118 
2119 	return -EOPNOTSUPP;
2120 }
2121 
2122 static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2123 {
2124 	return mem_st(nfp_prog, meta, 1);
2125 }
2126 
2127 static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2128 {
2129 	return mem_st(nfp_prog, meta, 2);
2130 }
2131 
2132 static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2133 {
2134 	return mem_st(nfp_prog, meta, 4);
2135 }
2136 
2137 static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2138 {
2139 	return mem_st(nfp_prog, meta, 8);
2140 }
2141 
2142 static int
2143 mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2144 	     unsigned int size)
2145 {
2146 	swreg off_reg;
2147 
2148 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2149 
2150 	return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2151 				   meta->insn.src_reg * 2, size);
2152 }
2153 
2154 static int
2155 mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2156 	      unsigned int size, unsigned int ptr_off)
2157 {
2158 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2159 			    meta->insn.src_reg * 2, meta->insn.dst_reg * 2,
2160 			    false, wrp_lmem_store);
2161 }
2162 
2163 static int
2164 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2165 	unsigned int size)
2166 {
2167 	if (meta->ptr.type == PTR_TO_PACKET)
2168 		return mem_stx_data(nfp_prog, meta, size);
2169 
2170 	if (meta->ptr.type == PTR_TO_STACK)
2171 		return mem_stx_stack(nfp_prog, meta, size,
2172 				     meta->ptr.off + meta->ptr.var_off.value);
2173 
2174 	return -EOPNOTSUPP;
2175 }
2176 
2177 static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2178 {
2179 	return mem_stx(nfp_prog, meta, 1);
2180 }
2181 
2182 static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2183 {
2184 	return mem_stx(nfp_prog, meta, 2);
2185 }
2186 
2187 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2188 {
2189 	return mem_stx(nfp_prog, meta, 4);
2190 }
2191 
2192 static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2193 {
2194 	return mem_stx(nfp_prog, meta, 8);
2195 }
2196 
2197 static int
2198 mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
2199 {
2200 	u8 dst_gpr = meta->insn.dst_reg * 2;
2201 	u8 src_gpr = meta->insn.src_reg * 2;
2202 	unsigned int full_add, out;
2203 	swreg addra, addrb, off;
2204 
2205 	off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2206 
2207 	/* We can fit 16 bits into command immediate, if we know the immediate
2208 	 * is guaranteed to either always or never fit into 16 bit we only
2209 	 * generate code to handle that particular case, otherwise generate
2210 	 * code for both.
2211 	 */
2212 	out = nfp_prog_current_offset(nfp_prog);
2213 	full_add = nfp_prog_current_offset(nfp_prog);
2214 
2215 	if (meta->insn.off) {
2216 		out += 2;
2217 		full_add += 2;
2218 	}
2219 	if (meta->xadd_maybe_16bit) {
2220 		out += 3;
2221 		full_add += 3;
2222 	}
2223 	if (meta->xadd_over_16bit)
2224 		out += 2 + is64;
2225 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2226 		out += 5;
2227 		full_add += 5;
2228 	}
2229 
2230 	/* Generate the branch for choosing add_imm vs add */
2231 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2232 		swreg max_imm = imm_a(nfp_prog);
2233 
2234 		wrp_immed(nfp_prog, max_imm, 0xffff);
2235 		emit_alu(nfp_prog, reg_none(),
2236 			 max_imm, ALU_OP_SUB, reg_b(src_gpr));
2237 		emit_alu(nfp_prog, reg_none(),
2238 			 reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1));
2239 		emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0);
2240 		/* defer for add */
2241 	}
2242 
2243 	/* If insn has an offset add to the address */
2244 	if (!meta->insn.off) {
2245 		addra = reg_a(dst_gpr);
2246 		addrb = reg_b(dst_gpr + 1);
2247 	} else {
2248 		emit_alu(nfp_prog, imma_a(nfp_prog),
2249 			 reg_a(dst_gpr), ALU_OP_ADD, off);
2250 		emit_alu(nfp_prog, imma_b(nfp_prog),
2251 			 reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0));
2252 		addra = imma_a(nfp_prog);
2253 		addrb = imma_b(nfp_prog);
2254 	}
2255 
2256 	/* Generate the add_imm if 16 bits are possible */
2257 	if (meta->xadd_maybe_16bit) {
2258 		swreg prev_alu = imm_a(nfp_prog);
2259 
2260 		wrp_immed(nfp_prog, prev_alu,
2261 			  FIELD_PREP(CMD_OVE_DATA, 2) |
2262 			  CMD_OVE_LEN |
2263 			  FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2));
2264 		wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2);
2265 		emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0,
2266 			       addra, addrb, 0, CMD_CTX_NO_SWAP);
2267 
2268 		if (meta->xadd_over_16bit)
2269 			emit_br(nfp_prog, BR_UNC, out, 0);
2270 	}
2271 
2272 	if (!nfp_prog_confirm_current_offset(nfp_prog, full_add))
2273 		return -EINVAL;
2274 
2275 	/* Generate the add if 16 bits are not guaranteed */
2276 	if (meta->xadd_over_16bit) {
2277 		emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0,
2278 			 addra, addrb, is64 << 2,
2279 			 is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1);
2280 
2281 		wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr));
2282 		if (is64)
2283 			wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1));
2284 	}
2285 
2286 	if (!nfp_prog_confirm_current_offset(nfp_prog, out))
2287 		return -EINVAL;
2288 
2289 	return 0;
2290 }
2291 
2292 static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2293 {
2294 	return mem_xadd(nfp_prog, meta, false);
2295 }
2296 
2297 static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2298 {
2299 	return mem_xadd(nfp_prog, meta, true);
2300 }
2301 
2302 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2303 {
2304 	emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
2305 
2306 	return 0;
2307 }
2308 
2309 static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2310 {
2311 	const struct bpf_insn *insn = &meta->insn;
2312 	u64 imm = insn->imm; /* sign extend */
2313 	swreg or1, or2, tmp_reg;
2314 
2315 	or1 = reg_a(insn->dst_reg * 2);
2316 	or2 = reg_b(insn->dst_reg * 2 + 1);
2317 
2318 	if (imm & ~0U) {
2319 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
2320 		emit_alu(nfp_prog, imm_a(nfp_prog),
2321 			 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
2322 		or1 = imm_a(nfp_prog);
2323 	}
2324 
2325 	if (imm >> 32) {
2326 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
2327 		emit_alu(nfp_prog, imm_b(nfp_prog),
2328 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
2329 		or2 = imm_b(nfp_prog);
2330 	}
2331 
2332 	emit_alu(nfp_prog, reg_none(), or1, ALU_OP_OR, or2);
2333 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
2334 
2335 	return 0;
2336 }
2337 
2338 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2339 {
2340 	const struct bpf_insn *insn = &meta->insn;
2341 	u64 imm = insn->imm; /* sign extend */
2342 	swreg tmp_reg;
2343 
2344 	if (!imm) {
2345 		meta->skip = true;
2346 		return 0;
2347 	}
2348 
2349 	if (imm & ~0U) {
2350 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
2351 		emit_alu(nfp_prog, reg_none(),
2352 			 reg_a(insn->dst_reg * 2), ALU_OP_AND, tmp_reg);
2353 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
2354 	}
2355 
2356 	if (imm >> 32) {
2357 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
2358 		emit_alu(nfp_prog, reg_none(),
2359 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_AND, tmp_reg);
2360 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
2361 	}
2362 
2363 	return 0;
2364 }
2365 
2366 static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2367 {
2368 	const struct bpf_insn *insn = &meta->insn;
2369 	u64 imm = insn->imm; /* sign extend */
2370 	swreg tmp_reg;
2371 
2372 	if (!imm) {
2373 		emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
2374 			 ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
2375 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
2376 		return 0;
2377 	}
2378 
2379 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
2380 	emit_alu(nfp_prog, reg_none(),
2381 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
2382 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
2383 
2384 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
2385 	emit_alu(nfp_prog, reg_none(),
2386 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
2387 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
2388 
2389 	return 0;
2390 }
2391 
2392 static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2393 {
2394 	const struct bpf_insn *insn = &meta->insn;
2395 
2396 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
2397 		 ALU_OP_XOR, reg_b(insn->src_reg * 2));
2398 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_a(insn->dst_reg * 2 + 1),
2399 		 ALU_OP_XOR, reg_b(insn->src_reg * 2 + 1));
2400 	emit_alu(nfp_prog, reg_none(),
2401 		 imm_a(nfp_prog), ALU_OP_OR, imm_b(nfp_prog));
2402 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
2403 
2404 	return 0;
2405 }
2406 
2407 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2408 {
2409 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
2410 }
2411 
2412 static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2413 {
2414 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
2415 }
2416 
2417 static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2418 {
2419 	switch (meta->insn.imm) {
2420 	case BPF_FUNC_xdp_adjust_head:
2421 		return adjust_head(nfp_prog, meta);
2422 	case BPF_FUNC_map_lookup_elem:
2423 	case BPF_FUNC_map_update_elem:
2424 	case BPF_FUNC_map_delete_elem:
2425 		return map_call_stack_common(nfp_prog, meta);
2426 	case BPF_FUNC_get_prandom_u32:
2427 		return nfp_get_prandom_u32(nfp_prog, meta);
2428 	case BPF_FUNC_perf_event_output:
2429 		return nfp_perf_event_output(nfp_prog, meta);
2430 	default:
2431 		WARN_ONCE(1, "verifier allowed unsupported function\n");
2432 		return -EOPNOTSUPP;
2433 	}
2434 }
2435 
2436 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2437 {
2438 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
2439 
2440 	return 0;
2441 }
2442 
2443 static const instr_cb_t instr_cb[256] = {
2444 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
2445 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
2446 	[BPF_ALU64 | BPF_XOR | BPF_X] =	xor_reg64,
2447 	[BPF_ALU64 | BPF_XOR | BPF_K] =	xor_imm64,
2448 	[BPF_ALU64 | BPF_AND | BPF_X] =	and_reg64,
2449 	[BPF_ALU64 | BPF_AND | BPF_K] =	and_imm64,
2450 	[BPF_ALU64 | BPF_OR | BPF_X] =	or_reg64,
2451 	[BPF_ALU64 | BPF_OR | BPF_K] =	or_imm64,
2452 	[BPF_ALU64 | BPF_ADD | BPF_X] =	add_reg64,
2453 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
2454 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
2455 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
2456 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
2457 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
2458 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
2459 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
2460 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
2461 	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
2462 	[BPF_ALU | BPF_XOR | BPF_K] =	xor_imm,
2463 	[BPF_ALU | BPF_AND | BPF_X] =	and_reg,
2464 	[BPF_ALU | BPF_AND | BPF_K] =	and_imm,
2465 	[BPF_ALU | BPF_OR | BPF_X] =	or_reg,
2466 	[BPF_ALU | BPF_OR | BPF_K] =	or_imm,
2467 	[BPF_ALU | BPF_ADD | BPF_X] =	add_reg,
2468 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
2469 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
2470 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
2471 	[BPF_ALU | BPF_NEG] =		neg_reg,
2472 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
2473 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
2474 	[BPF_LD | BPF_IMM | BPF_DW] =	imm_ld8,
2475 	[BPF_LD | BPF_ABS | BPF_B] =	data_ld1,
2476 	[BPF_LD | BPF_ABS | BPF_H] =	data_ld2,
2477 	[BPF_LD | BPF_ABS | BPF_W] =	data_ld4,
2478 	[BPF_LD | BPF_IND | BPF_B] =	data_ind_ld1,
2479 	[BPF_LD | BPF_IND | BPF_H] =	data_ind_ld2,
2480 	[BPF_LD | BPF_IND | BPF_W] =	data_ind_ld4,
2481 	[BPF_LDX | BPF_MEM | BPF_B] =	mem_ldx1,
2482 	[BPF_LDX | BPF_MEM | BPF_H] =	mem_ldx2,
2483 	[BPF_LDX | BPF_MEM | BPF_W] =	mem_ldx4,
2484 	[BPF_LDX | BPF_MEM | BPF_DW] =	mem_ldx8,
2485 	[BPF_STX | BPF_MEM | BPF_B] =	mem_stx1,
2486 	[BPF_STX | BPF_MEM | BPF_H] =	mem_stx2,
2487 	[BPF_STX | BPF_MEM | BPF_W] =	mem_stx4,
2488 	[BPF_STX | BPF_MEM | BPF_DW] =	mem_stx8,
2489 	[BPF_STX | BPF_XADD | BPF_W] =	mem_xadd4,
2490 	[BPF_STX | BPF_XADD | BPF_DW] =	mem_xadd8,
2491 	[BPF_ST | BPF_MEM | BPF_B] =	mem_st1,
2492 	[BPF_ST | BPF_MEM | BPF_H] =	mem_st2,
2493 	[BPF_ST | BPF_MEM | BPF_W] =	mem_st4,
2494 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
2495 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
2496 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
2497 	[BPF_JMP | BPF_JGT | BPF_K] =	cmp_imm,
2498 	[BPF_JMP | BPF_JGE | BPF_K] =	cmp_imm,
2499 	[BPF_JMP | BPF_JLT | BPF_K] =	cmp_imm,
2500 	[BPF_JMP | BPF_JLE | BPF_K] =	cmp_imm,
2501 	[BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
2502 	[BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
2503 	[BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
2504 	[BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
2505 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
2506 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
2507 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
2508 	[BPF_JMP | BPF_JGT | BPF_X] =	cmp_reg,
2509 	[BPF_JMP | BPF_JGE | BPF_X] =	cmp_reg,
2510 	[BPF_JMP | BPF_JLT | BPF_X] =	cmp_reg,
2511 	[BPF_JMP | BPF_JLE | BPF_X] =	cmp_reg,
2512 	[BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
2513 	[BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
2514 	[BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
2515 	[BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
2516 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
2517 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
2518 	[BPF_JMP | BPF_CALL] =		call,
2519 	[BPF_JMP | BPF_EXIT] =		goto_out,
2520 };
2521 
2522 /* --- Assembler logic --- */
2523 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
2524 {
2525 	struct nfp_insn_meta *meta, *jmp_dst;
2526 	u32 idx, br_idx;
2527 
2528 	list_for_each_entry(meta, &nfp_prog->insns, l) {
2529 		if (meta->skip)
2530 			continue;
2531 		if (meta->insn.code == (BPF_JMP | BPF_CALL))
2532 			continue;
2533 		if (BPF_CLASS(meta->insn.code) != BPF_JMP)
2534 			continue;
2535 
2536 		if (list_is_last(&meta->l, &nfp_prog->insns))
2537 			br_idx = nfp_prog->last_bpf_off;
2538 		else
2539 			br_idx = list_next_entry(meta, l)->off - 1;
2540 
2541 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
2542 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
2543 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
2544 			return -ELOOP;
2545 		}
2546 		/* Leave special branches for later */
2547 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
2548 		    RELO_BR_REL)
2549 			continue;
2550 
2551 		if (!meta->jmp_dst) {
2552 			pr_err("Non-exit jump doesn't have destination info recorded!!\n");
2553 			return -ELOOP;
2554 		}
2555 
2556 		jmp_dst = meta->jmp_dst;
2557 
2558 		if (jmp_dst->skip) {
2559 			pr_err("Branch landing on removed instruction!!\n");
2560 			return -ELOOP;
2561 		}
2562 
2563 		for (idx = meta->off; idx <= br_idx; idx++) {
2564 			if (!nfp_is_br(nfp_prog->prog[idx]))
2565 				continue;
2566 			br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
2567 		}
2568 	}
2569 
2570 	return 0;
2571 }
2572 
2573 static void nfp_intro(struct nfp_prog *nfp_prog)
2574 {
2575 	wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
2576 	emit_alu(nfp_prog, plen_reg(nfp_prog),
2577 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
2578 }
2579 
2580 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
2581 {
2582 	/* TC direct-action mode:
2583 	 *   0,1   ok        NOT SUPPORTED[1]
2584 	 *   2   drop  0x22 -> drop,  count as stat1
2585 	 *   4,5 nuke  0x02 -> drop
2586 	 *   7  redir  0x44 -> redir, count as stat2
2587 	 *   * unspec  0x11 -> pass,  count as stat0
2588 	 *
2589 	 * [1] We can't support OK and RECLASSIFY because we can't tell TC
2590 	 *     the exact decision made.  We are forced to support UNSPEC
2591 	 *     to handle aborts so that's the only one we handle for passing
2592 	 *     packets up the stack.
2593 	 */
2594 	/* Target for aborts */
2595 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
2596 
2597 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
2598 
2599 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
2600 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16);
2601 
2602 	/* Target for normal exits */
2603 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
2604 
2605 	/* if R0 > 7 jump to abort */
2606 	emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0));
2607 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
2608 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
2609 
2610 	wrp_immed(nfp_prog, reg_b(2), 0x41221211);
2611 	wrp_immed(nfp_prog, reg_b(3), 0x41001211);
2612 
2613 	emit_shf(nfp_prog, reg_a(1),
2614 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 2);
2615 
2616 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
2617 	emit_shf(nfp_prog, reg_a(2),
2618 		 reg_imm(0xf), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
2619 
2620 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
2621 	emit_shf(nfp_prog, reg_b(2),
2622 		 reg_imm(0xf), SHF_OP_AND, reg_b(3), SHF_SC_R_SHF, 0);
2623 
2624 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
2625 
2626 	emit_shf(nfp_prog, reg_b(2),
2627 		 reg_a(2), SHF_OP_OR, reg_b(2), SHF_SC_L_SHF, 4);
2628 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
2629 }
2630 
2631 static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
2632 {
2633 	/* XDP return codes:
2634 	 *   0 aborted  0x82 -> drop,  count as stat3
2635 	 *   1    drop  0x22 -> drop,  count as stat1
2636 	 *   2    pass  0x11 -> pass,  count as stat0
2637 	 *   3      tx  0x44 -> redir, count as stat2
2638 	 *   * unknown  0x82 -> drop,  count as stat3
2639 	 */
2640 	/* Target for aborts */
2641 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
2642 
2643 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
2644 
2645 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
2646 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
2647 
2648 	/* Target for normal exits */
2649 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
2650 
2651 	/* if R0 > 3 jump to abort */
2652 	emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
2653 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
2654 
2655 	wrp_immed(nfp_prog, reg_b(2), 0x44112282);
2656 
2657 	emit_shf(nfp_prog, reg_a(1),
2658 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 3);
2659 
2660 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
2661 	emit_shf(nfp_prog, reg_b(2),
2662 		 reg_imm(0xff), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
2663 
2664 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
2665 
2666 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
2667 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
2668 }
2669 
2670 static void nfp_outro(struct nfp_prog *nfp_prog)
2671 {
2672 	switch (nfp_prog->type) {
2673 	case BPF_PROG_TYPE_SCHED_CLS:
2674 		nfp_outro_tc_da(nfp_prog);
2675 		break;
2676 	case BPF_PROG_TYPE_XDP:
2677 		nfp_outro_xdp(nfp_prog);
2678 		break;
2679 	default:
2680 		WARN_ON(1);
2681 	}
2682 }
2683 
2684 static int nfp_translate(struct nfp_prog *nfp_prog)
2685 {
2686 	struct nfp_insn_meta *meta;
2687 	int err;
2688 
2689 	nfp_intro(nfp_prog);
2690 	if (nfp_prog->error)
2691 		return nfp_prog->error;
2692 
2693 	list_for_each_entry(meta, &nfp_prog->insns, l) {
2694 		instr_cb_t cb = instr_cb[meta->insn.code];
2695 
2696 		meta->off = nfp_prog_current_offset(nfp_prog);
2697 
2698 		if (meta->skip) {
2699 			nfp_prog->n_translated++;
2700 			continue;
2701 		}
2702 
2703 		if (nfp_meta_has_prev(nfp_prog, meta) &&
2704 		    nfp_meta_prev(meta)->double_cb)
2705 			cb = nfp_meta_prev(meta)->double_cb;
2706 		if (!cb)
2707 			return -ENOENT;
2708 		err = cb(nfp_prog, meta);
2709 		if (err)
2710 			return err;
2711 		if (nfp_prog->error)
2712 			return nfp_prog->error;
2713 
2714 		nfp_prog->n_translated++;
2715 	}
2716 
2717 	nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
2718 
2719 	nfp_outro(nfp_prog);
2720 	if (nfp_prog->error)
2721 		return nfp_prog->error;
2722 
2723 	wrp_nops(nfp_prog, NFP_USTORE_PREFETCH_WINDOW);
2724 	if (nfp_prog->error)
2725 		return nfp_prog->error;
2726 
2727 	return nfp_fixup_branches(nfp_prog);
2728 }
2729 
2730 /* --- Optimizations --- */
2731 static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
2732 {
2733 	struct nfp_insn_meta *meta;
2734 
2735 	list_for_each_entry(meta, &nfp_prog->insns, l) {
2736 		struct bpf_insn insn = meta->insn;
2737 
2738 		/* Programs converted from cBPF start with register xoring */
2739 		if (insn.code == (BPF_ALU64 | BPF_XOR | BPF_X) &&
2740 		    insn.src_reg == insn.dst_reg)
2741 			continue;
2742 
2743 		/* Programs start with R6 = R1 but we ignore the skb pointer */
2744 		if (insn.code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
2745 		    insn.src_reg == 1 && insn.dst_reg == 6)
2746 			meta->skip = true;
2747 
2748 		/* Return as soon as something doesn't match */
2749 		if (!meta->skip)
2750 			return;
2751 	}
2752 }
2753 
2754 /* abs(insn.imm) will fit better into unrestricted reg immediate -
2755  * convert add/sub of a negative number into a sub/add of a positive one.
2756  */
2757 static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
2758 {
2759 	struct nfp_insn_meta *meta;
2760 
2761 	list_for_each_entry(meta, &nfp_prog->insns, l) {
2762 		struct bpf_insn insn = meta->insn;
2763 
2764 		if (meta->skip)
2765 			continue;
2766 
2767 		if (BPF_CLASS(insn.code) != BPF_ALU &&
2768 		    BPF_CLASS(insn.code) != BPF_ALU64 &&
2769 		    BPF_CLASS(insn.code) != BPF_JMP)
2770 			continue;
2771 		if (BPF_SRC(insn.code) != BPF_K)
2772 			continue;
2773 		if (insn.imm >= 0)
2774 			continue;
2775 
2776 		if (BPF_CLASS(insn.code) == BPF_JMP) {
2777 			switch (BPF_OP(insn.code)) {
2778 			case BPF_JGE:
2779 			case BPF_JSGE:
2780 			case BPF_JLT:
2781 			case BPF_JSLT:
2782 				meta->jump_neg_op = true;
2783 				break;
2784 			default:
2785 				continue;
2786 			}
2787 		} else {
2788 			if (BPF_OP(insn.code) == BPF_ADD)
2789 				insn.code = BPF_CLASS(insn.code) | BPF_SUB;
2790 			else if (BPF_OP(insn.code) == BPF_SUB)
2791 				insn.code = BPF_CLASS(insn.code) | BPF_ADD;
2792 			else
2793 				continue;
2794 
2795 			meta->insn.code = insn.code | BPF_K;
2796 		}
2797 
2798 		meta->insn.imm = -insn.imm;
2799 	}
2800 }
2801 
2802 /* Remove masking after load since our load guarantees this is not needed */
2803 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
2804 {
2805 	struct nfp_insn_meta *meta1, *meta2;
2806 	const s32 exp_mask[] = {
2807 		[BPF_B] = 0x000000ffU,
2808 		[BPF_H] = 0x0000ffffU,
2809 		[BPF_W] = 0xffffffffU,
2810 	};
2811 
2812 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
2813 		struct bpf_insn insn, next;
2814 
2815 		insn = meta1->insn;
2816 		next = meta2->insn;
2817 
2818 		if (BPF_CLASS(insn.code) != BPF_LD)
2819 			continue;
2820 		if (BPF_MODE(insn.code) != BPF_ABS &&
2821 		    BPF_MODE(insn.code) != BPF_IND)
2822 			continue;
2823 
2824 		if (next.code != (BPF_ALU64 | BPF_AND | BPF_K))
2825 			continue;
2826 
2827 		if (!exp_mask[BPF_SIZE(insn.code)])
2828 			continue;
2829 		if (exp_mask[BPF_SIZE(insn.code)] != next.imm)
2830 			continue;
2831 
2832 		if (next.src_reg || next.dst_reg)
2833 			continue;
2834 
2835 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
2836 			continue;
2837 
2838 		meta2->skip = true;
2839 	}
2840 }
2841 
2842 static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
2843 {
2844 	struct nfp_insn_meta *meta1, *meta2, *meta3;
2845 
2846 	nfp_for_each_insn_walk3(nfp_prog, meta1, meta2, meta3) {
2847 		struct bpf_insn insn, next1, next2;
2848 
2849 		insn = meta1->insn;
2850 		next1 = meta2->insn;
2851 		next2 = meta3->insn;
2852 
2853 		if (BPF_CLASS(insn.code) != BPF_LD)
2854 			continue;
2855 		if (BPF_MODE(insn.code) != BPF_ABS &&
2856 		    BPF_MODE(insn.code) != BPF_IND)
2857 			continue;
2858 		if (BPF_SIZE(insn.code) != BPF_W)
2859 			continue;
2860 
2861 		if (!(next1.code == (BPF_LSH | BPF_K | BPF_ALU64) &&
2862 		      next2.code == (BPF_RSH | BPF_K | BPF_ALU64)) &&
2863 		    !(next1.code == (BPF_RSH | BPF_K | BPF_ALU64) &&
2864 		      next2.code == (BPF_LSH | BPF_K | BPF_ALU64)))
2865 			continue;
2866 
2867 		if (next1.src_reg || next1.dst_reg ||
2868 		    next2.src_reg || next2.dst_reg)
2869 			continue;
2870 
2871 		if (next1.imm != 0x20 || next2.imm != 0x20)
2872 			continue;
2873 
2874 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
2875 		    meta3->flags & FLAG_INSN_IS_JUMP_DST)
2876 			continue;
2877 
2878 		meta2->skip = true;
2879 		meta3->skip = true;
2880 	}
2881 }
2882 
2883 /* load/store pair that forms memory copy sould look like the following:
2884  *
2885  *   ld_width R, [addr_src + offset_src]
2886  *   st_width [addr_dest + offset_dest], R
2887  *
2888  * The destination register of load and source register of store should
2889  * be the same, load and store should also perform at the same width.
2890  * If either of addr_src or addr_dest is stack pointer, we don't do the
2891  * CPP optimization as stack is modelled by registers on NFP.
2892  */
2893 static bool
2894 curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
2895 		    struct nfp_insn_meta *st_meta)
2896 {
2897 	struct bpf_insn *ld = &ld_meta->insn;
2898 	struct bpf_insn *st = &st_meta->insn;
2899 
2900 	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
2901 		return false;
2902 
2903 	if (ld_meta->ptr.type != PTR_TO_PACKET)
2904 		return false;
2905 
2906 	if (st_meta->ptr.type != PTR_TO_PACKET)
2907 		return false;
2908 
2909 	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
2910 		return false;
2911 
2912 	if (ld->dst_reg != st->src_reg)
2913 		return false;
2914 
2915 	/* There is jump to the store insn in this pair. */
2916 	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
2917 		return false;
2918 
2919 	return true;
2920 }
2921 
2922 /* Currently, we only support chaining load/store pairs if:
2923  *
2924  *  - Their address base registers are the same.
2925  *  - Their address offsets are in the same order.
2926  *  - They operate at the same memory width.
2927  *  - There is no jump into the middle of them.
2928  */
2929 static bool
2930 curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
2931 			      struct nfp_insn_meta *st_meta,
2932 			      struct bpf_insn *prev_ld,
2933 			      struct bpf_insn *prev_st)
2934 {
2935 	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
2936 	struct bpf_insn *ld = &ld_meta->insn;
2937 	struct bpf_insn *st = &st_meta->insn;
2938 	s16 prev_ld_off, prev_st_off;
2939 
2940 	/* This pair is the start pair. */
2941 	if (!prev_ld)
2942 		return true;
2943 
2944 	prev_size = BPF_LDST_BYTES(prev_ld);
2945 	curr_size = BPF_LDST_BYTES(ld);
2946 	prev_ld_base = prev_ld->src_reg;
2947 	prev_st_base = prev_st->dst_reg;
2948 	prev_ld_dst = prev_ld->dst_reg;
2949 	prev_ld_off = prev_ld->off;
2950 	prev_st_off = prev_st->off;
2951 
2952 	if (ld->dst_reg != prev_ld_dst)
2953 		return false;
2954 
2955 	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
2956 		return false;
2957 
2958 	if (curr_size != prev_size)
2959 		return false;
2960 
2961 	/* There is jump to the head of this pair. */
2962 	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
2963 		return false;
2964 
2965 	/* Both in ascending order. */
2966 	if (prev_ld_off + prev_size == ld->off &&
2967 	    prev_st_off + prev_size == st->off)
2968 		return true;
2969 
2970 	/* Both in descending order. */
2971 	if (ld->off + curr_size == prev_ld_off &&
2972 	    st->off + curr_size == prev_st_off)
2973 		return true;
2974 
2975 	return false;
2976 }
2977 
2978 /* Return TRUE if cross memory access happens. Cross memory access means
2979  * store area is overlapping with load area that a later load might load
2980  * the value from previous store, for this case we can't treat the sequence
2981  * as an memory copy.
2982  */
2983 static bool
2984 cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
2985 		 struct nfp_insn_meta *head_st_meta)
2986 {
2987 	s16 head_ld_off, head_st_off, ld_off;
2988 
2989 	/* Different pointer types does not overlap. */
2990 	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
2991 		return false;
2992 
2993 	/* load and store are both PTR_TO_PACKET, check ID info.  */
2994 	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
2995 		return true;
2996 
2997 	/* Canonicalize the offsets. Turn all of them against the original
2998 	 * base register.
2999 	 */
3000 	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
3001 	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
3002 	ld_off = ld->off + head_ld_meta->ptr.off;
3003 
3004 	/* Ascending order cross. */
3005 	if (ld_off > head_ld_off &&
3006 	    head_ld_off < head_st_off && ld_off >= head_st_off)
3007 		return true;
3008 
3009 	/* Descending order cross. */
3010 	if (ld_off < head_ld_off &&
3011 	    head_ld_off > head_st_off && ld_off <= head_st_off)
3012 		return true;
3013 
3014 	return false;
3015 }
3016 
3017 /* This pass try to identify the following instructoin sequences.
3018  *
3019  *   load R, [regA + offA]
3020  *   store [regB + offB], R
3021  *   load R, [regA + offA + const_imm_A]
3022  *   store [regB + offB + const_imm_A], R
3023  *   load R, [regA + offA + 2 * const_imm_A]
3024  *   store [regB + offB + 2 * const_imm_A], R
3025  *   ...
3026  *
3027  * Above sequence is typically generated by compiler when lowering
3028  * memcpy. NFP prefer using CPP instructions to accelerate it.
3029  */
3030 static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
3031 {
3032 	struct nfp_insn_meta *head_ld_meta = NULL;
3033 	struct nfp_insn_meta *head_st_meta = NULL;
3034 	struct nfp_insn_meta *meta1, *meta2;
3035 	struct bpf_insn *prev_ld = NULL;
3036 	struct bpf_insn *prev_st = NULL;
3037 	u8 count = 0;
3038 
3039 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3040 		struct bpf_insn *ld = &meta1->insn;
3041 		struct bpf_insn *st = &meta2->insn;
3042 
3043 		/* Reset record status if any of the following if true:
3044 		 *   - The current insn pair is not load/store.
3045 		 *   - The load/store pair doesn't chain with previous one.
3046 		 *   - The chained load/store pair crossed with previous pair.
3047 		 *   - The chained load/store pair has a total size of memory
3048 		 *     copy beyond 128 bytes which is the maximum length a
3049 		 *     single NFP CPP command can transfer.
3050 		 */
3051 		if (!curr_pair_is_memcpy(meta1, meta2) ||
3052 		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
3053 						   prev_st) ||
3054 		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
3055 						       head_st_meta) ||
3056 				      head_ld_meta->ldst_gather_len >= 128))) {
3057 			if (!count)
3058 				continue;
3059 
3060 			if (count > 1) {
3061 				s16 prev_ld_off = prev_ld->off;
3062 				s16 prev_st_off = prev_st->off;
3063 				s16 head_ld_off = head_ld_meta->insn.off;
3064 
3065 				if (prev_ld_off < head_ld_off) {
3066 					head_ld_meta->insn.off = prev_ld_off;
3067 					head_st_meta->insn.off = prev_st_off;
3068 					head_ld_meta->ldst_gather_len =
3069 						-head_ld_meta->ldst_gather_len;
3070 				}
3071 
3072 				head_ld_meta->paired_st = &head_st_meta->insn;
3073 				head_st_meta->skip = true;
3074 			} else {
3075 				head_ld_meta->ldst_gather_len = 0;
3076 			}
3077 
3078 			/* If the chain is ended by an load/store pair then this
3079 			 * could serve as the new head of the the next chain.
3080 			 */
3081 			if (curr_pair_is_memcpy(meta1, meta2)) {
3082 				head_ld_meta = meta1;
3083 				head_st_meta = meta2;
3084 				head_ld_meta->ldst_gather_len =
3085 					BPF_LDST_BYTES(ld);
3086 				meta1 = nfp_meta_next(meta1);
3087 				meta2 = nfp_meta_next(meta2);
3088 				prev_ld = ld;
3089 				prev_st = st;
3090 				count = 1;
3091 			} else {
3092 				head_ld_meta = NULL;
3093 				head_st_meta = NULL;
3094 				prev_ld = NULL;
3095 				prev_st = NULL;
3096 				count = 0;
3097 			}
3098 
3099 			continue;
3100 		}
3101 
3102 		if (!head_ld_meta) {
3103 			head_ld_meta = meta1;
3104 			head_st_meta = meta2;
3105 		} else {
3106 			meta1->skip = true;
3107 			meta2->skip = true;
3108 		}
3109 
3110 		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
3111 		meta1 = nfp_meta_next(meta1);
3112 		meta2 = nfp_meta_next(meta2);
3113 		prev_ld = ld;
3114 		prev_st = st;
3115 		count++;
3116 	}
3117 }
3118 
3119 static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog)
3120 {
3121 	struct nfp_insn_meta *meta, *range_node = NULL;
3122 	s16 range_start = 0, range_end = 0;
3123 	bool cache_avail = false;
3124 	struct bpf_insn *insn;
3125 	s32 range_ptr_off = 0;
3126 	u32 range_ptr_id = 0;
3127 
3128 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3129 		if (meta->flags & FLAG_INSN_IS_JUMP_DST)
3130 			cache_avail = false;
3131 
3132 		if (meta->skip)
3133 			continue;
3134 
3135 		insn = &meta->insn;
3136 
3137 		if (is_mbpf_store_pkt(meta) ||
3138 		    insn->code == (BPF_JMP | BPF_CALL) ||
3139 		    is_mbpf_classic_store_pkt(meta) ||
3140 		    is_mbpf_classic_load(meta)) {
3141 			cache_avail = false;
3142 			continue;
3143 		}
3144 
3145 		if (!is_mbpf_load(meta))
3146 			continue;
3147 
3148 		if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) {
3149 			cache_avail = false;
3150 			continue;
3151 		}
3152 
3153 		if (!cache_avail) {
3154 			cache_avail = true;
3155 			if (range_node)
3156 				goto end_current_then_start_new;
3157 			goto start_new;
3158 		}
3159 
3160 		/* Check ID to make sure two reads share the same
3161 		 * variable offset against PTR_TO_PACKET, and check OFF
3162 		 * to make sure they also share the same constant
3163 		 * offset.
3164 		 *
3165 		 * OFFs don't really need to be the same, because they
3166 		 * are the constant offsets against PTR_TO_PACKET, so
3167 		 * for different OFFs, we could canonicalize them to
3168 		 * offsets against original packet pointer. We don't
3169 		 * support this.
3170 		 */
3171 		if (meta->ptr.id == range_ptr_id &&
3172 		    meta->ptr.off == range_ptr_off) {
3173 			s16 new_start = range_start;
3174 			s16 end, off = insn->off;
3175 			s16 new_end = range_end;
3176 			bool changed = false;
3177 
3178 			if (off < range_start) {
3179 				new_start = off;
3180 				changed = true;
3181 			}
3182 
3183 			end = off + BPF_LDST_BYTES(insn);
3184 			if (end > range_end) {
3185 				new_end = end;
3186 				changed = true;
3187 			}
3188 
3189 			if (!changed)
3190 				continue;
3191 
3192 			if (new_end - new_start <= 64) {
3193 				/* Install new range. */
3194 				range_start = new_start;
3195 				range_end = new_end;
3196 				continue;
3197 			}
3198 		}
3199 
3200 end_current_then_start_new:
3201 		range_node->pkt_cache.range_start = range_start;
3202 		range_node->pkt_cache.range_end = range_end;
3203 start_new:
3204 		range_node = meta;
3205 		range_node->pkt_cache.do_init = true;
3206 		range_ptr_id = range_node->ptr.id;
3207 		range_ptr_off = range_node->ptr.off;
3208 		range_start = insn->off;
3209 		range_end = insn->off + BPF_LDST_BYTES(insn);
3210 	}
3211 
3212 	if (range_node) {
3213 		range_node->pkt_cache.range_start = range_start;
3214 		range_node->pkt_cache.range_end = range_end;
3215 	}
3216 
3217 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3218 		if (meta->skip)
3219 			continue;
3220 
3221 		if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) {
3222 			if (meta->pkt_cache.do_init) {
3223 				range_start = meta->pkt_cache.range_start;
3224 				range_end = meta->pkt_cache.range_end;
3225 			} else {
3226 				meta->pkt_cache.range_start = range_start;
3227 				meta->pkt_cache.range_end = range_end;
3228 			}
3229 		}
3230 	}
3231 }
3232 
3233 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
3234 {
3235 	nfp_bpf_opt_reg_init(nfp_prog);
3236 
3237 	nfp_bpf_opt_neg_add_sub(nfp_prog);
3238 	nfp_bpf_opt_ld_mask(nfp_prog);
3239 	nfp_bpf_opt_ld_shift(nfp_prog);
3240 	nfp_bpf_opt_ldst_gather(nfp_prog);
3241 	nfp_bpf_opt_pkt_cache(nfp_prog);
3242 
3243 	return 0;
3244 }
3245 
3246 static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
3247 {
3248 	struct nfp_insn_meta *meta1, *meta2;
3249 	struct nfp_bpf_map *nfp_map;
3250 	struct bpf_map *map;
3251 
3252 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3253 		if (meta1->skip || meta2->skip)
3254 			continue;
3255 
3256 		if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
3257 		    meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
3258 			continue;
3259 
3260 		map = (void *)(unsigned long)((u32)meta1->insn.imm |
3261 					      (u64)meta2->insn.imm << 32);
3262 		if (bpf_map_offload_neutral(map))
3263 			continue;
3264 		nfp_map = map_to_offmap(map)->dev_priv;
3265 
3266 		meta1->insn.imm = nfp_map->tid;
3267 		meta2->insn.imm = 0;
3268 	}
3269 
3270 	return 0;
3271 }
3272 
3273 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
3274 {
3275 	__le64 *ustore = (__force __le64 *)prog;
3276 	int i;
3277 
3278 	for (i = 0; i < len; i++) {
3279 		int err;
3280 
3281 		err = nfp_ustore_check_valid_no_ecc(prog[i]);
3282 		if (err)
3283 			return err;
3284 
3285 		ustore[i] = cpu_to_le64(nfp_ustore_calc_ecc_insn(prog[i]));
3286 	}
3287 
3288 	return 0;
3289 }
3290 
3291 static void nfp_bpf_prog_trim(struct nfp_prog *nfp_prog)
3292 {
3293 	void *prog;
3294 
3295 	prog = kvmalloc_array(nfp_prog->prog_len, sizeof(u64), GFP_KERNEL);
3296 	if (!prog)
3297 		return;
3298 
3299 	nfp_prog->__prog_alloc_len = nfp_prog->prog_len * sizeof(u64);
3300 	memcpy(prog, nfp_prog->prog, nfp_prog->__prog_alloc_len);
3301 	kvfree(nfp_prog->prog);
3302 	nfp_prog->prog = prog;
3303 }
3304 
3305 int nfp_bpf_jit(struct nfp_prog *nfp_prog)
3306 {
3307 	int ret;
3308 
3309 	ret = nfp_bpf_replace_map_ptrs(nfp_prog);
3310 	if (ret)
3311 		return ret;
3312 
3313 	ret = nfp_bpf_optimize(nfp_prog);
3314 	if (ret)
3315 		return ret;
3316 
3317 	ret = nfp_translate(nfp_prog);
3318 	if (ret) {
3319 		pr_err("Translation failed with error %d (translated: %u)\n",
3320 		       ret, nfp_prog->n_translated);
3321 		return -EINVAL;
3322 	}
3323 
3324 	nfp_bpf_prog_trim(nfp_prog);
3325 
3326 	return ret;
3327 }
3328 
3329 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt)
3330 {
3331 	struct nfp_insn_meta *meta;
3332 
3333 	/* Another pass to record jump information. */
3334 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3335 		u64 code = meta->insn.code;
3336 
3337 		if (BPF_CLASS(code) == BPF_JMP && BPF_OP(code) != BPF_EXIT &&
3338 		    BPF_OP(code) != BPF_CALL) {
3339 			struct nfp_insn_meta *dst_meta;
3340 			unsigned short dst_indx;
3341 
3342 			dst_indx = meta->n + 1 + meta->insn.off;
3343 			dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_indx,
3344 						     cnt);
3345 
3346 			meta->jmp_dst = dst_meta;
3347 			dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
3348 		}
3349 	}
3350 }
3351 
3352 bool nfp_bpf_supported_opcode(u8 code)
3353 {
3354 	return !!instr_cb[code];
3355 }
3356 
3357 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
3358 {
3359 	unsigned int i;
3360 	u64 *prog;
3361 	int err;
3362 
3363 	prog = kmemdup(nfp_prog->prog, nfp_prog->prog_len * sizeof(u64),
3364 		       GFP_KERNEL);
3365 	if (!prog)
3366 		return ERR_PTR(-ENOMEM);
3367 
3368 	for (i = 0; i < nfp_prog->prog_len; i++) {
3369 		enum nfp_relo_type special;
3370 		u32 val;
3371 
3372 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
3373 		switch (special) {
3374 		case RELO_NONE:
3375 			continue;
3376 		case RELO_BR_REL:
3377 			br_add_offset(&prog[i], bv->start_off);
3378 			break;
3379 		case RELO_BR_GO_OUT:
3380 			br_set_offset(&prog[i],
3381 				      nfp_prog->tgt_out + bv->start_off);
3382 			break;
3383 		case RELO_BR_GO_ABORT:
3384 			br_set_offset(&prog[i],
3385 				      nfp_prog->tgt_abort + bv->start_off);
3386 			break;
3387 		case RELO_BR_NEXT_PKT:
3388 			br_set_offset(&prog[i], bv->tgt_done);
3389 			break;
3390 		case RELO_BR_HELPER:
3391 			val = br_get_offset(prog[i]);
3392 			val -= BR_OFF_RELO;
3393 			switch (val) {
3394 			case BPF_FUNC_map_lookup_elem:
3395 				val = nfp_prog->bpf->helpers.map_lookup;
3396 				break;
3397 			case BPF_FUNC_map_update_elem:
3398 				val = nfp_prog->bpf->helpers.map_update;
3399 				break;
3400 			case BPF_FUNC_map_delete_elem:
3401 				val = nfp_prog->bpf->helpers.map_delete;
3402 				break;
3403 			case BPF_FUNC_perf_event_output:
3404 				val = nfp_prog->bpf->helpers.perf_event_output;
3405 				break;
3406 			default:
3407 				pr_err("relocation of unknown helper %d\n",
3408 				       val);
3409 				err = -EINVAL;
3410 				goto err_free_prog;
3411 			}
3412 			br_set_offset(&prog[i], val);
3413 			break;
3414 		case RELO_IMMED_REL:
3415 			immed_add_value(&prog[i], bv->start_off);
3416 			break;
3417 		}
3418 
3419 		prog[i] &= ~OP_RELO_TYPE;
3420 	}
3421 
3422 	err = nfp_bpf_ustore_calc(prog, nfp_prog->prog_len);
3423 	if (err)
3424 		goto err_free_prog;
3425 
3426 	return prog;
3427 
3428 err_free_prog:
3429 	kfree(prog);
3430 	return ERR_PTR(err);
3431 }
3432