1 /*
2  * Copyright (C) 2016-2018 Netronome Systems, Inc.
3  *
4  * This software is dual licensed under the GNU General License Version 2,
5  * June 1991 as shown in the file COPYING in the top-level directory of this
6  * source tree or the BSD 2-Clause License provided below.  You have the
7  * option to license this software under the complete terms of either license.
8  *
9  * The BSD 2-Clause License:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      1. Redistributions of source code must retain the above
16  *         copyright notice, this list of conditions and the following
17  *         disclaimer.
18  *
19  *      2. Redistributions in binary form must reproduce the above
20  *         copyright notice, this list of conditions and the following
21  *         disclaimer in the documentation and/or other materials
22  *         provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #define pr_fmt(fmt)	"NFP net bpf: " fmt
35 
36 #include <linux/bug.h>
37 #include <linux/bpf.h>
38 #include <linux/filter.h>
39 #include <linux/kernel.h>
40 #include <linux/pkt_cls.h>
41 #include <linux/reciprocal_div.h>
42 #include <linux/unistd.h>
43 
44 #include "main.h"
45 #include "../nfp_asm.h"
46 #include "../nfp_net_ctrl.h"
47 
48 /* --- NFP prog --- */
49 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
50  * It's safe to modify the next pointers (but not pos).
51  */
52 #define nfp_for_each_insn_walk2(nfp_prog, pos, next)			\
53 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
54 	     next = list_next_entry(pos, l);			\
55 	     &(nfp_prog)->insns != &pos->l &&			\
56 	     &(nfp_prog)->insns != &next->l;			\
57 	     pos = nfp_meta_next(pos),				\
58 	     next = nfp_meta_next(pos))
59 
60 #define nfp_for_each_insn_walk3(nfp_prog, pos, next, next2)		\
61 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
62 	     next = list_next_entry(pos, l),			\
63 	     next2 = list_next_entry(next, l);			\
64 	     &(nfp_prog)->insns != &pos->l &&			\
65 	     &(nfp_prog)->insns != &next->l &&			\
66 	     &(nfp_prog)->insns != &next2->l;			\
67 	     pos = nfp_meta_next(pos),				\
68 	     next = nfp_meta_next(pos),				\
69 	     next2 = nfp_meta_next(next))
70 
71 static bool
72 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
73 {
74 	return meta->l.prev != &nfp_prog->insns;
75 }
76 
77 static void nfp_prog_push(struct nfp_prog *nfp_prog, u64 insn)
78 {
79 	if (nfp_prog->__prog_alloc_len / sizeof(u64) == nfp_prog->prog_len) {
80 		pr_warn("instruction limit reached (%u NFP instructions)\n",
81 			nfp_prog->prog_len);
82 		nfp_prog->error = -ENOSPC;
83 		return;
84 	}
85 
86 	nfp_prog->prog[nfp_prog->prog_len] = insn;
87 	nfp_prog->prog_len++;
88 }
89 
90 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog)
91 {
92 	return nfp_prog->prog_len;
93 }
94 
95 static bool
96 nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off)
97 {
98 	/* If there is a recorded error we may have dropped instructions;
99 	 * that doesn't have to be due to translator bug, and the translation
100 	 * will fail anyway, so just return OK.
101 	 */
102 	if (nfp_prog->error)
103 		return true;
104 	return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off);
105 }
106 
107 /* --- Emitters --- */
108 static void
109 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
110 	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx,
111 	   bool indir)
112 {
113 	u64 insn;
114 
115 	insn =	FIELD_PREP(OP_CMD_A_SRC, areg) |
116 		FIELD_PREP(OP_CMD_CTX, ctx) |
117 		FIELD_PREP(OP_CMD_B_SRC, breg) |
118 		FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) |
119 		FIELD_PREP(OP_CMD_XFER, xfer) |
120 		FIELD_PREP(OP_CMD_CNT, size) |
121 		FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) |
122 		FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
123 		FIELD_PREP(OP_CMD_INDIR, indir) |
124 		FIELD_PREP(OP_CMD_MODE, mode);
125 
126 	nfp_prog_push(nfp_prog, insn);
127 }
128 
129 static void
130 emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
131 	     swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir)
132 {
133 	struct nfp_insn_re_regs reg;
134 	int err;
135 
136 	err = swreg_to_restricted(reg_none(), lreg, rreg, &reg, false);
137 	if (err) {
138 		nfp_prog->error = err;
139 		return;
140 	}
141 	if (reg.swap) {
142 		pr_err("cmd can't swap arguments\n");
143 		nfp_prog->error = -EFAULT;
144 		return;
145 	}
146 	if (reg.dst_lmextn || reg.src_lmextn) {
147 		pr_err("cmd can't use LMextn\n");
148 		nfp_prog->error = -EFAULT;
149 		return;
150 	}
151 
152 	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx,
153 		   indir);
154 }
155 
156 static void
157 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
158 	 swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
159 {
160 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false);
161 }
162 
163 static void
164 emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
165 	       swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
166 {
167 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true);
168 }
169 
170 static void
171 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
172 	  enum br_ctx_signal_state css, u16 addr, u8 defer)
173 {
174 	u16 addr_lo, addr_hi;
175 	u64 insn;
176 
177 	addr_lo = addr & (OP_BR_ADDR_LO >> __bf_shf(OP_BR_ADDR_LO));
178 	addr_hi = addr != addr_lo;
179 
180 	insn = OP_BR_BASE |
181 		FIELD_PREP(OP_BR_MASK, mask) |
182 		FIELD_PREP(OP_BR_EV_PIP, ev_pip) |
183 		FIELD_PREP(OP_BR_CSS, css) |
184 		FIELD_PREP(OP_BR_DEFBR, defer) |
185 		FIELD_PREP(OP_BR_ADDR_LO, addr_lo) |
186 		FIELD_PREP(OP_BR_ADDR_HI, addr_hi);
187 
188 	nfp_prog_push(nfp_prog, insn);
189 }
190 
191 static void
192 emit_br_relo(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer,
193 	     enum nfp_relo_type relo)
194 {
195 	if (mask == BR_UNC && defer > 2) {
196 		pr_err("BUG: branch defer out of bounds %d\n", defer);
197 		nfp_prog->error = -EFAULT;
198 		return;
199 	}
200 
201 	__emit_br(nfp_prog, mask,
202 		  mask != BR_UNC ? BR_EV_PIP_COND : BR_EV_PIP_UNCOND,
203 		  BR_CSS_NONE, addr, defer);
204 
205 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
206 		FIELD_PREP(OP_RELO_TYPE, relo);
207 }
208 
209 static void
210 emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
211 {
212 	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
213 }
214 
215 static void
216 __emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
217 	      bool set, bool src_lmextn)
218 {
219 	u16 addr_lo, addr_hi;
220 	u64 insn;
221 
222 	addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
223 	addr_hi = addr != addr_lo;
224 
225 	insn = OP_BR_BIT_BASE |
226 		FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
227 		FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
228 		FIELD_PREP(OP_BR_BIT_BV, set) |
229 		FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
230 		FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
231 		FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
232 		FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);
233 
234 	nfp_prog_push(nfp_prog, insn);
235 }
236 
237 static void
238 emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
239 		 u8 defer, bool set, enum nfp_relo_type relo)
240 {
241 	struct nfp_insn_re_regs reg;
242 	int err;
243 
244 	/* NOTE: The bit to test is specified as an rotation amount, such that
245 	 *	 the bit to test will be placed on the MSB of the result when
246 	 *	 doing a rotate right. For bit X, we need right rotate X + 1.
247 	 */
248 	bit += 1;
249 
250 	err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
251 	if (err) {
252 		nfp_prog->error = err;
253 		return;
254 	}
255 
256 	__emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
257 		      reg.src_lmextn);
258 
259 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
260 		FIELD_PREP(OP_RELO_TYPE, relo);
261 }
262 
263 static void
264 emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
265 {
266 	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
267 }
268 
269 static void
270 __emit_br_alu(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
271 	      u8 defer, bool dst_lmextn, bool src_lmextn)
272 {
273 	u64 insn;
274 
275 	insn = OP_BR_ALU_BASE |
276 		FIELD_PREP(OP_BR_ALU_A_SRC, areg) |
277 		FIELD_PREP(OP_BR_ALU_B_SRC, breg) |
278 		FIELD_PREP(OP_BR_ALU_DEFBR, defer) |
279 		FIELD_PREP(OP_BR_ALU_IMM_HI, imm_hi) |
280 		FIELD_PREP(OP_BR_ALU_SRC_LMEXTN, src_lmextn) |
281 		FIELD_PREP(OP_BR_ALU_DST_LMEXTN, dst_lmextn);
282 
283 	nfp_prog_push(nfp_prog, insn);
284 }
285 
286 static void emit_rtn(struct nfp_prog *nfp_prog, swreg base, u8 defer)
287 {
288 	struct nfp_insn_ur_regs reg;
289 	int err;
290 
291 	err = swreg_to_unrestricted(reg_none(), base, reg_imm(0), &reg);
292 	if (err) {
293 		nfp_prog->error = err;
294 		return;
295 	}
296 
297 	__emit_br_alu(nfp_prog, reg.areg, reg.breg, 0, defer, reg.dst_lmextn,
298 		      reg.src_lmextn);
299 }
300 
301 static void
302 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
303 	     enum immed_width width, bool invert,
304 	     enum immed_shift shift, bool wr_both,
305 	     bool dst_lmextn, bool src_lmextn)
306 {
307 	u64 insn;
308 
309 	insn = OP_IMMED_BASE |
310 		FIELD_PREP(OP_IMMED_A_SRC, areg) |
311 		FIELD_PREP(OP_IMMED_B_SRC, breg) |
312 		FIELD_PREP(OP_IMMED_IMM, imm_hi) |
313 		FIELD_PREP(OP_IMMED_WIDTH, width) |
314 		FIELD_PREP(OP_IMMED_INV, invert) |
315 		FIELD_PREP(OP_IMMED_SHIFT, shift) |
316 		FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
317 		FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
318 		FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
319 
320 	nfp_prog_push(nfp_prog, insn);
321 }
322 
323 static void
324 emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
325 	   enum immed_width width, bool invert, enum immed_shift shift)
326 {
327 	struct nfp_insn_ur_regs reg;
328 	int err;
329 
330 	if (swreg_type(dst) == NN_REG_IMM) {
331 		nfp_prog->error = -EFAULT;
332 		return;
333 	}
334 
335 	err = swreg_to_unrestricted(dst, dst, reg_imm(imm & 0xff), &reg);
336 	if (err) {
337 		nfp_prog->error = err;
338 		return;
339 	}
340 
341 	/* Use reg.dst when destination is No-Dest. */
342 	__emit_immed(nfp_prog,
343 		     swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
344 		     reg.breg, imm >> 8, width, invert, shift,
345 		     reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
346 }
347 
348 static void
349 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
350 	   enum shf_sc sc, u8 shift,
351 	   u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
352 	   bool dst_lmextn, bool src_lmextn)
353 {
354 	u64 insn;
355 
356 	if (!FIELD_FIT(OP_SHF_SHIFT, shift)) {
357 		nfp_prog->error = -EFAULT;
358 		return;
359 	}
360 
361 	if (sc == SHF_SC_L_SHF)
362 		shift = 32 - shift;
363 
364 	insn = OP_SHF_BASE |
365 		FIELD_PREP(OP_SHF_A_SRC, areg) |
366 		FIELD_PREP(OP_SHF_SC, sc) |
367 		FIELD_PREP(OP_SHF_B_SRC, breg) |
368 		FIELD_PREP(OP_SHF_I8, i8) |
369 		FIELD_PREP(OP_SHF_SW, sw) |
370 		FIELD_PREP(OP_SHF_DST, dst) |
371 		FIELD_PREP(OP_SHF_SHIFT, shift) |
372 		FIELD_PREP(OP_SHF_OP, op) |
373 		FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
374 		FIELD_PREP(OP_SHF_WR_AB, wr_both) |
375 		FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
376 		FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
377 
378 	nfp_prog_push(nfp_prog, insn);
379 }
380 
381 static void
382 emit_shf(struct nfp_prog *nfp_prog, swreg dst,
383 	 swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
384 {
385 	struct nfp_insn_re_regs reg;
386 	int err;
387 
388 	err = swreg_to_restricted(dst, lreg, rreg, &reg, true);
389 	if (err) {
390 		nfp_prog->error = err;
391 		return;
392 	}
393 
394 	__emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
395 		   reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
396 		   reg.dst_lmextn, reg.src_lmextn);
397 }
398 
399 static void
400 emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
401 	       swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
402 {
403 	if (sc == SHF_SC_R_ROT) {
404 		pr_err("indirect shift is not allowed on rotation\n");
405 		nfp_prog->error = -EFAULT;
406 		return;
407 	}
408 
409 	emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
410 }
411 
412 static void
413 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
414 	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
415 	   bool dst_lmextn, bool src_lmextn)
416 {
417 	u64 insn;
418 
419 	insn = OP_ALU_BASE |
420 		FIELD_PREP(OP_ALU_A_SRC, areg) |
421 		FIELD_PREP(OP_ALU_B_SRC, breg) |
422 		FIELD_PREP(OP_ALU_DST, dst) |
423 		FIELD_PREP(OP_ALU_SW, swap) |
424 		FIELD_PREP(OP_ALU_OP, op) |
425 		FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
426 		FIELD_PREP(OP_ALU_WR_AB, wr_both) |
427 		FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
428 		FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
429 
430 	nfp_prog_push(nfp_prog, insn);
431 }
432 
433 static void
434 emit_alu(struct nfp_prog *nfp_prog, swreg dst,
435 	 swreg lreg, enum alu_op op, swreg rreg)
436 {
437 	struct nfp_insn_ur_regs reg;
438 	int err;
439 
440 	err = swreg_to_unrestricted(dst, lreg, rreg, &reg);
441 	if (err) {
442 		nfp_prog->error = err;
443 		return;
444 	}
445 
446 	__emit_alu(nfp_prog, reg.dst, reg.dst_ab,
447 		   reg.areg, op, reg.breg, reg.swap, reg.wr_both,
448 		   reg.dst_lmextn, reg.src_lmextn);
449 }
450 
451 static void
452 __emit_mul(struct nfp_prog *nfp_prog, enum alu_dst_ab dst_ab, u16 areg,
453 	   enum mul_type type, enum mul_step step, u16 breg, bool swap,
454 	   bool wr_both, bool dst_lmextn, bool src_lmextn)
455 {
456 	u64 insn;
457 
458 	insn = OP_MUL_BASE |
459 		FIELD_PREP(OP_MUL_A_SRC, areg) |
460 		FIELD_PREP(OP_MUL_B_SRC, breg) |
461 		FIELD_PREP(OP_MUL_STEP, step) |
462 		FIELD_PREP(OP_MUL_DST_AB, dst_ab) |
463 		FIELD_PREP(OP_MUL_SW, swap) |
464 		FIELD_PREP(OP_MUL_TYPE, type) |
465 		FIELD_PREP(OP_MUL_WR_AB, wr_both) |
466 		FIELD_PREP(OP_MUL_SRC_LMEXTN, src_lmextn) |
467 		FIELD_PREP(OP_MUL_DST_LMEXTN, dst_lmextn);
468 
469 	nfp_prog_push(nfp_prog, insn);
470 }
471 
472 static void
473 emit_mul(struct nfp_prog *nfp_prog, swreg lreg, enum mul_type type,
474 	 enum mul_step step, swreg rreg)
475 {
476 	struct nfp_insn_ur_regs reg;
477 	u16 areg;
478 	int err;
479 
480 	if (type == MUL_TYPE_START && step != MUL_STEP_NONE) {
481 		nfp_prog->error = -EINVAL;
482 		return;
483 	}
484 
485 	if (step == MUL_LAST || step == MUL_LAST_2) {
486 		/* When type is step and step Number is LAST or LAST2, left
487 		 * source is used as destination.
488 		 */
489 		err = swreg_to_unrestricted(lreg, reg_none(), rreg, &reg);
490 		areg = reg.dst;
491 	} else {
492 		err = swreg_to_unrestricted(reg_none(), lreg, rreg, &reg);
493 		areg = reg.areg;
494 	}
495 
496 	if (err) {
497 		nfp_prog->error = err;
498 		return;
499 	}
500 
501 	__emit_mul(nfp_prog, reg.dst_ab, areg, type, step, reg.breg, reg.swap,
502 		   reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
503 }
504 
505 static void
506 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
507 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
508 		bool zero, bool swap, bool wr_both,
509 		bool dst_lmextn, bool src_lmextn)
510 {
511 	u64 insn;
512 
513 	insn = OP_LDF_BASE |
514 		FIELD_PREP(OP_LDF_A_SRC, areg) |
515 		FIELD_PREP(OP_LDF_SC, sc) |
516 		FIELD_PREP(OP_LDF_B_SRC, breg) |
517 		FIELD_PREP(OP_LDF_I8, imm8) |
518 		FIELD_PREP(OP_LDF_SW, swap) |
519 		FIELD_PREP(OP_LDF_ZF, zero) |
520 		FIELD_PREP(OP_LDF_BMASK, bmask) |
521 		FIELD_PREP(OP_LDF_SHF, shift) |
522 		FIELD_PREP(OP_LDF_WR_AB, wr_both) |
523 		FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
524 		FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
525 
526 	nfp_prog_push(nfp_prog, insn);
527 }
528 
529 static void
530 emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
531 		  enum shf_sc sc, u8 shift, bool zero)
532 {
533 	struct nfp_insn_re_regs reg;
534 	int err;
535 
536 	/* Note: ld_field is special as it uses one of the src regs as dst */
537 	err = swreg_to_restricted(dst, dst, src, &reg, true);
538 	if (err) {
539 		nfp_prog->error = err;
540 		return;
541 	}
542 
543 	__emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
544 			reg.i8, zero, reg.swap, reg.wr_both,
545 			reg.dst_lmextn, reg.src_lmextn);
546 }
547 
548 static void
549 emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
550 	      enum shf_sc sc, u8 shift)
551 {
552 	emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
553 }
554 
555 static void
556 __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
557 	    bool dst_lmextn, bool src_lmextn)
558 {
559 	u64 insn;
560 
561 	insn = OP_LCSR_BASE |
562 		FIELD_PREP(OP_LCSR_A_SRC, areg) |
563 		FIELD_PREP(OP_LCSR_B_SRC, breg) |
564 		FIELD_PREP(OP_LCSR_WRITE, wr) |
565 		FIELD_PREP(OP_LCSR_ADDR, addr / 4) |
566 		FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
567 		FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
568 
569 	nfp_prog_push(nfp_prog, insn);
570 }
571 
572 static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
573 {
574 	struct nfp_insn_ur_regs reg;
575 	int err;
576 
577 	/* This instruction takes immeds instead of reg_none() for the ignored
578 	 * operand, but we can't encode 2 immeds in one instr with our normal
579 	 * swreg infra so if param is an immed, we encode as reg_none() and
580 	 * copy the immed to both operands.
581 	 */
582 	if (swreg_type(src) == NN_REG_IMM) {
583 		err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
584 		reg.breg = reg.areg;
585 	} else {
586 		err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
587 	}
588 	if (err) {
589 		nfp_prog->error = err;
590 		return;
591 	}
592 
593 	__emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr,
594 		    false, reg.src_lmextn);
595 }
596 
597 /* CSR value is read in following immed[gpr, 0] */
598 static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr)
599 {
600 	__emit_lcsr(nfp_prog, 0, 0, false, addr, false, false);
601 }
602 
603 static void emit_nop(struct nfp_prog *nfp_prog)
604 {
605 	__emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
606 }
607 
608 /* --- Wrappers --- */
609 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
610 {
611 	if (!(imm & 0xffff0000)) {
612 		*val = imm;
613 		*shift = IMMED_SHIFT_0B;
614 	} else if (!(imm & 0xff0000ff)) {
615 		*val = imm >> 8;
616 		*shift = IMMED_SHIFT_1B;
617 	} else if (!(imm & 0x0000ffff)) {
618 		*val = imm >> 16;
619 		*shift = IMMED_SHIFT_2B;
620 	} else {
621 		return false;
622 	}
623 
624 	return true;
625 }
626 
627 static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
628 {
629 	enum immed_shift shift;
630 	u16 val;
631 
632 	if (pack_immed(imm, &val, &shift)) {
633 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, false, shift);
634 	} else if (pack_immed(~imm, &val, &shift)) {
635 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, true, shift);
636 	} else {
637 		emit_immed(nfp_prog, dst, imm & 0xffff, IMMED_WIDTH_ALL,
638 			   false, IMMED_SHIFT_0B);
639 		emit_immed(nfp_prog, dst, imm >> 16, IMMED_WIDTH_WORD,
640 			   false, IMMED_SHIFT_2B);
641 	}
642 }
643 
644 static void
645 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm,
646 	       enum nfp_relo_type relo)
647 {
648 	if (imm > 0xffff) {
649 		pr_err("relocation of a large immediate!\n");
650 		nfp_prog->error = -EFAULT;
651 		return;
652 	}
653 	emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
654 
655 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
656 		FIELD_PREP(OP_RELO_TYPE, relo);
657 }
658 
659 /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted)
660  * If the @imm is small enough encode it directly in operand and return
661  * otherwise load @imm to a spare register and return its encoding.
662  */
663 static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
664 {
665 	if (FIELD_FIT(UR_REG_IMM_MAX, imm))
666 		return reg_imm(imm);
667 
668 	wrp_immed(nfp_prog, tmp_reg, imm);
669 	return tmp_reg;
670 }
671 
672 /* re_load_imm_any() - encode immediate or use tmp register (restricted)
673  * If the @imm is small enough encode it directly in operand and return
674  * otherwise load @imm to a spare register and return its encoding.
675  */
676 static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
677 {
678 	if (FIELD_FIT(RE_REG_IMM_MAX, imm))
679 		return reg_imm(imm);
680 
681 	wrp_immed(nfp_prog, tmp_reg, imm);
682 	return tmp_reg;
683 }
684 
685 static void wrp_nops(struct nfp_prog *nfp_prog, unsigned int count)
686 {
687 	while (count--)
688 		emit_nop(nfp_prog);
689 }
690 
691 static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src)
692 {
693 	emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src);
694 }
695 
696 static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
697 {
698 	wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
699 }
700 
701 /* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
702  * result to @dst from low end.
703  */
704 static void
705 wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
706 		u8 offset)
707 {
708 	enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
709 	u8 mask = (1 << field_len) - 1;
710 
711 	emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
712 }
713 
714 /* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the
715  * result to @dst from offset, there is no change on the other bits of @dst.
716  */
717 static void
718 wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src,
719 		   u8 field_len, u8 offset)
720 {
721 	enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE;
722 	u8 mask = ((1 << field_len) - 1) << offset;
723 
724 	emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8);
725 }
726 
727 static void
728 addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
729 	      swreg *rega, swreg *regb)
730 {
731 	if (offset == reg_imm(0)) {
732 		*rega = reg_a(src_gpr);
733 		*regb = reg_b(src_gpr + 1);
734 		return;
735 	}
736 
737 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset);
738 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C,
739 		 reg_imm(0));
740 	*rega = imm_a(nfp_prog);
741 	*regb = imm_b(nfp_prog);
742 }
743 
744 /* NFP has Command Push Pull bus which supports bluk memory operations. */
745 static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
746 {
747 	bool descending_seq = meta->ldst_gather_len < 0;
748 	s16 len = abs(meta->ldst_gather_len);
749 	swreg src_base, off;
750 	bool src_40bit_addr;
751 	unsigned int i;
752 	u8 xfer_num;
753 
754 	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
755 	src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE;
756 	src_base = reg_a(meta->insn.src_reg * 2);
757 	xfer_num = round_up(len, 4) / 4;
758 
759 	if (src_40bit_addr)
760 		addr40_offset(nfp_prog, meta->insn.src_reg * 2, off, &src_base,
761 			      &off);
762 
763 	/* Setup PREV_ALU fields to override memory read length. */
764 	if (len > 32)
765 		wrp_immed(nfp_prog, reg_none(),
766 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
767 
768 	/* Memory read from source addr into transfer-in registers. */
769 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP,
770 		     src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0,
771 		     src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32);
772 
773 	/* Move from transfer-in to transfer-out. */
774 	for (i = 0; i < xfer_num; i++)
775 		wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
776 
777 	off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
778 
779 	if (len <= 8) {
780 		/* Use single direct_ref write8. */
781 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
782 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
783 			 CMD_CTX_SWAP);
784 	} else if (len <= 32 && IS_ALIGNED(len, 4)) {
785 		/* Use single direct_ref write32. */
786 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
787 			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
788 			 CMD_CTX_SWAP);
789 	} else if (len <= 32) {
790 		/* Use single indirect_ref write8. */
791 		wrp_immed(nfp_prog, reg_none(),
792 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
793 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
794 			       reg_a(meta->paired_st->dst_reg * 2), off,
795 			       len - 1, CMD_CTX_SWAP);
796 	} else if (IS_ALIGNED(len, 4)) {
797 		/* Use single indirect_ref write32. */
798 		wrp_immed(nfp_prog, reg_none(),
799 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
800 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
801 			       reg_a(meta->paired_st->dst_reg * 2), off,
802 			       xfer_num - 1, CMD_CTX_SWAP);
803 	} else if (len <= 40) {
804 		/* Use one direct_ref write32 to write the first 32-bytes, then
805 		 * another direct_ref write8 to write the remaining bytes.
806 		 */
807 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
808 			 reg_a(meta->paired_st->dst_reg * 2), off, 7,
809 			 CMD_CTX_SWAP);
810 
811 		off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
812 				      imm_b(nfp_prog));
813 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
814 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
815 			 CMD_CTX_SWAP);
816 	} else {
817 		/* Use one indirect_ref write32 to write 4-bytes aligned length,
818 		 * then another direct_ref write8 to write the remaining bytes.
819 		 */
820 		u8 new_off;
821 
822 		wrp_immed(nfp_prog, reg_none(),
823 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
824 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
825 			       reg_a(meta->paired_st->dst_reg * 2), off,
826 			       xfer_num - 2, CMD_CTX_SWAP);
827 		new_off = meta->paired_st->off + (xfer_num - 1) * 4;
828 		off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
829 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
830 			 xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
831 			 (len & 0x3) - 1, CMD_CTX_SWAP);
832 	}
833 
834 	/* TODO: The following extra load is to make sure data flow be identical
835 	 *  before and after we do memory copy optimization.
836 	 *
837 	 *  The load destination register is not guaranteed to be dead, so we
838 	 *  need to make sure it is loaded with the value the same as before
839 	 *  this transformation.
840 	 *
841 	 *  These extra loads could be removed once we have accurate register
842 	 *  usage information.
843 	 */
844 	if (descending_seq)
845 		xfer_num = 0;
846 	else if (BPF_SIZE(meta->insn.code) != BPF_DW)
847 		xfer_num = xfer_num - 1;
848 	else
849 		xfer_num = xfer_num - 2;
850 
851 	switch (BPF_SIZE(meta->insn.code)) {
852 	case BPF_B:
853 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
854 				reg_xfer(xfer_num), 1,
855 				IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
856 		break;
857 	case BPF_H:
858 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
859 				reg_xfer(xfer_num), 2, (len & 3) ^ 2);
860 		break;
861 	case BPF_W:
862 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
863 			reg_xfer(0));
864 		break;
865 	case BPF_DW:
866 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
867 			reg_xfer(xfer_num));
868 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
869 			reg_xfer(xfer_num + 1));
870 		break;
871 	}
872 
873 	if (BPF_SIZE(meta->insn.code) != BPF_DW)
874 		wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
875 
876 	return 0;
877 }
878 
879 static int
880 data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size)
881 {
882 	unsigned int i;
883 	u16 shift, sz;
884 
885 	/* We load the value from the address indicated in @offset and then
886 	 * shift out the data we don't need.  Note: this is big endian!
887 	 */
888 	sz = max(size, 4);
889 	shift = size < 4 ? 4 - size : 0;
890 
891 	emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
892 		 pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP);
893 
894 	i = 0;
895 	if (shift)
896 		emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE,
897 			 reg_xfer(0), SHF_SC_R_SHF, shift * 8);
898 	else
899 		for (; i * 4 < size; i++)
900 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
901 
902 	if (i < 2)
903 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
904 
905 	return 0;
906 }
907 
908 static int
909 data_ld_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr,
910 		   swreg lreg, swreg rreg, int size, enum cmd_mode mode)
911 {
912 	unsigned int i;
913 	u8 mask, sz;
914 
915 	/* We load the value from the address indicated in rreg + lreg and then
916 	 * mask out the data we don't need.  Note: this is little endian!
917 	 */
918 	sz = max(size, 4);
919 	mask = size < 4 ? GENMASK(size - 1, 0) : 0;
920 
921 	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0,
922 		 lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP);
923 
924 	i = 0;
925 	if (mask)
926 		emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask,
927 				  reg_xfer(0), SHF_SC_NONE, 0, true);
928 	else
929 		for (; i * 4 < size; i++)
930 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
931 
932 	if (i < 2)
933 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
934 
935 	return 0;
936 }
937 
938 static int
939 data_ld_host_order_addr32(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
940 			  u8 dst_gpr, u8 size)
941 {
942 	return data_ld_host_order(nfp_prog, dst_gpr, reg_a(src_gpr), offset,
943 				  size, CMD_MODE_32b);
944 }
945 
946 static int
947 data_ld_host_order_addr40(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
948 			  u8 dst_gpr, u8 size)
949 {
950 	swreg rega, regb;
951 
952 	addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb);
953 
954 	return data_ld_host_order(nfp_prog, dst_gpr, rega, regb,
955 				  size, CMD_MODE_40b_BA);
956 }
957 
958 static int
959 construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset, u16 src, u8 size)
960 {
961 	swreg tmp_reg;
962 
963 	/* Calculate the true offset (src_reg + imm) */
964 	tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
965 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg);
966 
967 	/* Check packet length (size guaranteed to fit b/c it's u8) */
968 	emit_alu(nfp_prog, imm_a(nfp_prog),
969 		 imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
970 	emit_alu(nfp_prog, reg_none(),
971 		 plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
972 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
973 
974 	/* Load data */
975 	return data_ld(nfp_prog, imm_b(nfp_prog), 0, size);
976 }
977 
978 static int construct_data_ld(struct nfp_prog *nfp_prog, u16 offset, u8 size)
979 {
980 	swreg tmp_reg;
981 
982 	/* Check packet length */
983 	tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog));
984 	emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
985 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
986 
987 	/* Load data */
988 	tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
989 	return data_ld(nfp_prog, tmp_reg, 0, size);
990 }
991 
992 static int
993 data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
994 		    u8 src_gpr, u8 size)
995 {
996 	unsigned int i;
997 
998 	for (i = 0; i * 4 < size; i++)
999 		wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i));
1000 
1001 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
1002 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
1003 
1004 	return 0;
1005 }
1006 
1007 static int
1008 data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
1009 		   u64 imm, u8 size)
1010 {
1011 	wrp_immed(nfp_prog, reg_xfer(0), imm);
1012 	if (size == 8)
1013 		wrp_immed(nfp_prog, reg_xfer(1), imm >> 32);
1014 
1015 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
1016 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
1017 
1018 	return 0;
1019 }
1020 
1021 typedef int
1022 (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
1023 	     unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1024 	     bool needs_inc);
1025 
1026 static int
1027 wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
1028 	      unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1029 	      bool needs_inc)
1030 {
1031 	bool should_inc = needs_inc && new_gpr && !last;
1032 	u32 idx, src_byte;
1033 	enum shf_sc sc;
1034 	swreg reg;
1035 	int shf;
1036 	u8 mask;
1037 
1038 	if (WARN_ON_ONCE(dst_byte + size > 4 || off % 4 + size > 4))
1039 		return -EOPNOTSUPP;
1040 
1041 	idx = off / 4;
1042 
1043 	/* Move the entire word */
1044 	if (size == 4) {
1045 		wrp_mov(nfp_prog, reg_both(dst),
1046 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
1047 		return 0;
1048 	}
1049 
1050 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1051 		return -EOPNOTSUPP;
1052 
1053 	src_byte = off % 4;
1054 
1055 	mask = (1 << size) - 1;
1056 	mask <<= dst_byte;
1057 
1058 	if (WARN_ON_ONCE(mask > 0xf))
1059 		return -EOPNOTSUPP;
1060 
1061 	shf = abs(src_byte - dst_byte) * 8;
1062 	if (src_byte == dst_byte) {
1063 		sc = SHF_SC_NONE;
1064 	} else if (src_byte < dst_byte) {
1065 		shf = 32 - shf;
1066 		sc = SHF_SC_L_SHF;
1067 	} else {
1068 		sc = SHF_SC_R_SHF;
1069 	}
1070 
1071 	/* ld_field can address fewer indexes, if offset too large do RMW.
1072 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1073 	 */
1074 	if (idx <= RE_REG_LM_IDX_MAX) {
1075 		reg = reg_lm(lm3 ? 3 : 0, idx);
1076 	} else {
1077 		reg = imm_a(nfp_prog);
1078 		/* If it's not the first part of the load and we start a new GPR
1079 		 * that means we are loading a second part of the LMEM word into
1080 		 * a new GPR.  IOW we've already looked that LMEM word and
1081 		 * therefore it has been loaded into imm_a().
1082 		 */
1083 		if (first || !new_gpr)
1084 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1085 	}
1086 
1087 	emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
1088 
1089 	if (should_inc)
1090 		wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1091 
1092 	return 0;
1093 }
1094 
1095 static int
1096 wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
1097 	       unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1098 	       bool needs_inc)
1099 {
1100 	bool should_inc = needs_inc && new_gpr && !last;
1101 	u32 idx, dst_byte;
1102 	enum shf_sc sc;
1103 	swreg reg;
1104 	int shf;
1105 	u8 mask;
1106 
1107 	if (WARN_ON_ONCE(src_byte + size > 4 || off % 4 + size > 4))
1108 		return -EOPNOTSUPP;
1109 
1110 	idx = off / 4;
1111 
1112 	/* Move the entire word */
1113 	if (size == 4) {
1114 		wrp_mov(nfp_prog,
1115 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
1116 			reg_b(src));
1117 		return 0;
1118 	}
1119 
1120 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1121 		return -EOPNOTSUPP;
1122 
1123 	dst_byte = off % 4;
1124 
1125 	mask = (1 << size) - 1;
1126 	mask <<= dst_byte;
1127 
1128 	if (WARN_ON_ONCE(mask > 0xf))
1129 		return -EOPNOTSUPP;
1130 
1131 	shf = abs(src_byte - dst_byte) * 8;
1132 	if (src_byte == dst_byte) {
1133 		sc = SHF_SC_NONE;
1134 	} else if (src_byte < dst_byte) {
1135 		shf = 32 - shf;
1136 		sc = SHF_SC_L_SHF;
1137 	} else {
1138 		sc = SHF_SC_R_SHF;
1139 	}
1140 
1141 	/* ld_field can address fewer indexes, if offset too large do RMW.
1142 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1143 	 */
1144 	if (idx <= RE_REG_LM_IDX_MAX) {
1145 		reg = reg_lm(lm3 ? 3 : 0, idx);
1146 	} else {
1147 		reg = imm_a(nfp_prog);
1148 		/* Only first and last LMEM locations are going to need RMW,
1149 		 * the middle location will be overwritten fully.
1150 		 */
1151 		if (first || last)
1152 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1153 	}
1154 
1155 	emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
1156 
1157 	if (new_gpr || last) {
1158 		if (idx > RE_REG_LM_IDX_MAX)
1159 			wrp_mov(nfp_prog, reg_lm(0, idx), reg);
1160 		if (should_inc)
1161 			wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1162 	}
1163 
1164 	return 0;
1165 }
1166 
1167 static int
1168 mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1169 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
1170 	     bool clr_gpr, lmem_step step)
1171 {
1172 	s32 off = nfp_prog->stack_frame_depth + meta->insn.off + ptr_off;
1173 	bool first = true, last;
1174 	bool needs_inc = false;
1175 	swreg stack_off_reg;
1176 	u8 prev_gpr = 255;
1177 	u32 gpr_byte = 0;
1178 	bool lm3 = true;
1179 	int ret;
1180 
1181 	if (meta->ptr_not_const) {
1182 		/* Use of the last encountered ptr_off is OK, they all have
1183 		 * the same alignment.  Depend on low bits of value being
1184 		 * discarded when written to LMaddr register.
1185 		 */
1186 		stack_off_reg = ur_load_imm_any(nfp_prog, meta->insn.off,
1187 						stack_imm(nfp_prog));
1188 
1189 		emit_alu(nfp_prog, imm_b(nfp_prog),
1190 			 reg_a(ptr_gpr), ALU_OP_ADD, stack_off_reg);
1191 
1192 		needs_inc = true;
1193 	} else if (off + size <= 64) {
1194 		/* We can reach bottom 64B with LMaddr0 */
1195 		lm3 = false;
1196 	} else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
1197 		/* We have to set up a new pointer.  If we know the offset
1198 		 * and the entire access falls into a single 32 byte aligned
1199 		 * window we won't have to increment the LM pointer.
1200 		 * The 32 byte alignment is imporant because offset is ORed in
1201 		 * not added when doing *l$indexN[off].
1202 		 */
1203 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
1204 						stack_imm(nfp_prog));
1205 		emit_alu(nfp_prog, imm_b(nfp_prog),
1206 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1207 
1208 		off %= 32;
1209 	} else {
1210 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
1211 						stack_imm(nfp_prog));
1212 
1213 		emit_alu(nfp_prog, imm_b(nfp_prog),
1214 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1215 
1216 		needs_inc = true;
1217 	}
1218 	if (lm3) {
1219 		emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
1220 		/* For size < 4 one slot will be filled by zeroing of upper. */
1221 		wrp_nops(nfp_prog, clr_gpr && size < 8 ? 2 : 3);
1222 	}
1223 
1224 	if (clr_gpr && size < 8)
1225 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
1226 
1227 	while (size) {
1228 		u32 slice_end;
1229 		u8 slice_size;
1230 
1231 		slice_size = min(size, 4 - gpr_byte);
1232 		slice_end = min(off + slice_size, round_up(off + 1, 4));
1233 		slice_size = slice_end - off;
1234 
1235 		last = slice_size == size;
1236 
1237 		if (needs_inc)
1238 			off %= 4;
1239 
1240 		ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
1241 			   first, gpr != prev_gpr, last, lm3, needs_inc);
1242 		if (ret)
1243 			return ret;
1244 
1245 		prev_gpr = gpr;
1246 		first = false;
1247 
1248 		gpr_byte += slice_size;
1249 		if (gpr_byte >= 4) {
1250 			gpr_byte -= 4;
1251 			gpr++;
1252 		}
1253 
1254 		size -= slice_size;
1255 		off += slice_size;
1256 	}
1257 
1258 	return 0;
1259 }
1260 
1261 static void
1262 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
1263 {
1264 	swreg tmp_reg;
1265 
1266 	if (alu_op == ALU_OP_AND) {
1267 		if (!imm)
1268 			wrp_immed(nfp_prog, reg_both(dst), 0);
1269 		if (!imm || !~imm)
1270 			return;
1271 	}
1272 	if (alu_op == ALU_OP_OR) {
1273 		if (!~imm)
1274 			wrp_immed(nfp_prog, reg_both(dst), ~0U);
1275 		if (!imm || !~imm)
1276 			return;
1277 	}
1278 	if (alu_op == ALU_OP_XOR) {
1279 		if (!~imm)
1280 			emit_alu(nfp_prog, reg_both(dst), reg_none(),
1281 				 ALU_OP_NOT, reg_b(dst));
1282 		if (!imm || !~imm)
1283 			return;
1284 	}
1285 
1286 	tmp_reg = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1287 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, tmp_reg);
1288 }
1289 
1290 static int
1291 wrp_alu64_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1292 	      enum alu_op alu_op, bool skip)
1293 {
1294 	const struct bpf_insn *insn = &meta->insn;
1295 	u64 imm = insn->imm; /* sign extend */
1296 
1297 	if (skip) {
1298 		meta->skip = true;
1299 		return 0;
1300 	}
1301 
1302 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, imm & ~0U);
1303 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, alu_op, imm >> 32);
1304 
1305 	return 0;
1306 }
1307 
1308 static int
1309 wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1310 	      enum alu_op alu_op)
1311 {
1312 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1313 
1314 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1315 	emit_alu(nfp_prog, reg_both(dst + 1),
1316 		 reg_a(dst + 1), alu_op, reg_b(src + 1));
1317 
1318 	return 0;
1319 }
1320 
1321 static int
1322 wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1323 	      enum alu_op alu_op, bool skip)
1324 {
1325 	const struct bpf_insn *insn = &meta->insn;
1326 
1327 	if (skip) {
1328 		meta->skip = true;
1329 		return 0;
1330 	}
1331 
1332 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, insn->imm);
1333 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1334 
1335 	return 0;
1336 }
1337 
1338 static int
1339 wrp_alu32_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1340 	      enum alu_op alu_op)
1341 {
1342 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1343 
1344 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1345 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1346 
1347 	return 0;
1348 }
1349 
1350 static void
1351 wrp_test_reg_one(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u8 src,
1352 		 enum br_mask br_mask, u16 off)
1353 {
1354 	emit_alu(nfp_prog, reg_none(), reg_a(dst), alu_op, reg_b(src));
1355 	emit_br(nfp_prog, br_mask, off, 0);
1356 }
1357 
1358 static int
1359 wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1360 	     enum alu_op alu_op, enum br_mask br_mask)
1361 {
1362 	const struct bpf_insn *insn = &meta->insn;
1363 
1364 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
1365 			 insn->src_reg * 2, br_mask, insn->off);
1366 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
1367 			 insn->src_reg * 2 + 1, br_mask, insn->off);
1368 
1369 	return 0;
1370 }
1371 
1372 static const struct jmp_code_map {
1373 	enum br_mask br_mask;
1374 	bool swap;
1375 } jmp_code_map[] = {
1376 	[BPF_JGT >> 4]	= { BR_BLO, true },
1377 	[BPF_JGE >> 4]	= { BR_BHS, false },
1378 	[BPF_JLT >> 4]	= { BR_BLO, false },
1379 	[BPF_JLE >> 4]	= { BR_BHS, true },
1380 	[BPF_JSGT >> 4]	= { BR_BLT, true },
1381 	[BPF_JSGE >> 4]	= { BR_BGE, false },
1382 	[BPF_JSLT >> 4]	= { BR_BLT, false },
1383 	[BPF_JSLE >> 4]	= { BR_BGE, true },
1384 };
1385 
1386 static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
1387 {
1388 	unsigned int op;
1389 
1390 	op = BPF_OP(meta->insn.code) >> 4;
1391 	/* br_mask of 0 is BR_BEQ which we don't use in jump code table */
1392 	if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
1393 		      !jmp_code_map[op].br_mask,
1394 		      "no code found for jump instruction"))
1395 		return NULL;
1396 
1397 	return &jmp_code_map[op];
1398 }
1399 
1400 static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1401 {
1402 	const struct bpf_insn *insn = &meta->insn;
1403 	u64 imm = insn->imm; /* sign extend */
1404 	const struct jmp_code_map *code;
1405 	enum alu_op alu_op, carry_op;
1406 	u8 reg = insn->dst_reg * 2;
1407 	swreg tmp_reg;
1408 
1409 	code = nfp_jmp_code_get(meta);
1410 	if (!code)
1411 		return -EINVAL;
1412 
1413 	alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
1414 	carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
1415 
1416 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
1417 	if (!code->swap)
1418 		emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
1419 	else
1420 		emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
1421 
1422 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
1423 	if (!code->swap)
1424 		emit_alu(nfp_prog, reg_none(),
1425 			 reg_a(reg + 1), carry_op, tmp_reg);
1426 	else
1427 		emit_alu(nfp_prog, reg_none(),
1428 			 tmp_reg, carry_op, reg_a(reg + 1));
1429 
1430 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1431 
1432 	return 0;
1433 }
1434 
1435 static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1436 {
1437 	const struct bpf_insn *insn = &meta->insn;
1438 	const struct jmp_code_map *code;
1439 	u8 areg, breg;
1440 
1441 	code = nfp_jmp_code_get(meta);
1442 	if (!code)
1443 		return -EINVAL;
1444 
1445 	areg = insn->dst_reg * 2;
1446 	breg = insn->src_reg * 2;
1447 
1448 	if (code->swap) {
1449 		areg ^= breg;
1450 		breg ^= areg;
1451 		areg ^= breg;
1452 	}
1453 
1454 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
1455 	emit_alu(nfp_prog, reg_none(),
1456 		 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
1457 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1458 
1459 	return 0;
1460 }
1461 
1462 static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
1463 {
1464 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in,
1465 		      SHF_SC_R_ROT, 8);
1466 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out),
1467 		      SHF_SC_R_ROT, 16);
1468 }
1469 
1470 static void
1471 wrp_mul_u32(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1472 	    swreg rreg, bool gen_high_half)
1473 {
1474 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1475 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_1, rreg);
1476 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_2, rreg);
1477 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_3, rreg);
1478 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_4, rreg);
1479 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_32x32, MUL_LAST, reg_none());
1480 	if (gen_high_half)
1481 		emit_mul(nfp_prog, dst_hi, MUL_TYPE_STEP_32x32, MUL_LAST_2,
1482 			 reg_none());
1483 	else
1484 		wrp_immed(nfp_prog, dst_hi, 0);
1485 }
1486 
1487 static void
1488 wrp_mul_u16(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1489 	    swreg rreg)
1490 {
1491 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1492 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_1, rreg);
1493 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_2, rreg);
1494 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_16x16, MUL_LAST, reg_none());
1495 }
1496 
1497 static int
1498 wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1499 	bool gen_high_half, bool ropnd_from_reg)
1500 {
1501 	swreg multiplier, multiplicand, dst_hi, dst_lo;
1502 	const struct bpf_insn *insn = &meta->insn;
1503 	u32 lopnd_max, ropnd_max;
1504 	u8 dst_reg;
1505 
1506 	dst_reg = insn->dst_reg;
1507 	multiplicand = reg_a(dst_reg * 2);
1508 	dst_hi = reg_both(dst_reg * 2 + 1);
1509 	dst_lo = reg_both(dst_reg * 2);
1510 	lopnd_max = meta->umax_dst;
1511 	if (ropnd_from_reg) {
1512 		multiplier = reg_b(insn->src_reg * 2);
1513 		ropnd_max = meta->umax_src;
1514 	} else {
1515 		u32 imm = insn->imm;
1516 
1517 		multiplier = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1518 		ropnd_max = imm;
1519 	}
1520 	if (lopnd_max > U16_MAX || ropnd_max > U16_MAX)
1521 		wrp_mul_u32(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier,
1522 			    gen_high_half);
1523 	else
1524 		wrp_mul_u16(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier);
1525 
1526 	return 0;
1527 }
1528 
1529 static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
1530 {
1531 	swreg dst_both = reg_both(dst), dst_a = reg_a(dst), dst_b = reg_a(dst);
1532 	struct reciprocal_value_adv rvalue;
1533 	u8 pre_shift, exp;
1534 	swreg magic;
1535 
1536 	if (imm > U32_MAX) {
1537 		wrp_immed(nfp_prog, dst_both, 0);
1538 		return 0;
1539 	}
1540 
1541 	/* NOTE: because we are using "reciprocal_value_adv" which doesn't
1542 	 * support "divisor > (1u << 31)", we need to JIT separate NFP sequence
1543 	 * to handle such case which actually equals to the result of unsigned
1544 	 * comparison "dst >= imm" which could be calculated using the following
1545 	 * NFP sequence:
1546 	 *
1547 	 *  alu[--, dst, -, imm]
1548 	 *  immed[imm, 0]
1549 	 *  alu[dst, imm, +carry, 0]
1550 	 *
1551 	 */
1552 	if (imm > 1U << 31) {
1553 		swreg tmp_b = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1554 
1555 		emit_alu(nfp_prog, reg_none(), dst_a, ALU_OP_SUB, tmp_b);
1556 		wrp_immed(nfp_prog, imm_a(nfp_prog), 0);
1557 		emit_alu(nfp_prog, dst_both, imm_a(nfp_prog), ALU_OP_ADD_C,
1558 			 reg_imm(0));
1559 		return 0;
1560 	}
1561 
1562 	rvalue = reciprocal_value_adv(imm, 32);
1563 	exp = rvalue.exp;
1564 	if (rvalue.is_wide_m && !(imm & 1)) {
1565 		pre_shift = fls(imm & -imm) - 1;
1566 		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
1567 	} else {
1568 		pre_shift = 0;
1569 	}
1570 	magic = ur_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
1571 	if (imm == 1U << exp) {
1572 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1573 			 SHF_SC_R_SHF, exp);
1574 	} else if (rvalue.is_wide_m) {
1575 		wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a,
1576 			    magic, true);
1577 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB,
1578 			 imm_b(nfp_prog));
1579 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1580 			 SHF_SC_R_SHF, 1);
1581 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD,
1582 			 imm_b(nfp_prog));
1583 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1584 			 SHF_SC_R_SHF, rvalue.sh - 1);
1585 	} else {
1586 		if (pre_shift)
1587 			emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1588 				 dst_b, SHF_SC_R_SHF, pre_shift);
1589 		wrp_mul_u32(nfp_prog, dst_both, reg_none(), dst_a, magic, true);
1590 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1591 			 dst_b, SHF_SC_R_SHF, rvalue.sh);
1592 	}
1593 
1594 	return 0;
1595 }
1596 
1597 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1598 {
1599 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
1600 	struct nfp_bpf_cap_adjust_head *adjust_head;
1601 	u32 ret_einval, end;
1602 
1603 	adjust_head = &nfp_prog->bpf->adjust_head;
1604 
1605 	/* Optimized version - 5 vs 14 cycles */
1606 	if (nfp_prog->adjust_head_location != UINT_MAX) {
1607 		if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n))
1608 			return -EINVAL;
1609 
1610 		emit_alu(nfp_prog, pptr_reg(nfp_prog),
1611 			 reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog));
1612 		emit_alu(nfp_prog, plen_reg(nfp_prog),
1613 			 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1614 		emit_alu(nfp_prog, pv_len(nfp_prog),
1615 			 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1616 
1617 		wrp_immed(nfp_prog, reg_both(0), 0);
1618 		wrp_immed(nfp_prog, reg_both(1), 0);
1619 
1620 		/* TODO: when adjust head is guaranteed to succeed we can
1621 		 * also eliminate the following if (r0 == 0) branch.
1622 		 */
1623 
1624 		return 0;
1625 	}
1626 
1627 	ret_einval = nfp_prog_current_offset(nfp_prog) + 14;
1628 	end = ret_einval + 2;
1629 
1630 	/* We need to use a temp because offset is just a part of the pkt ptr */
1631 	emit_alu(nfp_prog, tmp,
1632 		 reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog));
1633 
1634 	/* Validate result will fit within FW datapath constraints */
1635 	emit_alu(nfp_prog, reg_none(),
1636 		 tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min));
1637 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1638 	emit_alu(nfp_prog, reg_none(),
1639 		 reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp);
1640 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1641 
1642 	/* Validate the length is at least ETH_HLEN */
1643 	emit_alu(nfp_prog, tmp_len,
1644 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1645 	emit_alu(nfp_prog, reg_none(),
1646 		 tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN));
1647 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1648 
1649 	/* Load the ret code */
1650 	wrp_immed(nfp_prog, reg_both(0), 0);
1651 	wrp_immed(nfp_prog, reg_both(1), 0);
1652 
1653 	/* Modify the packet metadata */
1654 	emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0);
1655 
1656 	/* Skip over the -EINVAL ret code (defer 2) */
1657 	emit_br(nfp_prog, BR_UNC, end, 2);
1658 
1659 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1660 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1661 	emit_alu(nfp_prog, pv_len(nfp_prog),
1662 		 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1663 
1664 	/* return -EINVAL target */
1665 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1666 		return -EINVAL;
1667 
1668 	wrp_immed(nfp_prog, reg_both(0), -22);
1669 	wrp_immed(nfp_prog, reg_both(1), ~0);
1670 
1671 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1672 		return -EINVAL;
1673 
1674 	return 0;
1675 }
1676 
1677 static int adjust_tail(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1678 {
1679 	u32 ret_einval, end;
1680 	swreg plen, delta;
1681 
1682 	BUILD_BUG_ON(plen_reg(nfp_prog) != reg_b(STATIC_REG_PKT_LEN));
1683 
1684 	plen = imm_a(nfp_prog);
1685 	delta = reg_a(2 * 2);
1686 
1687 	ret_einval = nfp_prog_current_offset(nfp_prog) + 9;
1688 	end = nfp_prog_current_offset(nfp_prog) + 11;
1689 
1690 	/* Calculate resulting length */
1691 	emit_alu(nfp_prog, plen, plen_reg(nfp_prog), ALU_OP_ADD, delta);
1692 	/* delta == 0 is not allowed by the kernel, add must overflow to make
1693 	 * length smaller.
1694 	 */
1695 	emit_br(nfp_prog, BR_BCC, ret_einval, 0);
1696 
1697 	/* if (new_len < 14) then -EINVAL */
1698 	emit_alu(nfp_prog, reg_none(), plen, ALU_OP_SUB, reg_imm(ETH_HLEN));
1699 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1700 
1701 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1702 		 plen_reg(nfp_prog), ALU_OP_ADD, delta);
1703 	emit_alu(nfp_prog, pv_len(nfp_prog),
1704 		 pv_len(nfp_prog), ALU_OP_ADD, delta);
1705 
1706 	emit_br(nfp_prog, BR_UNC, end, 2);
1707 	wrp_immed(nfp_prog, reg_both(0), 0);
1708 	wrp_immed(nfp_prog, reg_both(1), 0);
1709 
1710 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1711 		return -EINVAL;
1712 
1713 	wrp_immed(nfp_prog, reg_both(0), -22);
1714 	wrp_immed(nfp_prog, reg_both(1), ~0);
1715 
1716 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1717 		return -EINVAL;
1718 
1719 	return 0;
1720 }
1721 
1722 static int
1723 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1724 {
1725 	bool load_lm_ptr;
1726 	u32 ret_tgt;
1727 	s64 lm_off;
1728 
1729 	/* We only have to reload LM0 if the key is not at start of stack */
1730 	lm_off = nfp_prog->stack_frame_depth;
1731 	lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
1732 	load_lm_ptr = meta->arg2.var_off || lm_off;
1733 
1734 	/* Set LM0 to start of key */
1735 	if (load_lm_ptr)
1736 		emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0);
1737 	if (meta->func_id == BPF_FUNC_map_update_elem)
1738 		emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1739 
1740 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1741 		     2, RELO_BR_HELPER);
1742 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1743 
1744 	/* Load map ID into A0 */
1745 	wrp_mov(nfp_prog, reg_a(0), reg_a(2));
1746 
1747 	/* Load the return address into B0 */
1748 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1749 
1750 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1751 		return -EINVAL;
1752 
1753 	/* Reset the LM0 pointer */
1754 	if (!load_lm_ptr)
1755 		return 0;
1756 
1757 	emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
1758 	wrp_nops(nfp_prog, 3);
1759 
1760 	return 0;
1761 }
1762 
1763 static int
1764 nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1765 {
1766 	__emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM);
1767 	/* CSR value is read in following immed[gpr, 0] */
1768 	emit_immed(nfp_prog, reg_both(0), 0,
1769 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1770 	emit_immed(nfp_prog, reg_both(1), 0,
1771 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1772 	return 0;
1773 }
1774 
1775 static int
1776 nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1777 {
1778 	swreg ptr_type;
1779 	u32 ret_tgt;
1780 
1781 	ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
1782 
1783 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
1784 
1785 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1786 		     2, RELO_BR_HELPER);
1787 
1788 	/* Load ptr type into A1 */
1789 	wrp_mov(nfp_prog, reg_a(1), ptr_type);
1790 
1791 	/* Load the return address into B0 */
1792 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1793 
1794 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1795 		return -EINVAL;
1796 
1797 	return 0;
1798 }
1799 
1800 static int
1801 nfp_queue_select(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1802 {
1803 	u32 jmp_tgt;
1804 
1805 	jmp_tgt = nfp_prog_current_offset(nfp_prog) + 5;
1806 
1807 	/* Make sure the queue id fits into FW field */
1808 	emit_alu(nfp_prog, reg_none(), reg_a(meta->insn.src_reg * 2),
1809 		 ALU_OP_AND_NOT_B, reg_imm(0xff));
1810 	emit_br(nfp_prog, BR_BEQ, jmp_tgt, 2);
1811 
1812 	/* Set the 'queue selected' bit and the queue value */
1813 	emit_shf(nfp_prog, pv_qsel_set(nfp_prog),
1814 		 pv_qsel_set(nfp_prog), SHF_OP_OR, reg_imm(1),
1815 		 SHF_SC_L_SHF, PKT_VEL_QSEL_SET_BIT);
1816 	emit_ld_field(nfp_prog,
1817 		      pv_qsel_val(nfp_prog), 0x1, reg_b(meta->insn.src_reg * 2),
1818 		      SHF_SC_NONE, 0);
1819 	/* Delay slots end here, we will jump over next instruction if queue
1820 	 * value fits into the field.
1821 	 */
1822 	emit_ld_field(nfp_prog,
1823 		      pv_qsel_val(nfp_prog), 0x1, reg_imm(NFP_NET_RXR_MAX),
1824 		      SHF_SC_NONE, 0);
1825 
1826 	if (!nfp_prog_confirm_current_offset(nfp_prog, jmp_tgt))
1827 		return -EINVAL;
1828 
1829 	return 0;
1830 }
1831 
1832 /* --- Callbacks --- */
1833 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1834 {
1835 	const struct bpf_insn *insn = &meta->insn;
1836 	u8 dst = insn->dst_reg * 2;
1837 	u8 src = insn->src_reg * 2;
1838 
1839 	if (insn->src_reg == BPF_REG_10) {
1840 		swreg stack_depth_reg;
1841 
1842 		stack_depth_reg = ur_load_imm_any(nfp_prog,
1843 						  nfp_prog->stack_frame_depth,
1844 						  stack_imm(nfp_prog));
1845 		emit_alu(nfp_prog, reg_both(dst), stack_reg(nfp_prog),
1846 			 ALU_OP_ADD, stack_depth_reg);
1847 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1848 	} else {
1849 		wrp_reg_mov(nfp_prog, dst, src);
1850 		wrp_reg_mov(nfp_prog, dst + 1, src + 1);
1851 	}
1852 
1853 	return 0;
1854 }
1855 
1856 static int mov_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1857 {
1858 	u64 imm = meta->insn.imm; /* sign extend */
1859 
1860 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2), imm & ~0U);
1861 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), imm >> 32);
1862 
1863 	return 0;
1864 }
1865 
1866 static int xor_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1867 {
1868 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_XOR);
1869 }
1870 
1871 static int xor_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1872 {
1873 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm);
1874 }
1875 
1876 static int and_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1877 {
1878 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_AND);
1879 }
1880 
1881 static int and_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1882 {
1883 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1884 }
1885 
1886 static int or_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1887 {
1888 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_OR);
1889 }
1890 
1891 static int or_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1892 {
1893 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1894 }
1895 
1896 static int add_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1897 {
1898 	const struct bpf_insn *insn = &meta->insn;
1899 
1900 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1901 		 reg_a(insn->dst_reg * 2), ALU_OP_ADD,
1902 		 reg_b(insn->src_reg * 2));
1903 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1904 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_ADD_C,
1905 		 reg_b(insn->src_reg * 2 + 1));
1906 
1907 	return 0;
1908 }
1909 
1910 static int add_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1911 {
1912 	const struct bpf_insn *insn = &meta->insn;
1913 	u64 imm = insn->imm; /* sign extend */
1914 
1915 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_ADD, imm & ~0U);
1916 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_ADD_C, imm >> 32);
1917 
1918 	return 0;
1919 }
1920 
1921 static int sub_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1922 {
1923 	const struct bpf_insn *insn = &meta->insn;
1924 
1925 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1926 		 reg_a(insn->dst_reg * 2), ALU_OP_SUB,
1927 		 reg_b(insn->src_reg * 2));
1928 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1929 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_SUB_C,
1930 		 reg_b(insn->src_reg * 2 + 1));
1931 
1932 	return 0;
1933 }
1934 
1935 static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1936 {
1937 	const struct bpf_insn *insn = &meta->insn;
1938 	u64 imm = insn->imm; /* sign extend */
1939 
1940 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_SUB, imm & ~0U);
1941 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_SUB_C, imm >> 32);
1942 
1943 	return 0;
1944 }
1945 
1946 static int mul_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1947 {
1948 	return wrp_mul(nfp_prog, meta, true, true);
1949 }
1950 
1951 static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1952 {
1953 	return wrp_mul(nfp_prog, meta, true, false);
1954 }
1955 
1956 static int div_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1957 {
1958 	const struct bpf_insn *insn = &meta->insn;
1959 
1960 	return wrp_div_imm(nfp_prog, insn->dst_reg * 2, insn->imm);
1961 }
1962 
1963 static int div_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1964 {
1965 	/* NOTE: verifier hook has rejected cases for which verifier doesn't
1966 	 * know whether the source operand is constant or not.
1967 	 */
1968 	return wrp_div_imm(nfp_prog, meta->insn.dst_reg * 2, meta->umin_src);
1969 }
1970 
1971 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1972 {
1973 	const struct bpf_insn *insn = &meta->insn;
1974 
1975 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2), reg_imm(0),
1976 		 ALU_OP_SUB, reg_b(insn->dst_reg * 2));
1977 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1), reg_imm(0),
1978 		 ALU_OP_SUB_C, reg_b(insn->dst_reg * 2 + 1));
1979 
1980 	return 0;
1981 }
1982 
1983 /* Pseudo code:
1984  *   if shift_amt >= 32
1985  *     dst_high = dst_low << shift_amt[4:0]
1986  *     dst_low = 0;
1987  *   else
1988  *     dst_high = (dst_high, dst_low) >> (32 - shift_amt)
1989  *     dst_low = dst_low << shift_amt
1990  *
1991  * The indirect shift will use the same logic at runtime.
1992  */
1993 static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
1994 {
1995 	if (shift_amt < 32) {
1996 		emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
1997 			 SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
1998 			 32 - shift_amt);
1999 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2000 			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
2001 	} else if (shift_amt == 32) {
2002 		wrp_reg_mov(nfp_prog, dst + 1, dst);
2003 		wrp_immed(nfp_prog, reg_both(dst), 0);
2004 	} else if (shift_amt > 32) {
2005 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2006 			 reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
2007 		wrp_immed(nfp_prog, reg_both(dst), 0);
2008 	}
2009 
2010 	return 0;
2011 }
2012 
2013 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2014 {
2015 	const struct bpf_insn *insn = &meta->insn;
2016 	u8 dst = insn->dst_reg * 2;
2017 
2018 	return __shl_imm64(nfp_prog, dst, insn->imm);
2019 }
2020 
2021 static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2022 {
2023 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
2024 		 reg_b(src));
2025 	emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
2026 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
2027 		       reg_b(dst), SHF_SC_R_DSHF);
2028 }
2029 
2030 /* NOTE: for indirect left shift, HIGH part should be calculated first. */
2031 static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2032 {
2033 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2034 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2035 		       reg_b(dst), SHF_SC_L_SHF);
2036 }
2037 
2038 static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2039 {
2040 	shl_reg64_lt32_high(nfp_prog, dst, src);
2041 	shl_reg64_lt32_low(nfp_prog, dst, src);
2042 }
2043 
2044 static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2045 {
2046 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2047 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2048 		       reg_b(dst), SHF_SC_L_SHF);
2049 	wrp_immed(nfp_prog, reg_both(dst), 0);
2050 }
2051 
2052 static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2053 {
2054 	const struct bpf_insn *insn = &meta->insn;
2055 	u64 umin, umax;
2056 	u8 dst, src;
2057 
2058 	dst = insn->dst_reg * 2;
2059 	umin = meta->umin_src;
2060 	umax = meta->umax_src;
2061 	if (umin == umax)
2062 		return __shl_imm64(nfp_prog, dst, umin);
2063 
2064 	src = insn->src_reg * 2;
2065 	if (umax < 32) {
2066 		shl_reg64_lt32(nfp_prog, dst, src);
2067 	} else if (umin >= 32) {
2068 		shl_reg64_ge32(nfp_prog, dst, src);
2069 	} else {
2070 		/* Generate different instruction sequences depending on runtime
2071 		 * value of shift amount.
2072 		 */
2073 		u16 label_ge32, label_end;
2074 
2075 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
2076 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2077 
2078 		shl_reg64_lt32_high(nfp_prog, dst, src);
2079 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2080 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2081 		/* shl_reg64_lt32_low packed in delay slot. */
2082 		shl_reg64_lt32_low(nfp_prog, dst, src);
2083 
2084 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2085 			return -EINVAL;
2086 		shl_reg64_ge32(nfp_prog, dst, src);
2087 
2088 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2089 			return -EINVAL;
2090 	}
2091 
2092 	return 0;
2093 }
2094 
2095 /* Pseudo code:
2096  *   if shift_amt >= 32
2097  *     dst_high = 0;
2098  *     dst_low = dst_high >> shift_amt[4:0]
2099  *   else
2100  *     dst_high = dst_high >> shift_amt
2101  *     dst_low = (dst_high, dst_low) >> shift_amt
2102  *
2103  * The indirect shift will use the same logic at runtime.
2104  */
2105 static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2106 {
2107 	if (shift_amt < 32) {
2108 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2109 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2110 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2111 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2112 	} else if (shift_amt == 32) {
2113 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2114 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2115 	} else if (shift_amt > 32) {
2116 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2117 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2118 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2119 	}
2120 
2121 	return 0;
2122 }
2123 
2124 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2125 {
2126 	const struct bpf_insn *insn = &meta->insn;
2127 	u8 dst = insn->dst_reg * 2;
2128 
2129 	return __shr_imm64(nfp_prog, dst, insn->imm);
2130 }
2131 
2132 /* NOTE: for indirect right shift, LOW part should be calculated first. */
2133 static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2134 {
2135 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2136 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2137 		       reg_b(dst + 1), SHF_SC_R_SHF);
2138 }
2139 
2140 static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2141 {
2142 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2143 	emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2144 		       reg_b(dst), SHF_SC_R_DSHF);
2145 }
2146 
2147 static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2148 {
2149 	shr_reg64_lt32_low(nfp_prog, dst, src);
2150 	shr_reg64_lt32_high(nfp_prog, dst, src);
2151 }
2152 
2153 static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2154 {
2155 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2156 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2157 		       reg_b(dst + 1), SHF_SC_R_SHF);
2158 	wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2159 }
2160 
2161 static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2162 {
2163 	const struct bpf_insn *insn = &meta->insn;
2164 	u64 umin, umax;
2165 	u8 dst, src;
2166 
2167 	dst = insn->dst_reg * 2;
2168 	umin = meta->umin_src;
2169 	umax = meta->umax_src;
2170 	if (umin == umax)
2171 		return __shr_imm64(nfp_prog, dst, umin);
2172 
2173 	src = insn->src_reg * 2;
2174 	if (umax < 32) {
2175 		shr_reg64_lt32(nfp_prog, dst, src);
2176 	} else if (umin >= 32) {
2177 		shr_reg64_ge32(nfp_prog, dst, src);
2178 	} else {
2179 		/* Generate different instruction sequences depending on runtime
2180 		 * value of shift amount.
2181 		 */
2182 		u16 label_ge32, label_end;
2183 
2184 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2185 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2186 		shr_reg64_lt32_low(nfp_prog, dst, src);
2187 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2188 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2189 		/* shr_reg64_lt32_high packed in delay slot. */
2190 		shr_reg64_lt32_high(nfp_prog, dst, src);
2191 
2192 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2193 			return -EINVAL;
2194 		shr_reg64_ge32(nfp_prog, dst, src);
2195 
2196 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2197 			return -EINVAL;
2198 	}
2199 
2200 	return 0;
2201 }
2202 
2203 /* Code logic is the same as __shr_imm64 except ashr requires signedness bit
2204  * told through PREV_ALU result.
2205  */
2206 static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2207 {
2208 	if (shift_amt < 32) {
2209 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2210 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2211 		/* Set signedness bit. */
2212 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2213 			 reg_imm(0));
2214 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2215 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2216 	} else if (shift_amt == 32) {
2217 		/* NOTE: this also helps setting signedness bit. */
2218 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2219 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2220 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2221 	} else if (shift_amt > 32) {
2222 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2223 			 reg_imm(0));
2224 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2225 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2226 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2227 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2228 	}
2229 
2230 	return 0;
2231 }
2232 
2233 static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2234 {
2235 	const struct bpf_insn *insn = &meta->insn;
2236 	u8 dst = insn->dst_reg * 2;
2237 
2238 	return __ashr_imm64(nfp_prog, dst, insn->imm);
2239 }
2240 
2241 static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2242 {
2243 	/* NOTE: the first insn will set both indirect shift amount (source A)
2244 	 * and signedness bit (MSB of result).
2245 	 */
2246 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2247 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2248 		       reg_b(dst + 1), SHF_SC_R_SHF);
2249 }
2250 
2251 static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2252 {
2253 	/* NOTE: it is the same as logic shift because we don't need to shift in
2254 	 * signedness bit when the shift amount is less than 32.
2255 	 */
2256 	return shr_reg64_lt32_low(nfp_prog, dst, src);
2257 }
2258 
2259 static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2260 {
2261 	ashr_reg64_lt32_low(nfp_prog, dst, src);
2262 	ashr_reg64_lt32_high(nfp_prog, dst, src);
2263 }
2264 
2265 static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2266 {
2267 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2268 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2269 		       reg_b(dst + 1), SHF_SC_R_SHF);
2270 	emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2271 		 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2272 }
2273 
2274 /* Like ashr_imm64, but need to use indirect shift. */
2275 static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2276 {
2277 	const struct bpf_insn *insn = &meta->insn;
2278 	u64 umin, umax;
2279 	u8 dst, src;
2280 
2281 	dst = insn->dst_reg * 2;
2282 	umin = meta->umin_src;
2283 	umax = meta->umax_src;
2284 	if (umin == umax)
2285 		return __ashr_imm64(nfp_prog, dst, umin);
2286 
2287 	src = insn->src_reg * 2;
2288 	if (umax < 32) {
2289 		ashr_reg64_lt32(nfp_prog, dst, src);
2290 	} else if (umin >= 32) {
2291 		ashr_reg64_ge32(nfp_prog, dst, src);
2292 	} else {
2293 		u16 label_ge32, label_end;
2294 
2295 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2296 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2297 		ashr_reg64_lt32_low(nfp_prog, dst, src);
2298 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2299 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2300 		/* ashr_reg64_lt32_high packed in delay slot. */
2301 		ashr_reg64_lt32_high(nfp_prog, dst, src);
2302 
2303 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2304 			return -EINVAL;
2305 		ashr_reg64_ge32(nfp_prog, dst, src);
2306 
2307 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2308 			return -EINVAL;
2309 	}
2310 
2311 	return 0;
2312 }
2313 
2314 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2315 {
2316 	const struct bpf_insn *insn = &meta->insn;
2317 
2318 	wrp_reg_mov(nfp_prog, insn->dst_reg * 2,  insn->src_reg * 2);
2319 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2320 
2321 	return 0;
2322 }
2323 
2324 static int mov_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2325 {
2326 	const struct bpf_insn *insn = &meta->insn;
2327 
2328 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), insn->imm);
2329 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2330 
2331 	return 0;
2332 }
2333 
2334 static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2335 {
2336 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_XOR);
2337 }
2338 
2339 static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2340 {
2341 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR, !~meta->insn.imm);
2342 }
2343 
2344 static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2345 {
2346 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_AND);
2347 }
2348 
2349 static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2350 {
2351 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
2352 }
2353 
2354 static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2355 {
2356 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_OR);
2357 }
2358 
2359 static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2360 {
2361 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
2362 }
2363 
2364 static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2365 {
2366 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_ADD);
2367 }
2368 
2369 static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2370 {
2371 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD, !meta->insn.imm);
2372 }
2373 
2374 static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2375 {
2376 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_SUB);
2377 }
2378 
2379 static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2380 {
2381 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB, !meta->insn.imm);
2382 }
2383 
2384 static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2385 {
2386 	return wrp_mul(nfp_prog, meta, false, true);
2387 }
2388 
2389 static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2390 {
2391 	return wrp_mul(nfp_prog, meta, false, false);
2392 }
2393 
2394 static int div_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2395 {
2396 	return div_reg64(nfp_prog, meta);
2397 }
2398 
2399 static int div_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2400 {
2401 	return div_imm64(nfp_prog, meta);
2402 }
2403 
2404 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2405 {
2406 	u8 dst = meta->insn.dst_reg * 2;
2407 
2408 	emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst));
2409 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2410 
2411 	return 0;
2412 }
2413 
2414 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2415 {
2416 	const struct bpf_insn *insn = &meta->insn;
2417 
2418 	if (!insn->imm)
2419 		return 1; /* TODO: zero shift means indirect */
2420 
2421 	emit_shf(nfp_prog, reg_both(insn->dst_reg * 2),
2422 		 reg_none(), SHF_OP_NONE, reg_b(insn->dst_reg * 2),
2423 		 SHF_SC_L_SHF, insn->imm);
2424 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2425 
2426 	return 0;
2427 }
2428 
2429 static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2430 {
2431 	const struct bpf_insn *insn = &meta->insn;
2432 	u8 gpr = insn->dst_reg * 2;
2433 
2434 	switch (insn->imm) {
2435 	case 16:
2436 		emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr),
2437 			      SHF_SC_R_ROT, 8);
2438 		emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr),
2439 			      SHF_SC_R_SHF, 16);
2440 
2441 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2442 		break;
2443 	case 32:
2444 		wrp_end32(nfp_prog, reg_a(gpr), gpr);
2445 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2446 		break;
2447 	case 64:
2448 		wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1));
2449 
2450 		wrp_end32(nfp_prog, reg_a(gpr), gpr + 1);
2451 		wrp_end32(nfp_prog, imm_a(nfp_prog), gpr);
2452 		break;
2453 	}
2454 
2455 	return 0;
2456 }
2457 
2458 static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2459 {
2460 	struct nfp_insn_meta *prev = nfp_meta_prev(meta);
2461 	u32 imm_lo, imm_hi;
2462 	u8 dst;
2463 
2464 	dst = prev->insn.dst_reg * 2;
2465 	imm_lo = prev->insn.imm;
2466 	imm_hi = meta->insn.imm;
2467 
2468 	wrp_immed(nfp_prog, reg_both(dst), imm_lo);
2469 
2470 	/* mov is always 1 insn, load imm may be two, so try to use mov */
2471 	if (imm_hi == imm_lo)
2472 		wrp_mov(nfp_prog, reg_both(dst + 1), reg_a(dst));
2473 	else
2474 		wrp_immed(nfp_prog, reg_both(dst + 1), imm_hi);
2475 
2476 	return 0;
2477 }
2478 
2479 static int imm_ld8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2480 {
2481 	meta->double_cb = imm_ld8_part2;
2482 	return 0;
2483 }
2484 
2485 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2486 {
2487 	return construct_data_ld(nfp_prog, meta->insn.imm, 1);
2488 }
2489 
2490 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2491 {
2492 	return construct_data_ld(nfp_prog, meta->insn.imm, 2);
2493 }
2494 
2495 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2496 {
2497 	return construct_data_ld(nfp_prog, meta->insn.imm, 4);
2498 }
2499 
2500 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2501 {
2502 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2503 				     meta->insn.src_reg * 2, 1);
2504 }
2505 
2506 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2507 {
2508 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2509 				     meta->insn.src_reg * 2, 2);
2510 }
2511 
2512 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2513 {
2514 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2515 				     meta->insn.src_reg * 2, 4);
2516 }
2517 
2518 static int
2519 mem_ldx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2520 	      unsigned int size, unsigned int ptr_off)
2521 {
2522 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2523 			    meta->insn.dst_reg * 2, meta->insn.src_reg * 2,
2524 			    true, wrp_lmem_load);
2525 }
2526 
2527 static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2528 		       u8 size)
2529 {
2530 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2531 
2532 	switch (meta->insn.off) {
2533 	case offsetof(struct __sk_buff, len):
2534 		if (size != FIELD_SIZEOF(struct __sk_buff, len))
2535 			return -EOPNOTSUPP;
2536 		wrp_mov(nfp_prog, dst, plen_reg(nfp_prog));
2537 		break;
2538 	case offsetof(struct __sk_buff, data):
2539 		if (size != FIELD_SIZEOF(struct __sk_buff, data))
2540 			return -EOPNOTSUPP;
2541 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2542 		break;
2543 	case offsetof(struct __sk_buff, data_end):
2544 		if (size != FIELD_SIZEOF(struct __sk_buff, data_end))
2545 			return -EOPNOTSUPP;
2546 		emit_alu(nfp_prog, dst,
2547 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2548 		break;
2549 	default:
2550 		return -EOPNOTSUPP;
2551 	}
2552 
2553 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2554 
2555 	return 0;
2556 }
2557 
2558 static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2559 		       u8 size)
2560 {
2561 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2562 
2563 	switch (meta->insn.off) {
2564 	case offsetof(struct xdp_md, data):
2565 		if (size != FIELD_SIZEOF(struct xdp_md, data))
2566 			return -EOPNOTSUPP;
2567 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2568 		break;
2569 	case offsetof(struct xdp_md, data_end):
2570 		if (size != FIELD_SIZEOF(struct xdp_md, data_end))
2571 			return -EOPNOTSUPP;
2572 		emit_alu(nfp_prog, dst,
2573 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2574 		break;
2575 	default:
2576 		return -EOPNOTSUPP;
2577 	}
2578 
2579 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2580 
2581 	return 0;
2582 }
2583 
2584 static int
2585 mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2586 	     unsigned int size)
2587 {
2588 	swreg tmp_reg;
2589 
2590 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2591 
2592 	return data_ld_host_order_addr32(nfp_prog, meta->insn.src_reg * 2,
2593 					 tmp_reg, meta->insn.dst_reg * 2, size);
2594 }
2595 
2596 static int
2597 mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2598 	     unsigned int size)
2599 {
2600 	swreg tmp_reg;
2601 
2602 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2603 
2604 	return data_ld_host_order_addr40(nfp_prog, meta->insn.src_reg * 2,
2605 					 tmp_reg, meta->insn.dst_reg * 2, size);
2606 }
2607 
2608 static void
2609 mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog,
2610 			   struct nfp_insn_meta *meta)
2611 {
2612 	s16 range_start = meta->pkt_cache.range_start;
2613 	s16 range_end = meta->pkt_cache.range_end;
2614 	swreg src_base, off;
2615 	u8 xfer_num, len;
2616 	bool indir;
2617 
2618 	off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog));
2619 	src_base = reg_a(meta->insn.src_reg * 2);
2620 	len = range_end - range_start;
2621 	xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH;
2622 
2623 	indir = len > 8 * REG_WIDTH;
2624 	/* Setup PREV_ALU for indirect mode. */
2625 	if (indir)
2626 		wrp_immed(nfp_prog, reg_none(),
2627 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
2628 
2629 	/* Cache memory into transfer-in registers. */
2630 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
2631 		     off, xfer_num - 1, CMD_CTX_SWAP, indir);
2632 }
2633 
2634 static int
2635 mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog,
2636 				     struct nfp_insn_meta *meta,
2637 				     unsigned int size)
2638 {
2639 	s16 range_start = meta->pkt_cache.range_start;
2640 	s16 insn_off = meta->insn.off - range_start;
2641 	swreg dst_lo, dst_hi, src_lo, src_mid;
2642 	u8 dst_gpr = meta->insn.dst_reg * 2;
2643 	u8 len_lo = size, len_mid = 0;
2644 	u8 idx = insn_off / REG_WIDTH;
2645 	u8 off = insn_off % REG_WIDTH;
2646 
2647 	dst_hi = reg_both(dst_gpr + 1);
2648 	dst_lo = reg_both(dst_gpr);
2649 	src_lo = reg_xfer(idx);
2650 
2651 	/* The read length could involve as many as three registers. */
2652 	if (size > REG_WIDTH - off) {
2653 		/* Calculate the part in the second register. */
2654 		len_lo = REG_WIDTH - off;
2655 		len_mid = size - len_lo;
2656 
2657 		/* Calculate the part in the third register. */
2658 		if (size > 2 * REG_WIDTH - off)
2659 			len_mid = REG_WIDTH;
2660 	}
2661 
2662 	wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off);
2663 
2664 	if (!len_mid) {
2665 		wrp_immed(nfp_prog, dst_hi, 0);
2666 		return 0;
2667 	}
2668 
2669 	src_mid = reg_xfer(idx + 1);
2670 
2671 	if (size <= REG_WIDTH) {
2672 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo);
2673 		wrp_immed(nfp_prog, dst_hi, 0);
2674 	} else {
2675 		swreg src_hi = reg_xfer(idx + 2);
2676 
2677 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid,
2678 				   REG_WIDTH - len_lo, len_lo);
2679 		wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo,
2680 				REG_WIDTH - len_lo);
2681 		wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo,
2682 				   len_lo);
2683 	}
2684 
2685 	return 0;
2686 }
2687 
2688 static int
2689 mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog,
2690 				   struct nfp_insn_meta *meta,
2691 				   unsigned int size)
2692 {
2693 	swreg dst_lo, dst_hi, src_lo;
2694 	u8 dst_gpr, idx;
2695 
2696 	idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH;
2697 	dst_gpr = meta->insn.dst_reg * 2;
2698 	dst_hi = reg_both(dst_gpr + 1);
2699 	dst_lo = reg_both(dst_gpr);
2700 	src_lo = reg_xfer(idx);
2701 
2702 	if (size < REG_WIDTH) {
2703 		wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0);
2704 		wrp_immed(nfp_prog, dst_hi, 0);
2705 	} else if (size == REG_WIDTH) {
2706 		wrp_mov(nfp_prog, dst_lo, src_lo);
2707 		wrp_immed(nfp_prog, dst_hi, 0);
2708 	} else {
2709 		swreg src_hi = reg_xfer(idx + 1);
2710 
2711 		wrp_mov(nfp_prog, dst_lo, src_lo);
2712 		wrp_mov(nfp_prog, dst_hi, src_hi);
2713 	}
2714 
2715 	return 0;
2716 }
2717 
2718 static int
2719 mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog,
2720 			   struct nfp_insn_meta *meta, unsigned int size)
2721 {
2722 	u8 off = meta->insn.off - meta->pkt_cache.range_start;
2723 
2724 	if (IS_ALIGNED(off, REG_WIDTH))
2725 		return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size);
2726 
2727 	return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size);
2728 }
2729 
2730 static int
2731 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2732 	unsigned int size)
2733 {
2734 	if (meta->ldst_gather_len)
2735 		return nfp_cpp_memcpy(nfp_prog, meta);
2736 
2737 	if (meta->ptr.type == PTR_TO_CTX) {
2738 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2739 			return mem_ldx_xdp(nfp_prog, meta, size);
2740 		else
2741 			return mem_ldx_skb(nfp_prog, meta, size);
2742 	}
2743 
2744 	if (meta->ptr.type == PTR_TO_PACKET) {
2745 		if (meta->pkt_cache.range_end) {
2746 			if (meta->pkt_cache.do_init)
2747 				mem_ldx_data_init_pktcache(nfp_prog, meta);
2748 
2749 			return mem_ldx_data_from_pktcache(nfp_prog, meta, size);
2750 		} else {
2751 			return mem_ldx_data(nfp_prog, meta, size);
2752 		}
2753 	}
2754 
2755 	if (meta->ptr.type == PTR_TO_STACK)
2756 		return mem_ldx_stack(nfp_prog, meta, size,
2757 				     meta->ptr.off + meta->ptr.var_off.value);
2758 
2759 	if (meta->ptr.type == PTR_TO_MAP_VALUE)
2760 		return mem_ldx_emem(nfp_prog, meta, size);
2761 
2762 	return -EOPNOTSUPP;
2763 }
2764 
2765 static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2766 {
2767 	return mem_ldx(nfp_prog, meta, 1);
2768 }
2769 
2770 static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2771 {
2772 	return mem_ldx(nfp_prog, meta, 2);
2773 }
2774 
2775 static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2776 {
2777 	return mem_ldx(nfp_prog, meta, 4);
2778 }
2779 
2780 static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2781 {
2782 	return mem_ldx(nfp_prog, meta, 8);
2783 }
2784 
2785 static int
2786 mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2787 	    unsigned int size)
2788 {
2789 	u64 imm = meta->insn.imm; /* sign extend */
2790 	swreg off_reg;
2791 
2792 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2793 
2794 	return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2795 				  imm, size);
2796 }
2797 
2798 static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2799 		  unsigned int size)
2800 {
2801 	if (meta->ptr.type == PTR_TO_PACKET)
2802 		return mem_st_data(nfp_prog, meta, size);
2803 
2804 	return -EOPNOTSUPP;
2805 }
2806 
2807 static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2808 {
2809 	return mem_st(nfp_prog, meta, 1);
2810 }
2811 
2812 static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2813 {
2814 	return mem_st(nfp_prog, meta, 2);
2815 }
2816 
2817 static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2818 {
2819 	return mem_st(nfp_prog, meta, 4);
2820 }
2821 
2822 static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2823 {
2824 	return mem_st(nfp_prog, meta, 8);
2825 }
2826 
2827 static int
2828 mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2829 	     unsigned int size)
2830 {
2831 	swreg off_reg;
2832 
2833 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2834 
2835 	return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2836 				   meta->insn.src_reg * 2, size);
2837 }
2838 
2839 static int
2840 mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2841 	      unsigned int size, unsigned int ptr_off)
2842 {
2843 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2844 			    meta->insn.src_reg * 2, meta->insn.dst_reg * 2,
2845 			    false, wrp_lmem_store);
2846 }
2847 
2848 static int mem_stx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2849 {
2850 	switch (meta->insn.off) {
2851 	case offsetof(struct xdp_md, rx_queue_index):
2852 		return nfp_queue_select(nfp_prog, meta);
2853 	}
2854 
2855 	WARN_ON_ONCE(1); /* verifier should have rejected bad accesses */
2856 	return -EOPNOTSUPP;
2857 }
2858 
2859 static int
2860 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2861 	unsigned int size)
2862 {
2863 	if (meta->ptr.type == PTR_TO_PACKET)
2864 		return mem_stx_data(nfp_prog, meta, size);
2865 
2866 	if (meta->ptr.type == PTR_TO_STACK)
2867 		return mem_stx_stack(nfp_prog, meta, size,
2868 				     meta->ptr.off + meta->ptr.var_off.value);
2869 
2870 	return -EOPNOTSUPP;
2871 }
2872 
2873 static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2874 {
2875 	return mem_stx(nfp_prog, meta, 1);
2876 }
2877 
2878 static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2879 {
2880 	return mem_stx(nfp_prog, meta, 2);
2881 }
2882 
2883 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2884 {
2885 	if (meta->ptr.type == PTR_TO_CTX)
2886 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2887 			return mem_stx_xdp(nfp_prog, meta);
2888 	return mem_stx(nfp_prog, meta, 4);
2889 }
2890 
2891 static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2892 {
2893 	return mem_stx(nfp_prog, meta, 8);
2894 }
2895 
2896 static int
2897 mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
2898 {
2899 	u8 dst_gpr = meta->insn.dst_reg * 2;
2900 	u8 src_gpr = meta->insn.src_reg * 2;
2901 	unsigned int full_add, out;
2902 	swreg addra, addrb, off;
2903 
2904 	off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2905 
2906 	/* We can fit 16 bits into command immediate, if we know the immediate
2907 	 * is guaranteed to either always or never fit into 16 bit we only
2908 	 * generate code to handle that particular case, otherwise generate
2909 	 * code for both.
2910 	 */
2911 	out = nfp_prog_current_offset(nfp_prog);
2912 	full_add = nfp_prog_current_offset(nfp_prog);
2913 
2914 	if (meta->insn.off) {
2915 		out += 2;
2916 		full_add += 2;
2917 	}
2918 	if (meta->xadd_maybe_16bit) {
2919 		out += 3;
2920 		full_add += 3;
2921 	}
2922 	if (meta->xadd_over_16bit)
2923 		out += 2 + is64;
2924 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2925 		out += 5;
2926 		full_add += 5;
2927 	}
2928 
2929 	/* Generate the branch for choosing add_imm vs add */
2930 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2931 		swreg max_imm = imm_a(nfp_prog);
2932 
2933 		wrp_immed(nfp_prog, max_imm, 0xffff);
2934 		emit_alu(nfp_prog, reg_none(),
2935 			 max_imm, ALU_OP_SUB, reg_b(src_gpr));
2936 		emit_alu(nfp_prog, reg_none(),
2937 			 reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1));
2938 		emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0);
2939 		/* defer for add */
2940 	}
2941 
2942 	/* If insn has an offset add to the address */
2943 	if (!meta->insn.off) {
2944 		addra = reg_a(dst_gpr);
2945 		addrb = reg_b(dst_gpr + 1);
2946 	} else {
2947 		emit_alu(nfp_prog, imma_a(nfp_prog),
2948 			 reg_a(dst_gpr), ALU_OP_ADD, off);
2949 		emit_alu(nfp_prog, imma_b(nfp_prog),
2950 			 reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0));
2951 		addra = imma_a(nfp_prog);
2952 		addrb = imma_b(nfp_prog);
2953 	}
2954 
2955 	/* Generate the add_imm if 16 bits are possible */
2956 	if (meta->xadd_maybe_16bit) {
2957 		swreg prev_alu = imm_a(nfp_prog);
2958 
2959 		wrp_immed(nfp_prog, prev_alu,
2960 			  FIELD_PREP(CMD_OVE_DATA, 2) |
2961 			  CMD_OVE_LEN |
2962 			  FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2));
2963 		wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2);
2964 		emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0,
2965 			       addra, addrb, 0, CMD_CTX_NO_SWAP);
2966 
2967 		if (meta->xadd_over_16bit)
2968 			emit_br(nfp_prog, BR_UNC, out, 0);
2969 	}
2970 
2971 	if (!nfp_prog_confirm_current_offset(nfp_prog, full_add))
2972 		return -EINVAL;
2973 
2974 	/* Generate the add if 16 bits are not guaranteed */
2975 	if (meta->xadd_over_16bit) {
2976 		emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0,
2977 			 addra, addrb, is64 << 2,
2978 			 is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1);
2979 
2980 		wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr));
2981 		if (is64)
2982 			wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1));
2983 	}
2984 
2985 	if (!nfp_prog_confirm_current_offset(nfp_prog, out))
2986 		return -EINVAL;
2987 
2988 	return 0;
2989 }
2990 
2991 static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2992 {
2993 	return mem_xadd(nfp_prog, meta, false);
2994 }
2995 
2996 static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2997 {
2998 	return mem_xadd(nfp_prog, meta, true);
2999 }
3000 
3001 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3002 {
3003 	emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
3004 
3005 	return 0;
3006 }
3007 
3008 static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3009 {
3010 	const struct bpf_insn *insn = &meta->insn;
3011 	u64 imm = insn->imm; /* sign extend */
3012 	swreg or1, or2, tmp_reg;
3013 
3014 	or1 = reg_a(insn->dst_reg * 2);
3015 	or2 = reg_b(insn->dst_reg * 2 + 1);
3016 
3017 	if (imm & ~0U) {
3018 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3019 		emit_alu(nfp_prog, imm_a(nfp_prog),
3020 			 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3021 		or1 = imm_a(nfp_prog);
3022 	}
3023 
3024 	if (imm >> 32) {
3025 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3026 		emit_alu(nfp_prog, imm_b(nfp_prog),
3027 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3028 		or2 = imm_b(nfp_prog);
3029 	}
3030 
3031 	emit_alu(nfp_prog, reg_none(), or1, ALU_OP_OR, or2);
3032 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3033 
3034 	return 0;
3035 }
3036 
3037 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3038 {
3039 	const struct bpf_insn *insn = &meta->insn;
3040 	u64 imm = insn->imm; /* sign extend */
3041 	swreg tmp_reg;
3042 
3043 	if (!imm) {
3044 		meta->skip = true;
3045 		return 0;
3046 	}
3047 
3048 	if (imm & ~0U) {
3049 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3050 		emit_alu(nfp_prog, reg_none(),
3051 			 reg_a(insn->dst_reg * 2), ALU_OP_AND, tmp_reg);
3052 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3053 	}
3054 
3055 	if (imm >> 32) {
3056 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3057 		emit_alu(nfp_prog, reg_none(),
3058 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_AND, tmp_reg);
3059 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3060 	}
3061 
3062 	return 0;
3063 }
3064 
3065 static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3066 {
3067 	const struct bpf_insn *insn = &meta->insn;
3068 	u64 imm = insn->imm; /* sign extend */
3069 	swreg tmp_reg;
3070 
3071 	if (!imm) {
3072 		emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
3073 			 ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
3074 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3075 		return 0;
3076 	}
3077 
3078 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3079 	emit_alu(nfp_prog, reg_none(),
3080 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3081 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3082 
3083 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3084 	emit_alu(nfp_prog, reg_none(),
3085 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3086 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3087 
3088 	return 0;
3089 }
3090 
3091 static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3092 {
3093 	const struct bpf_insn *insn = &meta->insn;
3094 
3095 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
3096 		 ALU_OP_XOR, reg_b(insn->src_reg * 2));
3097 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_a(insn->dst_reg * 2 + 1),
3098 		 ALU_OP_XOR, reg_b(insn->src_reg * 2 + 1));
3099 	emit_alu(nfp_prog, reg_none(),
3100 		 imm_a(nfp_prog), ALU_OP_OR, imm_b(nfp_prog));
3101 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3102 
3103 	return 0;
3104 }
3105 
3106 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3107 {
3108 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
3109 }
3110 
3111 static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3112 {
3113 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
3114 }
3115 
3116 static int
3117 bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3118 {
3119 	u32 ret_tgt, stack_depth, offset_br;
3120 	swreg tmp_reg;
3121 
3122 	stack_depth = round_up(nfp_prog->stack_frame_depth, STACK_FRAME_ALIGN);
3123 	/* Space for saving the return address is accounted for by the callee,
3124 	 * so stack_depth can be zero for the main function.
3125 	 */
3126 	if (stack_depth) {
3127 		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3128 					  stack_imm(nfp_prog));
3129 		emit_alu(nfp_prog, stack_reg(nfp_prog),
3130 			 stack_reg(nfp_prog), ALU_OP_ADD, tmp_reg);
3131 		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3132 			    NFP_CSR_ACT_LM_ADDR0);
3133 	}
3134 
3135 	/* The following steps are performed:
3136 	 *     1. Put the start offset of the callee into imm_b(). This will
3137 	 *        require a fixup step, as we do not necessarily know this
3138 	 *        address yet.
3139 	 *     2. Put the return address from the callee to the caller into
3140 	 *        register ret_reg().
3141 	 *     3. (After defer slots are consumed) Jump to the subroutine that
3142 	 *        pushes the registers to the stack.
3143 	 * The subroutine acts as a trampoline, and returns to the address in
3144 	 * imm_b(), i.e. jumps to the callee.
3145 	 *
3146 	 * Using ret_reg() to pass the return address to the callee is set here
3147 	 * as a convention. The callee can then push this address onto its
3148 	 * stack frame in its prologue. The advantages of passing the return
3149 	 * address through ret_reg(), instead of pushing it to the stack right
3150 	 * here, are the following:
3151 	 * - It looks cleaner.
3152 	 * - If the called function is called multiple time, we get a lower
3153 	 *   program size.
3154 	 * - We save two no-op instructions that should be added just before
3155 	 *   the emit_br() when stack depth is not null otherwise.
3156 	 * - If we ever find a register to hold the return address during whole
3157 	 *   execution of the callee, we will not have to push the return
3158 	 *   address to the stack for leaf functions.
3159 	 */
3160 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
3161 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
3162 		     RELO_BR_GO_CALL_PUSH_REGS);
3163 	offset_br = nfp_prog_current_offset(nfp_prog);
3164 	wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
3165 	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
3166 
3167 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
3168 		return -EINVAL;
3169 
3170 	if (stack_depth) {
3171 		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3172 					  stack_imm(nfp_prog));
3173 		emit_alu(nfp_prog, stack_reg(nfp_prog),
3174 			 stack_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
3175 		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3176 			    NFP_CSR_ACT_LM_ADDR0);
3177 		wrp_nops(nfp_prog, 3);
3178 	}
3179 
3180 	meta->num_insns_after_br = nfp_prog_current_offset(nfp_prog);
3181 	meta->num_insns_after_br -= offset_br;
3182 
3183 	return 0;
3184 }
3185 
3186 static int helper_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3187 {
3188 	switch (meta->insn.imm) {
3189 	case BPF_FUNC_xdp_adjust_head:
3190 		return adjust_head(nfp_prog, meta);
3191 	case BPF_FUNC_xdp_adjust_tail:
3192 		return adjust_tail(nfp_prog, meta);
3193 	case BPF_FUNC_map_lookup_elem:
3194 	case BPF_FUNC_map_update_elem:
3195 	case BPF_FUNC_map_delete_elem:
3196 		return map_call_stack_common(nfp_prog, meta);
3197 	case BPF_FUNC_get_prandom_u32:
3198 		return nfp_get_prandom_u32(nfp_prog, meta);
3199 	case BPF_FUNC_perf_event_output:
3200 		return nfp_perf_event_output(nfp_prog, meta);
3201 	default:
3202 		WARN_ONCE(1, "verifier allowed unsupported function\n");
3203 		return -EOPNOTSUPP;
3204 	}
3205 }
3206 
3207 static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3208 {
3209 	if (is_mbpf_pseudo_call(meta))
3210 		return bpf_to_bpf_call(nfp_prog, meta);
3211 	else
3212 		return helper_call(nfp_prog, meta);
3213 }
3214 
3215 static bool nfp_is_main_function(struct nfp_insn_meta *meta)
3216 {
3217 	return meta->subprog_idx == 0;
3218 }
3219 
3220 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3221 {
3222 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
3223 
3224 	return 0;
3225 }
3226 
3227 static int
3228 nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3229 {
3230 	/* Pop R6~R9 to the stack via related subroutine.
3231 	 * Pop return address for BPF-to-BPF call from the stack and load it
3232 	 * into ret_reg() before we jump. This means that the subroutine does
3233 	 * not come back here, we make it jump back to the subprogram caller
3234 	 * directly!
3235 	 */
3236 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
3237 		     RELO_BR_GO_CALL_POP_REGS);
3238 	wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3239 
3240 	return 0;
3241 }
3242 
3243 static int jmp_exit(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3244 {
3245 	if (nfp_is_main_function(meta))
3246 		return goto_out(nfp_prog, meta);
3247 	else
3248 		return nfp_subprog_epilogue(nfp_prog, meta);
3249 }
3250 
3251 static const instr_cb_t instr_cb[256] = {
3252 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
3253 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
3254 	[BPF_ALU64 | BPF_XOR | BPF_X] =	xor_reg64,
3255 	[BPF_ALU64 | BPF_XOR | BPF_K] =	xor_imm64,
3256 	[BPF_ALU64 | BPF_AND | BPF_X] =	and_reg64,
3257 	[BPF_ALU64 | BPF_AND | BPF_K] =	and_imm64,
3258 	[BPF_ALU64 | BPF_OR | BPF_X] =	or_reg64,
3259 	[BPF_ALU64 | BPF_OR | BPF_K] =	or_imm64,
3260 	[BPF_ALU64 | BPF_ADD | BPF_X] =	add_reg64,
3261 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
3262 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
3263 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
3264 	[BPF_ALU64 | BPF_MUL | BPF_X] =	mul_reg64,
3265 	[BPF_ALU64 | BPF_MUL | BPF_K] =	mul_imm64,
3266 	[BPF_ALU64 | BPF_DIV | BPF_X] =	div_reg64,
3267 	[BPF_ALU64 | BPF_DIV | BPF_K] =	div_imm64,
3268 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
3269 	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
3270 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
3271 	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
3272 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
3273 	[BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64,
3274 	[BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
3275 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
3276 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
3277 	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
3278 	[BPF_ALU | BPF_XOR | BPF_K] =	xor_imm,
3279 	[BPF_ALU | BPF_AND | BPF_X] =	and_reg,
3280 	[BPF_ALU | BPF_AND | BPF_K] =	and_imm,
3281 	[BPF_ALU | BPF_OR | BPF_X] =	or_reg,
3282 	[BPF_ALU | BPF_OR | BPF_K] =	or_imm,
3283 	[BPF_ALU | BPF_ADD | BPF_X] =	add_reg,
3284 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
3285 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
3286 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
3287 	[BPF_ALU | BPF_MUL | BPF_X] =	mul_reg,
3288 	[BPF_ALU | BPF_MUL | BPF_K] =	mul_imm,
3289 	[BPF_ALU | BPF_DIV | BPF_X] =	div_reg,
3290 	[BPF_ALU | BPF_DIV | BPF_K] =	div_imm,
3291 	[BPF_ALU | BPF_NEG] =		neg_reg,
3292 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
3293 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
3294 	[BPF_LD | BPF_IMM | BPF_DW] =	imm_ld8,
3295 	[BPF_LD | BPF_ABS | BPF_B] =	data_ld1,
3296 	[BPF_LD | BPF_ABS | BPF_H] =	data_ld2,
3297 	[BPF_LD | BPF_ABS | BPF_W] =	data_ld4,
3298 	[BPF_LD | BPF_IND | BPF_B] =	data_ind_ld1,
3299 	[BPF_LD | BPF_IND | BPF_H] =	data_ind_ld2,
3300 	[BPF_LD | BPF_IND | BPF_W] =	data_ind_ld4,
3301 	[BPF_LDX | BPF_MEM | BPF_B] =	mem_ldx1,
3302 	[BPF_LDX | BPF_MEM | BPF_H] =	mem_ldx2,
3303 	[BPF_LDX | BPF_MEM | BPF_W] =	mem_ldx4,
3304 	[BPF_LDX | BPF_MEM | BPF_DW] =	mem_ldx8,
3305 	[BPF_STX | BPF_MEM | BPF_B] =	mem_stx1,
3306 	[BPF_STX | BPF_MEM | BPF_H] =	mem_stx2,
3307 	[BPF_STX | BPF_MEM | BPF_W] =	mem_stx4,
3308 	[BPF_STX | BPF_MEM | BPF_DW] =	mem_stx8,
3309 	[BPF_STX | BPF_XADD | BPF_W] =	mem_xadd4,
3310 	[BPF_STX | BPF_XADD | BPF_DW] =	mem_xadd8,
3311 	[BPF_ST | BPF_MEM | BPF_B] =	mem_st1,
3312 	[BPF_ST | BPF_MEM | BPF_H] =	mem_st2,
3313 	[BPF_ST | BPF_MEM | BPF_W] =	mem_st4,
3314 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
3315 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
3316 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
3317 	[BPF_JMP | BPF_JGT | BPF_K] =	cmp_imm,
3318 	[BPF_JMP | BPF_JGE | BPF_K] =	cmp_imm,
3319 	[BPF_JMP | BPF_JLT | BPF_K] =	cmp_imm,
3320 	[BPF_JMP | BPF_JLE | BPF_K] =	cmp_imm,
3321 	[BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
3322 	[BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
3323 	[BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
3324 	[BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
3325 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
3326 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
3327 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
3328 	[BPF_JMP | BPF_JGT | BPF_X] =	cmp_reg,
3329 	[BPF_JMP | BPF_JGE | BPF_X] =	cmp_reg,
3330 	[BPF_JMP | BPF_JLT | BPF_X] =	cmp_reg,
3331 	[BPF_JMP | BPF_JLE | BPF_X] =	cmp_reg,
3332 	[BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
3333 	[BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
3334 	[BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
3335 	[BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
3336 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
3337 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
3338 	[BPF_JMP | BPF_CALL] =		call,
3339 	[BPF_JMP | BPF_EXIT] =		jmp_exit,
3340 };
3341 
3342 /* --- Assembler logic --- */
3343 static int
3344 nfp_fixup_immed_relo(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
3345 		     struct nfp_insn_meta *jmp_dst, u32 br_idx)
3346 {
3347 	if (immed_get_value(nfp_prog->prog[br_idx + 1])) {
3348 		pr_err("BUG: failed to fix up callee register saving\n");
3349 		return -EINVAL;
3350 	}
3351 
3352 	immed_set_value(&nfp_prog->prog[br_idx + 1], jmp_dst->off);
3353 
3354 	return 0;
3355 }
3356 
3357 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
3358 {
3359 	struct nfp_insn_meta *meta, *jmp_dst;
3360 	u32 idx, br_idx;
3361 	int err;
3362 
3363 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3364 		if (meta->skip)
3365 			continue;
3366 		if (BPF_CLASS(meta->insn.code) != BPF_JMP)
3367 			continue;
3368 		if (meta->insn.code == (BPF_JMP | BPF_EXIT) &&
3369 		    !nfp_is_main_function(meta))
3370 			continue;
3371 		if (is_mbpf_helper_call(meta))
3372 			continue;
3373 
3374 		if (list_is_last(&meta->l, &nfp_prog->insns))
3375 			br_idx = nfp_prog->last_bpf_off;
3376 		else
3377 			br_idx = list_next_entry(meta, l)->off - 1;
3378 
3379 		/* For BPF-to-BPF function call, a stack adjustment sequence is
3380 		 * generated after the return instruction. Therefore, we must
3381 		 * withdraw the length of this sequence to have br_idx pointing
3382 		 * to where the "branch" NFP instruction is expected to be.
3383 		 */
3384 		if (is_mbpf_pseudo_call(meta))
3385 			br_idx -= meta->num_insns_after_br;
3386 
3387 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
3388 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
3389 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
3390 			return -ELOOP;
3391 		}
3392 
3393 		if (meta->insn.code == (BPF_JMP | BPF_EXIT))
3394 			continue;
3395 
3396 		/* Leave special branches for later */
3397 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3398 		    RELO_BR_REL && !is_mbpf_pseudo_call(meta))
3399 			continue;
3400 
3401 		if (!meta->jmp_dst) {
3402 			pr_err("Non-exit jump doesn't have destination info recorded!!\n");
3403 			return -ELOOP;
3404 		}
3405 
3406 		jmp_dst = meta->jmp_dst;
3407 
3408 		if (jmp_dst->skip) {
3409 			pr_err("Branch landing on removed instruction!!\n");
3410 			return -ELOOP;
3411 		}
3412 
3413 		if (is_mbpf_pseudo_call(meta)) {
3414 			err = nfp_fixup_immed_relo(nfp_prog, meta,
3415 						   jmp_dst, br_idx);
3416 			if (err)
3417 				return err;
3418 		}
3419 
3420 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3421 		    RELO_BR_REL)
3422 			continue;
3423 
3424 		for (idx = meta->off; idx <= br_idx; idx++) {
3425 			if (!nfp_is_br(nfp_prog->prog[idx]))
3426 				continue;
3427 			br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
3428 		}
3429 	}
3430 
3431 	return 0;
3432 }
3433 
3434 static void nfp_intro(struct nfp_prog *nfp_prog)
3435 {
3436 	wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
3437 	emit_alu(nfp_prog, plen_reg(nfp_prog),
3438 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
3439 }
3440 
3441 static void
3442 nfp_subprog_prologue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3443 {
3444 	/* Save return address into the stack. */
3445 	wrp_mov(nfp_prog, reg_lm(0, 0), ret_reg(nfp_prog));
3446 }
3447 
3448 static void
3449 nfp_start_subprog(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3450 {
3451 	unsigned int depth = nfp_prog->subprog[meta->subprog_idx].stack_depth;
3452 
3453 	nfp_prog->stack_frame_depth = round_up(depth, 4);
3454 	nfp_subprog_prologue(nfp_prog, meta);
3455 }
3456 
3457 bool nfp_is_subprog_start(struct nfp_insn_meta *meta)
3458 {
3459 	return meta->flags & FLAG_INSN_IS_SUBPROG_START;
3460 }
3461 
3462 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
3463 {
3464 	/* TC direct-action mode:
3465 	 *   0,1   ok        NOT SUPPORTED[1]
3466 	 *   2   drop  0x22 -> drop,  count as stat1
3467 	 *   4,5 nuke  0x02 -> drop
3468 	 *   7  redir  0x44 -> redir, count as stat2
3469 	 *   * unspec  0x11 -> pass,  count as stat0
3470 	 *
3471 	 * [1] We can't support OK and RECLASSIFY because we can't tell TC
3472 	 *     the exact decision made.  We are forced to support UNSPEC
3473 	 *     to handle aborts so that's the only one we handle for passing
3474 	 *     packets up the stack.
3475 	 */
3476 	/* Target for aborts */
3477 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3478 
3479 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3480 
3481 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3482 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16);
3483 
3484 	/* Target for normal exits */
3485 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3486 
3487 	/* if R0 > 7 jump to abort */
3488 	emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0));
3489 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3490 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3491 
3492 	wrp_immed(nfp_prog, reg_b(2), 0x41221211);
3493 	wrp_immed(nfp_prog, reg_b(3), 0x41001211);
3494 
3495 	emit_shf(nfp_prog, reg_a(1),
3496 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 2);
3497 
3498 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3499 	emit_shf(nfp_prog, reg_a(2),
3500 		 reg_imm(0xf), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3501 
3502 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3503 	emit_shf(nfp_prog, reg_b(2),
3504 		 reg_imm(0xf), SHF_OP_AND, reg_b(3), SHF_SC_R_SHF, 0);
3505 
3506 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3507 
3508 	emit_shf(nfp_prog, reg_b(2),
3509 		 reg_a(2), SHF_OP_OR, reg_b(2), SHF_SC_L_SHF, 4);
3510 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3511 }
3512 
3513 static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
3514 {
3515 	/* XDP return codes:
3516 	 *   0 aborted  0x82 -> drop,  count as stat3
3517 	 *   1    drop  0x22 -> drop,  count as stat1
3518 	 *   2    pass  0x11 -> pass,  count as stat0
3519 	 *   3      tx  0x44 -> redir, count as stat2
3520 	 *   * unknown  0x82 -> drop,  count as stat3
3521 	 */
3522 	/* Target for aborts */
3523 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3524 
3525 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3526 
3527 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3528 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
3529 
3530 	/* Target for normal exits */
3531 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3532 
3533 	/* if R0 > 3 jump to abort */
3534 	emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
3535 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3536 
3537 	wrp_immed(nfp_prog, reg_b(2), 0x44112282);
3538 
3539 	emit_shf(nfp_prog, reg_a(1),
3540 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 3);
3541 
3542 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3543 	emit_shf(nfp_prog, reg_b(2),
3544 		 reg_imm(0xff), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3545 
3546 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3547 
3548 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3549 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3550 }
3551 
3552 static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
3553 {
3554 	u8 reg;
3555 
3556 	/* Subroutine: Save all callee saved registers (R6 ~ R9).
3557 	 * imm_b() holds the return address.
3558 	 */
3559 	nfp_prog->tgt_call_push_regs = nfp_prog_current_offset(nfp_prog);
3560 	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3561 		u8 adj = (reg - BPF_REG_0) * 2;
3562 		u8 idx = (reg - BPF_REG_6) * 2;
3563 
3564 		/* The first slot in the stack frame is used to push the return
3565 		 * address in bpf_to_bpf_call(), start just after.
3566 		 */
3567 		wrp_mov(nfp_prog, reg_lm(0, 1 + idx), reg_b(adj));
3568 
3569 		if (reg == BPF_REG_8)
3570 			/* Prepare to jump back, last 3 insns use defer slots */
3571 			emit_rtn(nfp_prog, imm_b(nfp_prog), 3);
3572 
3573 		wrp_mov(nfp_prog, reg_lm(0, 1 + idx + 1), reg_b(adj + 1));
3574 	}
3575 }
3576 
3577 static void nfp_pop_callee_registers(struct nfp_prog *nfp_prog)
3578 {
3579 	u8 reg;
3580 
3581 	/* Subroutine: Restore all callee saved registers (R6 ~ R9).
3582 	 * ret_reg() holds the return address.
3583 	 */
3584 	nfp_prog->tgt_call_pop_regs = nfp_prog_current_offset(nfp_prog);
3585 	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3586 		u8 adj = (reg - BPF_REG_0) * 2;
3587 		u8 idx = (reg - BPF_REG_6) * 2;
3588 
3589 		/* The first slot in the stack frame holds the return address,
3590 		 * start popping just after that.
3591 		 */
3592 		wrp_mov(nfp_prog, reg_both(adj), reg_lm(0, 1 + idx));
3593 
3594 		if (reg == BPF_REG_8)
3595 			/* Prepare to jump back, last 3 insns use defer slots */
3596 			emit_rtn(nfp_prog, ret_reg(nfp_prog), 3);
3597 
3598 		wrp_mov(nfp_prog, reg_both(adj + 1), reg_lm(0, 1 + idx + 1));
3599 	}
3600 }
3601 
3602 static void nfp_outro(struct nfp_prog *nfp_prog)
3603 {
3604 	switch (nfp_prog->type) {
3605 	case BPF_PROG_TYPE_SCHED_CLS:
3606 		nfp_outro_tc_da(nfp_prog);
3607 		break;
3608 	case BPF_PROG_TYPE_XDP:
3609 		nfp_outro_xdp(nfp_prog);
3610 		break;
3611 	default:
3612 		WARN_ON(1);
3613 	}
3614 
3615 	if (nfp_prog->subprog_cnt == 1)
3616 		return;
3617 
3618 	nfp_push_callee_registers(nfp_prog);
3619 	nfp_pop_callee_registers(nfp_prog);
3620 }
3621 
3622 static int nfp_translate(struct nfp_prog *nfp_prog)
3623 {
3624 	struct nfp_insn_meta *meta;
3625 	unsigned int depth;
3626 	int err;
3627 
3628 	depth = nfp_prog->subprog[0].stack_depth;
3629 	nfp_prog->stack_frame_depth = round_up(depth, 4);
3630 
3631 	nfp_intro(nfp_prog);
3632 	if (nfp_prog->error)
3633 		return nfp_prog->error;
3634 
3635 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3636 		instr_cb_t cb = instr_cb[meta->insn.code];
3637 
3638 		meta->off = nfp_prog_current_offset(nfp_prog);
3639 
3640 		if (nfp_is_subprog_start(meta)) {
3641 			nfp_start_subprog(nfp_prog, meta);
3642 			if (nfp_prog->error)
3643 				return nfp_prog->error;
3644 		}
3645 
3646 		if (meta->skip) {
3647 			nfp_prog->n_translated++;
3648 			continue;
3649 		}
3650 
3651 		if (nfp_meta_has_prev(nfp_prog, meta) &&
3652 		    nfp_meta_prev(meta)->double_cb)
3653 			cb = nfp_meta_prev(meta)->double_cb;
3654 		if (!cb)
3655 			return -ENOENT;
3656 		err = cb(nfp_prog, meta);
3657 		if (err)
3658 			return err;
3659 		if (nfp_prog->error)
3660 			return nfp_prog->error;
3661 
3662 		nfp_prog->n_translated++;
3663 	}
3664 
3665 	nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
3666 
3667 	nfp_outro(nfp_prog);
3668 	if (nfp_prog->error)
3669 		return nfp_prog->error;
3670 
3671 	wrp_nops(nfp_prog, NFP_USTORE_PREFETCH_WINDOW);
3672 	if (nfp_prog->error)
3673 		return nfp_prog->error;
3674 
3675 	return nfp_fixup_branches(nfp_prog);
3676 }
3677 
3678 /* --- Optimizations --- */
3679 static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
3680 {
3681 	struct nfp_insn_meta *meta;
3682 
3683 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3684 		struct bpf_insn insn = meta->insn;
3685 
3686 		/* Programs converted from cBPF start with register xoring */
3687 		if (insn.code == (BPF_ALU64 | BPF_XOR | BPF_X) &&
3688 		    insn.src_reg == insn.dst_reg)
3689 			continue;
3690 
3691 		/* Programs start with R6 = R1 but we ignore the skb pointer */
3692 		if (insn.code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
3693 		    insn.src_reg == 1 && insn.dst_reg == 6)
3694 			meta->skip = true;
3695 
3696 		/* Return as soon as something doesn't match */
3697 		if (!meta->skip)
3698 			return;
3699 	}
3700 }
3701 
3702 /* abs(insn.imm) will fit better into unrestricted reg immediate -
3703  * convert add/sub of a negative number into a sub/add of a positive one.
3704  */
3705 static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
3706 {
3707 	struct nfp_insn_meta *meta;
3708 
3709 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3710 		struct bpf_insn insn = meta->insn;
3711 
3712 		if (meta->skip)
3713 			continue;
3714 
3715 		if (BPF_CLASS(insn.code) != BPF_ALU &&
3716 		    BPF_CLASS(insn.code) != BPF_ALU64 &&
3717 		    BPF_CLASS(insn.code) != BPF_JMP)
3718 			continue;
3719 		if (BPF_SRC(insn.code) != BPF_K)
3720 			continue;
3721 		if (insn.imm >= 0)
3722 			continue;
3723 
3724 		if (BPF_CLASS(insn.code) == BPF_JMP) {
3725 			switch (BPF_OP(insn.code)) {
3726 			case BPF_JGE:
3727 			case BPF_JSGE:
3728 			case BPF_JLT:
3729 			case BPF_JSLT:
3730 				meta->jump_neg_op = true;
3731 				break;
3732 			default:
3733 				continue;
3734 			}
3735 		} else {
3736 			if (BPF_OP(insn.code) == BPF_ADD)
3737 				insn.code = BPF_CLASS(insn.code) | BPF_SUB;
3738 			else if (BPF_OP(insn.code) == BPF_SUB)
3739 				insn.code = BPF_CLASS(insn.code) | BPF_ADD;
3740 			else
3741 				continue;
3742 
3743 			meta->insn.code = insn.code | BPF_K;
3744 		}
3745 
3746 		meta->insn.imm = -insn.imm;
3747 	}
3748 }
3749 
3750 /* Remove masking after load since our load guarantees this is not needed */
3751 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
3752 {
3753 	struct nfp_insn_meta *meta1, *meta2;
3754 	const s32 exp_mask[] = {
3755 		[BPF_B] = 0x000000ffU,
3756 		[BPF_H] = 0x0000ffffU,
3757 		[BPF_W] = 0xffffffffU,
3758 	};
3759 
3760 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3761 		struct bpf_insn insn, next;
3762 
3763 		insn = meta1->insn;
3764 		next = meta2->insn;
3765 
3766 		if (BPF_CLASS(insn.code) != BPF_LD)
3767 			continue;
3768 		if (BPF_MODE(insn.code) != BPF_ABS &&
3769 		    BPF_MODE(insn.code) != BPF_IND)
3770 			continue;
3771 
3772 		if (next.code != (BPF_ALU64 | BPF_AND | BPF_K))
3773 			continue;
3774 
3775 		if (!exp_mask[BPF_SIZE(insn.code)])
3776 			continue;
3777 		if (exp_mask[BPF_SIZE(insn.code)] != next.imm)
3778 			continue;
3779 
3780 		if (next.src_reg || next.dst_reg)
3781 			continue;
3782 
3783 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
3784 			continue;
3785 
3786 		meta2->skip = true;
3787 	}
3788 }
3789 
3790 static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
3791 {
3792 	struct nfp_insn_meta *meta1, *meta2, *meta3;
3793 
3794 	nfp_for_each_insn_walk3(nfp_prog, meta1, meta2, meta3) {
3795 		struct bpf_insn insn, next1, next2;
3796 
3797 		insn = meta1->insn;
3798 		next1 = meta2->insn;
3799 		next2 = meta3->insn;
3800 
3801 		if (BPF_CLASS(insn.code) != BPF_LD)
3802 			continue;
3803 		if (BPF_MODE(insn.code) != BPF_ABS &&
3804 		    BPF_MODE(insn.code) != BPF_IND)
3805 			continue;
3806 		if (BPF_SIZE(insn.code) != BPF_W)
3807 			continue;
3808 
3809 		if (!(next1.code == (BPF_LSH | BPF_K | BPF_ALU64) &&
3810 		      next2.code == (BPF_RSH | BPF_K | BPF_ALU64)) &&
3811 		    !(next1.code == (BPF_RSH | BPF_K | BPF_ALU64) &&
3812 		      next2.code == (BPF_LSH | BPF_K | BPF_ALU64)))
3813 			continue;
3814 
3815 		if (next1.src_reg || next1.dst_reg ||
3816 		    next2.src_reg || next2.dst_reg)
3817 			continue;
3818 
3819 		if (next1.imm != 0x20 || next2.imm != 0x20)
3820 			continue;
3821 
3822 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
3823 		    meta3->flags & FLAG_INSN_IS_JUMP_DST)
3824 			continue;
3825 
3826 		meta2->skip = true;
3827 		meta3->skip = true;
3828 	}
3829 }
3830 
3831 /* load/store pair that forms memory copy sould look like the following:
3832  *
3833  *   ld_width R, [addr_src + offset_src]
3834  *   st_width [addr_dest + offset_dest], R
3835  *
3836  * The destination register of load and source register of store should
3837  * be the same, load and store should also perform at the same width.
3838  * If either of addr_src or addr_dest is stack pointer, we don't do the
3839  * CPP optimization as stack is modelled by registers on NFP.
3840  */
3841 static bool
3842 curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
3843 		    struct nfp_insn_meta *st_meta)
3844 {
3845 	struct bpf_insn *ld = &ld_meta->insn;
3846 	struct bpf_insn *st = &st_meta->insn;
3847 
3848 	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
3849 		return false;
3850 
3851 	if (ld_meta->ptr.type != PTR_TO_PACKET &&
3852 	    ld_meta->ptr.type != PTR_TO_MAP_VALUE)
3853 		return false;
3854 
3855 	if (st_meta->ptr.type != PTR_TO_PACKET)
3856 		return false;
3857 
3858 	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
3859 		return false;
3860 
3861 	if (ld->dst_reg != st->src_reg)
3862 		return false;
3863 
3864 	/* There is jump to the store insn in this pair. */
3865 	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
3866 		return false;
3867 
3868 	return true;
3869 }
3870 
3871 /* Currently, we only support chaining load/store pairs if:
3872  *
3873  *  - Their address base registers are the same.
3874  *  - Their address offsets are in the same order.
3875  *  - They operate at the same memory width.
3876  *  - There is no jump into the middle of them.
3877  */
3878 static bool
3879 curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
3880 			      struct nfp_insn_meta *st_meta,
3881 			      struct bpf_insn *prev_ld,
3882 			      struct bpf_insn *prev_st)
3883 {
3884 	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
3885 	struct bpf_insn *ld = &ld_meta->insn;
3886 	struct bpf_insn *st = &st_meta->insn;
3887 	s16 prev_ld_off, prev_st_off;
3888 
3889 	/* This pair is the start pair. */
3890 	if (!prev_ld)
3891 		return true;
3892 
3893 	prev_size = BPF_LDST_BYTES(prev_ld);
3894 	curr_size = BPF_LDST_BYTES(ld);
3895 	prev_ld_base = prev_ld->src_reg;
3896 	prev_st_base = prev_st->dst_reg;
3897 	prev_ld_dst = prev_ld->dst_reg;
3898 	prev_ld_off = prev_ld->off;
3899 	prev_st_off = prev_st->off;
3900 
3901 	if (ld->dst_reg != prev_ld_dst)
3902 		return false;
3903 
3904 	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
3905 		return false;
3906 
3907 	if (curr_size != prev_size)
3908 		return false;
3909 
3910 	/* There is jump to the head of this pair. */
3911 	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
3912 		return false;
3913 
3914 	/* Both in ascending order. */
3915 	if (prev_ld_off + prev_size == ld->off &&
3916 	    prev_st_off + prev_size == st->off)
3917 		return true;
3918 
3919 	/* Both in descending order. */
3920 	if (ld->off + curr_size == prev_ld_off &&
3921 	    st->off + curr_size == prev_st_off)
3922 		return true;
3923 
3924 	return false;
3925 }
3926 
3927 /* Return TRUE if cross memory access happens. Cross memory access means
3928  * store area is overlapping with load area that a later load might load
3929  * the value from previous store, for this case we can't treat the sequence
3930  * as an memory copy.
3931  */
3932 static bool
3933 cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
3934 		 struct nfp_insn_meta *head_st_meta)
3935 {
3936 	s16 head_ld_off, head_st_off, ld_off;
3937 
3938 	/* Different pointer types does not overlap. */
3939 	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
3940 		return false;
3941 
3942 	/* load and store are both PTR_TO_PACKET, check ID info.  */
3943 	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
3944 		return true;
3945 
3946 	/* Canonicalize the offsets. Turn all of them against the original
3947 	 * base register.
3948 	 */
3949 	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
3950 	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
3951 	ld_off = ld->off + head_ld_meta->ptr.off;
3952 
3953 	/* Ascending order cross. */
3954 	if (ld_off > head_ld_off &&
3955 	    head_ld_off < head_st_off && ld_off >= head_st_off)
3956 		return true;
3957 
3958 	/* Descending order cross. */
3959 	if (ld_off < head_ld_off &&
3960 	    head_ld_off > head_st_off && ld_off <= head_st_off)
3961 		return true;
3962 
3963 	return false;
3964 }
3965 
3966 /* This pass try to identify the following instructoin sequences.
3967  *
3968  *   load R, [regA + offA]
3969  *   store [regB + offB], R
3970  *   load R, [regA + offA + const_imm_A]
3971  *   store [regB + offB + const_imm_A], R
3972  *   load R, [regA + offA + 2 * const_imm_A]
3973  *   store [regB + offB + 2 * const_imm_A], R
3974  *   ...
3975  *
3976  * Above sequence is typically generated by compiler when lowering
3977  * memcpy. NFP prefer using CPP instructions to accelerate it.
3978  */
3979 static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
3980 {
3981 	struct nfp_insn_meta *head_ld_meta = NULL;
3982 	struct nfp_insn_meta *head_st_meta = NULL;
3983 	struct nfp_insn_meta *meta1, *meta2;
3984 	struct bpf_insn *prev_ld = NULL;
3985 	struct bpf_insn *prev_st = NULL;
3986 	u8 count = 0;
3987 
3988 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3989 		struct bpf_insn *ld = &meta1->insn;
3990 		struct bpf_insn *st = &meta2->insn;
3991 
3992 		/* Reset record status if any of the following if true:
3993 		 *   - The current insn pair is not load/store.
3994 		 *   - The load/store pair doesn't chain with previous one.
3995 		 *   - The chained load/store pair crossed with previous pair.
3996 		 *   - The chained load/store pair has a total size of memory
3997 		 *     copy beyond 128 bytes which is the maximum length a
3998 		 *     single NFP CPP command can transfer.
3999 		 */
4000 		if (!curr_pair_is_memcpy(meta1, meta2) ||
4001 		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
4002 						   prev_st) ||
4003 		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
4004 						       head_st_meta) ||
4005 				      head_ld_meta->ldst_gather_len >= 128))) {
4006 			if (!count)
4007 				continue;
4008 
4009 			if (count > 1) {
4010 				s16 prev_ld_off = prev_ld->off;
4011 				s16 prev_st_off = prev_st->off;
4012 				s16 head_ld_off = head_ld_meta->insn.off;
4013 
4014 				if (prev_ld_off < head_ld_off) {
4015 					head_ld_meta->insn.off = prev_ld_off;
4016 					head_st_meta->insn.off = prev_st_off;
4017 					head_ld_meta->ldst_gather_len =
4018 						-head_ld_meta->ldst_gather_len;
4019 				}
4020 
4021 				head_ld_meta->paired_st = &head_st_meta->insn;
4022 				head_st_meta->skip = true;
4023 			} else {
4024 				head_ld_meta->ldst_gather_len = 0;
4025 			}
4026 
4027 			/* If the chain is ended by an load/store pair then this
4028 			 * could serve as the new head of the the next chain.
4029 			 */
4030 			if (curr_pair_is_memcpy(meta1, meta2)) {
4031 				head_ld_meta = meta1;
4032 				head_st_meta = meta2;
4033 				head_ld_meta->ldst_gather_len =
4034 					BPF_LDST_BYTES(ld);
4035 				meta1 = nfp_meta_next(meta1);
4036 				meta2 = nfp_meta_next(meta2);
4037 				prev_ld = ld;
4038 				prev_st = st;
4039 				count = 1;
4040 			} else {
4041 				head_ld_meta = NULL;
4042 				head_st_meta = NULL;
4043 				prev_ld = NULL;
4044 				prev_st = NULL;
4045 				count = 0;
4046 			}
4047 
4048 			continue;
4049 		}
4050 
4051 		if (!head_ld_meta) {
4052 			head_ld_meta = meta1;
4053 			head_st_meta = meta2;
4054 		} else {
4055 			meta1->skip = true;
4056 			meta2->skip = true;
4057 		}
4058 
4059 		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
4060 		meta1 = nfp_meta_next(meta1);
4061 		meta2 = nfp_meta_next(meta2);
4062 		prev_ld = ld;
4063 		prev_st = st;
4064 		count++;
4065 	}
4066 }
4067 
4068 static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog)
4069 {
4070 	struct nfp_insn_meta *meta, *range_node = NULL;
4071 	s16 range_start = 0, range_end = 0;
4072 	bool cache_avail = false;
4073 	struct bpf_insn *insn;
4074 	s32 range_ptr_off = 0;
4075 	u32 range_ptr_id = 0;
4076 
4077 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4078 		if (meta->flags & FLAG_INSN_IS_JUMP_DST)
4079 			cache_avail = false;
4080 
4081 		if (meta->skip)
4082 			continue;
4083 
4084 		insn = &meta->insn;
4085 
4086 		if (is_mbpf_store_pkt(meta) ||
4087 		    insn->code == (BPF_JMP | BPF_CALL) ||
4088 		    is_mbpf_classic_store_pkt(meta) ||
4089 		    is_mbpf_classic_load(meta)) {
4090 			cache_avail = false;
4091 			continue;
4092 		}
4093 
4094 		if (!is_mbpf_load(meta))
4095 			continue;
4096 
4097 		if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) {
4098 			cache_avail = false;
4099 			continue;
4100 		}
4101 
4102 		if (!cache_avail) {
4103 			cache_avail = true;
4104 			if (range_node)
4105 				goto end_current_then_start_new;
4106 			goto start_new;
4107 		}
4108 
4109 		/* Check ID to make sure two reads share the same
4110 		 * variable offset against PTR_TO_PACKET, and check OFF
4111 		 * to make sure they also share the same constant
4112 		 * offset.
4113 		 *
4114 		 * OFFs don't really need to be the same, because they
4115 		 * are the constant offsets against PTR_TO_PACKET, so
4116 		 * for different OFFs, we could canonicalize them to
4117 		 * offsets against original packet pointer. We don't
4118 		 * support this.
4119 		 */
4120 		if (meta->ptr.id == range_ptr_id &&
4121 		    meta->ptr.off == range_ptr_off) {
4122 			s16 new_start = range_start;
4123 			s16 end, off = insn->off;
4124 			s16 new_end = range_end;
4125 			bool changed = false;
4126 
4127 			if (off < range_start) {
4128 				new_start = off;
4129 				changed = true;
4130 			}
4131 
4132 			end = off + BPF_LDST_BYTES(insn);
4133 			if (end > range_end) {
4134 				new_end = end;
4135 				changed = true;
4136 			}
4137 
4138 			if (!changed)
4139 				continue;
4140 
4141 			if (new_end - new_start <= 64) {
4142 				/* Install new range. */
4143 				range_start = new_start;
4144 				range_end = new_end;
4145 				continue;
4146 			}
4147 		}
4148 
4149 end_current_then_start_new:
4150 		range_node->pkt_cache.range_start = range_start;
4151 		range_node->pkt_cache.range_end = range_end;
4152 start_new:
4153 		range_node = meta;
4154 		range_node->pkt_cache.do_init = true;
4155 		range_ptr_id = range_node->ptr.id;
4156 		range_ptr_off = range_node->ptr.off;
4157 		range_start = insn->off;
4158 		range_end = insn->off + BPF_LDST_BYTES(insn);
4159 	}
4160 
4161 	if (range_node) {
4162 		range_node->pkt_cache.range_start = range_start;
4163 		range_node->pkt_cache.range_end = range_end;
4164 	}
4165 
4166 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4167 		if (meta->skip)
4168 			continue;
4169 
4170 		if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) {
4171 			if (meta->pkt_cache.do_init) {
4172 				range_start = meta->pkt_cache.range_start;
4173 				range_end = meta->pkt_cache.range_end;
4174 			} else {
4175 				meta->pkt_cache.range_start = range_start;
4176 				meta->pkt_cache.range_end = range_end;
4177 			}
4178 		}
4179 	}
4180 }
4181 
4182 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
4183 {
4184 	nfp_bpf_opt_reg_init(nfp_prog);
4185 
4186 	nfp_bpf_opt_neg_add_sub(nfp_prog);
4187 	nfp_bpf_opt_ld_mask(nfp_prog);
4188 	nfp_bpf_opt_ld_shift(nfp_prog);
4189 	nfp_bpf_opt_ldst_gather(nfp_prog);
4190 	nfp_bpf_opt_pkt_cache(nfp_prog);
4191 
4192 	return 0;
4193 }
4194 
4195 static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
4196 {
4197 	struct nfp_insn_meta *meta1, *meta2;
4198 	struct nfp_bpf_map *nfp_map;
4199 	struct bpf_map *map;
4200 	u32 id;
4201 
4202 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4203 		if (meta1->skip || meta2->skip)
4204 			continue;
4205 
4206 		if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
4207 		    meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
4208 			continue;
4209 
4210 		map = (void *)(unsigned long)((u32)meta1->insn.imm |
4211 					      (u64)meta2->insn.imm << 32);
4212 		if (bpf_map_offload_neutral(map)) {
4213 			id = map->id;
4214 		} else {
4215 			nfp_map = map_to_offmap(map)->dev_priv;
4216 			id = nfp_map->tid;
4217 		}
4218 
4219 		meta1->insn.imm = id;
4220 		meta2->insn.imm = 0;
4221 	}
4222 
4223 	return 0;
4224 }
4225 
4226 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
4227 {
4228 	__le64 *ustore = (__force __le64 *)prog;
4229 	int i;
4230 
4231 	for (i = 0; i < len; i++) {
4232 		int err;
4233 
4234 		err = nfp_ustore_check_valid_no_ecc(prog[i]);
4235 		if (err)
4236 			return err;
4237 
4238 		ustore[i] = cpu_to_le64(nfp_ustore_calc_ecc_insn(prog[i]));
4239 	}
4240 
4241 	return 0;
4242 }
4243 
4244 static void nfp_bpf_prog_trim(struct nfp_prog *nfp_prog)
4245 {
4246 	void *prog;
4247 
4248 	prog = kvmalloc_array(nfp_prog->prog_len, sizeof(u64), GFP_KERNEL);
4249 	if (!prog)
4250 		return;
4251 
4252 	nfp_prog->__prog_alloc_len = nfp_prog->prog_len * sizeof(u64);
4253 	memcpy(prog, nfp_prog->prog, nfp_prog->__prog_alloc_len);
4254 	kvfree(nfp_prog->prog);
4255 	nfp_prog->prog = prog;
4256 }
4257 
4258 int nfp_bpf_jit(struct nfp_prog *nfp_prog)
4259 {
4260 	int ret;
4261 
4262 	ret = nfp_bpf_replace_map_ptrs(nfp_prog);
4263 	if (ret)
4264 		return ret;
4265 
4266 	ret = nfp_bpf_optimize(nfp_prog);
4267 	if (ret)
4268 		return ret;
4269 
4270 	ret = nfp_translate(nfp_prog);
4271 	if (ret) {
4272 		pr_err("Translation failed with error %d (translated: %u)\n",
4273 		       ret, nfp_prog->n_translated);
4274 		return -EINVAL;
4275 	}
4276 
4277 	nfp_bpf_prog_trim(nfp_prog);
4278 
4279 	return ret;
4280 }
4281 
4282 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt)
4283 {
4284 	struct nfp_insn_meta *meta;
4285 
4286 	/* Another pass to record jump information. */
4287 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4288 		struct nfp_insn_meta *dst_meta;
4289 		u64 code = meta->insn.code;
4290 		unsigned int dst_idx;
4291 		bool pseudo_call;
4292 
4293 		if (BPF_CLASS(code) != BPF_JMP)
4294 			continue;
4295 		if (BPF_OP(code) == BPF_EXIT)
4296 			continue;
4297 		if (is_mbpf_helper_call(meta))
4298 			continue;
4299 
4300 		/* If opcode is BPF_CALL at this point, this can only be a
4301 		 * BPF-to-BPF call (a.k.a pseudo call).
4302 		 */
4303 		pseudo_call = BPF_OP(code) == BPF_CALL;
4304 
4305 		if (pseudo_call)
4306 			dst_idx = meta->n + 1 + meta->insn.imm;
4307 		else
4308 			dst_idx = meta->n + 1 + meta->insn.off;
4309 
4310 		dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_idx, cnt);
4311 
4312 		if (pseudo_call)
4313 			dst_meta->flags |= FLAG_INSN_IS_SUBPROG_START;
4314 
4315 		dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
4316 		meta->jmp_dst = dst_meta;
4317 	}
4318 }
4319 
4320 bool nfp_bpf_supported_opcode(u8 code)
4321 {
4322 	return !!instr_cb[code];
4323 }
4324 
4325 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
4326 {
4327 	unsigned int i;
4328 	u64 *prog;
4329 	int err;
4330 
4331 	prog = kmemdup(nfp_prog->prog, nfp_prog->prog_len * sizeof(u64),
4332 		       GFP_KERNEL);
4333 	if (!prog)
4334 		return ERR_PTR(-ENOMEM);
4335 
4336 	for (i = 0; i < nfp_prog->prog_len; i++) {
4337 		enum nfp_relo_type special;
4338 		u32 val;
4339 		u16 off;
4340 
4341 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
4342 		switch (special) {
4343 		case RELO_NONE:
4344 			continue;
4345 		case RELO_BR_REL:
4346 			br_add_offset(&prog[i], bv->start_off);
4347 			break;
4348 		case RELO_BR_GO_OUT:
4349 			br_set_offset(&prog[i],
4350 				      nfp_prog->tgt_out + bv->start_off);
4351 			break;
4352 		case RELO_BR_GO_ABORT:
4353 			br_set_offset(&prog[i],
4354 				      nfp_prog->tgt_abort + bv->start_off);
4355 			break;
4356 		case RELO_BR_GO_CALL_PUSH_REGS:
4357 			off = nfp_prog->tgt_call_push_regs + bv->start_off;
4358 			br_set_offset(&prog[i], off);
4359 			break;
4360 		case RELO_BR_GO_CALL_POP_REGS:
4361 			off = nfp_prog->tgt_call_pop_regs + bv->start_off;
4362 			br_set_offset(&prog[i], off);
4363 			break;
4364 		case RELO_BR_NEXT_PKT:
4365 			br_set_offset(&prog[i], bv->tgt_done);
4366 			break;
4367 		case RELO_BR_HELPER:
4368 			val = br_get_offset(prog[i]);
4369 			val -= BR_OFF_RELO;
4370 			switch (val) {
4371 			case BPF_FUNC_map_lookup_elem:
4372 				val = nfp_prog->bpf->helpers.map_lookup;
4373 				break;
4374 			case BPF_FUNC_map_update_elem:
4375 				val = nfp_prog->bpf->helpers.map_update;
4376 				break;
4377 			case BPF_FUNC_map_delete_elem:
4378 				val = nfp_prog->bpf->helpers.map_delete;
4379 				break;
4380 			case BPF_FUNC_perf_event_output:
4381 				val = nfp_prog->bpf->helpers.perf_event_output;
4382 				break;
4383 			default:
4384 				pr_err("relocation of unknown helper %d\n",
4385 				       val);
4386 				err = -EINVAL;
4387 				goto err_free_prog;
4388 			}
4389 			br_set_offset(&prog[i], val);
4390 			break;
4391 		case RELO_IMMED_REL:
4392 			immed_add_value(&prog[i], bv->start_off);
4393 			break;
4394 		}
4395 
4396 		prog[i] &= ~OP_RELO_TYPE;
4397 	}
4398 
4399 	err = nfp_bpf_ustore_calc(prog, nfp_prog->prog_len);
4400 	if (err)
4401 		goto err_free_prog;
4402 
4403 	return prog;
4404 
4405 err_free_prog:
4406 	kfree(prog);
4407 	return ERR_PTR(err);
4408 }
4409