1 /*
2  * Copyright (C) 2016-2018 Netronome Systems, Inc.
3  *
4  * This software is dual licensed under the GNU General License Version 2,
5  * June 1991 as shown in the file COPYING in the top-level directory of this
6  * source tree or the BSD 2-Clause License provided below.  You have the
7  * option to license this software under the complete terms of either license.
8  *
9  * The BSD 2-Clause License:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      1. Redistributions of source code must retain the above
16  *         copyright notice, this list of conditions and the following
17  *         disclaimer.
18  *
19  *      2. Redistributions in binary form must reproduce the above
20  *         copyright notice, this list of conditions and the following
21  *         disclaimer in the documentation and/or other materials
22  *         provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #define pr_fmt(fmt)	"NFP net bpf: " fmt
35 
36 #include <linux/bug.h>
37 #include <linux/bpf.h>
38 #include <linux/filter.h>
39 #include <linux/kernel.h>
40 #include <linux/pkt_cls.h>
41 #include <linux/reciprocal_div.h>
42 #include <linux/unistd.h>
43 
44 #include "main.h"
45 #include "../nfp_asm.h"
46 #include "../nfp_net_ctrl.h"
47 
48 /* --- NFP prog --- */
49 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
50  * It's safe to modify the next pointers (but not pos).
51  */
52 #define nfp_for_each_insn_walk2(nfp_prog, pos, next)			\
53 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
54 	     next = list_next_entry(pos, l);			\
55 	     &(nfp_prog)->insns != &pos->l &&			\
56 	     &(nfp_prog)->insns != &next->l;			\
57 	     pos = nfp_meta_next(pos),				\
58 	     next = nfp_meta_next(pos))
59 
60 #define nfp_for_each_insn_walk3(nfp_prog, pos, next, next2)		\
61 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
62 	     next = list_next_entry(pos, l),			\
63 	     next2 = list_next_entry(next, l);			\
64 	     &(nfp_prog)->insns != &pos->l &&			\
65 	     &(nfp_prog)->insns != &next->l &&			\
66 	     &(nfp_prog)->insns != &next2->l;			\
67 	     pos = nfp_meta_next(pos),				\
68 	     next = nfp_meta_next(pos),				\
69 	     next2 = nfp_meta_next(next))
70 
71 static bool
72 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
73 {
74 	return meta->l.prev != &nfp_prog->insns;
75 }
76 
77 static void nfp_prog_push(struct nfp_prog *nfp_prog, u64 insn)
78 {
79 	if (nfp_prog->__prog_alloc_len / sizeof(u64) == nfp_prog->prog_len) {
80 		pr_warn("instruction limit reached (%u NFP instructions)\n",
81 			nfp_prog->prog_len);
82 		nfp_prog->error = -ENOSPC;
83 		return;
84 	}
85 
86 	nfp_prog->prog[nfp_prog->prog_len] = insn;
87 	nfp_prog->prog_len++;
88 }
89 
90 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog)
91 {
92 	return nfp_prog->prog_len;
93 }
94 
95 static bool
96 nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off)
97 {
98 	/* If there is a recorded error we may have dropped instructions;
99 	 * that doesn't have to be due to translator bug, and the translation
100 	 * will fail anyway, so just return OK.
101 	 */
102 	if (nfp_prog->error)
103 		return true;
104 	return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off);
105 }
106 
107 /* --- Emitters --- */
108 static void
109 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
110 	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx,
111 	   bool indir)
112 {
113 	u64 insn;
114 
115 	insn =	FIELD_PREP(OP_CMD_A_SRC, areg) |
116 		FIELD_PREP(OP_CMD_CTX, ctx) |
117 		FIELD_PREP(OP_CMD_B_SRC, breg) |
118 		FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) |
119 		FIELD_PREP(OP_CMD_XFER, xfer) |
120 		FIELD_PREP(OP_CMD_CNT, size) |
121 		FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) |
122 		FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
123 		FIELD_PREP(OP_CMD_INDIR, indir) |
124 		FIELD_PREP(OP_CMD_MODE, mode);
125 
126 	nfp_prog_push(nfp_prog, insn);
127 }
128 
129 static void
130 emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
131 	     swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir)
132 {
133 	struct nfp_insn_re_regs reg;
134 	int err;
135 
136 	err = swreg_to_restricted(reg_none(), lreg, rreg, &reg, false);
137 	if (err) {
138 		nfp_prog->error = err;
139 		return;
140 	}
141 	if (reg.swap) {
142 		pr_err("cmd can't swap arguments\n");
143 		nfp_prog->error = -EFAULT;
144 		return;
145 	}
146 	if (reg.dst_lmextn || reg.src_lmextn) {
147 		pr_err("cmd can't use LMextn\n");
148 		nfp_prog->error = -EFAULT;
149 		return;
150 	}
151 
152 	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx,
153 		   indir);
154 }
155 
156 static void
157 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
158 	 swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
159 {
160 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false);
161 }
162 
163 static void
164 emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
165 	       swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
166 {
167 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true);
168 }
169 
170 static void
171 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
172 	  enum br_ctx_signal_state css, u16 addr, u8 defer)
173 {
174 	u16 addr_lo, addr_hi;
175 	u64 insn;
176 
177 	addr_lo = addr & (OP_BR_ADDR_LO >> __bf_shf(OP_BR_ADDR_LO));
178 	addr_hi = addr != addr_lo;
179 
180 	insn = OP_BR_BASE |
181 		FIELD_PREP(OP_BR_MASK, mask) |
182 		FIELD_PREP(OP_BR_EV_PIP, ev_pip) |
183 		FIELD_PREP(OP_BR_CSS, css) |
184 		FIELD_PREP(OP_BR_DEFBR, defer) |
185 		FIELD_PREP(OP_BR_ADDR_LO, addr_lo) |
186 		FIELD_PREP(OP_BR_ADDR_HI, addr_hi);
187 
188 	nfp_prog_push(nfp_prog, insn);
189 }
190 
191 static void
192 emit_br_relo(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer,
193 	     enum nfp_relo_type relo)
194 {
195 	if (mask == BR_UNC && defer > 2) {
196 		pr_err("BUG: branch defer out of bounds %d\n", defer);
197 		nfp_prog->error = -EFAULT;
198 		return;
199 	}
200 
201 	__emit_br(nfp_prog, mask,
202 		  mask != BR_UNC ? BR_EV_PIP_COND : BR_EV_PIP_UNCOND,
203 		  BR_CSS_NONE, addr, defer);
204 
205 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
206 		FIELD_PREP(OP_RELO_TYPE, relo);
207 }
208 
209 static void
210 emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
211 {
212 	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
213 }
214 
215 static void
216 __emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
217 	      bool set, bool src_lmextn)
218 {
219 	u16 addr_lo, addr_hi;
220 	u64 insn;
221 
222 	addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
223 	addr_hi = addr != addr_lo;
224 
225 	insn = OP_BR_BIT_BASE |
226 		FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
227 		FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
228 		FIELD_PREP(OP_BR_BIT_BV, set) |
229 		FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
230 		FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
231 		FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
232 		FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);
233 
234 	nfp_prog_push(nfp_prog, insn);
235 }
236 
237 static void
238 emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
239 		 u8 defer, bool set, enum nfp_relo_type relo)
240 {
241 	struct nfp_insn_re_regs reg;
242 	int err;
243 
244 	/* NOTE: The bit to test is specified as an rotation amount, such that
245 	 *	 the bit to test will be placed on the MSB of the result when
246 	 *	 doing a rotate right. For bit X, we need right rotate X + 1.
247 	 */
248 	bit += 1;
249 
250 	err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
251 	if (err) {
252 		nfp_prog->error = err;
253 		return;
254 	}
255 
256 	__emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
257 		      reg.src_lmextn);
258 
259 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
260 		FIELD_PREP(OP_RELO_TYPE, relo);
261 }
262 
263 static void
264 emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
265 {
266 	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
267 }
268 
269 static void
270 __emit_br_alu(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
271 	      u8 defer, bool dst_lmextn, bool src_lmextn)
272 {
273 	u64 insn;
274 
275 	insn = OP_BR_ALU_BASE |
276 		FIELD_PREP(OP_BR_ALU_A_SRC, areg) |
277 		FIELD_PREP(OP_BR_ALU_B_SRC, breg) |
278 		FIELD_PREP(OP_BR_ALU_DEFBR, defer) |
279 		FIELD_PREP(OP_BR_ALU_IMM_HI, imm_hi) |
280 		FIELD_PREP(OP_BR_ALU_SRC_LMEXTN, src_lmextn) |
281 		FIELD_PREP(OP_BR_ALU_DST_LMEXTN, dst_lmextn);
282 
283 	nfp_prog_push(nfp_prog, insn);
284 }
285 
286 static void emit_rtn(struct nfp_prog *nfp_prog, swreg base, u8 defer)
287 {
288 	struct nfp_insn_ur_regs reg;
289 	int err;
290 
291 	err = swreg_to_unrestricted(reg_none(), base, reg_imm(0), &reg);
292 	if (err) {
293 		nfp_prog->error = err;
294 		return;
295 	}
296 
297 	__emit_br_alu(nfp_prog, reg.areg, reg.breg, 0, defer, reg.dst_lmextn,
298 		      reg.src_lmextn);
299 }
300 
301 static void
302 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
303 	     enum immed_width width, bool invert,
304 	     enum immed_shift shift, bool wr_both,
305 	     bool dst_lmextn, bool src_lmextn)
306 {
307 	u64 insn;
308 
309 	insn = OP_IMMED_BASE |
310 		FIELD_PREP(OP_IMMED_A_SRC, areg) |
311 		FIELD_PREP(OP_IMMED_B_SRC, breg) |
312 		FIELD_PREP(OP_IMMED_IMM, imm_hi) |
313 		FIELD_PREP(OP_IMMED_WIDTH, width) |
314 		FIELD_PREP(OP_IMMED_INV, invert) |
315 		FIELD_PREP(OP_IMMED_SHIFT, shift) |
316 		FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
317 		FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
318 		FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
319 
320 	nfp_prog_push(nfp_prog, insn);
321 }
322 
323 static void
324 emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
325 	   enum immed_width width, bool invert, enum immed_shift shift)
326 {
327 	struct nfp_insn_ur_regs reg;
328 	int err;
329 
330 	if (swreg_type(dst) == NN_REG_IMM) {
331 		nfp_prog->error = -EFAULT;
332 		return;
333 	}
334 
335 	err = swreg_to_unrestricted(dst, dst, reg_imm(imm & 0xff), &reg);
336 	if (err) {
337 		nfp_prog->error = err;
338 		return;
339 	}
340 
341 	/* Use reg.dst when destination is No-Dest. */
342 	__emit_immed(nfp_prog,
343 		     swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
344 		     reg.breg, imm >> 8, width, invert, shift,
345 		     reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
346 }
347 
348 static void
349 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
350 	   enum shf_sc sc, u8 shift,
351 	   u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
352 	   bool dst_lmextn, bool src_lmextn)
353 {
354 	u64 insn;
355 
356 	if (!FIELD_FIT(OP_SHF_SHIFT, shift)) {
357 		nfp_prog->error = -EFAULT;
358 		return;
359 	}
360 
361 	if (sc == SHF_SC_L_SHF)
362 		shift = 32 - shift;
363 
364 	insn = OP_SHF_BASE |
365 		FIELD_PREP(OP_SHF_A_SRC, areg) |
366 		FIELD_PREP(OP_SHF_SC, sc) |
367 		FIELD_PREP(OP_SHF_B_SRC, breg) |
368 		FIELD_PREP(OP_SHF_I8, i8) |
369 		FIELD_PREP(OP_SHF_SW, sw) |
370 		FIELD_PREP(OP_SHF_DST, dst) |
371 		FIELD_PREP(OP_SHF_SHIFT, shift) |
372 		FIELD_PREP(OP_SHF_OP, op) |
373 		FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
374 		FIELD_PREP(OP_SHF_WR_AB, wr_both) |
375 		FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
376 		FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
377 
378 	nfp_prog_push(nfp_prog, insn);
379 }
380 
381 static void
382 emit_shf(struct nfp_prog *nfp_prog, swreg dst,
383 	 swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
384 {
385 	struct nfp_insn_re_regs reg;
386 	int err;
387 
388 	err = swreg_to_restricted(dst, lreg, rreg, &reg, true);
389 	if (err) {
390 		nfp_prog->error = err;
391 		return;
392 	}
393 
394 	__emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
395 		   reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
396 		   reg.dst_lmextn, reg.src_lmextn);
397 }
398 
399 static void
400 emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
401 	       swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
402 {
403 	if (sc == SHF_SC_R_ROT) {
404 		pr_err("indirect shift is not allowed on rotation\n");
405 		nfp_prog->error = -EFAULT;
406 		return;
407 	}
408 
409 	emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
410 }
411 
412 static void
413 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
414 	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
415 	   bool dst_lmextn, bool src_lmextn)
416 {
417 	u64 insn;
418 
419 	insn = OP_ALU_BASE |
420 		FIELD_PREP(OP_ALU_A_SRC, areg) |
421 		FIELD_PREP(OP_ALU_B_SRC, breg) |
422 		FIELD_PREP(OP_ALU_DST, dst) |
423 		FIELD_PREP(OP_ALU_SW, swap) |
424 		FIELD_PREP(OP_ALU_OP, op) |
425 		FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
426 		FIELD_PREP(OP_ALU_WR_AB, wr_both) |
427 		FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
428 		FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
429 
430 	nfp_prog_push(nfp_prog, insn);
431 }
432 
433 static void
434 emit_alu(struct nfp_prog *nfp_prog, swreg dst,
435 	 swreg lreg, enum alu_op op, swreg rreg)
436 {
437 	struct nfp_insn_ur_regs reg;
438 	int err;
439 
440 	err = swreg_to_unrestricted(dst, lreg, rreg, &reg);
441 	if (err) {
442 		nfp_prog->error = err;
443 		return;
444 	}
445 
446 	__emit_alu(nfp_prog, reg.dst, reg.dst_ab,
447 		   reg.areg, op, reg.breg, reg.swap, reg.wr_both,
448 		   reg.dst_lmextn, reg.src_lmextn);
449 }
450 
451 static void
452 __emit_mul(struct nfp_prog *nfp_prog, enum alu_dst_ab dst_ab, u16 areg,
453 	   enum mul_type type, enum mul_step step, u16 breg, bool swap,
454 	   bool wr_both, bool dst_lmextn, bool src_lmextn)
455 {
456 	u64 insn;
457 
458 	insn = OP_MUL_BASE |
459 		FIELD_PREP(OP_MUL_A_SRC, areg) |
460 		FIELD_PREP(OP_MUL_B_SRC, breg) |
461 		FIELD_PREP(OP_MUL_STEP, step) |
462 		FIELD_PREP(OP_MUL_DST_AB, dst_ab) |
463 		FIELD_PREP(OP_MUL_SW, swap) |
464 		FIELD_PREP(OP_MUL_TYPE, type) |
465 		FIELD_PREP(OP_MUL_WR_AB, wr_both) |
466 		FIELD_PREP(OP_MUL_SRC_LMEXTN, src_lmextn) |
467 		FIELD_PREP(OP_MUL_DST_LMEXTN, dst_lmextn);
468 
469 	nfp_prog_push(nfp_prog, insn);
470 }
471 
472 static void
473 emit_mul(struct nfp_prog *nfp_prog, swreg lreg, enum mul_type type,
474 	 enum mul_step step, swreg rreg)
475 {
476 	struct nfp_insn_ur_regs reg;
477 	u16 areg;
478 	int err;
479 
480 	if (type == MUL_TYPE_START && step != MUL_STEP_NONE) {
481 		nfp_prog->error = -EINVAL;
482 		return;
483 	}
484 
485 	if (step == MUL_LAST || step == MUL_LAST_2) {
486 		/* When type is step and step Number is LAST or LAST2, left
487 		 * source is used as destination.
488 		 */
489 		err = swreg_to_unrestricted(lreg, reg_none(), rreg, &reg);
490 		areg = reg.dst;
491 	} else {
492 		err = swreg_to_unrestricted(reg_none(), lreg, rreg, &reg);
493 		areg = reg.areg;
494 	}
495 
496 	if (err) {
497 		nfp_prog->error = err;
498 		return;
499 	}
500 
501 	__emit_mul(nfp_prog, reg.dst_ab, areg, type, step, reg.breg, reg.swap,
502 		   reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
503 }
504 
505 static void
506 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
507 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
508 		bool zero, bool swap, bool wr_both,
509 		bool dst_lmextn, bool src_lmextn)
510 {
511 	u64 insn;
512 
513 	insn = OP_LDF_BASE |
514 		FIELD_PREP(OP_LDF_A_SRC, areg) |
515 		FIELD_PREP(OP_LDF_SC, sc) |
516 		FIELD_PREP(OP_LDF_B_SRC, breg) |
517 		FIELD_PREP(OP_LDF_I8, imm8) |
518 		FIELD_PREP(OP_LDF_SW, swap) |
519 		FIELD_PREP(OP_LDF_ZF, zero) |
520 		FIELD_PREP(OP_LDF_BMASK, bmask) |
521 		FIELD_PREP(OP_LDF_SHF, shift) |
522 		FIELD_PREP(OP_LDF_WR_AB, wr_both) |
523 		FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
524 		FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
525 
526 	nfp_prog_push(nfp_prog, insn);
527 }
528 
529 static void
530 emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
531 		  enum shf_sc sc, u8 shift, bool zero)
532 {
533 	struct nfp_insn_re_regs reg;
534 	int err;
535 
536 	/* Note: ld_field is special as it uses one of the src regs as dst */
537 	err = swreg_to_restricted(dst, dst, src, &reg, true);
538 	if (err) {
539 		nfp_prog->error = err;
540 		return;
541 	}
542 
543 	__emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
544 			reg.i8, zero, reg.swap, reg.wr_both,
545 			reg.dst_lmextn, reg.src_lmextn);
546 }
547 
548 static void
549 emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
550 	      enum shf_sc sc, u8 shift)
551 {
552 	emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
553 }
554 
555 static void
556 __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
557 	    bool dst_lmextn, bool src_lmextn)
558 {
559 	u64 insn;
560 
561 	insn = OP_LCSR_BASE |
562 		FIELD_PREP(OP_LCSR_A_SRC, areg) |
563 		FIELD_PREP(OP_LCSR_B_SRC, breg) |
564 		FIELD_PREP(OP_LCSR_WRITE, wr) |
565 		FIELD_PREP(OP_LCSR_ADDR, addr / 4) |
566 		FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
567 		FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
568 
569 	nfp_prog_push(nfp_prog, insn);
570 }
571 
572 static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
573 {
574 	struct nfp_insn_ur_regs reg;
575 	int err;
576 
577 	/* This instruction takes immeds instead of reg_none() for the ignored
578 	 * operand, but we can't encode 2 immeds in one instr with our normal
579 	 * swreg infra so if param is an immed, we encode as reg_none() and
580 	 * copy the immed to both operands.
581 	 */
582 	if (swreg_type(src) == NN_REG_IMM) {
583 		err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
584 		reg.breg = reg.areg;
585 	} else {
586 		err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
587 	}
588 	if (err) {
589 		nfp_prog->error = err;
590 		return;
591 	}
592 
593 	__emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr,
594 		    false, reg.src_lmextn);
595 }
596 
597 /* CSR value is read in following immed[gpr, 0] */
598 static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr)
599 {
600 	__emit_lcsr(nfp_prog, 0, 0, false, addr, false, false);
601 }
602 
603 static void emit_nop(struct nfp_prog *nfp_prog)
604 {
605 	__emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
606 }
607 
608 /* --- Wrappers --- */
609 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
610 {
611 	if (!(imm & 0xffff0000)) {
612 		*val = imm;
613 		*shift = IMMED_SHIFT_0B;
614 	} else if (!(imm & 0xff0000ff)) {
615 		*val = imm >> 8;
616 		*shift = IMMED_SHIFT_1B;
617 	} else if (!(imm & 0x0000ffff)) {
618 		*val = imm >> 16;
619 		*shift = IMMED_SHIFT_2B;
620 	} else {
621 		return false;
622 	}
623 
624 	return true;
625 }
626 
627 static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
628 {
629 	enum immed_shift shift;
630 	u16 val;
631 
632 	if (pack_immed(imm, &val, &shift)) {
633 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, false, shift);
634 	} else if (pack_immed(~imm, &val, &shift)) {
635 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, true, shift);
636 	} else {
637 		emit_immed(nfp_prog, dst, imm & 0xffff, IMMED_WIDTH_ALL,
638 			   false, IMMED_SHIFT_0B);
639 		emit_immed(nfp_prog, dst, imm >> 16, IMMED_WIDTH_WORD,
640 			   false, IMMED_SHIFT_2B);
641 	}
642 }
643 
644 static void
645 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm,
646 	       enum nfp_relo_type relo)
647 {
648 	if (imm > 0xffff) {
649 		pr_err("relocation of a large immediate!\n");
650 		nfp_prog->error = -EFAULT;
651 		return;
652 	}
653 	emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
654 
655 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
656 		FIELD_PREP(OP_RELO_TYPE, relo);
657 }
658 
659 /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted)
660  * If the @imm is small enough encode it directly in operand and return
661  * otherwise load @imm to a spare register and return its encoding.
662  */
663 static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
664 {
665 	if (FIELD_FIT(UR_REG_IMM_MAX, imm))
666 		return reg_imm(imm);
667 
668 	wrp_immed(nfp_prog, tmp_reg, imm);
669 	return tmp_reg;
670 }
671 
672 /* re_load_imm_any() - encode immediate or use tmp register (restricted)
673  * If the @imm is small enough encode it directly in operand and return
674  * otherwise load @imm to a spare register and return its encoding.
675  */
676 static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
677 {
678 	if (FIELD_FIT(RE_REG_IMM_MAX, imm))
679 		return reg_imm(imm);
680 
681 	wrp_immed(nfp_prog, tmp_reg, imm);
682 	return tmp_reg;
683 }
684 
685 static void wrp_nops(struct nfp_prog *nfp_prog, unsigned int count)
686 {
687 	while (count--)
688 		emit_nop(nfp_prog);
689 }
690 
691 static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src)
692 {
693 	emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src);
694 }
695 
696 static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
697 {
698 	wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
699 }
700 
701 /* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
702  * result to @dst from low end.
703  */
704 static void
705 wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
706 		u8 offset)
707 {
708 	enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
709 	u8 mask = (1 << field_len) - 1;
710 
711 	emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
712 }
713 
714 /* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the
715  * result to @dst from offset, there is no change on the other bits of @dst.
716  */
717 static void
718 wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src,
719 		   u8 field_len, u8 offset)
720 {
721 	enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE;
722 	u8 mask = ((1 << field_len) - 1) << offset;
723 
724 	emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8);
725 }
726 
727 static void
728 addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
729 	      swreg *rega, swreg *regb)
730 {
731 	if (offset == reg_imm(0)) {
732 		*rega = reg_a(src_gpr);
733 		*regb = reg_b(src_gpr + 1);
734 		return;
735 	}
736 
737 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset);
738 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C,
739 		 reg_imm(0));
740 	*rega = imm_a(nfp_prog);
741 	*regb = imm_b(nfp_prog);
742 }
743 
744 /* NFP has Command Push Pull bus which supports bluk memory operations. */
745 static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
746 {
747 	bool descending_seq = meta->ldst_gather_len < 0;
748 	s16 len = abs(meta->ldst_gather_len);
749 	swreg src_base, off;
750 	bool src_40bit_addr;
751 	unsigned int i;
752 	u8 xfer_num;
753 
754 	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
755 	src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE;
756 	src_base = reg_a(meta->insn.src_reg * 2);
757 	xfer_num = round_up(len, 4) / 4;
758 
759 	if (src_40bit_addr)
760 		addr40_offset(nfp_prog, meta->insn.src_reg * 2, off, &src_base,
761 			      &off);
762 
763 	/* Setup PREV_ALU fields to override memory read length. */
764 	if (len > 32)
765 		wrp_immed(nfp_prog, reg_none(),
766 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
767 
768 	/* Memory read from source addr into transfer-in registers. */
769 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP,
770 		     src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0,
771 		     src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32);
772 
773 	/* Move from transfer-in to transfer-out. */
774 	for (i = 0; i < xfer_num; i++)
775 		wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
776 
777 	off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
778 
779 	if (len <= 8) {
780 		/* Use single direct_ref write8. */
781 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
782 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
783 			 CMD_CTX_SWAP);
784 	} else if (len <= 32 && IS_ALIGNED(len, 4)) {
785 		/* Use single direct_ref write32. */
786 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
787 			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
788 			 CMD_CTX_SWAP);
789 	} else if (len <= 32) {
790 		/* Use single indirect_ref write8. */
791 		wrp_immed(nfp_prog, reg_none(),
792 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
793 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
794 			       reg_a(meta->paired_st->dst_reg * 2), off,
795 			       len - 1, CMD_CTX_SWAP);
796 	} else if (IS_ALIGNED(len, 4)) {
797 		/* Use single indirect_ref write32. */
798 		wrp_immed(nfp_prog, reg_none(),
799 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
800 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
801 			       reg_a(meta->paired_st->dst_reg * 2), off,
802 			       xfer_num - 1, CMD_CTX_SWAP);
803 	} else if (len <= 40) {
804 		/* Use one direct_ref write32 to write the first 32-bytes, then
805 		 * another direct_ref write8 to write the remaining bytes.
806 		 */
807 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
808 			 reg_a(meta->paired_st->dst_reg * 2), off, 7,
809 			 CMD_CTX_SWAP);
810 
811 		off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
812 				      imm_b(nfp_prog));
813 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
814 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
815 			 CMD_CTX_SWAP);
816 	} else {
817 		/* Use one indirect_ref write32 to write 4-bytes aligned length,
818 		 * then another direct_ref write8 to write the remaining bytes.
819 		 */
820 		u8 new_off;
821 
822 		wrp_immed(nfp_prog, reg_none(),
823 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
824 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
825 			       reg_a(meta->paired_st->dst_reg * 2), off,
826 			       xfer_num - 2, CMD_CTX_SWAP);
827 		new_off = meta->paired_st->off + (xfer_num - 1) * 4;
828 		off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
829 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
830 			 xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
831 			 (len & 0x3) - 1, CMD_CTX_SWAP);
832 	}
833 
834 	/* TODO: The following extra load is to make sure data flow be identical
835 	 *  before and after we do memory copy optimization.
836 	 *
837 	 *  The load destination register is not guaranteed to be dead, so we
838 	 *  need to make sure it is loaded with the value the same as before
839 	 *  this transformation.
840 	 *
841 	 *  These extra loads could be removed once we have accurate register
842 	 *  usage information.
843 	 */
844 	if (descending_seq)
845 		xfer_num = 0;
846 	else if (BPF_SIZE(meta->insn.code) != BPF_DW)
847 		xfer_num = xfer_num - 1;
848 	else
849 		xfer_num = xfer_num - 2;
850 
851 	switch (BPF_SIZE(meta->insn.code)) {
852 	case BPF_B:
853 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
854 				reg_xfer(xfer_num), 1,
855 				IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
856 		break;
857 	case BPF_H:
858 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
859 				reg_xfer(xfer_num), 2, (len & 3) ^ 2);
860 		break;
861 	case BPF_W:
862 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
863 			reg_xfer(0));
864 		break;
865 	case BPF_DW:
866 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
867 			reg_xfer(xfer_num));
868 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
869 			reg_xfer(xfer_num + 1));
870 		break;
871 	}
872 
873 	if (BPF_SIZE(meta->insn.code) != BPF_DW)
874 		wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
875 
876 	return 0;
877 }
878 
879 static int
880 data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size)
881 {
882 	unsigned int i;
883 	u16 shift, sz;
884 
885 	/* We load the value from the address indicated in @offset and then
886 	 * shift out the data we don't need.  Note: this is big endian!
887 	 */
888 	sz = max(size, 4);
889 	shift = size < 4 ? 4 - size : 0;
890 
891 	emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
892 		 pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP);
893 
894 	i = 0;
895 	if (shift)
896 		emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE,
897 			 reg_xfer(0), SHF_SC_R_SHF, shift * 8);
898 	else
899 		for (; i * 4 < size; i++)
900 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
901 
902 	if (i < 2)
903 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
904 
905 	return 0;
906 }
907 
908 static int
909 data_ld_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr,
910 		   swreg lreg, swreg rreg, int size, enum cmd_mode mode)
911 {
912 	unsigned int i;
913 	u8 mask, sz;
914 
915 	/* We load the value from the address indicated in rreg + lreg and then
916 	 * mask out the data we don't need.  Note: this is little endian!
917 	 */
918 	sz = max(size, 4);
919 	mask = size < 4 ? GENMASK(size - 1, 0) : 0;
920 
921 	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0,
922 		 lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP);
923 
924 	i = 0;
925 	if (mask)
926 		emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask,
927 				  reg_xfer(0), SHF_SC_NONE, 0, true);
928 	else
929 		for (; i * 4 < size; i++)
930 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
931 
932 	if (i < 2)
933 		wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0);
934 
935 	return 0;
936 }
937 
938 static int
939 data_ld_host_order_addr32(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
940 			  u8 dst_gpr, u8 size)
941 {
942 	return data_ld_host_order(nfp_prog, dst_gpr, reg_a(src_gpr), offset,
943 				  size, CMD_MODE_32b);
944 }
945 
946 static int
947 data_ld_host_order_addr40(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
948 			  u8 dst_gpr, u8 size)
949 {
950 	swreg rega, regb;
951 
952 	addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb);
953 
954 	return data_ld_host_order(nfp_prog, dst_gpr, rega, regb,
955 				  size, CMD_MODE_40b_BA);
956 }
957 
958 static int
959 construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset, u16 src, u8 size)
960 {
961 	swreg tmp_reg;
962 
963 	/* Calculate the true offset (src_reg + imm) */
964 	tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
965 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg);
966 
967 	/* Check packet length (size guaranteed to fit b/c it's u8) */
968 	emit_alu(nfp_prog, imm_a(nfp_prog),
969 		 imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
970 	emit_alu(nfp_prog, reg_none(),
971 		 plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
972 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
973 
974 	/* Load data */
975 	return data_ld(nfp_prog, imm_b(nfp_prog), 0, size);
976 }
977 
978 static int construct_data_ld(struct nfp_prog *nfp_prog, u16 offset, u8 size)
979 {
980 	swreg tmp_reg;
981 
982 	/* Check packet length */
983 	tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog));
984 	emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
985 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
986 
987 	/* Load data */
988 	tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
989 	return data_ld(nfp_prog, tmp_reg, 0, size);
990 }
991 
992 static int
993 data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
994 		    u8 src_gpr, u8 size)
995 {
996 	unsigned int i;
997 
998 	for (i = 0; i * 4 < size; i++)
999 		wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i));
1000 
1001 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
1002 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
1003 
1004 	return 0;
1005 }
1006 
1007 static int
1008 data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
1009 		   u64 imm, u8 size)
1010 {
1011 	wrp_immed(nfp_prog, reg_xfer(0), imm);
1012 	if (size == 8)
1013 		wrp_immed(nfp_prog, reg_xfer(1), imm >> 32);
1014 
1015 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
1016 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
1017 
1018 	return 0;
1019 }
1020 
1021 typedef int
1022 (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
1023 	     unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1024 	     bool needs_inc);
1025 
1026 static int
1027 wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
1028 	      unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1029 	      bool needs_inc)
1030 {
1031 	bool should_inc = needs_inc && new_gpr && !last;
1032 	u32 idx, src_byte;
1033 	enum shf_sc sc;
1034 	swreg reg;
1035 	int shf;
1036 	u8 mask;
1037 
1038 	if (WARN_ON_ONCE(dst_byte + size > 4 || off % 4 + size > 4))
1039 		return -EOPNOTSUPP;
1040 
1041 	idx = off / 4;
1042 
1043 	/* Move the entire word */
1044 	if (size == 4) {
1045 		wrp_mov(nfp_prog, reg_both(dst),
1046 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
1047 		return 0;
1048 	}
1049 
1050 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1051 		return -EOPNOTSUPP;
1052 
1053 	src_byte = off % 4;
1054 
1055 	mask = (1 << size) - 1;
1056 	mask <<= dst_byte;
1057 
1058 	if (WARN_ON_ONCE(mask > 0xf))
1059 		return -EOPNOTSUPP;
1060 
1061 	shf = abs(src_byte - dst_byte) * 8;
1062 	if (src_byte == dst_byte) {
1063 		sc = SHF_SC_NONE;
1064 	} else if (src_byte < dst_byte) {
1065 		shf = 32 - shf;
1066 		sc = SHF_SC_L_SHF;
1067 	} else {
1068 		sc = SHF_SC_R_SHF;
1069 	}
1070 
1071 	/* ld_field can address fewer indexes, if offset too large do RMW.
1072 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1073 	 */
1074 	if (idx <= RE_REG_LM_IDX_MAX) {
1075 		reg = reg_lm(lm3 ? 3 : 0, idx);
1076 	} else {
1077 		reg = imm_a(nfp_prog);
1078 		/* If it's not the first part of the load and we start a new GPR
1079 		 * that means we are loading a second part of the LMEM word into
1080 		 * a new GPR.  IOW we've already looked that LMEM word and
1081 		 * therefore it has been loaded into imm_a().
1082 		 */
1083 		if (first || !new_gpr)
1084 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1085 	}
1086 
1087 	emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
1088 
1089 	if (should_inc)
1090 		wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1091 
1092 	return 0;
1093 }
1094 
1095 static int
1096 wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
1097 	       unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1098 	       bool needs_inc)
1099 {
1100 	bool should_inc = needs_inc && new_gpr && !last;
1101 	u32 idx, dst_byte;
1102 	enum shf_sc sc;
1103 	swreg reg;
1104 	int shf;
1105 	u8 mask;
1106 
1107 	if (WARN_ON_ONCE(src_byte + size > 4 || off % 4 + size > 4))
1108 		return -EOPNOTSUPP;
1109 
1110 	idx = off / 4;
1111 
1112 	/* Move the entire word */
1113 	if (size == 4) {
1114 		wrp_mov(nfp_prog,
1115 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
1116 			reg_b(src));
1117 		return 0;
1118 	}
1119 
1120 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1121 		return -EOPNOTSUPP;
1122 
1123 	dst_byte = off % 4;
1124 
1125 	mask = (1 << size) - 1;
1126 	mask <<= dst_byte;
1127 
1128 	if (WARN_ON_ONCE(mask > 0xf))
1129 		return -EOPNOTSUPP;
1130 
1131 	shf = abs(src_byte - dst_byte) * 8;
1132 	if (src_byte == dst_byte) {
1133 		sc = SHF_SC_NONE;
1134 	} else if (src_byte < dst_byte) {
1135 		shf = 32 - shf;
1136 		sc = SHF_SC_L_SHF;
1137 	} else {
1138 		sc = SHF_SC_R_SHF;
1139 	}
1140 
1141 	/* ld_field can address fewer indexes, if offset too large do RMW.
1142 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1143 	 */
1144 	if (idx <= RE_REG_LM_IDX_MAX) {
1145 		reg = reg_lm(lm3 ? 3 : 0, idx);
1146 	} else {
1147 		reg = imm_a(nfp_prog);
1148 		/* Only first and last LMEM locations are going to need RMW,
1149 		 * the middle location will be overwritten fully.
1150 		 */
1151 		if (first || last)
1152 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1153 	}
1154 
1155 	emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
1156 
1157 	if (new_gpr || last) {
1158 		if (idx > RE_REG_LM_IDX_MAX)
1159 			wrp_mov(nfp_prog, reg_lm(0, idx), reg);
1160 		if (should_inc)
1161 			wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1162 	}
1163 
1164 	return 0;
1165 }
1166 
1167 static int
1168 mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1169 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
1170 	     bool clr_gpr, lmem_step step)
1171 {
1172 	s32 off = nfp_prog->stack_frame_depth + meta->insn.off + ptr_off;
1173 	bool first = true, last;
1174 	bool needs_inc = false;
1175 	swreg stack_off_reg;
1176 	u8 prev_gpr = 255;
1177 	u32 gpr_byte = 0;
1178 	bool lm3 = true;
1179 	int ret;
1180 
1181 	if (meta->ptr_not_const) {
1182 		/* Use of the last encountered ptr_off is OK, they all have
1183 		 * the same alignment.  Depend on low bits of value being
1184 		 * discarded when written to LMaddr register.
1185 		 */
1186 		stack_off_reg = ur_load_imm_any(nfp_prog, meta->insn.off,
1187 						stack_imm(nfp_prog));
1188 
1189 		emit_alu(nfp_prog, imm_b(nfp_prog),
1190 			 reg_a(ptr_gpr), ALU_OP_ADD, stack_off_reg);
1191 
1192 		needs_inc = true;
1193 	} else if (off + size <= 64) {
1194 		/* We can reach bottom 64B with LMaddr0 */
1195 		lm3 = false;
1196 	} else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
1197 		/* We have to set up a new pointer.  If we know the offset
1198 		 * and the entire access falls into a single 32 byte aligned
1199 		 * window we won't have to increment the LM pointer.
1200 		 * The 32 byte alignment is imporant because offset is ORed in
1201 		 * not added when doing *l$indexN[off].
1202 		 */
1203 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
1204 						stack_imm(nfp_prog));
1205 		emit_alu(nfp_prog, imm_b(nfp_prog),
1206 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1207 
1208 		off %= 32;
1209 	} else {
1210 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
1211 						stack_imm(nfp_prog));
1212 
1213 		emit_alu(nfp_prog, imm_b(nfp_prog),
1214 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1215 
1216 		needs_inc = true;
1217 	}
1218 	if (lm3) {
1219 		emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
1220 		/* For size < 4 one slot will be filled by zeroing of upper. */
1221 		wrp_nops(nfp_prog, clr_gpr && size < 8 ? 2 : 3);
1222 	}
1223 
1224 	if (clr_gpr && size < 8)
1225 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
1226 
1227 	while (size) {
1228 		u32 slice_end;
1229 		u8 slice_size;
1230 
1231 		slice_size = min(size, 4 - gpr_byte);
1232 		slice_end = min(off + slice_size, round_up(off + 1, 4));
1233 		slice_size = slice_end - off;
1234 
1235 		last = slice_size == size;
1236 
1237 		if (needs_inc)
1238 			off %= 4;
1239 
1240 		ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
1241 			   first, gpr != prev_gpr, last, lm3, needs_inc);
1242 		if (ret)
1243 			return ret;
1244 
1245 		prev_gpr = gpr;
1246 		first = false;
1247 
1248 		gpr_byte += slice_size;
1249 		if (gpr_byte >= 4) {
1250 			gpr_byte -= 4;
1251 			gpr++;
1252 		}
1253 
1254 		size -= slice_size;
1255 		off += slice_size;
1256 	}
1257 
1258 	return 0;
1259 }
1260 
1261 static void
1262 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
1263 {
1264 	swreg tmp_reg;
1265 
1266 	if (alu_op == ALU_OP_AND) {
1267 		if (!imm)
1268 			wrp_immed(nfp_prog, reg_both(dst), 0);
1269 		if (!imm || !~imm)
1270 			return;
1271 	}
1272 	if (alu_op == ALU_OP_OR) {
1273 		if (!~imm)
1274 			wrp_immed(nfp_prog, reg_both(dst), ~0U);
1275 		if (!imm || !~imm)
1276 			return;
1277 	}
1278 	if (alu_op == ALU_OP_XOR) {
1279 		if (!~imm)
1280 			emit_alu(nfp_prog, reg_both(dst), reg_none(),
1281 				 ALU_OP_NOT, reg_b(dst));
1282 		if (!imm || !~imm)
1283 			return;
1284 	}
1285 
1286 	tmp_reg = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1287 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, tmp_reg);
1288 }
1289 
1290 static int
1291 wrp_alu64_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1292 	      enum alu_op alu_op, bool skip)
1293 {
1294 	const struct bpf_insn *insn = &meta->insn;
1295 	u64 imm = insn->imm; /* sign extend */
1296 
1297 	if (skip) {
1298 		meta->skip = true;
1299 		return 0;
1300 	}
1301 
1302 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, imm & ~0U);
1303 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, alu_op, imm >> 32);
1304 
1305 	return 0;
1306 }
1307 
1308 static int
1309 wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1310 	      enum alu_op alu_op)
1311 {
1312 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1313 
1314 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1315 	emit_alu(nfp_prog, reg_both(dst + 1),
1316 		 reg_a(dst + 1), alu_op, reg_b(src + 1));
1317 
1318 	return 0;
1319 }
1320 
1321 static int
1322 wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1323 	      enum alu_op alu_op, bool skip)
1324 {
1325 	const struct bpf_insn *insn = &meta->insn;
1326 
1327 	if (skip) {
1328 		meta->skip = true;
1329 		return 0;
1330 	}
1331 
1332 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, insn->imm);
1333 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
1334 
1335 	return 0;
1336 }
1337 
1338 static int
1339 wrp_alu32_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1340 	      enum alu_op alu_op)
1341 {
1342 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1343 
1344 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1345 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
1346 
1347 	return 0;
1348 }
1349 
1350 static void
1351 wrp_test_reg_one(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u8 src,
1352 		 enum br_mask br_mask, u16 off)
1353 {
1354 	emit_alu(nfp_prog, reg_none(), reg_a(dst), alu_op, reg_b(src));
1355 	emit_br(nfp_prog, br_mask, off, 0);
1356 }
1357 
1358 static int
1359 wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1360 	     enum alu_op alu_op, enum br_mask br_mask)
1361 {
1362 	const struct bpf_insn *insn = &meta->insn;
1363 
1364 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
1365 			 insn->src_reg * 2, br_mask, insn->off);
1366 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
1367 			 insn->src_reg * 2 + 1, br_mask, insn->off);
1368 
1369 	return 0;
1370 }
1371 
1372 static const struct jmp_code_map {
1373 	enum br_mask br_mask;
1374 	bool swap;
1375 } jmp_code_map[] = {
1376 	[BPF_JGT >> 4]	= { BR_BLO, true },
1377 	[BPF_JGE >> 4]	= { BR_BHS, false },
1378 	[BPF_JLT >> 4]	= { BR_BLO, false },
1379 	[BPF_JLE >> 4]	= { BR_BHS, true },
1380 	[BPF_JSGT >> 4]	= { BR_BLT, true },
1381 	[BPF_JSGE >> 4]	= { BR_BGE, false },
1382 	[BPF_JSLT >> 4]	= { BR_BLT, false },
1383 	[BPF_JSLE >> 4]	= { BR_BGE, true },
1384 };
1385 
1386 static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
1387 {
1388 	unsigned int op;
1389 
1390 	op = BPF_OP(meta->insn.code) >> 4;
1391 	/* br_mask of 0 is BR_BEQ which we don't use in jump code table */
1392 	if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
1393 		      !jmp_code_map[op].br_mask,
1394 		      "no code found for jump instruction"))
1395 		return NULL;
1396 
1397 	return &jmp_code_map[op];
1398 }
1399 
1400 static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1401 {
1402 	const struct bpf_insn *insn = &meta->insn;
1403 	u64 imm = insn->imm; /* sign extend */
1404 	const struct jmp_code_map *code;
1405 	enum alu_op alu_op, carry_op;
1406 	u8 reg = insn->dst_reg * 2;
1407 	swreg tmp_reg;
1408 
1409 	code = nfp_jmp_code_get(meta);
1410 	if (!code)
1411 		return -EINVAL;
1412 
1413 	alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
1414 	carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
1415 
1416 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
1417 	if (!code->swap)
1418 		emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
1419 	else
1420 		emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
1421 
1422 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
1423 	if (!code->swap)
1424 		emit_alu(nfp_prog, reg_none(),
1425 			 reg_a(reg + 1), carry_op, tmp_reg);
1426 	else
1427 		emit_alu(nfp_prog, reg_none(),
1428 			 tmp_reg, carry_op, reg_a(reg + 1));
1429 
1430 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1431 
1432 	return 0;
1433 }
1434 
1435 static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1436 {
1437 	const struct bpf_insn *insn = &meta->insn;
1438 	const struct jmp_code_map *code;
1439 	u8 areg, breg;
1440 
1441 	code = nfp_jmp_code_get(meta);
1442 	if (!code)
1443 		return -EINVAL;
1444 
1445 	areg = insn->dst_reg * 2;
1446 	breg = insn->src_reg * 2;
1447 
1448 	if (code->swap) {
1449 		areg ^= breg;
1450 		breg ^= areg;
1451 		areg ^= breg;
1452 	}
1453 
1454 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
1455 	emit_alu(nfp_prog, reg_none(),
1456 		 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
1457 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1458 
1459 	return 0;
1460 }
1461 
1462 static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
1463 {
1464 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in,
1465 		      SHF_SC_R_ROT, 8);
1466 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out),
1467 		      SHF_SC_R_ROT, 16);
1468 }
1469 
1470 static void
1471 wrp_mul_u32(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1472 	    swreg rreg, bool gen_high_half)
1473 {
1474 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1475 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_1, rreg);
1476 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_2, rreg);
1477 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_3, rreg);
1478 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_4, rreg);
1479 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_32x32, MUL_LAST, reg_none());
1480 	if (gen_high_half)
1481 		emit_mul(nfp_prog, dst_hi, MUL_TYPE_STEP_32x32, MUL_LAST_2,
1482 			 reg_none());
1483 	else
1484 		wrp_immed(nfp_prog, dst_hi, 0);
1485 }
1486 
1487 static void
1488 wrp_mul_u16(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1489 	    swreg rreg)
1490 {
1491 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1492 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_1, rreg);
1493 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_2, rreg);
1494 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_16x16, MUL_LAST, reg_none());
1495 }
1496 
1497 static int
1498 wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1499 	bool gen_high_half, bool ropnd_from_reg)
1500 {
1501 	swreg multiplier, multiplicand, dst_hi, dst_lo;
1502 	const struct bpf_insn *insn = &meta->insn;
1503 	u32 lopnd_max, ropnd_max;
1504 	u8 dst_reg;
1505 
1506 	dst_reg = insn->dst_reg;
1507 	multiplicand = reg_a(dst_reg * 2);
1508 	dst_hi = reg_both(dst_reg * 2 + 1);
1509 	dst_lo = reg_both(dst_reg * 2);
1510 	lopnd_max = meta->umax_dst;
1511 	if (ropnd_from_reg) {
1512 		multiplier = reg_b(insn->src_reg * 2);
1513 		ropnd_max = meta->umax_src;
1514 	} else {
1515 		u32 imm = insn->imm;
1516 
1517 		multiplier = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1518 		ropnd_max = imm;
1519 	}
1520 	if (lopnd_max > U16_MAX || ropnd_max > U16_MAX)
1521 		wrp_mul_u32(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier,
1522 			    gen_high_half);
1523 	else
1524 		wrp_mul_u16(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier);
1525 
1526 	return 0;
1527 }
1528 
1529 static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
1530 {
1531 	swreg dst_both = reg_both(dst), dst_a = reg_a(dst), dst_b = reg_a(dst);
1532 	struct reciprocal_value_adv rvalue;
1533 	u8 pre_shift, exp;
1534 	swreg magic;
1535 
1536 	if (imm > U32_MAX) {
1537 		wrp_immed(nfp_prog, dst_both, 0);
1538 		return 0;
1539 	}
1540 
1541 	/* NOTE: because we are using "reciprocal_value_adv" which doesn't
1542 	 * support "divisor > (1u << 31)", we need to JIT separate NFP sequence
1543 	 * to handle such case which actually equals to the result of unsigned
1544 	 * comparison "dst >= imm" which could be calculated using the following
1545 	 * NFP sequence:
1546 	 *
1547 	 *  alu[--, dst, -, imm]
1548 	 *  immed[imm, 0]
1549 	 *  alu[dst, imm, +carry, 0]
1550 	 *
1551 	 */
1552 	if (imm > 1U << 31) {
1553 		swreg tmp_b = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1554 
1555 		emit_alu(nfp_prog, reg_none(), dst_a, ALU_OP_SUB, tmp_b);
1556 		wrp_immed(nfp_prog, imm_a(nfp_prog), 0);
1557 		emit_alu(nfp_prog, dst_both, imm_a(nfp_prog), ALU_OP_ADD_C,
1558 			 reg_imm(0));
1559 		return 0;
1560 	}
1561 
1562 	rvalue = reciprocal_value_adv(imm, 32);
1563 	exp = rvalue.exp;
1564 	if (rvalue.is_wide_m && !(imm & 1)) {
1565 		pre_shift = fls(imm & -imm) - 1;
1566 		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
1567 	} else {
1568 		pre_shift = 0;
1569 	}
1570 	magic = ur_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
1571 	if (imm == 1U << exp) {
1572 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1573 			 SHF_SC_R_SHF, exp);
1574 	} else if (rvalue.is_wide_m) {
1575 		wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a,
1576 			    magic, true);
1577 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB,
1578 			 imm_b(nfp_prog));
1579 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1580 			 SHF_SC_R_SHF, 1);
1581 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD,
1582 			 imm_b(nfp_prog));
1583 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1584 			 SHF_SC_R_SHF, rvalue.sh - 1);
1585 	} else {
1586 		if (pre_shift)
1587 			emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1588 				 dst_b, SHF_SC_R_SHF, pre_shift);
1589 		wrp_mul_u32(nfp_prog, dst_both, reg_none(), dst_a, magic, true);
1590 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1591 			 dst_b, SHF_SC_R_SHF, rvalue.sh);
1592 	}
1593 
1594 	return 0;
1595 }
1596 
1597 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1598 {
1599 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
1600 	struct nfp_bpf_cap_adjust_head *adjust_head;
1601 	u32 ret_einval, end;
1602 
1603 	adjust_head = &nfp_prog->bpf->adjust_head;
1604 
1605 	/* Optimized version - 5 vs 14 cycles */
1606 	if (nfp_prog->adjust_head_location != UINT_MAX) {
1607 		if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n))
1608 			return -EINVAL;
1609 
1610 		emit_alu(nfp_prog, pptr_reg(nfp_prog),
1611 			 reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog));
1612 		emit_alu(nfp_prog, plen_reg(nfp_prog),
1613 			 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1614 		emit_alu(nfp_prog, pv_len(nfp_prog),
1615 			 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1616 
1617 		wrp_immed(nfp_prog, reg_both(0), 0);
1618 		wrp_immed(nfp_prog, reg_both(1), 0);
1619 
1620 		/* TODO: when adjust head is guaranteed to succeed we can
1621 		 * also eliminate the following if (r0 == 0) branch.
1622 		 */
1623 
1624 		return 0;
1625 	}
1626 
1627 	ret_einval = nfp_prog_current_offset(nfp_prog) + 14;
1628 	end = ret_einval + 2;
1629 
1630 	/* We need to use a temp because offset is just a part of the pkt ptr */
1631 	emit_alu(nfp_prog, tmp,
1632 		 reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog));
1633 
1634 	/* Validate result will fit within FW datapath constraints */
1635 	emit_alu(nfp_prog, reg_none(),
1636 		 tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min));
1637 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1638 	emit_alu(nfp_prog, reg_none(),
1639 		 reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp);
1640 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1641 
1642 	/* Validate the length is at least ETH_HLEN */
1643 	emit_alu(nfp_prog, tmp_len,
1644 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1645 	emit_alu(nfp_prog, reg_none(),
1646 		 tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN));
1647 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1648 
1649 	/* Load the ret code */
1650 	wrp_immed(nfp_prog, reg_both(0), 0);
1651 	wrp_immed(nfp_prog, reg_both(1), 0);
1652 
1653 	/* Modify the packet metadata */
1654 	emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0);
1655 
1656 	/* Skip over the -EINVAL ret code (defer 2) */
1657 	emit_br(nfp_prog, BR_UNC, end, 2);
1658 
1659 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1660 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1661 	emit_alu(nfp_prog, pv_len(nfp_prog),
1662 		 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1663 
1664 	/* return -EINVAL target */
1665 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1666 		return -EINVAL;
1667 
1668 	wrp_immed(nfp_prog, reg_both(0), -22);
1669 	wrp_immed(nfp_prog, reg_both(1), ~0);
1670 
1671 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1672 		return -EINVAL;
1673 
1674 	return 0;
1675 }
1676 
1677 static int adjust_tail(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1678 {
1679 	u32 ret_einval, end;
1680 	swreg plen, delta;
1681 
1682 	BUILD_BUG_ON(plen_reg(nfp_prog) != reg_b(STATIC_REG_PKT_LEN));
1683 
1684 	plen = imm_a(nfp_prog);
1685 	delta = reg_a(2 * 2);
1686 
1687 	ret_einval = nfp_prog_current_offset(nfp_prog) + 9;
1688 	end = nfp_prog_current_offset(nfp_prog) + 11;
1689 
1690 	/* Calculate resulting length */
1691 	emit_alu(nfp_prog, plen, plen_reg(nfp_prog), ALU_OP_ADD, delta);
1692 	/* delta == 0 is not allowed by the kernel, add must overflow to make
1693 	 * length smaller.
1694 	 */
1695 	emit_br(nfp_prog, BR_BCC, ret_einval, 0);
1696 
1697 	/* if (new_len < 14) then -EINVAL */
1698 	emit_alu(nfp_prog, reg_none(), plen, ALU_OP_SUB, reg_imm(ETH_HLEN));
1699 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1700 
1701 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1702 		 plen_reg(nfp_prog), ALU_OP_ADD, delta);
1703 	emit_alu(nfp_prog, pv_len(nfp_prog),
1704 		 pv_len(nfp_prog), ALU_OP_ADD, delta);
1705 
1706 	emit_br(nfp_prog, BR_UNC, end, 2);
1707 	wrp_immed(nfp_prog, reg_both(0), 0);
1708 	wrp_immed(nfp_prog, reg_both(1), 0);
1709 
1710 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1711 		return -EINVAL;
1712 
1713 	wrp_immed(nfp_prog, reg_both(0), -22);
1714 	wrp_immed(nfp_prog, reg_both(1), ~0);
1715 
1716 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1717 		return -EINVAL;
1718 
1719 	return 0;
1720 }
1721 
1722 static int
1723 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1724 {
1725 	bool load_lm_ptr;
1726 	u32 ret_tgt;
1727 	s64 lm_off;
1728 
1729 	/* We only have to reload LM0 if the key is not at start of stack */
1730 	lm_off = nfp_prog->stack_frame_depth;
1731 	lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
1732 	load_lm_ptr = meta->arg2.var_off || lm_off;
1733 
1734 	/* Set LM0 to start of key */
1735 	if (load_lm_ptr)
1736 		emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0);
1737 	if (meta->func_id == BPF_FUNC_map_update_elem)
1738 		emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1739 
1740 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1741 		     2, RELO_BR_HELPER);
1742 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1743 
1744 	/* Load map ID into A0 */
1745 	wrp_mov(nfp_prog, reg_a(0), reg_a(2));
1746 
1747 	/* Load the return address into B0 */
1748 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1749 
1750 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1751 		return -EINVAL;
1752 
1753 	/* Reset the LM0 pointer */
1754 	if (!load_lm_ptr)
1755 		return 0;
1756 
1757 	emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
1758 	wrp_nops(nfp_prog, 3);
1759 
1760 	return 0;
1761 }
1762 
1763 static int
1764 nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1765 {
1766 	__emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM);
1767 	/* CSR value is read in following immed[gpr, 0] */
1768 	emit_immed(nfp_prog, reg_both(0), 0,
1769 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1770 	emit_immed(nfp_prog, reg_both(1), 0,
1771 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1772 	return 0;
1773 }
1774 
1775 static int
1776 nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1777 {
1778 	swreg ptr_type;
1779 	u32 ret_tgt;
1780 
1781 	ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
1782 
1783 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
1784 
1785 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1786 		     2, RELO_BR_HELPER);
1787 
1788 	/* Load ptr type into A1 */
1789 	wrp_mov(nfp_prog, reg_a(1), ptr_type);
1790 
1791 	/* Load the return address into B0 */
1792 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1793 
1794 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1795 		return -EINVAL;
1796 
1797 	return 0;
1798 }
1799 
1800 static int
1801 nfp_queue_select(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1802 {
1803 	u32 jmp_tgt;
1804 
1805 	jmp_tgt = nfp_prog_current_offset(nfp_prog) + 5;
1806 
1807 	/* Make sure the queue id fits into FW field */
1808 	emit_alu(nfp_prog, reg_none(), reg_a(meta->insn.src_reg * 2),
1809 		 ALU_OP_AND_NOT_B, reg_imm(0xff));
1810 	emit_br(nfp_prog, BR_BEQ, jmp_tgt, 2);
1811 
1812 	/* Set the 'queue selected' bit and the queue value */
1813 	emit_shf(nfp_prog, pv_qsel_set(nfp_prog),
1814 		 pv_qsel_set(nfp_prog), SHF_OP_OR, reg_imm(1),
1815 		 SHF_SC_L_SHF, PKT_VEL_QSEL_SET_BIT);
1816 	emit_ld_field(nfp_prog,
1817 		      pv_qsel_val(nfp_prog), 0x1, reg_b(meta->insn.src_reg * 2),
1818 		      SHF_SC_NONE, 0);
1819 	/* Delay slots end here, we will jump over next instruction if queue
1820 	 * value fits into the field.
1821 	 */
1822 	emit_ld_field(nfp_prog,
1823 		      pv_qsel_val(nfp_prog), 0x1, reg_imm(NFP_NET_RXR_MAX),
1824 		      SHF_SC_NONE, 0);
1825 
1826 	if (!nfp_prog_confirm_current_offset(nfp_prog, jmp_tgt))
1827 		return -EINVAL;
1828 
1829 	return 0;
1830 }
1831 
1832 /* --- Callbacks --- */
1833 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1834 {
1835 	const struct bpf_insn *insn = &meta->insn;
1836 	u8 dst = insn->dst_reg * 2;
1837 	u8 src = insn->src_reg * 2;
1838 
1839 	if (insn->src_reg == BPF_REG_10) {
1840 		swreg stack_depth_reg;
1841 
1842 		stack_depth_reg = ur_load_imm_any(nfp_prog,
1843 						  nfp_prog->stack_frame_depth,
1844 						  stack_imm(nfp_prog));
1845 		emit_alu(nfp_prog, reg_both(dst), stack_reg(nfp_prog),
1846 			 ALU_OP_ADD, stack_depth_reg);
1847 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1848 	} else {
1849 		wrp_reg_mov(nfp_prog, dst, src);
1850 		wrp_reg_mov(nfp_prog, dst + 1, src + 1);
1851 	}
1852 
1853 	return 0;
1854 }
1855 
1856 static int mov_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1857 {
1858 	u64 imm = meta->insn.imm; /* sign extend */
1859 
1860 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2), imm & ~0U);
1861 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), imm >> 32);
1862 
1863 	return 0;
1864 }
1865 
1866 static int xor_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1867 {
1868 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_XOR);
1869 }
1870 
1871 static int xor_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1872 {
1873 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm);
1874 }
1875 
1876 static int and_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1877 {
1878 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_AND);
1879 }
1880 
1881 static int and_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1882 {
1883 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1884 }
1885 
1886 static int or_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1887 {
1888 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_OR);
1889 }
1890 
1891 static int or_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1892 {
1893 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1894 }
1895 
1896 static int add_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1897 {
1898 	const struct bpf_insn *insn = &meta->insn;
1899 
1900 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1901 		 reg_a(insn->dst_reg * 2), ALU_OP_ADD,
1902 		 reg_b(insn->src_reg * 2));
1903 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1904 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_ADD_C,
1905 		 reg_b(insn->src_reg * 2 + 1));
1906 
1907 	return 0;
1908 }
1909 
1910 static int add_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1911 {
1912 	const struct bpf_insn *insn = &meta->insn;
1913 	u64 imm = insn->imm; /* sign extend */
1914 
1915 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_ADD, imm & ~0U);
1916 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_ADD_C, imm >> 32);
1917 
1918 	return 0;
1919 }
1920 
1921 static int sub_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1922 {
1923 	const struct bpf_insn *insn = &meta->insn;
1924 
1925 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1926 		 reg_a(insn->dst_reg * 2), ALU_OP_SUB,
1927 		 reg_b(insn->src_reg * 2));
1928 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1929 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_SUB_C,
1930 		 reg_b(insn->src_reg * 2 + 1));
1931 
1932 	return 0;
1933 }
1934 
1935 static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1936 {
1937 	const struct bpf_insn *insn = &meta->insn;
1938 	u64 imm = insn->imm; /* sign extend */
1939 
1940 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_SUB, imm & ~0U);
1941 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_SUB_C, imm >> 32);
1942 
1943 	return 0;
1944 }
1945 
1946 static int mul_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1947 {
1948 	return wrp_mul(nfp_prog, meta, true, true);
1949 }
1950 
1951 static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1952 {
1953 	return wrp_mul(nfp_prog, meta, true, false);
1954 }
1955 
1956 static int div_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1957 {
1958 	const struct bpf_insn *insn = &meta->insn;
1959 
1960 	return wrp_div_imm(nfp_prog, insn->dst_reg * 2, insn->imm);
1961 }
1962 
1963 static int div_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1964 {
1965 	/* NOTE: verifier hook has rejected cases for which verifier doesn't
1966 	 * know whether the source operand is constant or not.
1967 	 */
1968 	return wrp_div_imm(nfp_prog, meta->insn.dst_reg * 2, meta->umin_src);
1969 }
1970 
1971 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1972 {
1973 	const struct bpf_insn *insn = &meta->insn;
1974 
1975 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2), reg_imm(0),
1976 		 ALU_OP_SUB, reg_b(insn->dst_reg * 2));
1977 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1), reg_imm(0),
1978 		 ALU_OP_SUB_C, reg_b(insn->dst_reg * 2 + 1));
1979 
1980 	return 0;
1981 }
1982 
1983 /* Pseudo code:
1984  *   if shift_amt >= 32
1985  *     dst_high = dst_low << shift_amt[4:0]
1986  *     dst_low = 0;
1987  *   else
1988  *     dst_high = (dst_high, dst_low) >> (32 - shift_amt)
1989  *     dst_low = dst_low << shift_amt
1990  *
1991  * The indirect shift will use the same logic at runtime.
1992  */
1993 static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
1994 {
1995 	if (shift_amt < 32) {
1996 		emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
1997 			 SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
1998 			 32 - shift_amt);
1999 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2000 			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
2001 	} else if (shift_amt == 32) {
2002 		wrp_reg_mov(nfp_prog, dst + 1, dst);
2003 		wrp_immed(nfp_prog, reg_both(dst), 0);
2004 	} else if (shift_amt > 32) {
2005 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2006 			 reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
2007 		wrp_immed(nfp_prog, reg_both(dst), 0);
2008 	}
2009 
2010 	return 0;
2011 }
2012 
2013 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2014 {
2015 	const struct bpf_insn *insn = &meta->insn;
2016 	u8 dst = insn->dst_reg * 2;
2017 
2018 	return __shl_imm64(nfp_prog, dst, insn->imm);
2019 }
2020 
2021 static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2022 {
2023 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
2024 		 reg_b(src));
2025 	emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
2026 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
2027 		       reg_b(dst), SHF_SC_R_DSHF);
2028 }
2029 
2030 /* NOTE: for indirect left shift, HIGH part should be calculated first. */
2031 static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2032 {
2033 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2034 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2035 		       reg_b(dst), SHF_SC_L_SHF);
2036 }
2037 
2038 static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2039 {
2040 	shl_reg64_lt32_high(nfp_prog, dst, src);
2041 	shl_reg64_lt32_low(nfp_prog, dst, src);
2042 }
2043 
2044 static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2045 {
2046 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2047 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2048 		       reg_b(dst), SHF_SC_L_SHF);
2049 	wrp_immed(nfp_prog, reg_both(dst), 0);
2050 }
2051 
2052 static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2053 {
2054 	const struct bpf_insn *insn = &meta->insn;
2055 	u64 umin, umax;
2056 	u8 dst, src;
2057 
2058 	dst = insn->dst_reg * 2;
2059 	umin = meta->umin_src;
2060 	umax = meta->umax_src;
2061 	if (umin == umax)
2062 		return __shl_imm64(nfp_prog, dst, umin);
2063 
2064 	src = insn->src_reg * 2;
2065 	if (umax < 32) {
2066 		shl_reg64_lt32(nfp_prog, dst, src);
2067 	} else if (umin >= 32) {
2068 		shl_reg64_ge32(nfp_prog, dst, src);
2069 	} else {
2070 		/* Generate different instruction sequences depending on runtime
2071 		 * value of shift amount.
2072 		 */
2073 		u16 label_ge32, label_end;
2074 
2075 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
2076 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2077 
2078 		shl_reg64_lt32_high(nfp_prog, dst, src);
2079 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2080 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2081 		/* shl_reg64_lt32_low packed in delay slot. */
2082 		shl_reg64_lt32_low(nfp_prog, dst, src);
2083 
2084 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2085 			return -EINVAL;
2086 		shl_reg64_ge32(nfp_prog, dst, src);
2087 
2088 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2089 			return -EINVAL;
2090 	}
2091 
2092 	return 0;
2093 }
2094 
2095 /* Pseudo code:
2096  *   if shift_amt >= 32
2097  *     dst_high = 0;
2098  *     dst_low = dst_high >> shift_amt[4:0]
2099  *   else
2100  *     dst_high = dst_high >> shift_amt
2101  *     dst_low = (dst_high, dst_low) >> shift_amt
2102  *
2103  * The indirect shift will use the same logic at runtime.
2104  */
2105 static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2106 {
2107 	if (shift_amt < 32) {
2108 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2109 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2110 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2111 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2112 	} else if (shift_amt == 32) {
2113 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2114 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2115 	} else if (shift_amt > 32) {
2116 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2117 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2118 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2119 	}
2120 
2121 	return 0;
2122 }
2123 
2124 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2125 {
2126 	const struct bpf_insn *insn = &meta->insn;
2127 	u8 dst = insn->dst_reg * 2;
2128 
2129 	return __shr_imm64(nfp_prog, dst, insn->imm);
2130 }
2131 
2132 /* NOTE: for indirect right shift, LOW part should be calculated first. */
2133 static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2134 {
2135 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2136 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2137 		       reg_b(dst + 1), SHF_SC_R_SHF);
2138 }
2139 
2140 static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2141 {
2142 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2143 	emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2144 		       reg_b(dst), SHF_SC_R_DSHF);
2145 }
2146 
2147 static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2148 {
2149 	shr_reg64_lt32_low(nfp_prog, dst, src);
2150 	shr_reg64_lt32_high(nfp_prog, dst, src);
2151 }
2152 
2153 static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2154 {
2155 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2156 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2157 		       reg_b(dst + 1), SHF_SC_R_SHF);
2158 	wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2159 }
2160 
2161 static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2162 {
2163 	const struct bpf_insn *insn = &meta->insn;
2164 	u64 umin, umax;
2165 	u8 dst, src;
2166 
2167 	dst = insn->dst_reg * 2;
2168 	umin = meta->umin_src;
2169 	umax = meta->umax_src;
2170 	if (umin == umax)
2171 		return __shr_imm64(nfp_prog, dst, umin);
2172 
2173 	src = insn->src_reg * 2;
2174 	if (umax < 32) {
2175 		shr_reg64_lt32(nfp_prog, dst, src);
2176 	} else if (umin >= 32) {
2177 		shr_reg64_ge32(nfp_prog, dst, src);
2178 	} else {
2179 		/* Generate different instruction sequences depending on runtime
2180 		 * value of shift amount.
2181 		 */
2182 		u16 label_ge32, label_end;
2183 
2184 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2185 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2186 		shr_reg64_lt32_low(nfp_prog, dst, src);
2187 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2188 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2189 		/* shr_reg64_lt32_high packed in delay slot. */
2190 		shr_reg64_lt32_high(nfp_prog, dst, src);
2191 
2192 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2193 			return -EINVAL;
2194 		shr_reg64_ge32(nfp_prog, dst, src);
2195 
2196 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2197 			return -EINVAL;
2198 	}
2199 
2200 	return 0;
2201 }
2202 
2203 /* Code logic is the same as __shr_imm64 except ashr requires signedness bit
2204  * told through PREV_ALU result.
2205  */
2206 static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2207 {
2208 	if (shift_amt < 32) {
2209 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2210 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2211 		/* Set signedness bit. */
2212 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2213 			 reg_imm(0));
2214 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2215 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2216 	} else if (shift_amt == 32) {
2217 		/* NOTE: this also helps setting signedness bit. */
2218 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2219 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2220 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2221 	} else if (shift_amt > 32) {
2222 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2223 			 reg_imm(0));
2224 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2225 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2226 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2227 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2228 	}
2229 
2230 	return 0;
2231 }
2232 
2233 static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2234 {
2235 	const struct bpf_insn *insn = &meta->insn;
2236 	u8 dst = insn->dst_reg * 2;
2237 
2238 	return __ashr_imm64(nfp_prog, dst, insn->imm);
2239 }
2240 
2241 static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2242 {
2243 	/* NOTE: the first insn will set both indirect shift amount (source A)
2244 	 * and signedness bit (MSB of result).
2245 	 */
2246 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2247 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2248 		       reg_b(dst + 1), SHF_SC_R_SHF);
2249 }
2250 
2251 static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2252 {
2253 	/* NOTE: it is the same as logic shift because we don't need to shift in
2254 	 * signedness bit when the shift amount is less than 32.
2255 	 */
2256 	return shr_reg64_lt32_low(nfp_prog, dst, src);
2257 }
2258 
2259 static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2260 {
2261 	ashr_reg64_lt32_low(nfp_prog, dst, src);
2262 	ashr_reg64_lt32_high(nfp_prog, dst, src);
2263 }
2264 
2265 static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2266 {
2267 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2268 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2269 		       reg_b(dst + 1), SHF_SC_R_SHF);
2270 	emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2271 		 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2272 }
2273 
2274 /* Like ashr_imm64, but need to use indirect shift. */
2275 static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2276 {
2277 	const struct bpf_insn *insn = &meta->insn;
2278 	u64 umin, umax;
2279 	u8 dst, src;
2280 
2281 	dst = insn->dst_reg * 2;
2282 	umin = meta->umin_src;
2283 	umax = meta->umax_src;
2284 	if (umin == umax)
2285 		return __ashr_imm64(nfp_prog, dst, umin);
2286 
2287 	src = insn->src_reg * 2;
2288 	if (umax < 32) {
2289 		ashr_reg64_lt32(nfp_prog, dst, src);
2290 	} else if (umin >= 32) {
2291 		ashr_reg64_ge32(nfp_prog, dst, src);
2292 	} else {
2293 		u16 label_ge32, label_end;
2294 
2295 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2296 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2297 		ashr_reg64_lt32_low(nfp_prog, dst, src);
2298 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2299 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2300 		/* ashr_reg64_lt32_high packed in delay slot. */
2301 		ashr_reg64_lt32_high(nfp_prog, dst, src);
2302 
2303 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2304 			return -EINVAL;
2305 		ashr_reg64_ge32(nfp_prog, dst, src);
2306 
2307 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2308 			return -EINVAL;
2309 	}
2310 
2311 	return 0;
2312 }
2313 
2314 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2315 {
2316 	const struct bpf_insn *insn = &meta->insn;
2317 
2318 	wrp_reg_mov(nfp_prog, insn->dst_reg * 2,  insn->src_reg * 2);
2319 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2320 
2321 	return 0;
2322 }
2323 
2324 static int mov_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2325 {
2326 	const struct bpf_insn *insn = &meta->insn;
2327 
2328 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), insn->imm);
2329 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2330 
2331 	return 0;
2332 }
2333 
2334 static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2335 {
2336 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_XOR);
2337 }
2338 
2339 static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2340 {
2341 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR, !~meta->insn.imm);
2342 }
2343 
2344 static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2345 {
2346 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_AND);
2347 }
2348 
2349 static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2350 {
2351 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
2352 }
2353 
2354 static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2355 {
2356 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_OR);
2357 }
2358 
2359 static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2360 {
2361 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
2362 }
2363 
2364 static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2365 {
2366 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_ADD);
2367 }
2368 
2369 static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2370 {
2371 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD, !meta->insn.imm);
2372 }
2373 
2374 static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2375 {
2376 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_SUB);
2377 }
2378 
2379 static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2380 {
2381 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB, !meta->insn.imm);
2382 }
2383 
2384 static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2385 {
2386 	return wrp_mul(nfp_prog, meta, false, true);
2387 }
2388 
2389 static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2390 {
2391 	return wrp_mul(nfp_prog, meta, false, false);
2392 }
2393 
2394 static int div_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2395 {
2396 	return div_reg64(nfp_prog, meta);
2397 }
2398 
2399 static int div_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2400 {
2401 	return div_imm64(nfp_prog, meta);
2402 }
2403 
2404 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2405 {
2406 	u8 dst = meta->insn.dst_reg * 2;
2407 
2408 	emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst));
2409 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2410 
2411 	return 0;
2412 }
2413 
2414 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2415 {
2416 	const struct bpf_insn *insn = &meta->insn;
2417 
2418 	if (!insn->imm)
2419 		return 1; /* TODO: zero shift means indirect */
2420 
2421 	emit_shf(nfp_prog, reg_both(insn->dst_reg * 2),
2422 		 reg_none(), SHF_OP_NONE, reg_b(insn->dst_reg * 2),
2423 		 SHF_SC_L_SHF, insn->imm);
2424 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2425 
2426 	return 0;
2427 }
2428 
2429 static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2430 {
2431 	const struct bpf_insn *insn = &meta->insn;
2432 	u8 gpr = insn->dst_reg * 2;
2433 
2434 	switch (insn->imm) {
2435 	case 16:
2436 		emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr),
2437 			      SHF_SC_R_ROT, 8);
2438 		emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr),
2439 			      SHF_SC_R_SHF, 16);
2440 
2441 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2442 		break;
2443 	case 32:
2444 		wrp_end32(nfp_prog, reg_a(gpr), gpr);
2445 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2446 		break;
2447 	case 64:
2448 		wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1));
2449 
2450 		wrp_end32(nfp_prog, reg_a(gpr), gpr + 1);
2451 		wrp_end32(nfp_prog, imm_a(nfp_prog), gpr);
2452 		break;
2453 	}
2454 
2455 	return 0;
2456 }
2457 
2458 static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2459 {
2460 	struct nfp_insn_meta *prev = nfp_meta_prev(meta);
2461 	u32 imm_lo, imm_hi;
2462 	u8 dst;
2463 
2464 	dst = prev->insn.dst_reg * 2;
2465 	imm_lo = prev->insn.imm;
2466 	imm_hi = meta->insn.imm;
2467 
2468 	wrp_immed(nfp_prog, reg_both(dst), imm_lo);
2469 
2470 	/* mov is always 1 insn, load imm may be two, so try to use mov */
2471 	if (imm_hi == imm_lo)
2472 		wrp_mov(nfp_prog, reg_both(dst + 1), reg_a(dst));
2473 	else
2474 		wrp_immed(nfp_prog, reg_both(dst + 1), imm_hi);
2475 
2476 	return 0;
2477 }
2478 
2479 static int imm_ld8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2480 {
2481 	meta->double_cb = imm_ld8_part2;
2482 	return 0;
2483 }
2484 
2485 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2486 {
2487 	return construct_data_ld(nfp_prog, meta->insn.imm, 1);
2488 }
2489 
2490 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2491 {
2492 	return construct_data_ld(nfp_prog, meta->insn.imm, 2);
2493 }
2494 
2495 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2496 {
2497 	return construct_data_ld(nfp_prog, meta->insn.imm, 4);
2498 }
2499 
2500 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2501 {
2502 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2503 				     meta->insn.src_reg * 2, 1);
2504 }
2505 
2506 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2507 {
2508 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2509 				     meta->insn.src_reg * 2, 2);
2510 }
2511 
2512 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2513 {
2514 	return construct_data_ind_ld(nfp_prog, meta->insn.imm,
2515 				     meta->insn.src_reg * 2, 4);
2516 }
2517 
2518 static int
2519 mem_ldx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2520 	      unsigned int size, unsigned int ptr_off)
2521 {
2522 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2523 			    meta->insn.dst_reg * 2, meta->insn.src_reg * 2,
2524 			    true, wrp_lmem_load);
2525 }
2526 
2527 static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2528 		       u8 size)
2529 {
2530 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2531 
2532 	switch (meta->insn.off) {
2533 	case offsetof(struct __sk_buff, len):
2534 		if (size != FIELD_SIZEOF(struct __sk_buff, len))
2535 			return -EOPNOTSUPP;
2536 		wrp_mov(nfp_prog, dst, plen_reg(nfp_prog));
2537 		break;
2538 	case offsetof(struct __sk_buff, data):
2539 		if (size != FIELD_SIZEOF(struct __sk_buff, data))
2540 			return -EOPNOTSUPP;
2541 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2542 		break;
2543 	case offsetof(struct __sk_buff, data_end):
2544 		if (size != FIELD_SIZEOF(struct __sk_buff, data_end))
2545 			return -EOPNOTSUPP;
2546 		emit_alu(nfp_prog, dst,
2547 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2548 		break;
2549 	default:
2550 		return -EOPNOTSUPP;
2551 	}
2552 
2553 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2554 
2555 	return 0;
2556 }
2557 
2558 static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2559 		       u8 size)
2560 {
2561 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2562 
2563 	switch (meta->insn.off) {
2564 	case offsetof(struct xdp_md, data):
2565 		if (size != FIELD_SIZEOF(struct xdp_md, data))
2566 			return -EOPNOTSUPP;
2567 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2568 		break;
2569 	case offsetof(struct xdp_md, data_end):
2570 		if (size != FIELD_SIZEOF(struct xdp_md, data_end))
2571 			return -EOPNOTSUPP;
2572 		emit_alu(nfp_prog, dst,
2573 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2574 		break;
2575 	default:
2576 		return -EOPNOTSUPP;
2577 	}
2578 
2579 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2580 
2581 	return 0;
2582 }
2583 
2584 static int
2585 mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2586 	     unsigned int size)
2587 {
2588 	swreg tmp_reg;
2589 
2590 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2591 
2592 	return data_ld_host_order_addr32(nfp_prog, meta->insn.src_reg * 2,
2593 					 tmp_reg, meta->insn.dst_reg * 2, size);
2594 }
2595 
2596 static int
2597 mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2598 	     unsigned int size)
2599 {
2600 	swreg tmp_reg;
2601 
2602 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2603 
2604 	return data_ld_host_order_addr40(nfp_prog, meta->insn.src_reg * 2,
2605 					 tmp_reg, meta->insn.dst_reg * 2, size);
2606 }
2607 
2608 static void
2609 mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog,
2610 			   struct nfp_insn_meta *meta)
2611 {
2612 	s16 range_start = meta->pkt_cache.range_start;
2613 	s16 range_end = meta->pkt_cache.range_end;
2614 	swreg src_base, off;
2615 	u8 xfer_num, len;
2616 	bool indir;
2617 
2618 	off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog));
2619 	src_base = reg_a(meta->insn.src_reg * 2);
2620 	len = range_end - range_start;
2621 	xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH;
2622 
2623 	indir = len > 8 * REG_WIDTH;
2624 	/* Setup PREV_ALU for indirect mode. */
2625 	if (indir)
2626 		wrp_immed(nfp_prog, reg_none(),
2627 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
2628 
2629 	/* Cache memory into transfer-in registers. */
2630 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
2631 		     off, xfer_num - 1, CMD_CTX_SWAP, indir);
2632 }
2633 
2634 static int
2635 mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog,
2636 				     struct nfp_insn_meta *meta,
2637 				     unsigned int size)
2638 {
2639 	s16 range_start = meta->pkt_cache.range_start;
2640 	s16 insn_off = meta->insn.off - range_start;
2641 	swreg dst_lo, dst_hi, src_lo, src_mid;
2642 	u8 dst_gpr = meta->insn.dst_reg * 2;
2643 	u8 len_lo = size, len_mid = 0;
2644 	u8 idx = insn_off / REG_WIDTH;
2645 	u8 off = insn_off % REG_WIDTH;
2646 
2647 	dst_hi = reg_both(dst_gpr + 1);
2648 	dst_lo = reg_both(dst_gpr);
2649 	src_lo = reg_xfer(idx);
2650 
2651 	/* The read length could involve as many as three registers. */
2652 	if (size > REG_WIDTH - off) {
2653 		/* Calculate the part in the second register. */
2654 		len_lo = REG_WIDTH - off;
2655 		len_mid = size - len_lo;
2656 
2657 		/* Calculate the part in the third register. */
2658 		if (size > 2 * REG_WIDTH - off)
2659 			len_mid = REG_WIDTH;
2660 	}
2661 
2662 	wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off);
2663 
2664 	if (!len_mid) {
2665 		wrp_immed(nfp_prog, dst_hi, 0);
2666 		return 0;
2667 	}
2668 
2669 	src_mid = reg_xfer(idx + 1);
2670 
2671 	if (size <= REG_WIDTH) {
2672 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo);
2673 		wrp_immed(nfp_prog, dst_hi, 0);
2674 	} else {
2675 		swreg src_hi = reg_xfer(idx + 2);
2676 
2677 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid,
2678 				   REG_WIDTH - len_lo, len_lo);
2679 		wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo,
2680 				REG_WIDTH - len_lo);
2681 		wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo,
2682 				   len_lo);
2683 	}
2684 
2685 	return 0;
2686 }
2687 
2688 static int
2689 mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog,
2690 				   struct nfp_insn_meta *meta,
2691 				   unsigned int size)
2692 {
2693 	swreg dst_lo, dst_hi, src_lo;
2694 	u8 dst_gpr, idx;
2695 
2696 	idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH;
2697 	dst_gpr = meta->insn.dst_reg * 2;
2698 	dst_hi = reg_both(dst_gpr + 1);
2699 	dst_lo = reg_both(dst_gpr);
2700 	src_lo = reg_xfer(idx);
2701 
2702 	if (size < REG_WIDTH) {
2703 		wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0);
2704 		wrp_immed(nfp_prog, dst_hi, 0);
2705 	} else if (size == REG_WIDTH) {
2706 		wrp_mov(nfp_prog, dst_lo, src_lo);
2707 		wrp_immed(nfp_prog, dst_hi, 0);
2708 	} else {
2709 		swreg src_hi = reg_xfer(idx + 1);
2710 
2711 		wrp_mov(nfp_prog, dst_lo, src_lo);
2712 		wrp_mov(nfp_prog, dst_hi, src_hi);
2713 	}
2714 
2715 	return 0;
2716 }
2717 
2718 static int
2719 mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog,
2720 			   struct nfp_insn_meta *meta, unsigned int size)
2721 {
2722 	u8 off = meta->insn.off - meta->pkt_cache.range_start;
2723 
2724 	if (IS_ALIGNED(off, REG_WIDTH))
2725 		return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size);
2726 
2727 	return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size);
2728 }
2729 
2730 static int
2731 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2732 	unsigned int size)
2733 {
2734 	if (meta->ldst_gather_len)
2735 		return nfp_cpp_memcpy(nfp_prog, meta);
2736 
2737 	if (meta->ptr.type == PTR_TO_CTX) {
2738 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2739 			return mem_ldx_xdp(nfp_prog, meta, size);
2740 		else
2741 			return mem_ldx_skb(nfp_prog, meta, size);
2742 	}
2743 
2744 	if (meta->ptr.type == PTR_TO_PACKET) {
2745 		if (meta->pkt_cache.range_end) {
2746 			if (meta->pkt_cache.do_init)
2747 				mem_ldx_data_init_pktcache(nfp_prog, meta);
2748 
2749 			return mem_ldx_data_from_pktcache(nfp_prog, meta, size);
2750 		} else {
2751 			return mem_ldx_data(nfp_prog, meta, size);
2752 		}
2753 	}
2754 
2755 	if (meta->ptr.type == PTR_TO_STACK)
2756 		return mem_ldx_stack(nfp_prog, meta, size,
2757 				     meta->ptr.off + meta->ptr.var_off.value);
2758 
2759 	if (meta->ptr.type == PTR_TO_MAP_VALUE)
2760 		return mem_ldx_emem(nfp_prog, meta, size);
2761 
2762 	return -EOPNOTSUPP;
2763 }
2764 
2765 static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2766 {
2767 	return mem_ldx(nfp_prog, meta, 1);
2768 }
2769 
2770 static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2771 {
2772 	return mem_ldx(nfp_prog, meta, 2);
2773 }
2774 
2775 static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2776 {
2777 	return mem_ldx(nfp_prog, meta, 4);
2778 }
2779 
2780 static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2781 {
2782 	return mem_ldx(nfp_prog, meta, 8);
2783 }
2784 
2785 static int
2786 mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2787 	    unsigned int size)
2788 {
2789 	u64 imm = meta->insn.imm; /* sign extend */
2790 	swreg off_reg;
2791 
2792 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2793 
2794 	return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2795 				  imm, size);
2796 }
2797 
2798 static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2799 		  unsigned int size)
2800 {
2801 	if (meta->ptr.type == PTR_TO_PACKET)
2802 		return mem_st_data(nfp_prog, meta, size);
2803 
2804 	return -EOPNOTSUPP;
2805 }
2806 
2807 static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2808 {
2809 	return mem_st(nfp_prog, meta, 1);
2810 }
2811 
2812 static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2813 {
2814 	return mem_st(nfp_prog, meta, 2);
2815 }
2816 
2817 static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2818 {
2819 	return mem_st(nfp_prog, meta, 4);
2820 }
2821 
2822 static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2823 {
2824 	return mem_st(nfp_prog, meta, 8);
2825 }
2826 
2827 static int
2828 mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2829 	     unsigned int size)
2830 {
2831 	swreg off_reg;
2832 
2833 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2834 
2835 	return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2836 				   meta->insn.src_reg * 2, size);
2837 }
2838 
2839 static int
2840 mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2841 	      unsigned int size, unsigned int ptr_off)
2842 {
2843 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2844 			    meta->insn.src_reg * 2, meta->insn.dst_reg * 2,
2845 			    false, wrp_lmem_store);
2846 }
2847 
2848 static int mem_stx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2849 {
2850 	switch (meta->insn.off) {
2851 	case offsetof(struct xdp_md, rx_queue_index):
2852 		return nfp_queue_select(nfp_prog, meta);
2853 	}
2854 
2855 	WARN_ON_ONCE(1); /* verifier should have rejected bad accesses */
2856 	return -EOPNOTSUPP;
2857 }
2858 
2859 static int
2860 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2861 	unsigned int size)
2862 {
2863 	if (meta->ptr.type == PTR_TO_PACKET)
2864 		return mem_stx_data(nfp_prog, meta, size);
2865 
2866 	if (meta->ptr.type == PTR_TO_STACK)
2867 		return mem_stx_stack(nfp_prog, meta, size,
2868 				     meta->ptr.off + meta->ptr.var_off.value);
2869 
2870 	return -EOPNOTSUPP;
2871 }
2872 
2873 static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2874 {
2875 	return mem_stx(nfp_prog, meta, 1);
2876 }
2877 
2878 static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2879 {
2880 	return mem_stx(nfp_prog, meta, 2);
2881 }
2882 
2883 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2884 {
2885 	if (meta->ptr.type == PTR_TO_CTX)
2886 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2887 			return mem_stx_xdp(nfp_prog, meta);
2888 	return mem_stx(nfp_prog, meta, 4);
2889 }
2890 
2891 static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2892 {
2893 	return mem_stx(nfp_prog, meta, 8);
2894 }
2895 
2896 static int
2897 mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
2898 {
2899 	u8 dst_gpr = meta->insn.dst_reg * 2;
2900 	u8 src_gpr = meta->insn.src_reg * 2;
2901 	unsigned int full_add, out;
2902 	swreg addra, addrb, off;
2903 
2904 	off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2905 
2906 	/* We can fit 16 bits into command immediate, if we know the immediate
2907 	 * is guaranteed to either always or never fit into 16 bit we only
2908 	 * generate code to handle that particular case, otherwise generate
2909 	 * code for both.
2910 	 */
2911 	out = nfp_prog_current_offset(nfp_prog);
2912 	full_add = nfp_prog_current_offset(nfp_prog);
2913 
2914 	if (meta->insn.off) {
2915 		out += 2;
2916 		full_add += 2;
2917 	}
2918 	if (meta->xadd_maybe_16bit) {
2919 		out += 3;
2920 		full_add += 3;
2921 	}
2922 	if (meta->xadd_over_16bit)
2923 		out += 2 + is64;
2924 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2925 		out += 5;
2926 		full_add += 5;
2927 	}
2928 
2929 	/* Generate the branch for choosing add_imm vs add */
2930 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
2931 		swreg max_imm = imm_a(nfp_prog);
2932 
2933 		wrp_immed(nfp_prog, max_imm, 0xffff);
2934 		emit_alu(nfp_prog, reg_none(),
2935 			 max_imm, ALU_OP_SUB, reg_b(src_gpr));
2936 		emit_alu(nfp_prog, reg_none(),
2937 			 reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1));
2938 		emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0);
2939 		/* defer for add */
2940 	}
2941 
2942 	/* If insn has an offset add to the address */
2943 	if (!meta->insn.off) {
2944 		addra = reg_a(dst_gpr);
2945 		addrb = reg_b(dst_gpr + 1);
2946 	} else {
2947 		emit_alu(nfp_prog, imma_a(nfp_prog),
2948 			 reg_a(dst_gpr), ALU_OP_ADD, off);
2949 		emit_alu(nfp_prog, imma_b(nfp_prog),
2950 			 reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0));
2951 		addra = imma_a(nfp_prog);
2952 		addrb = imma_b(nfp_prog);
2953 	}
2954 
2955 	/* Generate the add_imm if 16 bits are possible */
2956 	if (meta->xadd_maybe_16bit) {
2957 		swreg prev_alu = imm_a(nfp_prog);
2958 
2959 		wrp_immed(nfp_prog, prev_alu,
2960 			  FIELD_PREP(CMD_OVE_DATA, 2) |
2961 			  CMD_OVE_LEN |
2962 			  FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2));
2963 		wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2);
2964 		emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0,
2965 			       addra, addrb, 0, CMD_CTX_NO_SWAP);
2966 
2967 		if (meta->xadd_over_16bit)
2968 			emit_br(nfp_prog, BR_UNC, out, 0);
2969 	}
2970 
2971 	if (!nfp_prog_confirm_current_offset(nfp_prog, full_add))
2972 		return -EINVAL;
2973 
2974 	/* Generate the add if 16 bits are not guaranteed */
2975 	if (meta->xadd_over_16bit) {
2976 		emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0,
2977 			 addra, addrb, is64 << 2,
2978 			 is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1);
2979 
2980 		wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr));
2981 		if (is64)
2982 			wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1));
2983 	}
2984 
2985 	if (!nfp_prog_confirm_current_offset(nfp_prog, out))
2986 		return -EINVAL;
2987 
2988 	return 0;
2989 }
2990 
2991 static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2992 {
2993 	return mem_xadd(nfp_prog, meta, false);
2994 }
2995 
2996 static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2997 {
2998 	return mem_xadd(nfp_prog, meta, true);
2999 }
3000 
3001 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3002 {
3003 	emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
3004 
3005 	return 0;
3006 }
3007 
3008 static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3009 {
3010 	const struct bpf_insn *insn = &meta->insn;
3011 	u64 imm = insn->imm; /* sign extend */
3012 	swreg or1, or2, tmp_reg;
3013 
3014 	or1 = reg_a(insn->dst_reg * 2);
3015 	or2 = reg_b(insn->dst_reg * 2 + 1);
3016 
3017 	if (imm & ~0U) {
3018 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3019 		emit_alu(nfp_prog, imm_a(nfp_prog),
3020 			 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3021 		or1 = imm_a(nfp_prog);
3022 	}
3023 
3024 	if (imm >> 32) {
3025 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3026 		emit_alu(nfp_prog, imm_b(nfp_prog),
3027 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3028 		or2 = imm_b(nfp_prog);
3029 	}
3030 
3031 	emit_alu(nfp_prog, reg_none(), or1, ALU_OP_OR, or2);
3032 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3033 
3034 	return 0;
3035 }
3036 
3037 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3038 {
3039 	const struct bpf_insn *insn = &meta->insn;
3040 	u64 imm = insn->imm; /* sign extend */
3041 	swreg tmp_reg;
3042 
3043 	if (!imm) {
3044 		meta->skip = true;
3045 		return 0;
3046 	}
3047 
3048 	if (imm & ~0U) {
3049 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3050 		emit_alu(nfp_prog, reg_none(),
3051 			 reg_a(insn->dst_reg * 2), ALU_OP_AND, tmp_reg);
3052 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3053 	}
3054 
3055 	if (imm >> 32) {
3056 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3057 		emit_alu(nfp_prog, reg_none(),
3058 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_AND, tmp_reg);
3059 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3060 	}
3061 
3062 	return 0;
3063 }
3064 
3065 static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3066 {
3067 	const struct bpf_insn *insn = &meta->insn;
3068 	u64 imm = insn->imm; /* sign extend */
3069 	swreg tmp_reg;
3070 
3071 	if (!imm) {
3072 		emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
3073 			 ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
3074 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3075 		return 0;
3076 	}
3077 
3078 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3079 	emit_alu(nfp_prog, reg_none(),
3080 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3081 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3082 
3083 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3084 	emit_alu(nfp_prog, reg_none(),
3085 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3086 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3087 
3088 	return 0;
3089 }
3090 
3091 static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3092 {
3093 	const struct bpf_insn *insn = &meta->insn;
3094 
3095 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
3096 		 ALU_OP_XOR, reg_b(insn->src_reg * 2));
3097 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_a(insn->dst_reg * 2 + 1),
3098 		 ALU_OP_XOR, reg_b(insn->src_reg * 2 + 1));
3099 	emit_alu(nfp_prog, reg_none(),
3100 		 imm_a(nfp_prog), ALU_OP_OR, imm_b(nfp_prog));
3101 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3102 
3103 	return 0;
3104 }
3105 
3106 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3107 {
3108 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
3109 }
3110 
3111 static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3112 {
3113 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
3114 }
3115 
3116 static int
3117 bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3118 {
3119 	u32 ret_tgt, stack_depth, offset_br;
3120 	swreg tmp_reg;
3121 
3122 	stack_depth = round_up(nfp_prog->stack_frame_depth, STACK_FRAME_ALIGN);
3123 	/* Space for saving the return address is accounted for by the callee,
3124 	 * so stack_depth can be zero for the main function.
3125 	 */
3126 	if (stack_depth) {
3127 		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3128 					  stack_imm(nfp_prog));
3129 		emit_alu(nfp_prog, stack_reg(nfp_prog),
3130 			 stack_reg(nfp_prog), ALU_OP_ADD, tmp_reg);
3131 		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3132 			    NFP_CSR_ACT_LM_ADDR0);
3133 	}
3134 
3135 	/* Two cases for jumping to the callee:
3136 	 *
3137 	 * - If callee uses and needs to save R6~R9 then:
3138 	 *     1. Put the start offset of the callee into imm_b(). This will
3139 	 *        require a fixup step, as we do not necessarily know this
3140 	 *        address yet.
3141 	 *     2. Put the return address from the callee to the caller into
3142 	 *        register ret_reg().
3143 	 *     3. (After defer slots are consumed) Jump to the subroutine that
3144 	 *        pushes the registers to the stack.
3145 	 *   The subroutine acts as a trampoline, and returns to the address in
3146 	 *   imm_b(), i.e. jumps to the callee.
3147 	 *
3148 	 * - If callee does not need to save R6~R9 then just load return
3149 	 *   address to the caller in ret_reg(), and jump to the callee
3150 	 *   directly.
3151 	 *
3152 	 * Using ret_reg() to pass the return address to the callee is set here
3153 	 * as a convention. The callee can then push this address onto its
3154 	 * stack frame in its prologue. The advantages of passing the return
3155 	 * address through ret_reg(), instead of pushing it to the stack right
3156 	 * here, are the following:
3157 	 * - It looks cleaner.
3158 	 * - If the called function is called multiple time, we get a lower
3159 	 *   program size.
3160 	 * - We save two no-op instructions that should be added just before
3161 	 *   the emit_br() when stack depth is not null otherwise.
3162 	 * - If we ever find a register to hold the return address during whole
3163 	 *   execution of the callee, we will not have to push the return
3164 	 *   address to the stack for leaf functions.
3165 	 */
3166 	if (!meta->jmp_dst) {
3167 		pr_err("BUG: BPF-to-BPF call has no destination recorded\n");
3168 		return -ELOOP;
3169 	}
3170 	if (nfp_prog->subprog[meta->jmp_dst->subprog_idx].needs_reg_push) {
3171 		ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
3172 		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
3173 			     RELO_BR_GO_CALL_PUSH_REGS);
3174 		offset_br = nfp_prog_current_offset(nfp_prog);
3175 		wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
3176 	} else {
3177 		ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
3178 		emit_br(nfp_prog, BR_UNC, meta->n + 1 + meta->insn.imm, 1);
3179 		offset_br = nfp_prog_current_offset(nfp_prog);
3180 	}
3181 	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
3182 
3183 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
3184 		return -EINVAL;
3185 
3186 	if (stack_depth) {
3187 		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3188 					  stack_imm(nfp_prog));
3189 		emit_alu(nfp_prog, stack_reg(nfp_prog),
3190 			 stack_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
3191 		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3192 			    NFP_CSR_ACT_LM_ADDR0);
3193 		wrp_nops(nfp_prog, 3);
3194 	}
3195 
3196 	meta->num_insns_after_br = nfp_prog_current_offset(nfp_prog);
3197 	meta->num_insns_after_br -= offset_br;
3198 
3199 	return 0;
3200 }
3201 
3202 static int helper_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3203 {
3204 	switch (meta->insn.imm) {
3205 	case BPF_FUNC_xdp_adjust_head:
3206 		return adjust_head(nfp_prog, meta);
3207 	case BPF_FUNC_xdp_adjust_tail:
3208 		return adjust_tail(nfp_prog, meta);
3209 	case BPF_FUNC_map_lookup_elem:
3210 	case BPF_FUNC_map_update_elem:
3211 	case BPF_FUNC_map_delete_elem:
3212 		return map_call_stack_common(nfp_prog, meta);
3213 	case BPF_FUNC_get_prandom_u32:
3214 		return nfp_get_prandom_u32(nfp_prog, meta);
3215 	case BPF_FUNC_perf_event_output:
3216 		return nfp_perf_event_output(nfp_prog, meta);
3217 	default:
3218 		WARN_ONCE(1, "verifier allowed unsupported function\n");
3219 		return -EOPNOTSUPP;
3220 	}
3221 }
3222 
3223 static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3224 {
3225 	if (is_mbpf_pseudo_call(meta))
3226 		return bpf_to_bpf_call(nfp_prog, meta);
3227 	else
3228 		return helper_call(nfp_prog, meta);
3229 }
3230 
3231 static bool nfp_is_main_function(struct nfp_insn_meta *meta)
3232 {
3233 	return meta->subprog_idx == 0;
3234 }
3235 
3236 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3237 {
3238 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
3239 
3240 	return 0;
3241 }
3242 
3243 static int
3244 nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3245 {
3246 	if (nfp_prog->subprog[meta->subprog_idx].needs_reg_push) {
3247 		/* Pop R6~R9 to the stack via related subroutine.
3248 		 * We loaded the return address to the caller into ret_reg().
3249 		 * This means that the subroutine does not come back here, we
3250 		 * make it jump back to the subprogram caller directly!
3251 		 */
3252 		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
3253 			     RELO_BR_GO_CALL_POP_REGS);
3254 		/* Pop return address from the stack. */
3255 		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3256 	} else {
3257 		/* Pop return address from the stack. */
3258 		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3259 		/* Jump back to caller if no callee-saved registers were used
3260 		 * by the subprogram.
3261 		 */
3262 		emit_rtn(nfp_prog, ret_reg(nfp_prog), 0);
3263 	}
3264 
3265 	return 0;
3266 }
3267 
3268 static int jmp_exit(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3269 {
3270 	if (nfp_is_main_function(meta))
3271 		return goto_out(nfp_prog, meta);
3272 	else
3273 		return nfp_subprog_epilogue(nfp_prog, meta);
3274 }
3275 
3276 static const instr_cb_t instr_cb[256] = {
3277 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
3278 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
3279 	[BPF_ALU64 | BPF_XOR | BPF_X] =	xor_reg64,
3280 	[BPF_ALU64 | BPF_XOR | BPF_K] =	xor_imm64,
3281 	[BPF_ALU64 | BPF_AND | BPF_X] =	and_reg64,
3282 	[BPF_ALU64 | BPF_AND | BPF_K] =	and_imm64,
3283 	[BPF_ALU64 | BPF_OR | BPF_X] =	or_reg64,
3284 	[BPF_ALU64 | BPF_OR | BPF_K] =	or_imm64,
3285 	[BPF_ALU64 | BPF_ADD | BPF_X] =	add_reg64,
3286 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
3287 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
3288 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
3289 	[BPF_ALU64 | BPF_MUL | BPF_X] =	mul_reg64,
3290 	[BPF_ALU64 | BPF_MUL | BPF_K] =	mul_imm64,
3291 	[BPF_ALU64 | BPF_DIV | BPF_X] =	div_reg64,
3292 	[BPF_ALU64 | BPF_DIV | BPF_K] =	div_imm64,
3293 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
3294 	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
3295 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
3296 	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
3297 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
3298 	[BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64,
3299 	[BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
3300 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
3301 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
3302 	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
3303 	[BPF_ALU | BPF_XOR | BPF_K] =	xor_imm,
3304 	[BPF_ALU | BPF_AND | BPF_X] =	and_reg,
3305 	[BPF_ALU | BPF_AND | BPF_K] =	and_imm,
3306 	[BPF_ALU | BPF_OR | BPF_X] =	or_reg,
3307 	[BPF_ALU | BPF_OR | BPF_K] =	or_imm,
3308 	[BPF_ALU | BPF_ADD | BPF_X] =	add_reg,
3309 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
3310 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
3311 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
3312 	[BPF_ALU | BPF_MUL | BPF_X] =	mul_reg,
3313 	[BPF_ALU | BPF_MUL | BPF_K] =	mul_imm,
3314 	[BPF_ALU | BPF_DIV | BPF_X] =	div_reg,
3315 	[BPF_ALU | BPF_DIV | BPF_K] =	div_imm,
3316 	[BPF_ALU | BPF_NEG] =		neg_reg,
3317 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
3318 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
3319 	[BPF_LD | BPF_IMM | BPF_DW] =	imm_ld8,
3320 	[BPF_LD | BPF_ABS | BPF_B] =	data_ld1,
3321 	[BPF_LD | BPF_ABS | BPF_H] =	data_ld2,
3322 	[BPF_LD | BPF_ABS | BPF_W] =	data_ld4,
3323 	[BPF_LD | BPF_IND | BPF_B] =	data_ind_ld1,
3324 	[BPF_LD | BPF_IND | BPF_H] =	data_ind_ld2,
3325 	[BPF_LD | BPF_IND | BPF_W] =	data_ind_ld4,
3326 	[BPF_LDX | BPF_MEM | BPF_B] =	mem_ldx1,
3327 	[BPF_LDX | BPF_MEM | BPF_H] =	mem_ldx2,
3328 	[BPF_LDX | BPF_MEM | BPF_W] =	mem_ldx4,
3329 	[BPF_LDX | BPF_MEM | BPF_DW] =	mem_ldx8,
3330 	[BPF_STX | BPF_MEM | BPF_B] =	mem_stx1,
3331 	[BPF_STX | BPF_MEM | BPF_H] =	mem_stx2,
3332 	[BPF_STX | BPF_MEM | BPF_W] =	mem_stx4,
3333 	[BPF_STX | BPF_MEM | BPF_DW] =	mem_stx8,
3334 	[BPF_STX | BPF_XADD | BPF_W] =	mem_xadd4,
3335 	[BPF_STX | BPF_XADD | BPF_DW] =	mem_xadd8,
3336 	[BPF_ST | BPF_MEM | BPF_B] =	mem_st1,
3337 	[BPF_ST | BPF_MEM | BPF_H] =	mem_st2,
3338 	[BPF_ST | BPF_MEM | BPF_W] =	mem_st4,
3339 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
3340 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
3341 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
3342 	[BPF_JMP | BPF_JGT | BPF_K] =	cmp_imm,
3343 	[BPF_JMP | BPF_JGE | BPF_K] =	cmp_imm,
3344 	[BPF_JMP | BPF_JLT | BPF_K] =	cmp_imm,
3345 	[BPF_JMP | BPF_JLE | BPF_K] =	cmp_imm,
3346 	[BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
3347 	[BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
3348 	[BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
3349 	[BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
3350 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
3351 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
3352 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
3353 	[BPF_JMP | BPF_JGT | BPF_X] =	cmp_reg,
3354 	[BPF_JMP | BPF_JGE | BPF_X] =	cmp_reg,
3355 	[BPF_JMP | BPF_JLT | BPF_X] =	cmp_reg,
3356 	[BPF_JMP | BPF_JLE | BPF_X] =	cmp_reg,
3357 	[BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
3358 	[BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
3359 	[BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
3360 	[BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
3361 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
3362 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
3363 	[BPF_JMP | BPF_CALL] =		call,
3364 	[BPF_JMP | BPF_EXIT] =		jmp_exit,
3365 };
3366 
3367 /* --- Assembler logic --- */
3368 static int
3369 nfp_fixup_immed_relo(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
3370 		     struct nfp_insn_meta *jmp_dst, u32 br_idx)
3371 {
3372 	if (immed_get_value(nfp_prog->prog[br_idx + 1])) {
3373 		pr_err("BUG: failed to fix up callee register saving\n");
3374 		return -EINVAL;
3375 	}
3376 
3377 	immed_set_value(&nfp_prog->prog[br_idx + 1], jmp_dst->off);
3378 
3379 	return 0;
3380 }
3381 
3382 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
3383 {
3384 	struct nfp_insn_meta *meta, *jmp_dst;
3385 	u32 idx, br_idx;
3386 	int err;
3387 
3388 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3389 		if (meta->skip)
3390 			continue;
3391 		if (BPF_CLASS(meta->insn.code) != BPF_JMP)
3392 			continue;
3393 		if (meta->insn.code == (BPF_JMP | BPF_EXIT) &&
3394 		    !nfp_is_main_function(meta))
3395 			continue;
3396 		if (is_mbpf_helper_call(meta))
3397 			continue;
3398 
3399 		if (list_is_last(&meta->l, &nfp_prog->insns))
3400 			br_idx = nfp_prog->last_bpf_off;
3401 		else
3402 			br_idx = list_next_entry(meta, l)->off - 1;
3403 
3404 		/* For BPF-to-BPF function call, a stack adjustment sequence is
3405 		 * generated after the return instruction. Therefore, we must
3406 		 * withdraw the length of this sequence to have br_idx pointing
3407 		 * to where the "branch" NFP instruction is expected to be.
3408 		 */
3409 		if (is_mbpf_pseudo_call(meta))
3410 			br_idx -= meta->num_insns_after_br;
3411 
3412 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
3413 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
3414 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
3415 			return -ELOOP;
3416 		}
3417 
3418 		if (meta->insn.code == (BPF_JMP | BPF_EXIT))
3419 			continue;
3420 
3421 		/* Leave special branches for later */
3422 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3423 		    RELO_BR_REL && !is_mbpf_pseudo_call(meta))
3424 			continue;
3425 
3426 		if (!meta->jmp_dst) {
3427 			pr_err("Non-exit jump doesn't have destination info recorded!!\n");
3428 			return -ELOOP;
3429 		}
3430 
3431 		jmp_dst = meta->jmp_dst;
3432 
3433 		if (jmp_dst->skip) {
3434 			pr_err("Branch landing on removed instruction!!\n");
3435 			return -ELOOP;
3436 		}
3437 
3438 		if (is_mbpf_pseudo_call(meta) &&
3439 		    nfp_prog->subprog[jmp_dst->subprog_idx].needs_reg_push) {
3440 			err = nfp_fixup_immed_relo(nfp_prog, meta,
3441 						   jmp_dst, br_idx);
3442 			if (err)
3443 				return err;
3444 		}
3445 
3446 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3447 		    RELO_BR_REL)
3448 			continue;
3449 
3450 		for (idx = meta->off; idx <= br_idx; idx++) {
3451 			if (!nfp_is_br(nfp_prog->prog[idx]))
3452 				continue;
3453 			br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
3454 		}
3455 	}
3456 
3457 	return 0;
3458 }
3459 
3460 static void nfp_intro(struct nfp_prog *nfp_prog)
3461 {
3462 	wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
3463 	emit_alu(nfp_prog, plen_reg(nfp_prog),
3464 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
3465 }
3466 
3467 static void
3468 nfp_subprog_prologue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3469 {
3470 	/* Save return address into the stack. */
3471 	wrp_mov(nfp_prog, reg_lm(0, 0), ret_reg(nfp_prog));
3472 }
3473 
3474 static void
3475 nfp_start_subprog(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3476 {
3477 	unsigned int depth = nfp_prog->subprog[meta->subprog_idx].stack_depth;
3478 
3479 	nfp_prog->stack_frame_depth = round_up(depth, 4);
3480 	nfp_subprog_prologue(nfp_prog, meta);
3481 }
3482 
3483 bool nfp_is_subprog_start(struct nfp_insn_meta *meta)
3484 {
3485 	return meta->flags & FLAG_INSN_IS_SUBPROG_START;
3486 }
3487 
3488 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
3489 {
3490 	/* TC direct-action mode:
3491 	 *   0,1   ok        NOT SUPPORTED[1]
3492 	 *   2   drop  0x22 -> drop,  count as stat1
3493 	 *   4,5 nuke  0x02 -> drop
3494 	 *   7  redir  0x44 -> redir, count as stat2
3495 	 *   * unspec  0x11 -> pass,  count as stat0
3496 	 *
3497 	 * [1] We can't support OK and RECLASSIFY because we can't tell TC
3498 	 *     the exact decision made.  We are forced to support UNSPEC
3499 	 *     to handle aborts so that's the only one we handle for passing
3500 	 *     packets up the stack.
3501 	 */
3502 	/* Target for aborts */
3503 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3504 
3505 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3506 
3507 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3508 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16);
3509 
3510 	/* Target for normal exits */
3511 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3512 
3513 	/* if R0 > 7 jump to abort */
3514 	emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0));
3515 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3516 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3517 
3518 	wrp_immed(nfp_prog, reg_b(2), 0x41221211);
3519 	wrp_immed(nfp_prog, reg_b(3), 0x41001211);
3520 
3521 	emit_shf(nfp_prog, reg_a(1),
3522 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 2);
3523 
3524 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3525 	emit_shf(nfp_prog, reg_a(2),
3526 		 reg_imm(0xf), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3527 
3528 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3529 	emit_shf(nfp_prog, reg_b(2),
3530 		 reg_imm(0xf), SHF_OP_AND, reg_b(3), SHF_SC_R_SHF, 0);
3531 
3532 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3533 
3534 	emit_shf(nfp_prog, reg_b(2),
3535 		 reg_a(2), SHF_OP_OR, reg_b(2), SHF_SC_L_SHF, 4);
3536 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3537 }
3538 
3539 static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
3540 {
3541 	/* XDP return codes:
3542 	 *   0 aborted  0x82 -> drop,  count as stat3
3543 	 *   1    drop  0x22 -> drop,  count as stat1
3544 	 *   2    pass  0x11 -> pass,  count as stat0
3545 	 *   3      tx  0x44 -> redir, count as stat2
3546 	 *   * unknown  0x82 -> drop,  count as stat3
3547 	 */
3548 	/* Target for aborts */
3549 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3550 
3551 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3552 
3553 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3554 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
3555 
3556 	/* Target for normal exits */
3557 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3558 
3559 	/* if R0 > 3 jump to abort */
3560 	emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
3561 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3562 
3563 	wrp_immed(nfp_prog, reg_b(2), 0x44112282);
3564 
3565 	emit_shf(nfp_prog, reg_a(1),
3566 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 3);
3567 
3568 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3569 	emit_shf(nfp_prog, reg_b(2),
3570 		 reg_imm(0xff), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3571 
3572 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3573 
3574 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3575 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3576 }
3577 
3578 static bool nfp_prog_needs_callee_reg_save(struct nfp_prog *nfp_prog)
3579 {
3580 	unsigned int idx;
3581 
3582 	for (idx = 1; idx < nfp_prog->subprog_cnt; idx++)
3583 		if (nfp_prog->subprog[idx].needs_reg_push)
3584 			return true;
3585 
3586 	return false;
3587 }
3588 
3589 static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
3590 {
3591 	u8 reg;
3592 
3593 	/* Subroutine: Save all callee saved registers (R6 ~ R9).
3594 	 * imm_b() holds the return address.
3595 	 */
3596 	nfp_prog->tgt_call_push_regs = nfp_prog_current_offset(nfp_prog);
3597 	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3598 		u8 adj = (reg - BPF_REG_0) * 2;
3599 		u8 idx = (reg - BPF_REG_6) * 2;
3600 
3601 		/* The first slot in the stack frame is used to push the return
3602 		 * address in bpf_to_bpf_call(), start just after.
3603 		 */
3604 		wrp_mov(nfp_prog, reg_lm(0, 1 + idx), reg_b(adj));
3605 
3606 		if (reg == BPF_REG_8)
3607 			/* Prepare to jump back, last 3 insns use defer slots */
3608 			emit_rtn(nfp_prog, imm_b(nfp_prog), 3);
3609 
3610 		wrp_mov(nfp_prog, reg_lm(0, 1 + idx + 1), reg_b(adj + 1));
3611 	}
3612 }
3613 
3614 static void nfp_pop_callee_registers(struct nfp_prog *nfp_prog)
3615 {
3616 	u8 reg;
3617 
3618 	/* Subroutine: Restore all callee saved registers (R6 ~ R9).
3619 	 * ret_reg() holds the return address.
3620 	 */
3621 	nfp_prog->tgt_call_pop_regs = nfp_prog_current_offset(nfp_prog);
3622 	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3623 		u8 adj = (reg - BPF_REG_0) * 2;
3624 		u8 idx = (reg - BPF_REG_6) * 2;
3625 
3626 		/* The first slot in the stack frame holds the return address,
3627 		 * start popping just after that.
3628 		 */
3629 		wrp_mov(nfp_prog, reg_both(adj), reg_lm(0, 1 + idx));
3630 
3631 		if (reg == BPF_REG_8)
3632 			/* Prepare to jump back, last 3 insns use defer slots */
3633 			emit_rtn(nfp_prog, ret_reg(nfp_prog), 3);
3634 
3635 		wrp_mov(nfp_prog, reg_both(adj + 1), reg_lm(0, 1 + idx + 1));
3636 	}
3637 }
3638 
3639 static void nfp_outro(struct nfp_prog *nfp_prog)
3640 {
3641 	switch (nfp_prog->type) {
3642 	case BPF_PROG_TYPE_SCHED_CLS:
3643 		nfp_outro_tc_da(nfp_prog);
3644 		break;
3645 	case BPF_PROG_TYPE_XDP:
3646 		nfp_outro_xdp(nfp_prog);
3647 		break;
3648 	default:
3649 		WARN_ON(1);
3650 	}
3651 
3652 	if (!nfp_prog_needs_callee_reg_save(nfp_prog))
3653 		return;
3654 
3655 	nfp_push_callee_registers(nfp_prog);
3656 	nfp_pop_callee_registers(nfp_prog);
3657 }
3658 
3659 static int nfp_translate(struct nfp_prog *nfp_prog)
3660 {
3661 	struct nfp_insn_meta *meta;
3662 	unsigned int depth;
3663 	int err;
3664 
3665 	depth = nfp_prog->subprog[0].stack_depth;
3666 	nfp_prog->stack_frame_depth = round_up(depth, 4);
3667 
3668 	nfp_intro(nfp_prog);
3669 	if (nfp_prog->error)
3670 		return nfp_prog->error;
3671 
3672 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3673 		instr_cb_t cb = instr_cb[meta->insn.code];
3674 
3675 		meta->off = nfp_prog_current_offset(nfp_prog);
3676 
3677 		if (nfp_is_subprog_start(meta)) {
3678 			nfp_start_subprog(nfp_prog, meta);
3679 			if (nfp_prog->error)
3680 				return nfp_prog->error;
3681 		}
3682 
3683 		if (meta->skip) {
3684 			nfp_prog->n_translated++;
3685 			continue;
3686 		}
3687 
3688 		if (nfp_meta_has_prev(nfp_prog, meta) &&
3689 		    nfp_meta_prev(meta)->double_cb)
3690 			cb = nfp_meta_prev(meta)->double_cb;
3691 		if (!cb)
3692 			return -ENOENT;
3693 		err = cb(nfp_prog, meta);
3694 		if (err)
3695 			return err;
3696 		if (nfp_prog->error)
3697 			return nfp_prog->error;
3698 
3699 		nfp_prog->n_translated++;
3700 	}
3701 
3702 	nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
3703 
3704 	nfp_outro(nfp_prog);
3705 	if (nfp_prog->error)
3706 		return nfp_prog->error;
3707 
3708 	wrp_nops(nfp_prog, NFP_USTORE_PREFETCH_WINDOW);
3709 	if (nfp_prog->error)
3710 		return nfp_prog->error;
3711 
3712 	return nfp_fixup_branches(nfp_prog);
3713 }
3714 
3715 /* --- Optimizations --- */
3716 static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
3717 {
3718 	struct nfp_insn_meta *meta;
3719 
3720 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3721 		struct bpf_insn insn = meta->insn;
3722 
3723 		/* Programs converted from cBPF start with register xoring */
3724 		if (insn.code == (BPF_ALU64 | BPF_XOR | BPF_X) &&
3725 		    insn.src_reg == insn.dst_reg)
3726 			continue;
3727 
3728 		/* Programs start with R6 = R1 but we ignore the skb pointer */
3729 		if (insn.code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
3730 		    insn.src_reg == 1 && insn.dst_reg == 6)
3731 			meta->skip = true;
3732 
3733 		/* Return as soon as something doesn't match */
3734 		if (!meta->skip)
3735 			return;
3736 	}
3737 }
3738 
3739 /* abs(insn.imm) will fit better into unrestricted reg immediate -
3740  * convert add/sub of a negative number into a sub/add of a positive one.
3741  */
3742 static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
3743 {
3744 	struct nfp_insn_meta *meta;
3745 
3746 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3747 		struct bpf_insn insn = meta->insn;
3748 
3749 		if (meta->skip)
3750 			continue;
3751 
3752 		if (BPF_CLASS(insn.code) != BPF_ALU &&
3753 		    BPF_CLASS(insn.code) != BPF_ALU64 &&
3754 		    BPF_CLASS(insn.code) != BPF_JMP)
3755 			continue;
3756 		if (BPF_SRC(insn.code) != BPF_K)
3757 			continue;
3758 		if (insn.imm >= 0)
3759 			continue;
3760 
3761 		if (BPF_CLASS(insn.code) == BPF_JMP) {
3762 			switch (BPF_OP(insn.code)) {
3763 			case BPF_JGE:
3764 			case BPF_JSGE:
3765 			case BPF_JLT:
3766 			case BPF_JSLT:
3767 				meta->jump_neg_op = true;
3768 				break;
3769 			default:
3770 				continue;
3771 			}
3772 		} else {
3773 			if (BPF_OP(insn.code) == BPF_ADD)
3774 				insn.code = BPF_CLASS(insn.code) | BPF_SUB;
3775 			else if (BPF_OP(insn.code) == BPF_SUB)
3776 				insn.code = BPF_CLASS(insn.code) | BPF_ADD;
3777 			else
3778 				continue;
3779 
3780 			meta->insn.code = insn.code | BPF_K;
3781 		}
3782 
3783 		meta->insn.imm = -insn.imm;
3784 	}
3785 }
3786 
3787 /* Remove masking after load since our load guarantees this is not needed */
3788 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
3789 {
3790 	struct nfp_insn_meta *meta1, *meta2;
3791 	const s32 exp_mask[] = {
3792 		[BPF_B] = 0x000000ffU,
3793 		[BPF_H] = 0x0000ffffU,
3794 		[BPF_W] = 0xffffffffU,
3795 	};
3796 
3797 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3798 		struct bpf_insn insn, next;
3799 
3800 		insn = meta1->insn;
3801 		next = meta2->insn;
3802 
3803 		if (BPF_CLASS(insn.code) != BPF_LD)
3804 			continue;
3805 		if (BPF_MODE(insn.code) != BPF_ABS &&
3806 		    BPF_MODE(insn.code) != BPF_IND)
3807 			continue;
3808 
3809 		if (next.code != (BPF_ALU64 | BPF_AND | BPF_K))
3810 			continue;
3811 
3812 		if (!exp_mask[BPF_SIZE(insn.code)])
3813 			continue;
3814 		if (exp_mask[BPF_SIZE(insn.code)] != next.imm)
3815 			continue;
3816 
3817 		if (next.src_reg || next.dst_reg)
3818 			continue;
3819 
3820 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
3821 			continue;
3822 
3823 		meta2->skip = true;
3824 	}
3825 }
3826 
3827 static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
3828 {
3829 	struct nfp_insn_meta *meta1, *meta2, *meta3;
3830 
3831 	nfp_for_each_insn_walk3(nfp_prog, meta1, meta2, meta3) {
3832 		struct bpf_insn insn, next1, next2;
3833 
3834 		insn = meta1->insn;
3835 		next1 = meta2->insn;
3836 		next2 = meta3->insn;
3837 
3838 		if (BPF_CLASS(insn.code) != BPF_LD)
3839 			continue;
3840 		if (BPF_MODE(insn.code) != BPF_ABS &&
3841 		    BPF_MODE(insn.code) != BPF_IND)
3842 			continue;
3843 		if (BPF_SIZE(insn.code) != BPF_W)
3844 			continue;
3845 
3846 		if (!(next1.code == (BPF_LSH | BPF_K | BPF_ALU64) &&
3847 		      next2.code == (BPF_RSH | BPF_K | BPF_ALU64)) &&
3848 		    !(next1.code == (BPF_RSH | BPF_K | BPF_ALU64) &&
3849 		      next2.code == (BPF_LSH | BPF_K | BPF_ALU64)))
3850 			continue;
3851 
3852 		if (next1.src_reg || next1.dst_reg ||
3853 		    next2.src_reg || next2.dst_reg)
3854 			continue;
3855 
3856 		if (next1.imm != 0x20 || next2.imm != 0x20)
3857 			continue;
3858 
3859 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
3860 		    meta3->flags & FLAG_INSN_IS_JUMP_DST)
3861 			continue;
3862 
3863 		meta2->skip = true;
3864 		meta3->skip = true;
3865 	}
3866 }
3867 
3868 /* load/store pair that forms memory copy sould look like the following:
3869  *
3870  *   ld_width R, [addr_src + offset_src]
3871  *   st_width [addr_dest + offset_dest], R
3872  *
3873  * The destination register of load and source register of store should
3874  * be the same, load and store should also perform at the same width.
3875  * If either of addr_src or addr_dest is stack pointer, we don't do the
3876  * CPP optimization as stack is modelled by registers on NFP.
3877  */
3878 static bool
3879 curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
3880 		    struct nfp_insn_meta *st_meta)
3881 {
3882 	struct bpf_insn *ld = &ld_meta->insn;
3883 	struct bpf_insn *st = &st_meta->insn;
3884 
3885 	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
3886 		return false;
3887 
3888 	if (ld_meta->ptr.type != PTR_TO_PACKET &&
3889 	    ld_meta->ptr.type != PTR_TO_MAP_VALUE)
3890 		return false;
3891 
3892 	if (st_meta->ptr.type != PTR_TO_PACKET)
3893 		return false;
3894 
3895 	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
3896 		return false;
3897 
3898 	if (ld->dst_reg != st->src_reg)
3899 		return false;
3900 
3901 	/* There is jump to the store insn in this pair. */
3902 	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
3903 		return false;
3904 
3905 	return true;
3906 }
3907 
3908 /* Currently, we only support chaining load/store pairs if:
3909  *
3910  *  - Their address base registers are the same.
3911  *  - Their address offsets are in the same order.
3912  *  - They operate at the same memory width.
3913  *  - There is no jump into the middle of them.
3914  */
3915 static bool
3916 curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
3917 			      struct nfp_insn_meta *st_meta,
3918 			      struct bpf_insn *prev_ld,
3919 			      struct bpf_insn *prev_st)
3920 {
3921 	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
3922 	struct bpf_insn *ld = &ld_meta->insn;
3923 	struct bpf_insn *st = &st_meta->insn;
3924 	s16 prev_ld_off, prev_st_off;
3925 
3926 	/* This pair is the start pair. */
3927 	if (!prev_ld)
3928 		return true;
3929 
3930 	prev_size = BPF_LDST_BYTES(prev_ld);
3931 	curr_size = BPF_LDST_BYTES(ld);
3932 	prev_ld_base = prev_ld->src_reg;
3933 	prev_st_base = prev_st->dst_reg;
3934 	prev_ld_dst = prev_ld->dst_reg;
3935 	prev_ld_off = prev_ld->off;
3936 	prev_st_off = prev_st->off;
3937 
3938 	if (ld->dst_reg != prev_ld_dst)
3939 		return false;
3940 
3941 	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
3942 		return false;
3943 
3944 	if (curr_size != prev_size)
3945 		return false;
3946 
3947 	/* There is jump to the head of this pair. */
3948 	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
3949 		return false;
3950 
3951 	/* Both in ascending order. */
3952 	if (prev_ld_off + prev_size == ld->off &&
3953 	    prev_st_off + prev_size == st->off)
3954 		return true;
3955 
3956 	/* Both in descending order. */
3957 	if (ld->off + curr_size == prev_ld_off &&
3958 	    st->off + curr_size == prev_st_off)
3959 		return true;
3960 
3961 	return false;
3962 }
3963 
3964 /* Return TRUE if cross memory access happens. Cross memory access means
3965  * store area is overlapping with load area that a later load might load
3966  * the value from previous store, for this case we can't treat the sequence
3967  * as an memory copy.
3968  */
3969 static bool
3970 cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
3971 		 struct nfp_insn_meta *head_st_meta)
3972 {
3973 	s16 head_ld_off, head_st_off, ld_off;
3974 
3975 	/* Different pointer types does not overlap. */
3976 	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
3977 		return false;
3978 
3979 	/* load and store are both PTR_TO_PACKET, check ID info.  */
3980 	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
3981 		return true;
3982 
3983 	/* Canonicalize the offsets. Turn all of them against the original
3984 	 * base register.
3985 	 */
3986 	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
3987 	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
3988 	ld_off = ld->off + head_ld_meta->ptr.off;
3989 
3990 	/* Ascending order cross. */
3991 	if (ld_off > head_ld_off &&
3992 	    head_ld_off < head_st_off && ld_off >= head_st_off)
3993 		return true;
3994 
3995 	/* Descending order cross. */
3996 	if (ld_off < head_ld_off &&
3997 	    head_ld_off > head_st_off && ld_off <= head_st_off)
3998 		return true;
3999 
4000 	return false;
4001 }
4002 
4003 /* This pass try to identify the following instructoin sequences.
4004  *
4005  *   load R, [regA + offA]
4006  *   store [regB + offB], R
4007  *   load R, [regA + offA + const_imm_A]
4008  *   store [regB + offB + const_imm_A], R
4009  *   load R, [regA + offA + 2 * const_imm_A]
4010  *   store [regB + offB + 2 * const_imm_A], R
4011  *   ...
4012  *
4013  * Above sequence is typically generated by compiler when lowering
4014  * memcpy. NFP prefer using CPP instructions to accelerate it.
4015  */
4016 static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
4017 {
4018 	struct nfp_insn_meta *head_ld_meta = NULL;
4019 	struct nfp_insn_meta *head_st_meta = NULL;
4020 	struct nfp_insn_meta *meta1, *meta2;
4021 	struct bpf_insn *prev_ld = NULL;
4022 	struct bpf_insn *prev_st = NULL;
4023 	u8 count = 0;
4024 
4025 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4026 		struct bpf_insn *ld = &meta1->insn;
4027 		struct bpf_insn *st = &meta2->insn;
4028 
4029 		/* Reset record status if any of the following if true:
4030 		 *   - The current insn pair is not load/store.
4031 		 *   - The load/store pair doesn't chain with previous one.
4032 		 *   - The chained load/store pair crossed with previous pair.
4033 		 *   - The chained load/store pair has a total size of memory
4034 		 *     copy beyond 128 bytes which is the maximum length a
4035 		 *     single NFP CPP command can transfer.
4036 		 */
4037 		if (!curr_pair_is_memcpy(meta1, meta2) ||
4038 		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
4039 						   prev_st) ||
4040 		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
4041 						       head_st_meta) ||
4042 				      head_ld_meta->ldst_gather_len >= 128))) {
4043 			if (!count)
4044 				continue;
4045 
4046 			if (count > 1) {
4047 				s16 prev_ld_off = prev_ld->off;
4048 				s16 prev_st_off = prev_st->off;
4049 				s16 head_ld_off = head_ld_meta->insn.off;
4050 
4051 				if (prev_ld_off < head_ld_off) {
4052 					head_ld_meta->insn.off = prev_ld_off;
4053 					head_st_meta->insn.off = prev_st_off;
4054 					head_ld_meta->ldst_gather_len =
4055 						-head_ld_meta->ldst_gather_len;
4056 				}
4057 
4058 				head_ld_meta->paired_st = &head_st_meta->insn;
4059 				head_st_meta->skip = true;
4060 			} else {
4061 				head_ld_meta->ldst_gather_len = 0;
4062 			}
4063 
4064 			/* If the chain is ended by an load/store pair then this
4065 			 * could serve as the new head of the the next chain.
4066 			 */
4067 			if (curr_pair_is_memcpy(meta1, meta2)) {
4068 				head_ld_meta = meta1;
4069 				head_st_meta = meta2;
4070 				head_ld_meta->ldst_gather_len =
4071 					BPF_LDST_BYTES(ld);
4072 				meta1 = nfp_meta_next(meta1);
4073 				meta2 = nfp_meta_next(meta2);
4074 				prev_ld = ld;
4075 				prev_st = st;
4076 				count = 1;
4077 			} else {
4078 				head_ld_meta = NULL;
4079 				head_st_meta = NULL;
4080 				prev_ld = NULL;
4081 				prev_st = NULL;
4082 				count = 0;
4083 			}
4084 
4085 			continue;
4086 		}
4087 
4088 		if (!head_ld_meta) {
4089 			head_ld_meta = meta1;
4090 			head_st_meta = meta2;
4091 		} else {
4092 			meta1->skip = true;
4093 			meta2->skip = true;
4094 		}
4095 
4096 		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
4097 		meta1 = nfp_meta_next(meta1);
4098 		meta2 = nfp_meta_next(meta2);
4099 		prev_ld = ld;
4100 		prev_st = st;
4101 		count++;
4102 	}
4103 }
4104 
4105 static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog)
4106 {
4107 	struct nfp_insn_meta *meta, *range_node = NULL;
4108 	s16 range_start = 0, range_end = 0;
4109 	bool cache_avail = false;
4110 	struct bpf_insn *insn;
4111 	s32 range_ptr_off = 0;
4112 	u32 range_ptr_id = 0;
4113 
4114 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4115 		if (meta->flags & FLAG_INSN_IS_JUMP_DST)
4116 			cache_avail = false;
4117 
4118 		if (meta->skip)
4119 			continue;
4120 
4121 		insn = &meta->insn;
4122 
4123 		if (is_mbpf_store_pkt(meta) ||
4124 		    insn->code == (BPF_JMP | BPF_CALL) ||
4125 		    is_mbpf_classic_store_pkt(meta) ||
4126 		    is_mbpf_classic_load(meta)) {
4127 			cache_avail = false;
4128 			continue;
4129 		}
4130 
4131 		if (!is_mbpf_load(meta))
4132 			continue;
4133 
4134 		if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) {
4135 			cache_avail = false;
4136 			continue;
4137 		}
4138 
4139 		if (!cache_avail) {
4140 			cache_avail = true;
4141 			if (range_node)
4142 				goto end_current_then_start_new;
4143 			goto start_new;
4144 		}
4145 
4146 		/* Check ID to make sure two reads share the same
4147 		 * variable offset against PTR_TO_PACKET, and check OFF
4148 		 * to make sure they also share the same constant
4149 		 * offset.
4150 		 *
4151 		 * OFFs don't really need to be the same, because they
4152 		 * are the constant offsets against PTR_TO_PACKET, so
4153 		 * for different OFFs, we could canonicalize them to
4154 		 * offsets against original packet pointer. We don't
4155 		 * support this.
4156 		 */
4157 		if (meta->ptr.id == range_ptr_id &&
4158 		    meta->ptr.off == range_ptr_off) {
4159 			s16 new_start = range_start;
4160 			s16 end, off = insn->off;
4161 			s16 new_end = range_end;
4162 			bool changed = false;
4163 
4164 			if (off < range_start) {
4165 				new_start = off;
4166 				changed = true;
4167 			}
4168 
4169 			end = off + BPF_LDST_BYTES(insn);
4170 			if (end > range_end) {
4171 				new_end = end;
4172 				changed = true;
4173 			}
4174 
4175 			if (!changed)
4176 				continue;
4177 
4178 			if (new_end - new_start <= 64) {
4179 				/* Install new range. */
4180 				range_start = new_start;
4181 				range_end = new_end;
4182 				continue;
4183 			}
4184 		}
4185 
4186 end_current_then_start_new:
4187 		range_node->pkt_cache.range_start = range_start;
4188 		range_node->pkt_cache.range_end = range_end;
4189 start_new:
4190 		range_node = meta;
4191 		range_node->pkt_cache.do_init = true;
4192 		range_ptr_id = range_node->ptr.id;
4193 		range_ptr_off = range_node->ptr.off;
4194 		range_start = insn->off;
4195 		range_end = insn->off + BPF_LDST_BYTES(insn);
4196 	}
4197 
4198 	if (range_node) {
4199 		range_node->pkt_cache.range_start = range_start;
4200 		range_node->pkt_cache.range_end = range_end;
4201 	}
4202 
4203 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4204 		if (meta->skip)
4205 			continue;
4206 
4207 		if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) {
4208 			if (meta->pkt_cache.do_init) {
4209 				range_start = meta->pkt_cache.range_start;
4210 				range_end = meta->pkt_cache.range_end;
4211 			} else {
4212 				meta->pkt_cache.range_start = range_start;
4213 				meta->pkt_cache.range_end = range_end;
4214 			}
4215 		}
4216 	}
4217 }
4218 
4219 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
4220 {
4221 	nfp_bpf_opt_reg_init(nfp_prog);
4222 
4223 	nfp_bpf_opt_neg_add_sub(nfp_prog);
4224 	nfp_bpf_opt_ld_mask(nfp_prog);
4225 	nfp_bpf_opt_ld_shift(nfp_prog);
4226 	nfp_bpf_opt_ldst_gather(nfp_prog);
4227 	nfp_bpf_opt_pkt_cache(nfp_prog);
4228 
4229 	return 0;
4230 }
4231 
4232 static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
4233 {
4234 	struct nfp_insn_meta *meta1, *meta2;
4235 	struct nfp_bpf_map *nfp_map;
4236 	struct bpf_map *map;
4237 	u32 id;
4238 
4239 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4240 		if (meta1->skip || meta2->skip)
4241 			continue;
4242 
4243 		if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
4244 		    meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
4245 			continue;
4246 
4247 		map = (void *)(unsigned long)((u32)meta1->insn.imm |
4248 					      (u64)meta2->insn.imm << 32);
4249 		if (bpf_map_offload_neutral(map)) {
4250 			id = map->id;
4251 		} else {
4252 			nfp_map = map_to_offmap(map)->dev_priv;
4253 			id = nfp_map->tid;
4254 		}
4255 
4256 		meta1->insn.imm = id;
4257 		meta2->insn.imm = 0;
4258 	}
4259 
4260 	return 0;
4261 }
4262 
4263 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
4264 {
4265 	__le64 *ustore = (__force __le64 *)prog;
4266 	int i;
4267 
4268 	for (i = 0; i < len; i++) {
4269 		int err;
4270 
4271 		err = nfp_ustore_check_valid_no_ecc(prog[i]);
4272 		if (err)
4273 			return err;
4274 
4275 		ustore[i] = cpu_to_le64(nfp_ustore_calc_ecc_insn(prog[i]));
4276 	}
4277 
4278 	return 0;
4279 }
4280 
4281 static void nfp_bpf_prog_trim(struct nfp_prog *nfp_prog)
4282 {
4283 	void *prog;
4284 
4285 	prog = kvmalloc_array(nfp_prog->prog_len, sizeof(u64), GFP_KERNEL);
4286 	if (!prog)
4287 		return;
4288 
4289 	nfp_prog->__prog_alloc_len = nfp_prog->prog_len * sizeof(u64);
4290 	memcpy(prog, nfp_prog->prog, nfp_prog->__prog_alloc_len);
4291 	kvfree(nfp_prog->prog);
4292 	nfp_prog->prog = prog;
4293 }
4294 
4295 int nfp_bpf_jit(struct nfp_prog *nfp_prog)
4296 {
4297 	int ret;
4298 
4299 	ret = nfp_bpf_replace_map_ptrs(nfp_prog);
4300 	if (ret)
4301 		return ret;
4302 
4303 	ret = nfp_bpf_optimize(nfp_prog);
4304 	if (ret)
4305 		return ret;
4306 
4307 	ret = nfp_translate(nfp_prog);
4308 	if (ret) {
4309 		pr_err("Translation failed with error %d (translated: %u)\n",
4310 		       ret, nfp_prog->n_translated);
4311 		return -EINVAL;
4312 	}
4313 
4314 	nfp_bpf_prog_trim(nfp_prog);
4315 
4316 	return ret;
4317 }
4318 
4319 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt)
4320 {
4321 	struct nfp_insn_meta *meta;
4322 
4323 	/* Another pass to record jump information. */
4324 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4325 		struct nfp_insn_meta *dst_meta;
4326 		u64 code = meta->insn.code;
4327 		unsigned int dst_idx;
4328 		bool pseudo_call;
4329 
4330 		if (BPF_CLASS(code) != BPF_JMP)
4331 			continue;
4332 		if (BPF_OP(code) == BPF_EXIT)
4333 			continue;
4334 		if (is_mbpf_helper_call(meta))
4335 			continue;
4336 
4337 		/* If opcode is BPF_CALL at this point, this can only be a
4338 		 * BPF-to-BPF call (a.k.a pseudo call).
4339 		 */
4340 		pseudo_call = BPF_OP(code) == BPF_CALL;
4341 
4342 		if (pseudo_call)
4343 			dst_idx = meta->n + 1 + meta->insn.imm;
4344 		else
4345 			dst_idx = meta->n + 1 + meta->insn.off;
4346 
4347 		dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_idx, cnt);
4348 
4349 		if (pseudo_call)
4350 			dst_meta->flags |= FLAG_INSN_IS_SUBPROG_START;
4351 
4352 		dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
4353 		meta->jmp_dst = dst_meta;
4354 	}
4355 }
4356 
4357 bool nfp_bpf_supported_opcode(u8 code)
4358 {
4359 	return !!instr_cb[code];
4360 }
4361 
4362 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
4363 {
4364 	unsigned int i;
4365 	u64 *prog;
4366 	int err;
4367 
4368 	prog = kmemdup(nfp_prog->prog, nfp_prog->prog_len * sizeof(u64),
4369 		       GFP_KERNEL);
4370 	if (!prog)
4371 		return ERR_PTR(-ENOMEM);
4372 
4373 	for (i = 0; i < nfp_prog->prog_len; i++) {
4374 		enum nfp_relo_type special;
4375 		u32 val;
4376 		u16 off;
4377 
4378 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
4379 		switch (special) {
4380 		case RELO_NONE:
4381 			continue;
4382 		case RELO_BR_REL:
4383 			br_add_offset(&prog[i], bv->start_off);
4384 			break;
4385 		case RELO_BR_GO_OUT:
4386 			br_set_offset(&prog[i],
4387 				      nfp_prog->tgt_out + bv->start_off);
4388 			break;
4389 		case RELO_BR_GO_ABORT:
4390 			br_set_offset(&prog[i],
4391 				      nfp_prog->tgt_abort + bv->start_off);
4392 			break;
4393 		case RELO_BR_GO_CALL_PUSH_REGS:
4394 			if (!nfp_prog->tgt_call_push_regs) {
4395 				pr_err("BUG: failed to detect subprogram registers needs\n");
4396 				err = -EINVAL;
4397 				goto err_free_prog;
4398 			}
4399 			off = nfp_prog->tgt_call_push_regs + bv->start_off;
4400 			br_set_offset(&prog[i], off);
4401 			break;
4402 		case RELO_BR_GO_CALL_POP_REGS:
4403 			if (!nfp_prog->tgt_call_pop_regs) {
4404 				pr_err("BUG: failed to detect subprogram registers needs\n");
4405 				err = -EINVAL;
4406 				goto err_free_prog;
4407 			}
4408 			off = nfp_prog->tgt_call_pop_regs + bv->start_off;
4409 			br_set_offset(&prog[i], off);
4410 			break;
4411 		case RELO_BR_NEXT_PKT:
4412 			br_set_offset(&prog[i], bv->tgt_done);
4413 			break;
4414 		case RELO_BR_HELPER:
4415 			val = br_get_offset(prog[i]);
4416 			val -= BR_OFF_RELO;
4417 			switch (val) {
4418 			case BPF_FUNC_map_lookup_elem:
4419 				val = nfp_prog->bpf->helpers.map_lookup;
4420 				break;
4421 			case BPF_FUNC_map_update_elem:
4422 				val = nfp_prog->bpf->helpers.map_update;
4423 				break;
4424 			case BPF_FUNC_map_delete_elem:
4425 				val = nfp_prog->bpf->helpers.map_delete;
4426 				break;
4427 			case BPF_FUNC_perf_event_output:
4428 				val = nfp_prog->bpf->helpers.perf_event_output;
4429 				break;
4430 			default:
4431 				pr_err("relocation of unknown helper %d\n",
4432 				       val);
4433 				err = -EINVAL;
4434 				goto err_free_prog;
4435 			}
4436 			br_set_offset(&prog[i], val);
4437 			break;
4438 		case RELO_IMMED_REL:
4439 			immed_add_value(&prog[i], bv->start_off);
4440 			break;
4441 		}
4442 
4443 		prog[i] &= ~OP_RELO_TYPE;
4444 	}
4445 
4446 	err = nfp_bpf_ustore_calc(prog, nfp_prog->prog_len);
4447 	if (err)
4448 		goto err_free_prog;
4449 
4450 	return prog;
4451 
4452 err_free_prog:
4453 	kfree(prog);
4454 	return ERR_PTR(err);
4455 }
4456