1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 /* Copyright (C) 2016-2018 Netronome Systems, Inc. */
3 
4 #define pr_fmt(fmt)	"NFP net bpf: " fmt
5 
6 #include <linux/bug.h>
7 #include <linux/bpf.h>
8 #include <linux/filter.h>
9 #include <linux/kernel.h>
10 #include <linux/pkt_cls.h>
11 #include <linux/reciprocal_div.h>
12 #include <linux/unistd.h>
13 
14 #include "main.h"
15 #include "../nfp_asm.h"
16 #include "../nfp_net_ctrl.h"
17 
18 /* --- NFP prog --- */
19 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
20  * It's safe to modify the next pointers (but not pos).
21  */
22 #define nfp_for_each_insn_walk2(nfp_prog, pos, next)			\
23 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
24 	     next = list_next_entry(pos, l);			\
25 	     &(nfp_prog)->insns != &pos->l &&			\
26 	     &(nfp_prog)->insns != &next->l;			\
27 	     pos = nfp_meta_next(pos),				\
28 	     next = nfp_meta_next(pos))
29 
30 #define nfp_for_each_insn_walk3(nfp_prog, pos, next, next2)		\
31 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
32 	     next = list_next_entry(pos, l),			\
33 	     next2 = list_next_entry(next, l);			\
34 	     &(nfp_prog)->insns != &pos->l &&			\
35 	     &(nfp_prog)->insns != &next->l &&			\
36 	     &(nfp_prog)->insns != &next2->l;			\
37 	     pos = nfp_meta_next(pos),				\
38 	     next = nfp_meta_next(pos),				\
39 	     next2 = nfp_meta_next(next))
40 
41 static bool
42 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
43 {
44 	return meta->l.prev != &nfp_prog->insns;
45 }
46 
47 static void nfp_prog_push(struct nfp_prog *nfp_prog, u64 insn)
48 {
49 	if (nfp_prog->__prog_alloc_len / sizeof(u64) == nfp_prog->prog_len) {
50 		pr_warn("instruction limit reached (%u NFP instructions)\n",
51 			nfp_prog->prog_len);
52 		nfp_prog->error = -ENOSPC;
53 		return;
54 	}
55 
56 	nfp_prog->prog[nfp_prog->prog_len] = insn;
57 	nfp_prog->prog_len++;
58 }
59 
60 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog)
61 {
62 	return nfp_prog->prog_len;
63 }
64 
65 static bool
66 nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off)
67 {
68 	/* If there is a recorded error we may have dropped instructions;
69 	 * that doesn't have to be due to translator bug, and the translation
70 	 * will fail anyway, so just return OK.
71 	 */
72 	if (nfp_prog->error)
73 		return true;
74 	return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off);
75 }
76 
77 /* --- Emitters --- */
78 static void
79 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
80 	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx,
81 	   bool indir)
82 {
83 	u64 insn;
84 
85 	insn =	FIELD_PREP(OP_CMD_A_SRC, areg) |
86 		FIELD_PREP(OP_CMD_CTX, ctx) |
87 		FIELD_PREP(OP_CMD_B_SRC, breg) |
88 		FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) |
89 		FIELD_PREP(OP_CMD_XFER, xfer) |
90 		FIELD_PREP(OP_CMD_CNT, size) |
91 		FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) |
92 		FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
93 		FIELD_PREP(OP_CMD_INDIR, indir) |
94 		FIELD_PREP(OP_CMD_MODE, mode);
95 
96 	nfp_prog_push(nfp_prog, insn);
97 }
98 
99 static void
100 emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
101 	     swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir)
102 {
103 	struct nfp_insn_re_regs reg;
104 	int err;
105 
106 	err = swreg_to_restricted(reg_none(), lreg, rreg, &reg, false);
107 	if (err) {
108 		nfp_prog->error = err;
109 		return;
110 	}
111 	if (reg.swap) {
112 		pr_err("cmd can't swap arguments\n");
113 		nfp_prog->error = -EFAULT;
114 		return;
115 	}
116 	if (reg.dst_lmextn || reg.src_lmextn) {
117 		pr_err("cmd can't use LMextn\n");
118 		nfp_prog->error = -EFAULT;
119 		return;
120 	}
121 
122 	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx,
123 		   indir);
124 }
125 
126 static void
127 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
128 	 swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
129 {
130 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false);
131 }
132 
133 static void
134 emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
135 	       swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
136 {
137 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true);
138 }
139 
140 static void
141 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
142 	  enum br_ctx_signal_state css, u16 addr, u8 defer)
143 {
144 	u16 addr_lo, addr_hi;
145 	u64 insn;
146 
147 	addr_lo = addr & (OP_BR_ADDR_LO >> __bf_shf(OP_BR_ADDR_LO));
148 	addr_hi = addr != addr_lo;
149 
150 	insn = OP_BR_BASE |
151 		FIELD_PREP(OP_BR_MASK, mask) |
152 		FIELD_PREP(OP_BR_EV_PIP, ev_pip) |
153 		FIELD_PREP(OP_BR_CSS, css) |
154 		FIELD_PREP(OP_BR_DEFBR, defer) |
155 		FIELD_PREP(OP_BR_ADDR_LO, addr_lo) |
156 		FIELD_PREP(OP_BR_ADDR_HI, addr_hi);
157 
158 	nfp_prog_push(nfp_prog, insn);
159 }
160 
161 static void
162 emit_br_relo(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer,
163 	     enum nfp_relo_type relo)
164 {
165 	if (mask == BR_UNC && defer > 2) {
166 		pr_err("BUG: branch defer out of bounds %d\n", defer);
167 		nfp_prog->error = -EFAULT;
168 		return;
169 	}
170 
171 	__emit_br(nfp_prog, mask,
172 		  mask != BR_UNC ? BR_EV_PIP_COND : BR_EV_PIP_UNCOND,
173 		  BR_CSS_NONE, addr, defer);
174 
175 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
176 		FIELD_PREP(OP_RELO_TYPE, relo);
177 }
178 
179 static void
180 emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
181 {
182 	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
183 }
184 
185 static void
186 __emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
187 	      bool set, bool src_lmextn)
188 {
189 	u16 addr_lo, addr_hi;
190 	u64 insn;
191 
192 	addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
193 	addr_hi = addr != addr_lo;
194 
195 	insn = OP_BR_BIT_BASE |
196 		FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
197 		FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
198 		FIELD_PREP(OP_BR_BIT_BV, set) |
199 		FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
200 		FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
201 		FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
202 		FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);
203 
204 	nfp_prog_push(nfp_prog, insn);
205 }
206 
207 static void
208 emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
209 		 u8 defer, bool set, enum nfp_relo_type relo)
210 {
211 	struct nfp_insn_re_regs reg;
212 	int err;
213 
214 	/* NOTE: The bit to test is specified as an rotation amount, such that
215 	 *	 the bit to test will be placed on the MSB of the result when
216 	 *	 doing a rotate right. For bit X, we need right rotate X + 1.
217 	 */
218 	bit += 1;
219 
220 	err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
221 	if (err) {
222 		nfp_prog->error = err;
223 		return;
224 	}
225 
226 	__emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
227 		      reg.src_lmextn);
228 
229 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
230 		FIELD_PREP(OP_RELO_TYPE, relo);
231 }
232 
233 static void
234 emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
235 {
236 	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
237 }
238 
239 static void
240 __emit_br_alu(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
241 	      u8 defer, bool dst_lmextn, bool src_lmextn)
242 {
243 	u64 insn;
244 
245 	insn = OP_BR_ALU_BASE |
246 		FIELD_PREP(OP_BR_ALU_A_SRC, areg) |
247 		FIELD_PREP(OP_BR_ALU_B_SRC, breg) |
248 		FIELD_PREP(OP_BR_ALU_DEFBR, defer) |
249 		FIELD_PREP(OP_BR_ALU_IMM_HI, imm_hi) |
250 		FIELD_PREP(OP_BR_ALU_SRC_LMEXTN, src_lmextn) |
251 		FIELD_PREP(OP_BR_ALU_DST_LMEXTN, dst_lmextn);
252 
253 	nfp_prog_push(nfp_prog, insn);
254 }
255 
256 static void emit_rtn(struct nfp_prog *nfp_prog, swreg base, u8 defer)
257 {
258 	struct nfp_insn_ur_regs reg;
259 	int err;
260 
261 	err = swreg_to_unrestricted(reg_none(), base, reg_imm(0), &reg);
262 	if (err) {
263 		nfp_prog->error = err;
264 		return;
265 	}
266 
267 	__emit_br_alu(nfp_prog, reg.areg, reg.breg, 0, defer, reg.dst_lmextn,
268 		      reg.src_lmextn);
269 }
270 
271 static void
272 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
273 	     enum immed_width width, bool invert,
274 	     enum immed_shift shift, bool wr_both,
275 	     bool dst_lmextn, bool src_lmextn)
276 {
277 	u64 insn;
278 
279 	insn = OP_IMMED_BASE |
280 		FIELD_PREP(OP_IMMED_A_SRC, areg) |
281 		FIELD_PREP(OP_IMMED_B_SRC, breg) |
282 		FIELD_PREP(OP_IMMED_IMM, imm_hi) |
283 		FIELD_PREP(OP_IMMED_WIDTH, width) |
284 		FIELD_PREP(OP_IMMED_INV, invert) |
285 		FIELD_PREP(OP_IMMED_SHIFT, shift) |
286 		FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
287 		FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
288 		FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
289 
290 	nfp_prog_push(nfp_prog, insn);
291 }
292 
293 static void
294 emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
295 	   enum immed_width width, bool invert, enum immed_shift shift)
296 {
297 	struct nfp_insn_ur_regs reg;
298 	int err;
299 
300 	if (swreg_type(dst) == NN_REG_IMM) {
301 		nfp_prog->error = -EFAULT;
302 		return;
303 	}
304 
305 	err = swreg_to_unrestricted(dst, dst, reg_imm(imm & 0xff), &reg);
306 	if (err) {
307 		nfp_prog->error = err;
308 		return;
309 	}
310 
311 	/* Use reg.dst when destination is No-Dest. */
312 	__emit_immed(nfp_prog,
313 		     swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
314 		     reg.breg, imm >> 8, width, invert, shift,
315 		     reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
316 }
317 
318 static void
319 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
320 	   enum shf_sc sc, u8 shift,
321 	   u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
322 	   bool dst_lmextn, bool src_lmextn)
323 {
324 	u64 insn;
325 
326 	if (!FIELD_FIT(OP_SHF_SHIFT, shift)) {
327 		nfp_prog->error = -EFAULT;
328 		return;
329 	}
330 
331 	/* NFP shift instruction has something special. If shift direction is
332 	 * left then shift amount of 1 to 31 is specified as 32 minus the amount
333 	 * to shift.
334 	 *
335 	 * But no need to do this for indirect shift which has shift amount be
336 	 * 0. Even after we do this subtraction, shift amount 0 will be turned
337 	 * into 32 which will eventually be encoded the same as 0 because only
338 	 * low 5 bits are encoded, but shift amount be 32 will fail the
339 	 * FIELD_PREP check done later on shift mask (0x1f), due to 32 is out of
340 	 * mask range.
341 	 */
342 	if (sc == SHF_SC_L_SHF && shift)
343 		shift = 32 - shift;
344 
345 	insn = OP_SHF_BASE |
346 		FIELD_PREP(OP_SHF_A_SRC, areg) |
347 		FIELD_PREP(OP_SHF_SC, sc) |
348 		FIELD_PREP(OP_SHF_B_SRC, breg) |
349 		FIELD_PREP(OP_SHF_I8, i8) |
350 		FIELD_PREP(OP_SHF_SW, sw) |
351 		FIELD_PREP(OP_SHF_DST, dst) |
352 		FIELD_PREP(OP_SHF_SHIFT, shift) |
353 		FIELD_PREP(OP_SHF_OP, op) |
354 		FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
355 		FIELD_PREP(OP_SHF_WR_AB, wr_both) |
356 		FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
357 		FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
358 
359 	nfp_prog_push(nfp_prog, insn);
360 }
361 
362 static void
363 emit_shf(struct nfp_prog *nfp_prog, swreg dst,
364 	 swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
365 {
366 	struct nfp_insn_re_regs reg;
367 	int err;
368 
369 	err = swreg_to_restricted(dst, lreg, rreg, &reg, true);
370 	if (err) {
371 		nfp_prog->error = err;
372 		return;
373 	}
374 
375 	__emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
376 		   reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
377 		   reg.dst_lmextn, reg.src_lmextn);
378 }
379 
380 static void
381 emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
382 	       swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
383 {
384 	if (sc == SHF_SC_R_ROT) {
385 		pr_err("indirect shift is not allowed on rotation\n");
386 		nfp_prog->error = -EFAULT;
387 		return;
388 	}
389 
390 	emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
391 }
392 
393 static void
394 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
395 	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
396 	   bool dst_lmextn, bool src_lmextn)
397 {
398 	u64 insn;
399 
400 	insn = OP_ALU_BASE |
401 		FIELD_PREP(OP_ALU_A_SRC, areg) |
402 		FIELD_PREP(OP_ALU_B_SRC, breg) |
403 		FIELD_PREP(OP_ALU_DST, dst) |
404 		FIELD_PREP(OP_ALU_SW, swap) |
405 		FIELD_PREP(OP_ALU_OP, op) |
406 		FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
407 		FIELD_PREP(OP_ALU_WR_AB, wr_both) |
408 		FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
409 		FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
410 
411 	nfp_prog_push(nfp_prog, insn);
412 }
413 
414 static void
415 emit_alu(struct nfp_prog *nfp_prog, swreg dst,
416 	 swreg lreg, enum alu_op op, swreg rreg)
417 {
418 	struct nfp_insn_ur_regs reg;
419 	int err;
420 
421 	err = swreg_to_unrestricted(dst, lreg, rreg, &reg);
422 	if (err) {
423 		nfp_prog->error = err;
424 		return;
425 	}
426 
427 	__emit_alu(nfp_prog, reg.dst, reg.dst_ab,
428 		   reg.areg, op, reg.breg, reg.swap, reg.wr_both,
429 		   reg.dst_lmextn, reg.src_lmextn);
430 }
431 
432 static void
433 __emit_mul(struct nfp_prog *nfp_prog, enum alu_dst_ab dst_ab, u16 areg,
434 	   enum mul_type type, enum mul_step step, u16 breg, bool swap,
435 	   bool wr_both, bool dst_lmextn, bool src_lmextn)
436 {
437 	u64 insn;
438 
439 	insn = OP_MUL_BASE |
440 		FIELD_PREP(OP_MUL_A_SRC, areg) |
441 		FIELD_PREP(OP_MUL_B_SRC, breg) |
442 		FIELD_PREP(OP_MUL_STEP, step) |
443 		FIELD_PREP(OP_MUL_DST_AB, dst_ab) |
444 		FIELD_PREP(OP_MUL_SW, swap) |
445 		FIELD_PREP(OP_MUL_TYPE, type) |
446 		FIELD_PREP(OP_MUL_WR_AB, wr_both) |
447 		FIELD_PREP(OP_MUL_SRC_LMEXTN, src_lmextn) |
448 		FIELD_PREP(OP_MUL_DST_LMEXTN, dst_lmextn);
449 
450 	nfp_prog_push(nfp_prog, insn);
451 }
452 
453 static void
454 emit_mul(struct nfp_prog *nfp_prog, swreg lreg, enum mul_type type,
455 	 enum mul_step step, swreg rreg)
456 {
457 	struct nfp_insn_ur_regs reg;
458 	u16 areg;
459 	int err;
460 
461 	if (type == MUL_TYPE_START && step != MUL_STEP_NONE) {
462 		nfp_prog->error = -EINVAL;
463 		return;
464 	}
465 
466 	if (step == MUL_LAST || step == MUL_LAST_2) {
467 		/* When type is step and step Number is LAST or LAST2, left
468 		 * source is used as destination.
469 		 */
470 		err = swreg_to_unrestricted(lreg, reg_none(), rreg, &reg);
471 		areg = reg.dst;
472 	} else {
473 		err = swreg_to_unrestricted(reg_none(), lreg, rreg, &reg);
474 		areg = reg.areg;
475 	}
476 
477 	if (err) {
478 		nfp_prog->error = err;
479 		return;
480 	}
481 
482 	__emit_mul(nfp_prog, reg.dst_ab, areg, type, step, reg.breg, reg.swap,
483 		   reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
484 }
485 
486 static void
487 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
488 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
489 		bool zero, bool swap, bool wr_both,
490 		bool dst_lmextn, bool src_lmextn)
491 {
492 	u64 insn;
493 
494 	insn = OP_LDF_BASE |
495 		FIELD_PREP(OP_LDF_A_SRC, areg) |
496 		FIELD_PREP(OP_LDF_SC, sc) |
497 		FIELD_PREP(OP_LDF_B_SRC, breg) |
498 		FIELD_PREP(OP_LDF_I8, imm8) |
499 		FIELD_PREP(OP_LDF_SW, swap) |
500 		FIELD_PREP(OP_LDF_ZF, zero) |
501 		FIELD_PREP(OP_LDF_BMASK, bmask) |
502 		FIELD_PREP(OP_LDF_SHF, shift) |
503 		FIELD_PREP(OP_LDF_WR_AB, wr_both) |
504 		FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
505 		FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
506 
507 	nfp_prog_push(nfp_prog, insn);
508 }
509 
510 static void
511 emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
512 		  enum shf_sc sc, u8 shift, bool zero)
513 {
514 	struct nfp_insn_re_regs reg;
515 	int err;
516 
517 	/* Note: ld_field is special as it uses one of the src regs as dst */
518 	err = swreg_to_restricted(dst, dst, src, &reg, true);
519 	if (err) {
520 		nfp_prog->error = err;
521 		return;
522 	}
523 
524 	__emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
525 			reg.i8, zero, reg.swap, reg.wr_both,
526 			reg.dst_lmextn, reg.src_lmextn);
527 }
528 
529 static void
530 emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
531 	      enum shf_sc sc, u8 shift)
532 {
533 	emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
534 }
535 
536 static void
537 __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
538 	    bool dst_lmextn, bool src_lmextn)
539 {
540 	u64 insn;
541 
542 	insn = OP_LCSR_BASE |
543 		FIELD_PREP(OP_LCSR_A_SRC, areg) |
544 		FIELD_PREP(OP_LCSR_B_SRC, breg) |
545 		FIELD_PREP(OP_LCSR_WRITE, wr) |
546 		FIELD_PREP(OP_LCSR_ADDR, addr / 4) |
547 		FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
548 		FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
549 
550 	nfp_prog_push(nfp_prog, insn);
551 }
552 
553 static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
554 {
555 	struct nfp_insn_ur_regs reg;
556 	int err;
557 
558 	/* This instruction takes immeds instead of reg_none() for the ignored
559 	 * operand, but we can't encode 2 immeds in one instr with our normal
560 	 * swreg infra so if param is an immed, we encode as reg_none() and
561 	 * copy the immed to both operands.
562 	 */
563 	if (swreg_type(src) == NN_REG_IMM) {
564 		err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
565 		reg.breg = reg.areg;
566 	} else {
567 		err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
568 	}
569 	if (err) {
570 		nfp_prog->error = err;
571 		return;
572 	}
573 
574 	__emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr,
575 		    false, reg.src_lmextn);
576 }
577 
578 /* CSR value is read in following immed[gpr, 0] */
579 static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr)
580 {
581 	__emit_lcsr(nfp_prog, 0, 0, false, addr, false, false);
582 }
583 
584 static void emit_nop(struct nfp_prog *nfp_prog)
585 {
586 	__emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
587 }
588 
589 /* --- Wrappers --- */
590 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
591 {
592 	if (!(imm & 0xffff0000)) {
593 		*val = imm;
594 		*shift = IMMED_SHIFT_0B;
595 	} else if (!(imm & 0xff0000ff)) {
596 		*val = imm >> 8;
597 		*shift = IMMED_SHIFT_1B;
598 	} else if (!(imm & 0x0000ffff)) {
599 		*val = imm >> 16;
600 		*shift = IMMED_SHIFT_2B;
601 	} else {
602 		return false;
603 	}
604 
605 	return true;
606 }
607 
608 static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
609 {
610 	enum immed_shift shift;
611 	u16 val;
612 
613 	if (pack_immed(imm, &val, &shift)) {
614 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, false, shift);
615 	} else if (pack_immed(~imm, &val, &shift)) {
616 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, true, shift);
617 	} else {
618 		emit_immed(nfp_prog, dst, imm & 0xffff, IMMED_WIDTH_ALL,
619 			   false, IMMED_SHIFT_0B);
620 		emit_immed(nfp_prog, dst, imm >> 16, IMMED_WIDTH_WORD,
621 			   false, IMMED_SHIFT_2B);
622 	}
623 }
624 
625 static void
626 wrp_zext(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst)
627 {
628 	if (meta->flags & FLAG_INSN_DO_ZEXT)
629 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
630 }
631 
632 static void
633 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm,
634 	       enum nfp_relo_type relo)
635 {
636 	if (imm > 0xffff) {
637 		pr_err("relocation of a large immediate!\n");
638 		nfp_prog->error = -EFAULT;
639 		return;
640 	}
641 	emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
642 
643 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
644 		FIELD_PREP(OP_RELO_TYPE, relo);
645 }
646 
647 /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted)
648  * If the @imm is small enough encode it directly in operand and return
649  * otherwise load @imm to a spare register and return its encoding.
650  */
651 static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
652 {
653 	if (FIELD_FIT(UR_REG_IMM_MAX, imm))
654 		return reg_imm(imm);
655 
656 	wrp_immed(nfp_prog, tmp_reg, imm);
657 	return tmp_reg;
658 }
659 
660 /* re_load_imm_any() - encode immediate or use tmp register (restricted)
661  * If the @imm is small enough encode it directly in operand and return
662  * otherwise load @imm to a spare register and return its encoding.
663  */
664 static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
665 {
666 	if (FIELD_FIT(RE_REG_IMM_MAX, imm))
667 		return reg_imm(imm);
668 
669 	wrp_immed(nfp_prog, tmp_reg, imm);
670 	return tmp_reg;
671 }
672 
673 static void wrp_nops(struct nfp_prog *nfp_prog, unsigned int count)
674 {
675 	while (count--)
676 		emit_nop(nfp_prog);
677 }
678 
679 static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src)
680 {
681 	emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src);
682 }
683 
684 static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
685 {
686 	wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
687 }
688 
689 /* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
690  * result to @dst from low end.
691  */
692 static void
693 wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
694 		u8 offset)
695 {
696 	enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
697 	u8 mask = (1 << field_len) - 1;
698 
699 	emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
700 }
701 
702 /* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the
703  * result to @dst from offset, there is no change on the other bits of @dst.
704  */
705 static void
706 wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src,
707 		   u8 field_len, u8 offset)
708 {
709 	enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE;
710 	u8 mask = ((1 << field_len) - 1) << offset;
711 
712 	emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8);
713 }
714 
715 static void
716 addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
717 	      swreg *rega, swreg *regb)
718 {
719 	if (offset == reg_imm(0)) {
720 		*rega = reg_a(src_gpr);
721 		*regb = reg_b(src_gpr + 1);
722 		return;
723 	}
724 
725 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset);
726 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C,
727 		 reg_imm(0));
728 	*rega = imm_a(nfp_prog);
729 	*regb = imm_b(nfp_prog);
730 }
731 
732 /* NFP has Command Push Pull bus which supports bluk memory operations. */
733 static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
734 {
735 	bool descending_seq = meta->ldst_gather_len < 0;
736 	s16 len = abs(meta->ldst_gather_len);
737 	swreg src_base, off;
738 	bool src_40bit_addr;
739 	unsigned int i;
740 	u8 xfer_num;
741 
742 	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
743 	src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE;
744 	src_base = reg_a(meta->insn.src_reg * 2);
745 	xfer_num = round_up(len, 4) / 4;
746 
747 	if (src_40bit_addr)
748 		addr40_offset(nfp_prog, meta->insn.src_reg * 2, off, &src_base,
749 			      &off);
750 
751 	/* Setup PREV_ALU fields to override memory read length. */
752 	if (len > 32)
753 		wrp_immed(nfp_prog, reg_none(),
754 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
755 
756 	/* Memory read from source addr into transfer-in registers. */
757 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP,
758 		     src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0,
759 		     src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32);
760 
761 	/* Move from transfer-in to transfer-out. */
762 	for (i = 0; i < xfer_num; i++)
763 		wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
764 
765 	off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
766 
767 	if (len <= 8) {
768 		/* Use single direct_ref write8. */
769 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
770 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
771 			 CMD_CTX_SWAP);
772 	} else if (len <= 32 && IS_ALIGNED(len, 4)) {
773 		/* Use single direct_ref write32. */
774 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
775 			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
776 			 CMD_CTX_SWAP);
777 	} else if (len <= 32) {
778 		/* Use single indirect_ref write8. */
779 		wrp_immed(nfp_prog, reg_none(),
780 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
781 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
782 			       reg_a(meta->paired_st->dst_reg * 2), off,
783 			       len - 1, CMD_CTX_SWAP);
784 	} else if (IS_ALIGNED(len, 4)) {
785 		/* Use single indirect_ref write32. */
786 		wrp_immed(nfp_prog, reg_none(),
787 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
788 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
789 			       reg_a(meta->paired_st->dst_reg * 2), off,
790 			       xfer_num - 1, CMD_CTX_SWAP);
791 	} else if (len <= 40) {
792 		/* Use one direct_ref write32 to write the first 32-bytes, then
793 		 * another direct_ref write8 to write the remaining bytes.
794 		 */
795 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
796 			 reg_a(meta->paired_st->dst_reg * 2), off, 7,
797 			 CMD_CTX_SWAP);
798 
799 		off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
800 				      imm_b(nfp_prog));
801 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
802 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
803 			 CMD_CTX_SWAP);
804 	} else {
805 		/* Use one indirect_ref write32 to write 4-bytes aligned length,
806 		 * then another direct_ref write8 to write the remaining bytes.
807 		 */
808 		u8 new_off;
809 
810 		wrp_immed(nfp_prog, reg_none(),
811 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
812 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
813 			       reg_a(meta->paired_st->dst_reg * 2), off,
814 			       xfer_num - 2, CMD_CTX_SWAP);
815 		new_off = meta->paired_st->off + (xfer_num - 1) * 4;
816 		off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
817 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
818 			 xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
819 			 (len & 0x3) - 1, CMD_CTX_SWAP);
820 	}
821 
822 	/* TODO: The following extra load is to make sure data flow be identical
823 	 *  before and after we do memory copy optimization.
824 	 *
825 	 *  The load destination register is not guaranteed to be dead, so we
826 	 *  need to make sure it is loaded with the value the same as before
827 	 *  this transformation.
828 	 *
829 	 *  These extra loads could be removed once we have accurate register
830 	 *  usage information.
831 	 */
832 	if (descending_seq)
833 		xfer_num = 0;
834 	else if (BPF_SIZE(meta->insn.code) != BPF_DW)
835 		xfer_num = xfer_num - 1;
836 	else
837 		xfer_num = xfer_num - 2;
838 
839 	switch (BPF_SIZE(meta->insn.code)) {
840 	case BPF_B:
841 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
842 				reg_xfer(xfer_num), 1,
843 				IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
844 		break;
845 	case BPF_H:
846 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
847 				reg_xfer(xfer_num), 2, (len & 3) ^ 2);
848 		break;
849 	case BPF_W:
850 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
851 			reg_xfer(0));
852 		break;
853 	case BPF_DW:
854 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
855 			reg_xfer(xfer_num));
856 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
857 			reg_xfer(xfer_num + 1));
858 		break;
859 	}
860 
861 	if (BPF_SIZE(meta->insn.code) != BPF_DW)
862 		wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
863 
864 	return 0;
865 }
866 
867 static int
868 data_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, swreg offset,
869 	u8 dst_gpr, int size)
870 {
871 	unsigned int i;
872 	u16 shift, sz;
873 
874 	/* We load the value from the address indicated in @offset and then
875 	 * shift out the data we don't need.  Note: this is big endian!
876 	 */
877 	sz = max(size, 4);
878 	shift = size < 4 ? 4 - size : 0;
879 
880 	emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
881 		 pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP);
882 
883 	i = 0;
884 	if (shift)
885 		emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE,
886 			 reg_xfer(0), SHF_SC_R_SHF, shift * 8);
887 	else
888 		for (; i * 4 < size; i++)
889 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
890 
891 	if (i < 2)
892 		wrp_zext(nfp_prog, meta, dst_gpr);
893 
894 	return 0;
895 }
896 
897 static int
898 data_ld_host_order(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
899 		   u8 dst_gpr, swreg lreg, swreg rreg, int size,
900 		   enum cmd_mode mode)
901 {
902 	unsigned int i;
903 	u8 mask, sz;
904 
905 	/* We load the value from the address indicated in rreg + lreg and then
906 	 * mask out the data we don't need.  Note: this is little endian!
907 	 */
908 	sz = max(size, 4);
909 	mask = size < 4 ? GENMASK(size - 1, 0) : 0;
910 
911 	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0,
912 		 lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP);
913 
914 	i = 0;
915 	if (mask)
916 		emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask,
917 				  reg_xfer(0), SHF_SC_NONE, 0, true);
918 	else
919 		for (; i * 4 < size; i++)
920 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
921 
922 	if (i < 2)
923 		wrp_zext(nfp_prog, meta, dst_gpr);
924 
925 	return 0;
926 }
927 
928 static int
929 data_ld_host_order_addr32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
930 			  u8 src_gpr, swreg offset, u8 dst_gpr, u8 size)
931 {
932 	return data_ld_host_order(nfp_prog, meta, dst_gpr, reg_a(src_gpr),
933 				  offset, size, CMD_MODE_32b);
934 }
935 
936 static int
937 data_ld_host_order_addr40(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
938 			  u8 src_gpr, swreg offset, u8 dst_gpr, u8 size)
939 {
940 	swreg rega, regb;
941 
942 	addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb);
943 
944 	return data_ld_host_order(nfp_prog, meta, dst_gpr, rega, regb,
945 				  size, CMD_MODE_40b_BA);
946 }
947 
948 static int
949 construct_data_ind_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
950 		      u16 offset, u16 src, u8 size)
951 {
952 	swreg tmp_reg;
953 
954 	/* Calculate the true offset (src_reg + imm) */
955 	tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
956 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg);
957 
958 	/* Check packet length (size guaranteed to fit b/c it's u8) */
959 	emit_alu(nfp_prog, imm_a(nfp_prog),
960 		 imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
961 	emit_alu(nfp_prog, reg_none(),
962 		 plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
963 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
964 
965 	/* Load data */
966 	return data_ld(nfp_prog, meta, imm_b(nfp_prog), 0, size);
967 }
968 
969 static int
970 construct_data_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
971 		  u16 offset, u8 size)
972 {
973 	swreg tmp_reg;
974 
975 	/* Check packet length */
976 	tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog));
977 	emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
978 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
979 
980 	/* Load data */
981 	tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
982 	return data_ld(nfp_prog, meta, tmp_reg, 0, size);
983 }
984 
985 static int
986 data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
987 		    u8 src_gpr, u8 size)
988 {
989 	unsigned int i;
990 
991 	for (i = 0; i * 4 < size; i++)
992 		wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i));
993 
994 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
995 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
996 
997 	return 0;
998 }
999 
1000 static int
1001 data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
1002 		   u64 imm, u8 size)
1003 {
1004 	wrp_immed(nfp_prog, reg_xfer(0), imm);
1005 	if (size == 8)
1006 		wrp_immed(nfp_prog, reg_xfer(1), imm >> 32);
1007 
1008 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
1009 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
1010 
1011 	return 0;
1012 }
1013 
1014 typedef int
1015 (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
1016 	     unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1017 	     bool needs_inc);
1018 
1019 static int
1020 wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
1021 	      unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1022 	      bool needs_inc)
1023 {
1024 	bool should_inc = needs_inc && new_gpr && !last;
1025 	u32 idx, src_byte;
1026 	enum shf_sc sc;
1027 	swreg reg;
1028 	int shf;
1029 	u8 mask;
1030 
1031 	if (WARN_ON_ONCE(dst_byte + size > 4 || off % 4 + size > 4))
1032 		return -EOPNOTSUPP;
1033 
1034 	idx = off / 4;
1035 
1036 	/* Move the entire word */
1037 	if (size == 4) {
1038 		wrp_mov(nfp_prog, reg_both(dst),
1039 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
1040 		return 0;
1041 	}
1042 
1043 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1044 		return -EOPNOTSUPP;
1045 
1046 	src_byte = off % 4;
1047 
1048 	mask = (1 << size) - 1;
1049 	mask <<= dst_byte;
1050 
1051 	if (WARN_ON_ONCE(mask > 0xf))
1052 		return -EOPNOTSUPP;
1053 
1054 	shf = abs(src_byte - dst_byte) * 8;
1055 	if (src_byte == dst_byte) {
1056 		sc = SHF_SC_NONE;
1057 	} else if (src_byte < dst_byte) {
1058 		shf = 32 - shf;
1059 		sc = SHF_SC_L_SHF;
1060 	} else {
1061 		sc = SHF_SC_R_SHF;
1062 	}
1063 
1064 	/* ld_field can address fewer indexes, if offset too large do RMW.
1065 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1066 	 */
1067 	if (idx <= RE_REG_LM_IDX_MAX) {
1068 		reg = reg_lm(lm3 ? 3 : 0, idx);
1069 	} else {
1070 		reg = imm_a(nfp_prog);
1071 		/* If it's not the first part of the load and we start a new GPR
1072 		 * that means we are loading a second part of the LMEM word into
1073 		 * a new GPR.  IOW we've already looked that LMEM word and
1074 		 * therefore it has been loaded into imm_a().
1075 		 */
1076 		if (first || !new_gpr)
1077 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1078 	}
1079 
1080 	emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
1081 
1082 	if (should_inc)
1083 		wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1084 
1085 	return 0;
1086 }
1087 
1088 static int
1089 wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
1090 	       unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1091 	       bool needs_inc)
1092 {
1093 	bool should_inc = needs_inc && new_gpr && !last;
1094 	u32 idx, dst_byte;
1095 	enum shf_sc sc;
1096 	swreg reg;
1097 	int shf;
1098 	u8 mask;
1099 
1100 	if (WARN_ON_ONCE(src_byte + size > 4 || off % 4 + size > 4))
1101 		return -EOPNOTSUPP;
1102 
1103 	idx = off / 4;
1104 
1105 	/* Move the entire word */
1106 	if (size == 4) {
1107 		wrp_mov(nfp_prog,
1108 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
1109 			reg_b(src));
1110 		return 0;
1111 	}
1112 
1113 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1114 		return -EOPNOTSUPP;
1115 
1116 	dst_byte = off % 4;
1117 
1118 	mask = (1 << size) - 1;
1119 	mask <<= dst_byte;
1120 
1121 	if (WARN_ON_ONCE(mask > 0xf))
1122 		return -EOPNOTSUPP;
1123 
1124 	shf = abs(src_byte - dst_byte) * 8;
1125 	if (src_byte == dst_byte) {
1126 		sc = SHF_SC_NONE;
1127 	} else if (src_byte < dst_byte) {
1128 		shf = 32 - shf;
1129 		sc = SHF_SC_L_SHF;
1130 	} else {
1131 		sc = SHF_SC_R_SHF;
1132 	}
1133 
1134 	/* ld_field can address fewer indexes, if offset too large do RMW.
1135 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1136 	 */
1137 	if (idx <= RE_REG_LM_IDX_MAX) {
1138 		reg = reg_lm(lm3 ? 3 : 0, idx);
1139 	} else {
1140 		reg = imm_a(nfp_prog);
1141 		/* Only first and last LMEM locations are going to need RMW,
1142 		 * the middle location will be overwritten fully.
1143 		 */
1144 		if (first || last)
1145 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1146 	}
1147 
1148 	emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
1149 
1150 	if (new_gpr || last) {
1151 		if (idx > RE_REG_LM_IDX_MAX)
1152 			wrp_mov(nfp_prog, reg_lm(0, idx), reg);
1153 		if (should_inc)
1154 			wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1155 	}
1156 
1157 	return 0;
1158 }
1159 
1160 static int
1161 mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1162 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
1163 	     bool clr_gpr, lmem_step step)
1164 {
1165 	s32 off = nfp_prog->stack_frame_depth + meta->insn.off + ptr_off;
1166 	bool first = true, last;
1167 	bool needs_inc = false;
1168 	swreg stack_off_reg;
1169 	u8 prev_gpr = 255;
1170 	u32 gpr_byte = 0;
1171 	bool lm3 = true;
1172 	int ret;
1173 
1174 	if (meta->ptr_not_const ||
1175 	    meta->flags & FLAG_INSN_PTR_CALLER_STACK_FRAME) {
1176 		/* Use of the last encountered ptr_off is OK, they all have
1177 		 * the same alignment.  Depend on low bits of value being
1178 		 * discarded when written to LMaddr register.
1179 		 */
1180 		stack_off_reg = ur_load_imm_any(nfp_prog, meta->insn.off,
1181 						stack_imm(nfp_prog));
1182 
1183 		emit_alu(nfp_prog, imm_b(nfp_prog),
1184 			 reg_a(ptr_gpr), ALU_OP_ADD, stack_off_reg);
1185 
1186 		needs_inc = true;
1187 	} else if (off + size <= 64) {
1188 		/* We can reach bottom 64B with LMaddr0 */
1189 		lm3 = false;
1190 	} else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
1191 		/* We have to set up a new pointer.  If we know the offset
1192 		 * and the entire access falls into a single 32 byte aligned
1193 		 * window we won't have to increment the LM pointer.
1194 		 * The 32 byte alignment is imporant because offset is ORed in
1195 		 * not added when doing *l$indexN[off].
1196 		 */
1197 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
1198 						stack_imm(nfp_prog));
1199 		emit_alu(nfp_prog, imm_b(nfp_prog),
1200 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1201 
1202 		off %= 32;
1203 	} else {
1204 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
1205 						stack_imm(nfp_prog));
1206 
1207 		emit_alu(nfp_prog, imm_b(nfp_prog),
1208 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1209 
1210 		needs_inc = true;
1211 	}
1212 	if (lm3) {
1213 		emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
1214 		/* For size < 4 one slot will be filled by zeroing of upper. */
1215 		wrp_nops(nfp_prog, clr_gpr && size < 8 ? 2 : 3);
1216 	}
1217 
1218 	if (clr_gpr && size < 8)
1219 		wrp_zext(nfp_prog, meta, gpr);
1220 
1221 	while (size) {
1222 		u32 slice_end;
1223 		u8 slice_size;
1224 
1225 		slice_size = min(size, 4 - gpr_byte);
1226 		slice_end = min(off + slice_size, round_up(off + 1, 4));
1227 		slice_size = slice_end - off;
1228 
1229 		last = slice_size == size;
1230 
1231 		if (needs_inc)
1232 			off %= 4;
1233 
1234 		ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
1235 			   first, gpr != prev_gpr, last, lm3, needs_inc);
1236 		if (ret)
1237 			return ret;
1238 
1239 		prev_gpr = gpr;
1240 		first = false;
1241 
1242 		gpr_byte += slice_size;
1243 		if (gpr_byte >= 4) {
1244 			gpr_byte -= 4;
1245 			gpr++;
1246 		}
1247 
1248 		size -= slice_size;
1249 		off += slice_size;
1250 	}
1251 
1252 	return 0;
1253 }
1254 
1255 static void
1256 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
1257 {
1258 	swreg tmp_reg;
1259 
1260 	if (alu_op == ALU_OP_AND) {
1261 		if (!imm)
1262 			wrp_immed(nfp_prog, reg_both(dst), 0);
1263 		if (!imm || !~imm)
1264 			return;
1265 	}
1266 	if (alu_op == ALU_OP_OR) {
1267 		if (!~imm)
1268 			wrp_immed(nfp_prog, reg_both(dst), ~0U);
1269 		if (!imm || !~imm)
1270 			return;
1271 	}
1272 	if (alu_op == ALU_OP_XOR) {
1273 		if (!~imm)
1274 			emit_alu(nfp_prog, reg_both(dst), reg_none(),
1275 				 ALU_OP_NOT, reg_b(dst));
1276 		if (!imm || !~imm)
1277 			return;
1278 	}
1279 
1280 	tmp_reg = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1281 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, tmp_reg);
1282 }
1283 
1284 static int
1285 wrp_alu64_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1286 	      enum alu_op alu_op, bool skip)
1287 {
1288 	const struct bpf_insn *insn = &meta->insn;
1289 	u64 imm = insn->imm; /* sign extend */
1290 
1291 	if (skip) {
1292 		meta->flags |= FLAG_INSN_SKIP_NOOP;
1293 		return 0;
1294 	}
1295 
1296 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, imm & ~0U);
1297 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, alu_op, imm >> 32);
1298 
1299 	return 0;
1300 }
1301 
1302 static int
1303 wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1304 	      enum alu_op alu_op)
1305 {
1306 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1307 
1308 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1309 	emit_alu(nfp_prog, reg_both(dst + 1),
1310 		 reg_a(dst + 1), alu_op, reg_b(src + 1));
1311 
1312 	return 0;
1313 }
1314 
1315 static int
1316 wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1317 	      enum alu_op alu_op)
1318 {
1319 	const struct bpf_insn *insn = &meta->insn;
1320 	u8 dst = insn->dst_reg * 2;
1321 
1322 	wrp_alu_imm(nfp_prog, dst, alu_op, insn->imm);
1323 	wrp_zext(nfp_prog, meta, dst);
1324 
1325 	return 0;
1326 }
1327 
1328 static int
1329 wrp_alu32_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1330 	      enum alu_op alu_op)
1331 {
1332 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1333 
1334 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1335 	wrp_zext(nfp_prog, meta, dst);
1336 
1337 	return 0;
1338 }
1339 
1340 static void
1341 wrp_test_reg_one(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u8 src,
1342 		 enum br_mask br_mask, u16 off)
1343 {
1344 	emit_alu(nfp_prog, reg_none(), reg_a(dst), alu_op, reg_b(src));
1345 	emit_br(nfp_prog, br_mask, off, 0);
1346 }
1347 
1348 static int
1349 wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1350 	     enum alu_op alu_op, enum br_mask br_mask)
1351 {
1352 	const struct bpf_insn *insn = &meta->insn;
1353 
1354 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
1355 			 insn->src_reg * 2, br_mask, insn->off);
1356 	if (is_mbpf_jmp64(meta))
1357 		wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
1358 				 insn->src_reg * 2 + 1, br_mask, insn->off);
1359 
1360 	return 0;
1361 }
1362 
1363 static const struct jmp_code_map {
1364 	enum br_mask br_mask;
1365 	bool swap;
1366 } jmp_code_map[] = {
1367 	[BPF_JGT >> 4]	= { BR_BLO, true },
1368 	[BPF_JGE >> 4]	= { BR_BHS, false },
1369 	[BPF_JLT >> 4]	= { BR_BLO, false },
1370 	[BPF_JLE >> 4]	= { BR_BHS, true },
1371 	[BPF_JSGT >> 4]	= { BR_BLT, true },
1372 	[BPF_JSGE >> 4]	= { BR_BGE, false },
1373 	[BPF_JSLT >> 4]	= { BR_BLT, false },
1374 	[BPF_JSLE >> 4]	= { BR_BGE, true },
1375 };
1376 
1377 static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
1378 {
1379 	unsigned int op;
1380 
1381 	op = BPF_OP(meta->insn.code) >> 4;
1382 	/* br_mask of 0 is BR_BEQ which we don't use in jump code table */
1383 	if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
1384 		      !jmp_code_map[op].br_mask,
1385 		      "no code found for jump instruction"))
1386 		return NULL;
1387 
1388 	return &jmp_code_map[op];
1389 }
1390 
1391 static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1392 {
1393 	const struct bpf_insn *insn = &meta->insn;
1394 	u64 imm = insn->imm; /* sign extend */
1395 	const struct jmp_code_map *code;
1396 	enum alu_op alu_op, carry_op;
1397 	u8 reg = insn->dst_reg * 2;
1398 	swreg tmp_reg;
1399 
1400 	code = nfp_jmp_code_get(meta);
1401 	if (!code)
1402 		return -EINVAL;
1403 
1404 	alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
1405 	carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
1406 
1407 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
1408 	if (!code->swap)
1409 		emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
1410 	else
1411 		emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
1412 
1413 	if (is_mbpf_jmp64(meta)) {
1414 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
1415 		if (!code->swap)
1416 			emit_alu(nfp_prog, reg_none(),
1417 				 reg_a(reg + 1), carry_op, tmp_reg);
1418 		else
1419 			emit_alu(nfp_prog, reg_none(),
1420 				 tmp_reg, carry_op, reg_a(reg + 1));
1421 	}
1422 
1423 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1424 
1425 	return 0;
1426 }
1427 
1428 static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1429 {
1430 	const struct bpf_insn *insn = &meta->insn;
1431 	const struct jmp_code_map *code;
1432 	u8 areg, breg;
1433 
1434 	code = nfp_jmp_code_get(meta);
1435 	if (!code)
1436 		return -EINVAL;
1437 
1438 	areg = insn->dst_reg * 2;
1439 	breg = insn->src_reg * 2;
1440 
1441 	if (code->swap) {
1442 		areg ^= breg;
1443 		breg ^= areg;
1444 		areg ^= breg;
1445 	}
1446 
1447 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
1448 	if (is_mbpf_jmp64(meta))
1449 		emit_alu(nfp_prog, reg_none(),
1450 			 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
1451 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1452 
1453 	return 0;
1454 }
1455 
1456 static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
1457 {
1458 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in,
1459 		      SHF_SC_R_ROT, 8);
1460 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out),
1461 		      SHF_SC_R_ROT, 16);
1462 }
1463 
1464 static void
1465 wrp_mul_u32(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1466 	    swreg rreg, bool gen_high_half)
1467 {
1468 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1469 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_1, rreg);
1470 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_2, rreg);
1471 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_3, rreg);
1472 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_4, rreg);
1473 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_32x32, MUL_LAST, reg_none());
1474 	if (gen_high_half)
1475 		emit_mul(nfp_prog, dst_hi, MUL_TYPE_STEP_32x32, MUL_LAST_2,
1476 			 reg_none());
1477 	else
1478 		wrp_immed(nfp_prog, dst_hi, 0);
1479 }
1480 
1481 static void
1482 wrp_mul_u16(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1483 	    swreg rreg)
1484 {
1485 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1486 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_1, rreg);
1487 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_2, rreg);
1488 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_16x16, MUL_LAST, reg_none());
1489 }
1490 
1491 static int
1492 wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1493 	bool gen_high_half, bool ropnd_from_reg)
1494 {
1495 	swreg multiplier, multiplicand, dst_hi, dst_lo;
1496 	const struct bpf_insn *insn = &meta->insn;
1497 	u32 lopnd_max, ropnd_max;
1498 	u8 dst_reg;
1499 
1500 	dst_reg = insn->dst_reg;
1501 	multiplicand = reg_a(dst_reg * 2);
1502 	dst_hi = reg_both(dst_reg * 2 + 1);
1503 	dst_lo = reg_both(dst_reg * 2);
1504 	lopnd_max = meta->umax_dst;
1505 	if (ropnd_from_reg) {
1506 		multiplier = reg_b(insn->src_reg * 2);
1507 		ropnd_max = meta->umax_src;
1508 	} else {
1509 		u32 imm = insn->imm;
1510 
1511 		multiplier = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1512 		ropnd_max = imm;
1513 	}
1514 	if (lopnd_max > U16_MAX || ropnd_max > U16_MAX)
1515 		wrp_mul_u32(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier,
1516 			    gen_high_half);
1517 	else
1518 		wrp_mul_u16(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier);
1519 
1520 	return 0;
1521 }
1522 
1523 static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
1524 {
1525 	swreg dst_both = reg_both(dst), dst_a = reg_a(dst), dst_b = reg_a(dst);
1526 	struct reciprocal_value_adv rvalue;
1527 	u8 pre_shift, exp;
1528 	swreg magic;
1529 
1530 	if (imm > U32_MAX) {
1531 		wrp_immed(nfp_prog, dst_both, 0);
1532 		return 0;
1533 	}
1534 
1535 	/* NOTE: because we are using "reciprocal_value_adv" which doesn't
1536 	 * support "divisor > (1u << 31)", we need to JIT separate NFP sequence
1537 	 * to handle such case which actually equals to the result of unsigned
1538 	 * comparison "dst >= imm" which could be calculated using the following
1539 	 * NFP sequence:
1540 	 *
1541 	 *  alu[--, dst, -, imm]
1542 	 *  immed[imm, 0]
1543 	 *  alu[dst, imm, +carry, 0]
1544 	 *
1545 	 */
1546 	if (imm > 1U << 31) {
1547 		swreg tmp_b = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1548 
1549 		emit_alu(nfp_prog, reg_none(), dst_a, ALU_OP_SUB, tmp_b);
1550 		wrp_immed(nfp_prog, imm_a(nfp_prog), 0);
1551 		emit_alu(nfp_prog, dst_both, imm_a(nfp_prog), ALU_OP_ADD_C,
1552 			 reg_imm(0));
1553 		return 0;
1554 	}
1555 
1556 	rvalue = reciprocal_value_adv(imm, 32);
1557 	exp = rvalue.exp;
1558 	if (rvalue.is_wide_m && !(imm & 1)) {
1559 		pre_shift = fls(imm & -imm) - 1;
1560 		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
1561 	} else {
1562 		pre_shift = 0;
1563 	}
1564 	magic = ur_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
1565 	if (imm == 1U << exp) {
1566 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1567 			 SHF_SC_R_SHF, exp);
1568 	} else if (rvalue.is_wide_m) {
1569 		wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a,
1570 			    magic, true);
1571 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB,
1572 			 imm_b(nfp_prog));
1573 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1574 			 SHF_SC_R_SHF, 1);
1575 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD,
1576 			 imm_b(nfp_prog));
1577 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1578 			 SHF_SC_R_SHF, rvalue.sh - 1);
1579 	} else {
1580 		if (pre_shift)
1581 			emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1582 				 dst_b, SHF_SC_R_SHF, pre_shift);
1583 		wrp_mul_u32(nfp_prog, dst_both, reg_none(), dst_a, magic, true);
1584 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1585 			 dst_b, SHF_SC_R_SHF, rvalue.sh);
1586 	}
1587 
1588 	return 0;
1589 }
1590 
1591 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1592 {
1593 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
1594 	struct nfp_bpf_cap_adjust_head *adjust_head;
1595 	u32 ret_einval, end;
1596 
1597 	adjust_head = &nfp_prog->bpf->adjust_head;
1598 
1599 	/* Optimized version - 5 vs 14 cycles */
1600 	if (nfp_prog->adjust_head_location != UINT_MAX) {
1601 		if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n))
1602 			return -EINVAL;
1603 
1604 		emit_alu(nfp_prog, pptr_reg(nfp_prog),
1605 			 reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog));
1606 		emit_alu(nfp_prog, plen_reg(nfp_prog),
1607 			 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1608 		emit_alu(nfp_prog, pv_len(nfp_prog),
1609 			 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1610 
1611 		wrp_immed(nfp_prog, reg_both(0), 0);
1612 		wrp_immed(nfp_prog, reg_both(1), 0);
1613 
1614 		/* TODO: when adjust head is guaranteed to succeed we can
1615 		 * also eliminate the following if (r0 == 0) branch.
1616 		 */
1617 
1618 		return 0;
1619 	}
1620 
1621 	ret_einval = nfp_prog_current_offset(nfp_prog) + 14;
1622 	end = ret_einval + 2;
1623 
1624 	/* We need to use a temp because offset is just a part of the pkt ptr */
1625 	emit_alu(nfp_prog, tmp,
1626 		 reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog));
1627 
1628 	/* Validate result will fit within FW datapath constraints */
1629 	emit_alu(nfp_prog, reg_none(),
1630 		 tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min));
1631 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1632 	emit_alu(nfp_prog, reg_none(),
1633 		 reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp);
1634 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1635 
1636 	/* Validate the length is at least ETH_HLEN */
1637 	emit_alu(nfp_prog, tmp_len,
1638 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1639 	emit_alu(nfp_prog, reg_none(),
1640 		 tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN));
1641 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1642 
1643 	/* Load the ret code */
1644 	wrp_immed(nfp_prog, reg_both(0), 0);
1645 	wrp_immed(nfp_prog, reg_both(1), 0);
1646 
1647 	/* Modify the packet metadata */
1648 	emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0);
1649 
1650 	/* Skip over the -EINVAL ret code (defer 2) */
1651 	emit_br(nfp_prog, BR_UNC, end, 2);
1652 
1653 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1654 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1655 	emit_alu(nfp_prog, pv_len(nfp_prog),
1656 		 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1657 
1658 	/* return -EINVAL target */
1659 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1660 		return -EINVAL;
1661 
1662 	wrp_immed(nfp_prog, reg_both(0), -22);
1663 	wrp_immed(nfp_prog, reg_both(1), ~0);
1664 
1665 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1666 		return -EINVAL;
1667 
1668 	return 0;
1669 }
1670 
1671 static int adjust_tail(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1672 {
1673 	u32 ret_einval, end;
1674 	swreg plen, delta;
1675 
1676 	BUILD_BUG_ON(plen_reg(nfp_prog) != reg_b(STATIC_REG_PKT_LEN));
1677 
1678 	plen = imm_a(nfp_prog);
1679 	delta = reg_a(2 * 2);
1680 
1681 	ret_einval = nfp_prog_current_offset(nfp_prog) + 9;
1682 	end = nfp_prog_current_offset(nfp_prog) + 11;
1683 
1684 	/* Calculate resulting length */
1685 	emit_alu(nfp_prog, plen, plen_reg(nfp_prog), ALU_OP_ADD, delta);
1686 	/* delta == 0 is not allowed by the kernel, add must overflow to make
1687 	 * length smaller.
1688 	 */
1689 	emit_br(nfp_prog, BR_BCC, ret_einval, 0);
1690 
1691 	/* if (new_len < 14) then -EINVAL */
1692 	emit_alu(nfp_prog, reg_none(), plen, ALU_OP_SUB, reg_imm(ETH_HLEN));
1693 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1694 
1695 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1696 		 plen_reg(nfp_prog), ALU_OP_ADD, delta);
1697 	emit_alu(nfp_prog, pv_len(nfp_prog),
1698 		 pv_len(nfp_prog), ALU_OP_ADD, delta);
1699 
1700 	emit_br(nfp_prog, BR_UNC, end, 2);
1701 	wrp_immed(nfp_prog, reg_both(0), 0);
1702 	wrp_immed(nfp_prog, reg_both(1), 0);
1703 
1704 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1705 		return -EINVAL;
1706 
1707 	wrp_immed(nfp_prog, reg_both(0), -22);
1708 	wrp_immed(nfp_prog, reg_both(1), ~0);
1709 
1710 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1711 		return -EINVAL;
1712 
1713 	return 0;
1714 }
1715 
1716 static int
1717 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1718 {
1719 	bool load_lm_ptr;
1720 	u32 ret_tgt;
1721 	s64 lm_off;
1722 
1723 	/* We only have to reload LM0 if the key is not at start of stack */
1724 	lm_off = nfp_prog->stack_frame_depth;
1725 	lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
1726 	load_lm_ptr = meta->arg2.var_off || lm_off;
1727 
1728 	/* Set LM0 to start of key */
1729 	if (load_lm_ptr)
1730 		emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0);
1731 	if (meta->func_id == BPF_FUNC_map_update_elem)
1732 		emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1733 
1734 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1735 		     2, RELO_BR_HELPER);
1736 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1737 
1738 	/* Load map ID into A0 */
1739 	wrp_mov(nfp_prog, reg_a(0), reg_a(2));
1740 
1741 	/* Load the return address into B0 */
1742 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1743 
1744 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1745 		return -EINVAL;
1746 
1747 	/* Reset the LM0 pointer */
1748 	if (!load_lm_ptr)
1749 		return 0;
1750 
1751 	emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
1752 	wrp_nops(nfp_prog, 3);
1753 
1754 	return 0;
1755 }
1756 
1757 static int
1758 nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1759 {
1760 	__emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM);
1761 	/* CSR value is read in following immed[gpr, 0] */
1762 	emit_immed(nfp_prog, reg_both(0), 0,
1763 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1764 	emit_immed(nfp_prog, reg_both(1), 0,
1765 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1766 	return 0;
1767 }
1768 
1769 static int
1770 nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1771 {
1772 	swreg ptr_type;
1773 	u32 ret_tgt;
1774 
1775 	ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
1776 
1777 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
1778 
1779 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1780 		     2, RELO_BR_HELPER);
1781 
1782 	/* Load ptr type into A1 */
1783 	wrp_mov(nfp_prog, reg_a(1), ptr_type);
1784 
1785 	/* Load the return address into B0 */
1786 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1787 
1788 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1789 		return -EINVAL;
1790 
1791 	return 0;
1792 }
1793 
1794 static int
1795 nfp_queue_select(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1796 {
1797 	u32 jmp_tgt;
1798 
1799 	jmp_tgt = nfp_prog_current_offset(nfp_prog) + 5;
1800 
1801 	/* Make sure the queue id fits into FW field */
1802 	emit_alu(nfp_prog, reg_none(), reg_a(meta->insn.src_reg * 2),
1803 		 ALU_OP_AND_NOT_B, reg_imm(0xff));
1804 	emit_br(nfp_prog, BR_BEQ, jmp_tgt, 2);
1805 
1806 	/* Set the 'queue selected' bit and the queue value */
1807 	emit_shf(nfp_prog, pv_qsel_set(nfp_prog),
1808 		 pv_qsel_set(nfp_prog), SHF_OP_OR, reg_imm(1),
1809 		 SHF_SC_L_SHF, PKT_VEL_QSEL_SET_BIT);
1810 	emit_ld_field(nfp_prog,
1811 		      pv_qsel_val(nfp_prog), 0x1, reg_b(meta->insn.src_reg * 2),
1812 		      SHF_SC_NONE, 0);
1813 	/* Delay slots end here, we will jump over next instruction if queue
1814 	 * value fits into the field.
1815 	 */
1816 	emit_ld_field(nfp_prog,
1817 		      pv_qsel_val(nfp_prog), 0x1, reg_imm(NFP_NET_RXR_MAX),
1818 		      SHF_SC_NONE, 0);
1819 
1820 	if (!nfp_prog_confirm_current_offset(nfp_prog, jmp_tgt))
1821 		return -EINVAL;
1822 
1823 	return 0;
1824 }
1825 
1826 /* --- Callbacks --- */
1827 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1828 {
1829 	const struct bpf_insn *insn = &meta->insn;
1830 	u8 dst = insn->dst_reg * 2;
1831 	u8 src = insn->src_reg * 2;
1832 
1833 	if (insn->src_reg == BPF_REG_10) {
1834 		swreg stack_depth_reg;
1835 
1836 		stack_depth_reg = ur_load_imm_any(nfp_prog,
1837 						  nfp_prog->stack_frame_depth,
1838 						  stack_imm(nfp_prog));
1839 		emit_alu(nfp_prog, reg_both(dst), stack_reg(nfp_prog),
1840 			 ALU_OP_ADD, stack_depth_reg);
1841 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1842 	} else {
1843 		wrp_reg_mov(nfp_prog, dst, src);
1844 		wrp_reg_mov(nfp_prog, dst + 1, src + 1);
1845 	}
1846 
1847 	return 0;
1848 }
1849 
1850 static int mov_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1851 {
1852 	u64 imm = meta->insn.imm; /* sign extend */
1853 
1854 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2), imm & ~0U);
1855 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), imm >> 32);
1856 
1857 	return 0;
1858 }
1859 
1860 static int xor_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1861 {
1862 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_XOR);
1863 }
1864 
1865 static int xor_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1866 {
1867 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm);
1868 }
1869 
1870 static int and_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1871 {
1872 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_AND);
1873 }
1874 
1875 static int and_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1876 {
1877 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1878 }
1879 
1880 static int or_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1881 {
1882 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_OR);
1883 }
1884 
1885 static int or_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1886 {
1887 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1888 }
1889 
1890 static int add_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1891 {
1892 	const struct bpf_insn *insn = &meta->insn;
1893 
1894 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1895 		 reg_a(insn->dst_reg * 2), ALU_OP_ADD,
1896 		 reg_b(insn->src_reg * 2));
1897 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1898 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_ADD_C,
1899 		 reg_b(insn->src_reg * 2 + 1));
1900 
1901 	return 0;
1902 }
1903 
1904 static int add_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1905 {
1906 	const struct bpf_insn *insn = &meta->insn;
1907 	u64 imm = insn->imm; /* sign extend */
1908 
1909 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_ADD, imm & ~0U);
1910 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_ADD_C, imm >> 32);
1911 
1912 	return 0;
1913 }
1914 
1915 static int sub_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1916 {
1917 	const struct bpf_insn *insn = &meta->insn;
1918 
1919 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1920 		 reg_a(insn->dst_reg * 2), ALU_OP_SUB,
1921 		 reg_b(insn->src_reg * 2));
1922 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1923 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_SUB_C,
1924 		 reg_b(insn->src_reg * 2 + 1));
1925 
1926 	return 0;
1927 }
1928 
1929 static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1930 {
1931 	const struct bpf_insn *insn = &meta->insn;
1932 	u64 imm = insn->imm; /* sign extend */
1933 
1934 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_SUB, imm & ~0U);
1935 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_SUB_C, imm >> 32);
1936 
1937 	return 0;
1938 }
1939 
1940 static int mul_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1941 {
1942 	return wrp_mul(nfp_prog, meta, true, true);
1943 }
1944 
1945 static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1946 {
1947 	return wrp_mul(nfp_prog, meta, true, false);
1948 }
1949 
1950 static int div_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1951 {
1952 	const struct bpf_insn *insn = &meta->insn;
1953 
1954 	return wrp_div_imm(nfp_prog, insn->dst_reg * 2, insn->imm);
1955 }
1956 
1957 static int div_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1958 {
1959 	/* NOTE: verifier hook has rejected cases for which verifier doesn't
1960 	 * know whether the source operand is constant or not.
1961 	 */
1962 	return wrp_div_imm(nfp_prog, meta->insn.dst_reg * 2, meta->umin_src);
1963 }
1964 
1965 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1966 {
1967 	const struct bpf_insn *insn = &meta->insn;
1968 
1969 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2), reg_imm(0),
1970 		 ALU_OP_SUB, reg_b(insn->dst_reg * 2));
1971 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1), reg_imm(0),
1972 		 ALU_OP_SUB_C, reg_b(insn->dst_reg * 2 + 1));
1973 
1974 	return 0;
1975 }
1976 
1977 /* Pseudo code:
1978  *   if shift_amt >= 32
1979  *     dst_high = dst_low << shift_amt[4:0]
1980  *     dst_low = 0;
1981  *   else
1982  *     dst_high = (dst_high, dst_low) >> (32 - shift_amt)
1983  *     dst_low = dst_low << shift_amt
1984  *
1985  * The indirect shift will use the same logic at runtime.
1986  */
1987 static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
1988 {
1989 	if (!shift_amt)
1990 		return 0;
1991 
1992 	if (shift_amt < 32) {
1993 		emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
1994 			 SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
1995 			 32 - shift_amt);
1996 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
1997 			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
1998 	} else if (shift_amt == 32) {
1999 		wrp_reg_mov(nfp_prog, dst + 1, dst);
2000 		wrp_immed(nfp_prog, reg_both(dst), 0);
2001 	} else if (shift_amt > 32) {
2002 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2003 			 reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
2004 		wrp_immed(nfp_prog, reg_both(dst), 0);
2005 	}
2006 
2007 	return 0;
2008 }
2009 
2010 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2011 {
2012 	const struct bpf_insn *insn = &meta->insn;
2013 	u8 dst = insn->dst_reg * 2;
2014 
2015 	return __shl_imm64(nfp_prog, dst, insn->imm);
2016 }
2017 
2018 static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2019 {
2020 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
2021 		 reg_b(src));
2022 	emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
2023 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
2024 		       reg_b(dst), SHF_SC_R_DSHF);
2025 }
2026 
2027 /* NOTE: for indirect left shift, HIGH part should be calculated first. */
2028 static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2029 {
2030 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2031 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2032 		       reg_b(dst), SHF_SC_L_SHF);
2033 }
2034 
2035 static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2036 {
2037 	shl_reg64_lt32_high(nfp_prog, dst, src);
2038 	shl_reg64_lt32_low(nfp_prog, dst, src);
2039 }
2040 
2041 static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2042 {
2043 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2044 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2045 		       reg_b(dst), SHF_SC_L_SHF);
2046 	wrp_immed(nfp_prog, reg_both(dst), 0);
2047 }
2048 
2049 static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2050 {
2051 	const struct bpf_insn *insn = &meta->insn;
2052 	u64 umin, umax;
2053 	u8 dst, src;
2054 
2055 	dst = insn->dst_reg * 2;
2056 	umin = meta->umin_src;
2057 	umax = meta->umax_src;
2058 	if (umin == umax)
2059 		return __shl_imm64(nfp_prog, dst, umin);
2060 
2061 	src = insn->src_reg * 2;
2062 	if (umax < 32) {
2063 		shl_reg64_lt32(nfp_prog, dst, src);
2064 	} else if (umin >= 32) {
2065 		shl_reg64_ge32(nfp_prog, dst, src);
2066 	} else {
2067 		/* Generate different instruction sequences depending on runtime
2068 		 * value of shift amount.
2069 		 */
2070 		u16 label_ge32, label_end;
2071 
2072 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
2073 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2074 
2075 		shl_reg64_lt32_high(nfp_prog, dst, src);
2076 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2077 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2078 		/* shl_reg64_lt32_low packed in delay slot. */
2079 		shl_reg64_lt32_low(nfp_prog, dst, src);
2080 
2081 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2082 			return -EINVAL;
2083 		shl_reg64_ge32(nfp_prog, dst, src);
2084 
2085 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2086 			return -EINVAL;
2087 	}
2088 
2089 	return 0;
2090 }
2091 
2092 /* Pseudo code:
2093  *   if shift_amt >= 32
2094  *     dst_high = 0;
2095  *     dst_low = dst_high >> shift_amt[4:0]
2096  *   else
2097  *     dst_high = dst_high >> shift_amt
2098  *     dst_low = (dst_high, dst_low) >> shift_amt
2099  *
2100  * The indirect shift will use the same logic at runtime.
2101  */
2102 static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2103 {
2104 	if (!shift_amt)
2105 		return 0;
2106 
2107 	if (shift_amt < 32) {
2108 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2109 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2110 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2111 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2112 	} else if (shift_amt == 32) {
2113 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2114 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2115 	} else if (shift_amt > 32) {
2116 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2117 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2118 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2119 	}
2120 
2121 	return 0;
2122 }
2123 
2124 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2125 {
2126 	const struct bpf_insn *insn = &meta->insn;
2127 	u8 dst = insn->dst_reg * 2;
2128 
2129 	return __shr_imm64(nfp_prog, dst, insn->imm);
2130 }
2131 
2132 /* NOTE: for indirect right shift, LOW part should be calculated first. */
2133 static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2134 {
2135 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2136 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2137 		       reg_b(dst + 1), SHF_SC_R_SHF);
2138 }
2139 
2140 static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2141 {
2142 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2143 	emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2144 		       reg_b(dst), SHF_SC_R_DSHF);
2145 }
2146 
2147 static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2148 {
2149 	shr_reg64_lt32_low(nfp_prog, dst, src);
2150 	shr_reg64_lt32_high(nfp_prog, dst, src);
2151 }
2152 
2153 static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2154 {
2155 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2156 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2157 		       reg_b(dst + 1), SHF_SC_R_SHF);
2158 	wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2159 }
2160 
2161 static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2162 {
2163 	const struct bpf_insn *insn = &meta->insn;
2164 	u64 umin, umax;
2165 	u8 dst, src;
2166 
2167 	dst = insn->dst_reg * 2;
2168 	umin = meta->umin_src;
2169 	umax = meta->umax_src;
2170 	if (umin == umax)
2171 		return __shr_imm64(nfp_prog, dst, umin);
2172 
2173 	src = insn->src_reg * 2;
2174 	if (umax < 32) {
2175 		shr_reg64_lt32(nfp_prog, dst, src);
2176 	} else if (umin >= 32) {
2177 		shr_reg64_ge32(nfp_prog, dst, src);
2178 	} else {
2179 		/* Generate different instruction sequences depending on runtime
2180 		 * value of shift amount.
2181 		 */
2182 		u16 label_ge32, label_end;
2183 
2184 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2185 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2186 		shr_reg64_lt32_low(nfp_prog, dst, src);
2187 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2188 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2189 		/* shr_reg64_lt32_high packed in delay slot. */
2190 		shr_reg64_lt32_high(nfp_prog, dst, src);
2191 
2192 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2193 			return -EINVAL;
2194 		shr_reg64_ge32(nfp_prog, dst, src);
2195 
2196 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2197 			return -EINVAL;
2198 	}
2199 
2200 	return 0;
2201 }
2202 
2203 /* Code logic is the same as __shr_imm64 except ashr requires signedness bit
2204  * told through PREV_ALU result.
2205  */
2206 static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2207 {
2208 	if (!shift_amt)
2209 		return 0;
2210 
2211 	if (shift_amt < 32) {
2212 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2213 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2214 		/* Set signedness bit. */
2215 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2216 			 reg_imm(0));
2217 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2218 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2219 	} else if (shift_amt == 32) {
2220 		/* NOTE: this also helps setting signedness bit. */
2221 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2222 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2223 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2224 	} else if (shift_amt > 32) {
2225 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2226 			 reg_imm(0));
2227 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2228 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2229 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2230 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2231 	}
2232 
2233 	return 0;
2234 }
2235 
2236 static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2237 {
2238 	const struct bpf_insn *insn = &meta->insn;
2239 	u8 dst = insn->dst_reg * 2;
2240 
2241 	return __ashr_imm64(nfp_prog, dst, insn->imm);
2242 }
2243 
2244 static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2245 {
2246 	/* NOTE: the first insn will set both indirect shift amount (source A)
2247 	 * and signedness bit (MSB of result).
2248 	 */
2249 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2250 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2251 		       reg_b(dst + 1), SHF_SC_R_SHF);
2252 }
2253 
2254 static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2255 {
2256 	/* NOTE: it is the same as logic shift because we don't need to shift in
2257 	 * signedness bit when the shift amount is less than 32.
2258 	 */
2259 	return shr_reg64_lt32_low(nfp_prog, dst, src);
2260 }
2261 
2262 static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2263 {
2264 	ashr_reg64_lt32_low(nfp_prog, dst, src);
2265 	ashr_reg64_lt32_high(nfp_prog, dst, src);
2266 }
2267 
2268 static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2269 {
2270 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2271 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2272 		       reg_b(dst + 1), SHF_SC_R_SHF);
2273 	emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2274 		 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2275 }
2276 
2277 /* Like ashr_imm64, but need to use indirect shift. */
2278 static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2279 {
2280 	const struct bpf_insn *insn = &meta->insn;
2281 	u64 umin, umax;
2282 	u8 dst, src;
2283 
2284 	dst = insn->dst_reg * 2;
2285 	umin = meta->umin_src;
2286 	umax = meta->umax_src;
2287 	if (umin == umax)
2288 		return __ashr_imm64(nfp_prog, dst, umin);
2289 
2290 	src = insn->src_reg * 2;
2291 	if (umax < 32) {
2292 		ashr_reg64_lt32(nfp_prog, dst, src);
2293 	} else if (umin >= 32) {
2294 		ashr_reg64_ge32(nfp_prog, dst, src);
2295 	} else {
2296 		u16 label_ge32, label_end;
2297 
2298 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2299 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2300 		ashr_reg64_lt32_low(nfp_prog, dst, src);
2301 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2302 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2303 		/* ashr_reg64_lt32_high packed in delay slot. */
2304 		ashr_reg64_lt32_high(nfp_prog, dst, src);
2305 
2306 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2307 			return -EINVAL;
2308 		ashr_reg64_ge32(nfp_prog, dst, src);
2309 
2310 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2311 			return -EINVAL;
2312 	}
2313 
2314 	return 0;
2315 }
2316 
2317 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2318 {
2319 	const struct bpf_insn *insn = &meta->insn;
2320 
2321 	wrp_reg_mov(nfp_prog, insn->dst_reg * 2,  insn->src_reg * 2);
2322 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2323 
2324 	return 0;
2325 }
2326 
2327 static int mov_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2328 {
2329 	const struct bpf_insn *insn = &meta->insn;
2330 
2331 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), insn->imm);
2332 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2333 
2334 	return 0;
2335 }
2336 
2337 static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2338 {
2339 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_XOR);
2340 }
2341 
2342 static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2343 {
2344 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR);
2345 }
2346 
2347 static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2348 {
2349 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_AND);
2350 }
2351 
2352 static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2353 {
2354 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND);
2355 }
2356 
2357 static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2358 {
2359 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_OR);
2360 }
2361 
2362 static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2363 {
2364 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR);
2365 }
2366 
2367 static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2368 {
2369 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_ADD);
2370 }
2371 
2372 static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2373 {
2374 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD);
2375 }
2376 
2377 static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2378 {
2379 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_SUB);
2380 }
2381 
2382 static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2383 {
2384 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB);
2385 }
2386 
2387 static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2388 {
2389 	return wrp_mul(nfp_prog, meta, false, true);
2390 }
2391 
2392 static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2393 {
2394 	return wrp_mul(nfp_prog, meta, false, false);
2395 }
2396 
2397 static int div_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2398 {
2399 	return div_reg64(nfp_prog, meta);
2400 }
2401 
2402 static int div_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2403 {
2404 	return div_imm64(nfp_prog, meta);
2405 }
2406 
2407 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2408 {
2409 	u8 dst = meta->insn.dst_reg * 2;
2410 
2411 	emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst));
2412 	wrp_zext(nfp_prog, meta, dst);
2413 
2414 	return 0;
2415 }
2416 
2417 static int
2418 __ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2419 	   u8 shift_amt)
2420 {
2421 	if (shift_amt) {
2422 		/* Set signedness bit (MSB of result). */
2423 		emit_alu(nfp_prog, reg_none(), reg_a(dst), ALU_OP_OR,
2424 			 reg_imm(0));
2425 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2426 			 reg_b(dst), SHF_SC_R_SHF, shift_amt);
2427 	}
2428 	wrp_zext(nfp_prog, meta, dst);
2429 
2430 	return 0;
2431 }
2432 
2433 static int ashr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2434 {
2435 	const struct bpf_insn *insn = &meta->insn;
2436 	u64 umin, umax;
2437 	u8 dst, src;
2438 
2439 	dst = insn->dst_reg * 2;
2440 	umin = meta->umin_src;
2441 	umax = meta->umax_src;
2442 	if (umin == umax)
2443 		return __ashr_imm(nfp_prog, meta, dst, umin);
2444 
2445 	src = insn->src_reg * 2;
2446 	/* NOTE: the first insn will set both indirect shift amount (source A)
2447 	 * and signedness bit (MSB of result).
2448 	 */
2449 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst));
2450 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2451 		       reg_b(dst), SHF_SC_R_SHF);
2452 	wrp_zext(nfp_prog, meta, dst);
2453 
2454 	return 0;
2455 }
2456 
2457 static int ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2458 {
2459 	const struct bpf_insn *insn = &meta->insn;
2460 	u8 dst = insn->dst_reg * 2;
2461 
2462 	return __ashr_imm(nfp_prog, meta, dst, insn->imm);
2463 }
2464 
2465 static int
2466 __shr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2467 	  u8 shift_amt)
2468 {
2469 	if (shift_amt)
2470 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2471 			 reg_b(dst), SHF_SC_R_SHF, shift_amt);
2472 	wrp_zext(nfp_prog, meta, dst);
2473 	return 0;
2474 }
2475 
2476 static int shr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2477 {
2478 	const struct bpf_insn *insn = &meta->insn;
2479 	u8 dst = insn->dst_reg * 2;
2480 
2481 	return __shr_imm(nfp_prog, meta, dst, insn->imm);
2482 }
2483 
2484 static int shr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2485 {
2486 	const struct bpf_insn *insn = &meta->insn;
2487 	u64 umin, umax;
2488 	u8 dst, src;
2489 
2490 	dst = insn->dst_reg * 2;
2491 	umin = meta->umin_src;
2492 	umax = meta->umax_src;
2493 	if (umin == umax)
2494 		return __shr_imm(nfp_prog, meta, dst, umin);
2495 
2496 	src = insn->src_reg * 2;
2497 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2498 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2499 		       reg_b(dst), SHF_SC_R_SHF);
2500 	wrp_zext(nfp_prog, meta, dst);
2501 	return 0;
2502 }
2503 
2504 static int
2505 __shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2506 	  u8 shift_amt)
2507 {
2508 	if (shift_amt)
2509 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2510 			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
2511 	wrp_zext(nfp_prog, meta, dst);
2512 	return 0;
2513 }
2514 
2515 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2516 {
2517 	const struct bpf_insn *insn = &meta->insn;
2518 	u8 dst = insn->dst_reg * 2;
2519 
2520 	return __shl_imm(nfp_prog, meta, dst, insn->imm);
2521 }
2522 
2523 static int shl_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2524 {
2525 	const struct bpf_insn *insn = &meta->insn;
2526 	u64 umin, umax;
2527 	u8 dst, src;
2528 
2529 	dst = insn->dst_reg * 2;
2530 	umin = meta->umin_src;
2531 	umax = meta->umax_src;
2532 	if (umin == umax)
2533 		return __shl_imm(nfp_prog, meta, dst, umin);
2534 
2535 	src = insn->src_reg * 2;
2536 	shl_reg64_lt32_low(nfp_prog, dst, src);
2537 	wrp_zext(nfp_prog, meta, dst);
2538 	return 0;
2539 }
2540 
2541 static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2542 {
2543 	const struct bpf_insn *insn = &meta->insn;
2544 	u8 gpr = insn->dst_reg * 2;
2545 
2546 	switch (insn->imm) {
2547 	case 16:
2548 		emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr),
2549 			      SHF_SC_R_ROT, 8);
2550 		emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr),
2551 			      SHF_SC_R_SHF, 16);
2552 
2553 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2554 		break;
2555 	case 32:
2556 		wrp_end32(nfp_prog, reg_a(gpr), gpr);
2557 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2558 		break;
2559 	case 64:
2560 		wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1));
2561 
2562 		wrp_end32(nfp_prog, reg_a(gpr), gpr + 1);
2563 		wrp_end32(nfp_prog, imm_a(nfp_prog), gpr);
2564 		break;
2565 	}
2566 
2567 	return 0;
2568 }
2569 
2570 static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2571 {
2572 	struct nfp_insn_meta *prev = nfp_meta_prev(meta);
2573 	u32 imm_lo, imm_hi;
2574 	u8 dst;
2575 
2576 	dst = prev->insn.dst_reg * 2;
2577 	imm_lo = prev->insn.imm;
2578 	imm_hi = meta->insn.imm;
2579 
2580 	wrp_immed(nfp_prog, reg_both(dst), imm_lo);
2581 
2582 	/* mov is always 1 insn, load imm may be two, so try to use mov */
2583 	if (imm_hi == imm_lo)
2584 		wrp_mov(nfp_prog, reg_both(dst + 1), reg_a(dst));
2585 	else
2586 		wrp_immed(nfp_prog, reg_both(dst + 1), imm_hi);
2587 
2588 	return 0;
2589 }
2590 
2591 static int imm_ld8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2592 {
2593 	meta->double_cb = imm_ld8_part2;
2594 	return 0;
2595 }
2596 
2597 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2598 {
2599 	return construct_data_ld(nfp_prog, meta, meta->insn.imm, 1);
2600 }
2601 
2602 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2603 {
2604 	return construct_data_ld(nfp_prog, meta, meta->insn.imm, 2);
2605 }
2606 
2607 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2608 {
2609 	return construct_data_ld(nfp_prog, meta, meta->insn.imm, 4);
2610 }
2611 
2612 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2613 {
2614 	return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2615 				     meta->insn.src_reg * 2, 1);
2616 }
2617 
2618 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2619 {
2620 	return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2621 				     meta->insn.src_reg * 2, 2);
2622 }
2623 
2624 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2625 {
2626 	return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2627 				     meta->insn.src_reg * 2, 4);
2628 }
2629 
2630 static int
2631 mem_ldx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2632 	      unsigned int size, unsigned int ptr_off)
2633 {
2634 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2635 			    meta->insn.dst_reg * 2, meta->insn.src_reg * 2,
2636 			    true, wrp_lmem_load);
2637 }
2638 
2639 static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2640 		       u8 size)
2641 {
2642 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2643 
2644 	switch (meta->insn.off) {
2645 	case offsetof(struct __sk_buff, len):
2646 		if (size != FIELD_SIZEOF(struct __sk_buff, len))
2647 			return -EOPNOTSUPP;
2648 		wrp_mov(nfp_prog, dst, plen_reg(nfp_prog));
2649 		break;
2650 	case offsetof(struct __sk_buff, data):
2651 		if (size != FIELD_SIZEOF(struct __sk_buff, data))
2652 			return -EOPNOTSUPP;
2653 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2654 		break;
2655 	case offsetof(struct __sk_buff, data_end):
2656 		if (size != FIELD_SIZEOF(struct __sk_buff, data_end))
2657 			return -EOPNOTSUPP;
2658 		emit_alu(nfp_prog, dst,
2659 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2660 		break;
2661 	default:
2662 		return -EOPNOTSUPP;
2663 	}
2664 
2665 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2666 
2667 	return 0;
2668 }
2669 
2670 static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2671 		       u8 size)
2672 {
2673 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2674 
2675 	switch (meta->insn.off) {
2676 	case offsetof(struct xdp_md, data):
2677 		if (size != FIELD_SIZEOF(struct xdp_md, data))
2678 			return -EOPNOTSUPP;
2679 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2680 		break;
2681 	case offsetof(struct xdp_md, data_end):
2682 		if (size != FIELD_SIZEOF(struct xdp_md, data_end))
2683 			return -EOPNOTSUPP;
2684 		emit_alu(nfp_prog, dst,
2685 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2686 		break;
2687 	default:
2688 		return -EOPNOTSUPP;
2689 	}
2690 
2691 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2692 
2693 	return 0;
2694 }
2695 
2696 static int
2697 mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2698 	     unsigned int size)
2699 {
2700 	swreg tmp_reg;
2701 
2702 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2703 
2704 	return data_ld_host_order_addr32(nfp_prog, meta, meta->insn.src_reg * 2,
2705 					 tmp_reg, meta->insn.dst_reg * 2, size);
2706 }
2707 
2708 static int
2709 mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2710 	     unsigned int size)
2711 {
2712 	swreg tmp_reg;
2713 
2714 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2715 
2716 	return data_ld_host_order_addr40(nfp_prog, meta, meta->insn.src_reg * 2,
2717 					 tmp_reg, meta->insn.dst_reg * 2, size);
2718 }
2719 
2720 static void
2721 mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog,
2722 			   struct nfp_insn_meta *meta)
2723 {
2724 	s16 range_start = meta->pkt_cache.range_start;
2725 	s16 range_end = meta->pkt_cache.range_end;
2726 	swreg src_base, off;
2727 	u8 xfer_num, len;
2728 	bool indir;
2729 
2730 	off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog));
2731 	src_base = reg_a(meta->insn.src_reg * 2);
2732 	len = range_end - range_start;
2733 	xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH;
2734 
2735 	indir = len > 8 * REG_WIDTH;
2736 	/* Setup PREV_ALU for indirect mode. */
2737 	if (indir)
2738 		wrp_immed(nfp_prog, reg_none(),
2739 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
2740 
2741 	/* Cache memory into transfer-in registers. */
2742 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
2743 		     off, xfer_num - 1, CMD_CTX_SWAP, indir);
2744 }
2745 
2746 static int
2747 mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog,
2748 				     struct nfp_insn_meta *meta,
2749 				     unsigned int size)
2750 {
2751 	s16 range_start = meta->pkt_cache.range_start;
2752 	s16 insn_off = meta->insn.off - range_start;
2753 	swreg dst_lo, dst_hi, src_lo, src_mid;
2754 	u8 dst_gpr = meta->insn.dst_reg * 2;
2755 	u8 len_lo = size, len_mid = 0;
2756 	u8 idx = insn_off / REG_WIDTH;
2757 	u8 off = insn_off % REG_WIDTH;
2758 
2759 	dst_hi = reg_both(dst_gpr + 1);
2760 	dst_lo = reg_both(dst_gpr);
2761 	src_lo = reg_xfer(idx);
2762 
2763 	/* The read length could involve as many as three registers. */
2764 	if (size > REG_WIDTH - off) {
2765 		/* Calculate the part in the second register. */
2766 		len_lo = REG_WIDTH - off;
2767 		len_mid = size - len_lo;
2768 
2769 		/* Calculate the part in the third register. */
2770 		if (size > 2 * REG_WIDTH - off)
2771 			len_mid = REG_WIDTH;
2772 	}
2773 
2774 	wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off);
2775 
2776 	if (!len_mid) {
2777 		wrp_zext(nfp_prog, meta, dst_gpr);
2778 		return 0;
2779 	}
2780 
2781 	src_mid = reg_xfer(idx + 1);
2782 
2783 	if (size <= REG_WIDTH) {
2784 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo);
2785 		wrp_zext(nfp_prog, meta, dst_gpr);
2786 	} else {
2787 		swreg src_hi = reg_xfer(idx + 2);
2788 
2789 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid,
2790 				   REG_WIDTH - len_lo, len_lo);
2791 		wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo,
2792 				REG_WIDTH - len_lo);
2793 		wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo,
2794 				   len_lo);
2795 	}
2796 
2797 	return 0;
2798 }
2799 
2800 static int
2801 mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog,
2802 				   struct nfp_insn_meta *meta,
2803 				   unsigned int size)
2804 {
2805 	swreg dst_lo, dst_hi, src_lo;
2806 	u8 dst_gpr, idx;
2807 
2808 	idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH;
2809 	dst_gpr = meta->insn.dst_reg * 2;
2810 	dst_hi = reg_both(dst_gpr + 1);
2811 	dst_lo = reg_both(dst_gpr);
2812 	src_lo = reg_xfer(idx);
2813 
2814 	if (size < REG_WIDTH) {
2815 		wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0);
2816 		wrp_zext(nfp_prog, meta, dst_gpr);
2817 	} else if (size == REG_WIDTH) {
2818 		wrp_mov(nfp_prog, dst_lo, src_lo);
2819 		wrp_zext(nfp_prog, meta, dst_gpr);
2820 	} else {
2821 		swreg src_hi = reg_xfer(idx + 1);
2822 
2823 		wrp_mov(nfp_prog, dst_lo, src_lo);
2824 		wrp_mov(nfp_prog, dst_hi, src_hi);
2825 	}
2826 
2827 	return 0;
2828 }
2829 
2830 static int
2831 mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog,
2832 			   struct nfp_insn_meta *meta, unsigned int size)
2833 {
2834 	u8 off = meta->insn.off - meta->pkt_cache.range_start;
2835 
2836 	if (IS_ALIGNED(off, REG_WIDTH))
2837 		return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size);
2838 
2839 	return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size);
2840 }
2841 
2842 static int
2843 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2844 	unsigned int size)
2845 {
2846 	if (meta->ldst_gather_len)
2847 		return nfp_cpp_memcpy(nfp_prog, meta);
2848 
2849 	if (meta->ptr.type == PTR_TO_CTX) {
2850 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2851 			return mem_ldx_xdp(nfp_prog, meta, size);
2852 		else
2853 			return mem_ldx_skb(nfp_prog, meta, size);
2854 	}
2855 
2856 	if (meta->ptr.type == PTR_TO_PACKET) {
2857 		if (meta->pkt_cache.range_end) {
2858 			if (meta->pkt_cache.do_init)
2859 				mem_ldx_data_init_pktcache(nfp_prog, meta);
2860 
2861 			return mem_ldx_data_from_pktcache(nfp_prog, meta, size);
2862 		} else {
2863 			return mem_ldx_data(nfp_prog, meta, size);
2864 		}
2865 	}
2866 
2867 	if (meta->ptr.type == PTR_TO_STACK)
2868 		return mem_ldx_stack(nfp_prog, meta, size,
2869 				     meta->ptr.off + meta->ptr.var_off.value);
2870 
2871 	if (meta->ptr.type == PTR_TO_MAP_VALUE)
2872 		return mem_ldx_emem(nfp_prog, meta, size);
2873 
2874 	return -EOPNOTSUPP;
2875 }
2876 
2877 static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2878 {
2879 	return mem_ldx(nfp_prog, meta, 1);
2880 }
2881 
2882 static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2883 {
2884 	return mem_ldx(nfp_prog, meta, 2);
2885 }
2886 
2887 static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2888 {
2889 	return mem_ldx(nfp_prog, meta, 4);
2890 }
2891 
2892 static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2893 {
2894 	return mem_ldx(nfp_prog, meta, 8);
2895 }
2896 
2897 static int
2898 mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2899 	    unsigned int size)
2900 {
2901 	u64 imm = meta->insn.imm; /* sign extend */
2902 	swreg off_reg;
2903 
2904 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2905 
2906 	return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2907 				  imm, size);
2908 }
2909 
2910 static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2911 		  unsigned int size)
2912 {
2913 	if (meta->ptr.type == PTR_TO_PACKET)
2914 		return mem_st_data(nfp_prog, meta, size);
2915 
2916 	return -EOPNOTSUPP;
2917 }
2918 
2919 static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2920 {
2921 	return mem_st(nfp_prog, meta, 1);
2922 }
2923 
2924 static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2925 {
2926 	return mem_st(nfp_prog, meta, 2);
2927 }
2928 
2929 static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2930 {
2931 	return mem_st(nfp_prog, meta, 4);
2932 }
2933 
2934 static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2935 {
2936 	return mem_st(nfp_prog, meta, 8);
2937 }
2938 
2939 static int
2940 mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2941 	     unsigned int size)
2942 {
2943 	swreg off_reg;
2944 
2945 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2946 
2947 	return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2948 				   meta->insn.src_reg * 2, size);
2949 }
2950 
2951 static int
2952 mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2953 	      unsigned int size, unsigned int ptr_off)
2954 {
2955 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2956 			    meta->insn.src_reg * 2, meta->insn.dst_reg * 2,
2957 			    false, wrp_lmem_store);
2958 }
2959 
2960 static int mem_stx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2961 {
2962 	switch (meta->insn.off) {
2963 	case offsetof(struct xdp_md, rx_queue_index):
2964 		return nfp_queue_select(nfp_prog, meta);
2965 	}
2966 
2967 	WARN_ON_ONCE(1); /* verifier should have rejected bad accesses */
2968 	return -EOPNOTSUPP;
2969 }
2970 
2971 static int
2972 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2973 	unsigned int size)
2974 {
2975 	if (meta->ptr.type == PTR_TO_PACKET)
2976 		return mem_stx_data(nfp_prog, meta, size);
2977 
2978 	if (meta->ptr.type == PTR_TO_STACK)
2979 		return mem_stx_stack(nfp_prog, meta, size,
2980 				     meta->ptr.off + meta->ptr.var_off.value);
2981 
2982 	return -EOPNOTSUPP;
2983 }
2984 
2985 static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2986 {
2987 	return mem_stx(nfp_prog, meta, 1);
2988 }
2989 
2990 static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2991 {
2992 	return mem_stx(nfp_prog, meta, 2);
2993 }
2994 
2995 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2996 {
2997 	if (meta->ptr.type == PTR_TO_CTX)
2998 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2999 			return mem_stx_xdp(nfp_prog, meta);
3000 	return mem_stx(nfp_prog, meta, 4);
3001 }
3002 
3003 static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3004 {
3005 	return mem_stx(nfp_prog, meta, 8);
3006 }
3007 
3008 static int
3009 mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
3010 {
3011 	u8 dst_gpr = meta->insn.dst_reg * 2;
3012 	u8 src_gpr = meta->insn.src_reg * 2;
3013 	unsigned int full_add, out;
3014 	swreg addra, addrb, off;
3015 
3016 	off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
3017 
3018 	/* We can fit 16 bits into command immediate, if we know the immediate
3019 	 * is guaranteed to either always or never fit into 16 bit we only
3020 	 * generate code to handle that particular case, otherwise generate
3021 	 * code for both.
3022 	 */
3023 	out = nfp_prog_current_offset(nfp_prog);
3024 	full_add = nfp_prog_current_offset(nfp_prog);
3025 
3026 	if (meta->insn.off) {
3027 		out += 2;
3028 		full_add += 2;
3029 	}
3030 	if (meta->xadd_maybe_16bit) {
3031 		out += 3;
3032 		full_add += 3;
3033 	}
3034 	if (meta->xadd_over_16bit)
3035 		out += 2 + is64;
3036 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
3037 		out += 5;
3038 		full_add += 5;
3039 	}
3040 
3041 	/* Generate the branch for choosing add_imm vs add */
3042 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
3043 		swreg max_imm = imm_a(nfp_prog);
3044 
3045 		wrp_immed(nfp_prog, max_imm, 0xffff);
3046 		emit_alu(nfp_prog, reg_none(),
3047 			 max_imm, ALU_OP_SUB, reg_b(src_gpr));
3048 		emit_alu(nfp_prog, reg_none(),
3049 			 reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1));
3050 		emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0);
3051 		/* defer for add */
3052 	}
3053 
3054 	/* If insn has an offset add to the address */
3055 	if (!meta->insn.off) {
3056 		addra = reg_a(dst_gpr);
3057 		addrb = reg_b(dst_gpr + 1);
3058 	} else {
3059 		emit_alu(nfp_prog, imma_a(nfp_prog),
3060 			 reg_a(dst_gpr), ALU_OP_ADD, off);
3061 		emit_alu(nfp_prog, imma_b(nfp_prog),
3062 			 reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0));
3063 		addra = imma_a(nfp_prog);
3064 		addrb = imma_b(nfp_prog);
3065 	}
3066 
3067 	/* Generate the add_imm if 16 bits are possible */
3068 	if (meta->xadd_maybe_16bit) {
3069 		swreg prev_alu = imm_a(nfp_prog);
3070 
3071 		wrp_immed(nfp_prog, prev_alu,
3072 			  FIELD_PREP(CMD_OVE_DATA, 2) |
3073 			  CMD_OVE_LEN |
3074 			  FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2));
3075 		wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2);
3076 		emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0,
3077 			       addra, addrb, 0, CMD_CTX_NO_SWAP);
3078 
3079 		if (meta->xadd_over_16bit)
3080 			emit_br(nfp_prog, BR_UNC, out, 0);
3081 	}
3082 
3083 	if (!nfp_prog_confirm_current_offset(nfp_prog, full_add))
3084 		return -EINVAL;
3085 
3086 	/* Generate the add if 16 bits are not guaranteed */
3087 	if (meta->xadd_over_16bit) {
3088 		emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0,
3089 			 addra, addrb, is64 << 2,
3090 			 is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1);
3091 
3092 		wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr));
3093 		if (is64)
3094 			wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1));
3095 	}
3096 
3097 	if (!nfp_prog_confirm_current_offset(nfp_prog, out))
3098 		return -EINVAL;
3099 
3100 	return 0;
3101 }
3102 
3103 static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3104 {
3105 	return mem_xadd(nfp_prog, meta, false);
3106 }
3107 
3108 static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3109 {
3110 	return mem_xadd(nfp_prog, meta, true);
3111 }
3112 
3113 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3114 {
3115 	emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
3116 
3117 	return 0;
3118 }
3119 
3120 static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3121 {
3122 	const struct bpf_insn *insn = &meta->insn;
3123 	u64 imm = insn->imm; /* sign extend */
3124 	swreg or1, or2, tmp_reg;
3125 
3126 	or1 = reg_a(insn->dst_reg * 2);
3127 	or2 = reg_b(insn->dst_reg * 2 + 1);
3128 
3129 	if (imm & ~0U) {
3130 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3131 		emit_alu(nfp_prog, imm_a(nfp_prog),
3132 			 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3133 		or1 = imm_a(nfp_prog);
3134 	}
3135 
3136 	if (imm >> 32) {
3137 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3138 		emit_alu(nfp_prog, imm_b(nfp_prog),
3139 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3140 		or2 = imm_b(nfp_prog);
3141 	}
3142 
3143 	emit_alu(nfp_prog, reg_none(), or1, ALU_OP_OR, or2);
3144 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3145 
3146 	return 0;
3147 }
3148 
3149 static int jeq32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3150 {
3151 	const struct bpf_insn *insn = &meta->insn;
3152 	swreg tmp_reg;
3153 
3154 	tmp_reg = ur_load_imm_any(nfp_prog, insn->imm, imm_b(nfp_prog));
3155 	emit_alu(nfp_prog, reg_none(),
3156 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3157 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3158 
3159 	return 0;
3160 }
3161 
3162 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3163 {
3164 	const struct bpf_insn *insn = &meta->insn;
3165 	u64 imm = insn->imm; /* sign extend */
3166 	u8 dst_gpr = insn->dst_reg * 2;
3167 	swreg tmp_reg;
3168 
3169 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3170 	emit_alu(nfp_prog, imm_b(nfp_prog),
3171 		 reg_a(dst_gpr), ALU_OP_AND, tmp_reg);
3172 	/* Upper word of the mask can only be 0 or ~0 from sign extension,
3173 	 * so either ignore it or OR the whole thing in.
3174 	 */
3175 	if (is_mbpf_jmp64(meta) && imm >> 32) {
3176 		emit_alu(nfp_prog, reg_none(),
3177 			 reg_a(dst_gpr + 1), ALU_OP_OR, imm_b(nfp_prog));
3178 	}
3179 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3180 
3181 	return 0;
3182 }
3183 
3184 static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3185 {
3186 	const struct bpf_insn *insn = &meta->insn;
3187 	u64 imm = insn->imm; /* sign extend */
3188 	bool is_jmp32 = is_mbpf_jmp32(meta);
3189 	swreg tmp_reg;
3190 
3191 	if (!imm) {
3192 		if (is_jmp32)
3193 			emit_alu(nfp_prog, reg_none(), reg_none(), ALU_OP_NONE,
3194 				 reg_b(insn->dst_reg * 2));
3195 		else
3196 			emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
3197 				 ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
3198 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3199 		return 0;
3200 	}
3201 
3202 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3203 	emit_alu(nfp_prog, reg_none(),
3204 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3205 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3206 
3207 	if (is_jmp32)
3208 		return 0;
3209 
3210 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3211 	emit_alu(nfp_prog, reg_none(),
3212 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3213 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3214 
3215 	return 0;
3216 }
3217 
3218 static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3219 {
3220 	const struct bpf_insn *insn = &meta->insn;
3221 
3222 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
3223 		 ALU_OP_XOR, reg_b(insn->src_reg * 2));
3224 	if (is_mbpf_jmp64(meta)) {
3225 		emit_alu(nfp_prog, imm_b(nfp_prog),
3226 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR,
3227 			 reg_b(insn->src_reg * 2 + 1));
3228 		emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR,
3229 			 imm_b(nfp_prog));
3230 	}
3231 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3232 
3233 	return 0;
3234 }
3235 
3236 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3237 {
3238 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
3239 }
3240 
3241 static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3242 {
3243 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
3244 }
3245 
3246 static int
3247 bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3248 {
3249 	u32 ret_tgt, stack_depth, offset_br;
3250 	swreg tmp_reg;
3251 
3252 	stack_depth = round_up(nfp_prog->stack_frame_depth, STACK_FRAME_ALIGN);
3253 	/* Space for saving the return address is accounted for by the callee,
3254 	 * so stack_depth can be zero for the main function.
3255 	 */
3256 	if (stack_depth) {
3257 		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3258 					  stack_imm(nfp_prog));
3259 		emit_alu(nfp_prog, stack_reg(nfp_prog),
3260 			 stack_reg(nfp_prog), ALU_OP_ADD, tmp_reg);
3261 		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3262 			    NFP_CSR_ACT_LM_ADDR0);
3263 	}
3264 
3265 	/* Two cases for jumping to the callee:
3266 	 *
3267 	 * - If callee uses and needs to save R6~R9 then:
3268 	 *     1. Put the start offset of the callee into imm_b(). This will
3269 	 *        require a fixup step, as we do not necessarily know this
3270 	 *        address yet.
3271 	 *     2. Put the return address from the callee to the caller into
3272 	 *        register ret_reg().
3273 	 *     3. (After defer slots are consumed) Jump to the subroutine that
3274 	 *        pushes the registers to the stack.
3275 	 *   The subroutine acts as a trampoline, and returns to the address in
3276 	 *   imm_b(), i.e. jumps to the callee.
3277 	 *
3278 	 * - If callee does not need to save R6~R9 then just load return
3279 	 *   address to the caller in ret_reg(), and jump to the callee
3280 	 *   directly.
3281 	 *
3282 	 * Using ret_reg() to pass the return address to the callee is set here
3283 	 * as a convention. The callee can then push this address onto its
3284 	 * stack frame in its prologue. The advantages of passing the return
3285 	 * address through ret_reg(), instead of pushing it to the stack right
3286 	 * here, are the following:
3287 	 * - It looks cleaner.
3288 	 * - If the called function is called multiple time, we get a lower
3289 	 *   program size.
3290 	 * - We save two no-op instructions that should be added just before
3291 	 *   the emit_br() when stack depth is not null otherwise.
3292 	 * - If we ever find a register to hold the return address during whole
3293 	 *   execution of the callee, we will not have to push the return
3294 	 *   address to the stack for leaf functions.
3295 	 */
3296 	if (!meta->jmp_dst) {
3297 		pr_err("BUG: BPF-to-BPF call has no destination recorded\n");
3298 		return -ELOOP;
3299 	}
3300 	if (nfp_prog->subprog[meta->jmp_dst->subprog_idx].needs_reg_push) {
3301 		ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
3302 		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
3303 			     RELO_BR_GO_CALL_PUSH_REGS);
3304 		offset_br = nfp_prog_current_offset(nfp_prog);
3305 		wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
3306 	} else {
3307 		ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
3308 		emit_br(nfp_prog, BR_UNC, meta->insn.imm, 1);
3309 		offset_br = nfp_prog_current_offset(nfp_prog);
3310 	}
3311 	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
3312 
3313 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
3314 		return -EINVAL;
3315 
3316 	if (stack_depth) {
3317 		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3318 					  stack_imm(nfp_prog));
3319 		emit_alu(nfp_prog, stack_reg(nfp_prog),
3320 			 stack_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
3321 		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3322 			    NFP_CSR_ACT_LM_ADDR0);
3323 		wrp_nops(nfp_prog, 3);
3324 	}
3325 
3326 	meta->num_insns_after_br = nfp_prog_current_offset(nfp_prog);
3327 	meta->num_insns_after_br -= offset_br;
3328 
3329 	return 0;
3330 }
3331 
3332 static int helper_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3333 {
3334 	switch (meta->insn.imm) {
3335 	case BPF_FUNC_xdp_adjust_head:
3336 		return adjust_head(nfp_prog, meta);
3337 	case BPF_FUNC_xdp_adjust_tail:
3338 		return adjust_tail(nfp_prog, meta);
3339 	case BPF_FUNC_map_lookup_elem:
3340 	case BPF_FUNC_map_update_elem:
3341 	case BPF_FUNC_map_delete_elem:
3342 		return map_call_stack_common(nfp_prog, meta);
3343 	case BPF_FUNC_get_prandom_u32:
3344 		return nfp_get_prandom_u32(nfp_prog, meta);
3345 	case BPF_FUNC_perf_event_output:
3346 		return nfp_perf_event_output(nfp_prog, meta);
3347 	default:
3348 		WARN_ONCE(1, "verifier allowed unsupported function\n");
3349 		return -EOPNOTSUPP;
3350 	}
3351 }
3352 
3353 static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3354 {
3355 	if (is_mbpf_pseudo_call(meta))
3356 		return bpf_to_bpf_call(nfp_prog, meta);
3357 	else
3358 		return helper_call(nfp_prog, meta);
3359 }
3360 
3361 static bool nfp_is_main_function(struct nfp_insn_meta *meta)
3362 {
3363 	return meta->subprog_idx == 0;
3364 }
3365 
3366 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3367 {
3368 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
3369 
3370 	return 0;
3371 }
3372 
3373 static int
3374 nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3375 {
3376 	if (nfp_prog->subprog[meta->subprog_idx].needs_reg_push) {
3377 		/* Pop R6~R9 to the stack via related subroutine.
3378 		 * We loaded the return address to the caller into ret_reg().
3379 		 * This means that the subroutine does not come back here, we
3380 		 * make it jump back to the subprogram caller directly!
3381 		 */
3382 		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
3383 			     RELO_BR_GO_CALL_POP_REGS);
3384 		/* Pop return address from the stack. */
3385 		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3386 	} else {
3387 		/* Pop return address from the stack. */
3388 		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3389 		/* Jump back to caller if no callee-saved registers were used
3390 		 * by the subprogram.
3391 		 */
3392 		emit_rtn(nfp_prog, ret_reg(nfp_prog), 0);
3393 	}
3394 
3395 	return 0;
3396 }
3397 
3398 static int jmp_exit(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3399 {
3400 	if (nfp_is_main_function(meta))
3401 		return goto_out(nfp_prog, meta);
3402 	else
3403 		return nfp_subprog_epilogue(nfp_prog, meta);
3404 }
3405 
3406 static const instr_cb_t instr_cb[256] = {
3407 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
3408 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
3409 	[BPF_ALU64 | BPF_XOR | BPF_X] =	xor_reg64,
3410 	[BPF_ALU64 | BPF_XOR | BPF_K] =	xor_imm64,
3411 	[BPF_ALU64 | BPF_AND | BPF_X] =	and_reg64,
3412 	[BPF_ALU64 | BPF_AND | BPF_K] =	and_imm64,
3413 	[BPF_ALU64 | BPF_OR | BPF_X] =	or_reg64,
3414 	[BPF_ALU64 | BPF_OR | BPF_K] =	or_imm64,
3415 	[BPF_ALU64 | BPF_ADD | BPF_X] =	add_reg64,
3416 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
3417 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
3418 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
3419 	[BPF_ALU64 | BPF_MUL | BPF_X] =	mul_reg64,
3420 	[BPF_ALU64 | BPF_MUL | BPF_K] =	mul_imm64,
3421 	[BPF_ALU64 | BPF_DIV | BPF_X] =	div_reg64,
3422 	[BPF_ALU64 | BPF_DIV | BPF_K] =	div_imm64,
3423 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
3424 	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
3425 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
3426 	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
3427 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
3428 	[BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64,
3429 	[BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
3430 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
3431 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
3432 	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
3433 	[BPF_ALU | BPF_XOR | BPF_K] =	xor_imm,
3434 	[BPF_ALU | BPF_AND | BPF_X] =	and_reg,
3435 	[BPF_ALU | BPF_AND | BPF_K] =	and_imm,
3436 	[BPF_ALU | BPF_OR | BPF_X] =	or_reg,
3437 	[BPF_ALU | BPF_OR | BPF_K] =	or_imm,
3438 	[BPF_ALU | BPF_ADD | BPF_X] =	add_reg,
3439 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
3440 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
3441 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
3442 	[BPF_ALU | BPF_MUL | BPF_X] =	mul_reg,
3443 	[BPF_ALU | BPF_MUL | BPF_K] =	mul_imm,
3444 	[BPF_ALU | BPF_DIV | BPF_X] =	div_reg,
3445 	[BPF_ALU | BPF_DIV | BPF_K] =	div_imm,
3446 	[BPF_ALU | BPF_NEG] =		neg_reg,
3447 	[BPF_ALU | BPF_LSH | BPF_X] =	shl_reg,
3448 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
3449 	[BPF_ALU | BPF_RSH | BPF_X] =	shr_reg,
3450 	[BPF_ALU | BPF_RSH | BPF_K] =	shr_imm,
3451 	[BPF_ALU | BPF_ARSH | BPF_X] =	ashr_reg,
3452 	[BPF_ALU | BPF_ARSH | BPF_K] =	ashr_imm,
3453 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
3454 	[BPF_LD | BPF_IMM | BPF_DW] =	imm_ld8,
3455 	[BPF_LD | BPF_ABS | BPF_B] =	data_ld1,
3456 	[BPF_LD | BPF_ABS | BPF_H] =	data_ld2,
3457 	[BPF_LD | BPF_ABS | BPF_W] =	data_ld4,
3458 	[BPF_LD | BPF_IND | BPF_B] =	data_ind_ld1,
3459 	[BPF_LD | BPF_IND | BPF_H] =	data_ind_ld2,
3460 	[BPF_LD | BPF_IND | BPF_W] =	data_ind_ld4,
3461 	[BPF_LDX | BPF_MEM | BPF_B] =	mem_ldx1,
3462 	[BPF_LDX | BPF_MEM | BPF_H] =	mem_ldx2,
3463 	[BPF_LDX | BPF_MEM | BPF_W] =	mem_ldx4,
3464 	[BPF_LDX | BPF_MEM | BPF_DW] =	mem_ldx8,
3465 	[BPF_STX | BPF_MEM | BPF_B] =	mem_stx1,
3466 	[BPF_STX | BPF_MEM | BPF_H] =	mem_stx2,
3467 	[BPF_STX | BPF_MEM | BPF_W] =	mem_stx4,
3468 	[BPF_STX | BPF_MEM | BPF_DW] =	mem_stx8,
3469 	[BPF_STX | BPF_XADD | BPF_W] =	mem_xadd4,
3470 	[BPF_STX | BPF_XADD | BPF_DW] =	mem_xadd8,
3471 	[BPF_ST | BPF_MEM | BPF_B] =	mem_st1,
3472 	[BPF_ST | BPF_MEM | BPF_H] =	mem_st2,
3473 	[BPF_ST | BPF_MEM | BPF_W] =	mem_st4,
3474 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
3475 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
3476 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
3477 	[BPF_JMP | BPF_JGT | BPF_K] =	cmp_imm,
3478 	[BPF_JMP | BPF_JGE | BPF_K] =	cmp_imm,
3479 	[BPF_JMP | BPF_JLT | BPF_K] =	cmp_imm,
3480 	[BPF_JMP | BPF_JLE | BPF_K] =	cmp_imm,
3481 	[BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
3482 	[BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
3483 	[BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
3484 	[BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
3485 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
3486 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
3487 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
3488 	[BPF_JMP | BPF_JGT | BPF_X] =	cmp_reg,
3489 	[BPF_JMP | BPF_JGE | BPF_X] =	cmp_reg,
3490 	[BPF_JMP | BPF_JLT | BPF_X] =	cmp_reg,
3491 	[BPF_JMP | BPF_JLE | BPF_X] =	cmp_reg,
3492 	[BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
3493 	[BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
3494 	[BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
3495 	[BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
3496 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
3497 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
3498 	[BPF_JMP32 | BPF_JEQ | BPF_K] =	jeq32_imm,
3499 	[BPF_JMP32 | BPF_JGT | BPF_K] =	cmp_imm,
3500 	[BPF_JMP32 | BPF_JGE | BPF_K] =	cmp_imm,
3501 	[BPF_JMP32 | BPF_JLT | BPF_K] =	cmp_imm,
3502 	[BPF_JMP32 | BPF_JLE | BPF_K] =	cmp_imm,
3503 	[BPF_JMP32 | BPF_JSGT | BPF_K] =cmp_imm,
3504 	[BPF_JMP32 | BPF_JSGE | BPF_K] =cmp_imm,
3505 	[BPF_JMP32 | BPF_JSLT | BPF_K] =cmp_imm,
3506 	[BPF_JMP32 | BPF_JSLE | BPF_K] =cmp_imm,
3507 	[BPF_JMP32 | BPF_JSET | BPF_K] =jset_imm,
3508 	[BPF_JMP32 | BPF_JNE | BPF_K] =	jne_imm,
3509 	[BPF_JMP32 | BPF_JEQ | BPF_X] =	jeq_reg,
3510 	[BPF_JMP32 | BPF_JGT | BPF_X] =	cmp_reg,
3511 	[BPF_JMP32 | BPF_JGE | BPF_X] =	cmp_reg,
3512 	[BPF_JMP32 | BPF_JLT | BPF_X] =	cmp_reg,
3513 	[BPF_JMP32 | BPF_JLE | BPF_X] =	cmp_reg,
3514 	[BPF_JMP32 | BPF_JSGT | BPF_X] =cmp_reg,
3515 	[BPF_JMP32 | BPF_JSGE | BPF_X] =cmp_reg,
3516 	[BPF_JMP32 | BPF_JSLT | BPF_X] =cmp_reg,
3517 	[BPF_JMP32 | BPF_JSLE | BPF_X] =cmp_reg,
3518 	[BPF_JMP32 | BPF_JSET | BPF_X] =jset_reg,
3519 	[BPF_JMP32 | BPF_JNE | BPF_X] =	jne_reg,
3520 	[BPF_JMP | BPF_CALL] =		call,
3521 	[BPF_JMP | BPF_EXIT] =		jmp_exit,
3522 };
3523 
3524 /* --- Assembler logic --- */
3525 static int
3526 nfp_fixup_immed_relo(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
3527 		     struct nfp_insn_meta *jmp_dst, u32 br_idx)
3528 {
3529 	if (immed_get_value(nfp_prog->prog[br_idx + 1])) {
3530 		pr_err("BUG: failed to fix up callee register saving\n");
3531 		return -EINVAL;
3532 	}
3533 
3534 	immed_set_value(&nfp_prog->prog[br_idx + 1], jmp_dst->off);
3535 
3536 	return 0;
3537 }
3538 
3539 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
3540 {
3541 	struct nfp_insn_meta *meta, *jmp_dst;
3542 	u32 idx, br_idx;
3543 	int err;
3544 
3545 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3546 		if (meta->flags & FLAG_INSN_SKIP_MASK)
3547 			continue;
3548 		if (!is_mbpf_jmp(meta))
3549 			continue;
3550 		if (meta->insn.code == (BPF_JMP | BPF_EXIT) &&
3551 		    !nfp_is_main_function(meta))
3552 			continue;
3553 		if (is_mbpf_helper_call(meta))
3554 			continue;
3555 
3556 		if (list_is_last(&meta->l, &nfp_prog->insns))
3557 			br_idx = nfp_prog->last_bpf_off;
3558 		else
3559 			br_idx = list_next_entry(meta, l)->off - 1;
3560 
3561 		/* For BPF-to-BPF function call, a stack adjustment sequence is
3562 		 * generated after the return instruction. Therefore, we must
3563 		 * withdraw the length of this sequence to have br_idx pointing
3564 		 * to where the "branch" NFP instruction is expected to be.
3565 		 */
3566 		if (is_mbpf_pseudo_call(meta))
3567 			br_idx -= meta->num_insns_after_br;
3568 
3569 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
3570 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
3571 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
3572 			return -ELOOP;
3573 		}
3574 
3575 		if (meta->insn.code == (BPF_JMP | BPF_EXIT))
3576 			continue;
3577 
3578 		/* Leave special branches for later */
3579 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3580 		    RELO_BR_REL && !is_mbpf_pseudo_call(meta))
3581 			continue;
3582 
3583 		if (!meta->jmp_dst) {
3584 			pr_err("Non-exit jump doesn't have destination info recorded!!\n");
3585 			return -ELOOP;
3586 		}
3587 
3588 		jmp_dst = meta->jmp_dst;
3589 
3590 		if (jmp_dst->flags & FLAG_INSN_SKIP_PREC_DEPENDENT) {
3591 			pr_err("Branch landing on removed instruction!!\n");
3592 			return -ELOOP;
3593 		}
3594 
3595 		if (is_mbpf_pseudo_call(meta) &&
3596 		    nfp_prog->subprog[jmp_dst->subprog_idx].needs_reg_push) {
3597 			err = nfp_fixup_immed_relo(nfp_prog, meta,
3598 						   jmp_dst, br_idx);
3599 			if (err)
3600 				return err;
3601 		}
3602 
3603 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3604 		    RELO_BR_REL)
3605 			continue;
3606 
3607 		for (idx = meta->off; idx <= br_idx; idx++) {
3608 			if (!nfp_is_br(nfp_prog->prog[idx]))
3609 				continue;
3610 			br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
3611 		}
3612 	}
3613 
3614 	return 0;
3615 }
3616 
3617 static void nfp_intro(struct nfp_prog *nfp_prog)
3618 {
3619 	wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
3620 	emit_alu(nfp_prog, plen_reg(nfp_prog),
3621 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
3622 }
3623 
3624 static void
3625 nfp_subprog_prologue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3626 {
3627 	/* Save return address into the stack. */
3628 	wrp_mov(nfp_prog, reg_lm(0, 0), ret_reg(nfp_prog));
3629 }
3630 
3631 static void
3632 nfp_start_subprog(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3633 {
3634 	unsigned int depth = nfp_prog->subprog[meta->subprog_idx].stack_depth;
3635 
3636 	nfp_prog->stack_frame_depth = round_up(depth, 4);
3637 	nfp_subprog_prologue(nfp_prog, meta);
3638 }
3639 
3640 bool nfp_is_subprog_start(struct nfp_insn_meta *meta)
3641 {
3642 	return meta->flags & FLAG_INSN_IS_SUBPROG_START;
3643 }
3644 
3645 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
3646 {
3647 	/* TC direct-action mode:
3648 	 *   0,1   ok        NOT SUPPORTED[1]
3649 	 *   2   drop  0x22 -> drop,  count as stat1
3650 	 *   4,5 nuke  0x02 -> drop
3651 	 *   7  redir  0x44 -> redir, count as stat2
3652 	 *   * unspec  0x11 -> pass,  count as stat0
3653 	 *
3654 	 * [1] We can't support OK and RECLASSIFY because we can't tell TC
3655 	 *     the exact decision made.  We are forced to support UNSPEC
3656 	 *     to handle aborts so that's the only one we handle for passing
3657 	 *     packets up the stack.
3658 	 */
3659 	/* Target for aborts */
3660 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3661 
3662 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3663 
3664 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3665 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16);
3666 
3667 	/* Target for normal exits */
3668 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3669 
3670 	/* if R0 > 7 jump to abort */
3671 	emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0));
3672 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3673 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3674 
3675 	wrp_immed(nfp_prog, reg_b(2), 0x41221211);
3676 	wrp_immed(nfp_prog, reg_b(3), 0x41001211);
3677 
3678 	emit_shf(nfp_prog, reg_a(1),
3679 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 2);
3680 
3681 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3682 	emit_shf(nfp_prog, reg_a(2),
3683 		 reg_imm(0xf), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3684 
3685 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3686 	emit_shf(nfp_prog, reg_b(2),
3687 		 reg_imm(0xf), SHF_OP_AND, reg_b(3), SHF_SC_R_SHF, 0);
3688 
3689 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3690 
3691 	emit_shf(nfp_prog, reg_b(2),
3692 		 reg_a(2), SHF_OP_OR, reg_b(2), SHF_SC_L_SHF, 4);
3693 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3694 }
3695 
3696 static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
3697 {
3698 	/* XDP return codes:
3699 	 *   0 aborted  0x82 -> drop,  count as stat3
3700 	 *   1    drop  0x22 -> drop,  count as stat1
3701 	 *   2    pass  0x11 -> pass,  count as stat0
3702 	 *   3      tx  0x44 -> redir, count as stat2
3703 	 *   * unknown  0x82 -> drop,  count as stat3
3704 	 */
3705 	/* Target for aborts */
3706 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3707 
3708 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3709 
3710 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3711 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
3712 
3713 	/* Target for normal exits */
3714 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3715 
3716 	/* if R0 > 3 jump to abort */
3717 	emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
3718 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3719 
3720 	wrp_immed(nfp_prog, reg_b(2), 0x44112282);
3721 
3722 	emit_shf(nfp_prog, reg_a(1),
3723 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 3);
3724 
3725 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3726 	emit_shf(nfp_prog, reg_b(2),
3727 		 reg_imm(0xff), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3728 
3729 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3730 
3731 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3732 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3733 }
3734 
3735 static bool nfp_prog_needs_callee_reg_save(struct nfp_prog *nfp_prog)
3736 {
3737 	unsigned int idx;
3738 
3739 	for (idx = 1; idx < nfp_prog->subprog_cnt; idx++)
3740 		if (nfp_prog->subprog[idx].needs_reg_push)
3741 			return true;
3742 
3743 	return false;
3744 }
3745 
3746 static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
3747 {
3748 	u8 reg;
3749 
3750 	/* Subroutine: Save all callee saved registers (R6 ~ R9).
3751 	 * imm_b() holds the return address.
3752 	 */
3753 	nfp_prog->tgt_call_push_regs = nfp_prog_current_offset(nfp_prog);
3754 	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3755 		u8 adj = (reg - BPF_REG_0) * 2;
3756 		u8 idx = (reg - BPF_REG_6) * 2;
3757 
3758 		/* The first slot in the stack frame is used to push the return
3759 		 * address in bpf_to_bpf_call(), start just after.
3760 		 */
3761 		wrp_mov(nfp_prog, reg_lm(0, 1 + idx), reg_b(adj));
3762 
3763 		if (reg == BPF_REG_8)
3764 			/* Prepare to jump back, last 3 insns use defer slots */
3765 			emit_rtn(nfp_prog, imm_b(nfp_prog), 3);
3766 
3767 		wrp_mov(nfp_prog, reg_lm(0, 1 + idx + 1), reg_b(adj + 1));
3768 	}
3769 }
3770 
3771 static void nfp_pop_callee_registers(struct nfp_prog *nfp_prog)
3772 {
3773 	u8 reg;
3774 
3775 	/* Subroutine: Restore all callee saved registers (R6 ~ R9).
3776 	 * ret_reg() holds the return address.
3777 	 */
3778 	nfp_prog->tgt_call_pop_regs = nfp_prog_current_offset(nfp_prog);
3779 	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3780 		u8 adj = (reg - BPF_REG_0) * 2;
3781 		u8 idx = (reg - BPF_REG_6) * 2;
3782 
3783 		/* The first slot in the stack frame holds the return address,
3784 		 * start popping just after that.
3785 		 */
3786 		wrp_mov(nfp_prog, reg_both(adj), reg_lm(0, 1 + idx));
3787 
3788 		if (reg == BPF_REG_8)
3789 			/* Prepare to jump back, last 3 insns use defer slots */
3790 			emit_rtn(nfp_prog, ret_reg(nfp_prog), 3);
3791 
3792 		wrp_mov(nfp_prog, reg_both(adj + 1), reg_lm(0, 1 + idx + 1));
3793 	}
3794 }
3795 
3796 static void nfp_outro(struct nfp_prog *nfp_prog)
3797 {
3798 	switch (nfp_prog->type) {
3799 	case BPF_PROG_TYPE_SCHED_CLS:
3800 		nfp_outro_tc_da(nfp_prog);
3801 		break;
3802 	case BPF_PROG_TYPE_XDP:
3803 		nfp_outro_xdp(nfp_prog);
3804 		break;
3805 	default:
3806 		WARN_ON(1);
3807 	}
3808 
3809 	if (!nfp_prog_needs_callee_reg_save(nfp_prog))
3810 		return;
3811 
3812 	nfp_push_callee_registers(nfp_prog);
3813 	nfp_pop_callee_registers(nfp_prog);
3814 }
3815 
3816 static int nfp_translate(struct nfp_prog *nfp_prog)
3817 {
3818 	struct nfp_insn_meta *meta;
3819 	unsigned int depth;
3820 	int err;
3821 
3822 	depth = nfp_prog->subprog[0].stack_depth;
3823 	nfp_prog->stack_frame_depth = round_up(depth, 4);
3824 
3825 	nfp_intro(nfp_prog);
3826 	if (nfp_prog->error)
3827 		return nfp_prog->error;
3828 
3829 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3830 		instr_cb_t cb = instr_cb[meta->insn.code];
3831 
3832 		meta->off = nfp_prog_current_offset(nfp_prog);
3833 
3834 		if (nfp_is_subprog_start(meta)) {
3835 			nfp_start_subprog(nfp_prog, meta);
3836 			if (nfp_prog->error)
3837 				return nfp_prog->error;
3838 		}
3839 
3840 		if (meta->flags & FLAG_INSN_SKIP_MASK) {
3841 			nfp_prog->n_translated++;
3842 			continue;
3843 		}
3844 
3845 		if (nfp_meta_has_prev(nfp_prog, meta) &&
3846 		    nfp_meta_prev(meta)->double_cb)
3847 			cb = nfp_meta_prev(meta)->double_cb;
3848 		if (!cb)
3849 			return -ENOENT;
3850 		err = cb(nfp_prog, meta);
3851 		if (err)
3852 			return err;
3853 		if (nfp_prog->error)
3854 			return nfp_prog->error;
3855 
3856 		nfp_prog->n_translated++;
3857 	}
3858 
3859 	nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
3860 
3861 	nfp_outro(nfp_prog);
3862 	if (nfp_prog->error)
3863 		return nfp_prog->error;
3864 
3865 	wrp_nops(nfp_prog, NFP_USTORE_PREFETCH_WINDOW);
3866 	if (nfp_prog->error)
3867 		return nfp_prog->error;
3868 
3869 	return nfp_fixup_branches(nfp_prog);
3870 }
3871 
3872 /* --- Optimizations --- */
3873 static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
3874 {
3875 	struct nfp_insn_meta *meta;
3876 
3877 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3878 		struct bpf_insn insn = meta->insn;
3879 
3880 		/* Programs converted from cBPF start with register xoring */
3881 		if (insn.code == (BPF_ALU64 | BPF_XOR | BPF_X) &&
3882 		    insn.src_reg == insn.dst_reg)
3883 			continue;
3884 
3885 		/* Programs start with R6 = R1 but we ignore the skb pointer */
3886 		if (insn.code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
3887 		    insn.src_reg == 1 && insn.dst_reg == 6)
3888 			meta->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
3889 
3890 		/* Return as soon as something doesn't match */
3891 		if (!(meta->flags & FLAG_INSN_SKIP_MASK))
3892 			return;
3893 	}
3894 }
3895 
3896 /* abs(insn.imm) will fit better into unrestricted reg immediate -
3897  * convert add/sub of a negative number into a sub/add of a positive one.
3898  */
3899 static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
3900 {
3901 	struct nfp_insn_meta *meta;
3902 
3903 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3904 		struct bpf_insn insn = meta->insn;
3905 
3906 		if (meta->flags & FLAG_INSN_SKIP_MASK)
3907 			continue;
3908 
3909 		if (!is_mbpf_alu(meta) && !is_mbpf_jmp(meta))
3910 			continue;
3911 		if (BPF_SRC(insn.code) != BPF_K)
3912 			continue;
3913 		if (insn.imm >= 0)
3914 			continue;
3915 
3916 		if (is_mbpf_jmp(meta)) {
3917 			switch (BPF_OP(insn.code)) {
3918 			case BPF_JGE:
3919 			case BPF_JSGE:
3920 			case BPF_JLT:
3921 			case BPF_JSLT:
3922 				meta->jump_neg_op = true;
3923 				break;
3924 			default:
3925 				continue;
3926 			}
3927 		} else {
3928 			if (BPF_OP(insn.code) == BPF_ADD)
3929 				insn.code = BPF_CLASS(insn.code) | BPF_SUB;
3930 			else if (BPF_OP(insn.code) == BPF_SUB)
3931 				insn.code = BPF_CLASS(insn.code) | BPF_ADD;
3932 			else
3933 				continue;
3934 
3935 			meta->insn.code = insn.code | BPF_K;
3936 		}
3937 
3938 		meta->insn.imm = -insn.imm;
3939 	}
3940 }
3941 
3942 /* Remove masking after load since our load guarantees this is not needed */
3943 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
3944 {
3945 	struct nfp_insn_meta *meta1, *meta2;
3946 	const s32 exp_mask[] = {
3947 		[BPF_B] = 0x000000ffU,
3948 		[BPF_H] = 0x0000ffffU,
3949 		[BPF_W] = 0xffffffffU,
3950 	};
3951 
3952 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3953 		struct bpf_insn insn, next;
3954 
3955 		insn = meta1->insn;
3956 		next = meta2->insn;
3957 
3958 		if (BPF_CLASS(insn.code) != BPF_LD)
3959 			continue;
3960 		if (BPF_MODE(insn.code) != BPF_ABS &&
3961 		    BPF_MODE(insn.code) != BPF_IND)
3962 			continue;
3963 
3964 		if (next.code != (BPF_ALU64 | BPF_AND | BPF_K))
3965 			continue;
3966 
3967 		if (!exp_mask[BPF_SIZE(insn.code)])
3968 			continue;
3969 		if (exp_mask[BPF_SIZE(insn.code)] != next.imm)
3970 			continue;
3971 
3972 		if (next.src_reg || next.dst_reg)
3973 			continue;
3974 
3975 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
3976 			continue;
3977 
3978 		meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
3979 	}
3980 }
3981 
3982 static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
3983 {
3984 	struct nfp_insn_meta *meta1, *meta2, *meta3;
3985 
3986 	nfp_for_each_insn_walk3(nfp_prog, meta1, meta2, meta3) {
3987 		struct bpf_insn insn, next1, next2;
3988 
3989 		insn = meta1->insn;
3990 		next1 = meta2->insn;
3991 		next2 = meta3->insn;
3992 
3993 		if (BPF_CLASS(insn.code) != BPF_LD)
3994 			continue;
3995 		if (BPF_MODE(insn.code) != BPF_ABS &&
3996 		    BPF_MODE(insn.code) != BPF_IND)
3997 			continue;
3998 		if (BPF_SIZE(insn.code) != BPF_W)
3999 			continue;
4000 
4001 		if (!(next1.code == (BPF_LSH | BPF_K | BPF_ALU64) &&
4002 		      next2.code == (BPF_RSH | BPF_K | BPF_ALU64)) &&
4003 		    !(next1.code == (BPF_RSH | BPF_K | BPF_ALU64) &&
4004 		      next2.code == (BPF_LSH | BPF_K | BPF_ALU64)))
4005 			continue;
4006 
4007 		if (next1.src_reg || next1.dst_reg ||
4008 		    next2.src_reg || next2.dst_reg)
4009 			continue;
4010 
4011 		if (next1.imm != 0x20 || next2.imm != 0x20)
4012 			continue;
4013 
4014 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
4015 		    meta3->flags & FLAG_INSN_IS_JUMP_DST)
4016 			continue;
4017 
4018 		meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4019 		meta3->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4020 	}
4021 }
4022 
4023 /* load/store pair that forms memory copy sould look like the following:
4024  *
4025  *   ld_width R, [addr_src + offset_src]
4026  *   st_width [addr_dest + offset_dest], R
4027  *
4028  * The destination register of load and source register of store should
4029  * be the same, load and store should also perform at the same width.
4030  * If either of addr_src or addr_dest is stack pointer, we don't do the
4031  * CPP optimization as stack is modelled by registers on NFP.
4032  */
4033 static bool
4034 curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
4035 		    struct nfp_insn_meta *st_meta)
4036 {
4037 	struct bpf_insn *ld = &ld_meta->insn;
4038 	struct bpf_insn *st = &st_meta->insn;
4039 
4040 	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
4041 		return false;
4042 
4043 	if (ld_meta->ptr.type != PTR_TO_PACKET &&
4044 	    ld_meta->ptr.type != PTR_TO_MAP_VALUE)
4045 		return false;
4046 
4047 	if (st_meta->ptr.type != PTR_TO_PACKET)
4048 		return false;
4049 
4050 	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
4051 		return false;
4052 
4053 	if (ld->dst_reg != st->src_reg)
4054 		return false;
4055 
4056 	/* There is jump to the store insn in this pair. */
4057 	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
4058 		return false;
4059 
4060 	return true;
4061 }
4062 
4063 /* Currently, we only support chaining load/store pairs if:
4064  *
4065  *  - Their address base registers are the same.
4066  *  - Their address offsets are in the same order.
4067  *  - They operate at the same memory width.
4068  *  - There is no jump into the middle of them.
4069  */
4070 static bool
4071 curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
4072 			      struct nfp_insn_meta *st_meta,
4073 			      struct bpf_insn *prev_ld,
4074 			      struct bpf_insn *prev_st)
4075 {
4076 	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
4077 	struct bpf_insn *ld = &ld_meta->insn;
4078 	struct bpf_insn *st = &st_meta->insn;
4079 	s16 prev_ld_off, prev_st_off;
4080 
4081 	/* This pair is the start pair. */
4082 	if (!prev_ld)
4083 		return true;
4084 
4085 	prev_size = BPF_LDST_BYTES(prev_ld);
4086 	curr_size = BPF_LDST_BYTES(ld);
4087 	prev_ld_base = prev_ld->src_reg;
4088 	prev_st_base = prev_st->dst_reg;
4089 	prev_ld_dst = prev_ld->dst_reg;
4090 	prev_ld_off = prev_ld->off;
4091 	prev_st_off = prev_st->off;
4092 
4093 	if (ld->dst_reg != prev_ld_dst)
4094 		return false;
4095 
4096 	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
4097 		return false;
4098 
4099 	if (curr_size != prev_size)
4100 		return false;
4101 
4102 	/* There is jump to the head of this pair. */
4103 	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
4104 		return false;
4105 
4106 	/* Both in ascending order. */
4107 	if (prev_ld_off + prev_size == ld->off &&
4108 	    prev_st_off + prev_size == st->off)
4109 		return true;
4110 
4111 	/* Both in descending order. */
4112 	if (ld->off + curr_size == prev_ld_off &&
4113 	    st->off + curr_size == prev_st_off)
4114 		return true;
4115 
4116 	return false;
4117 }
4118 
4119 /* Return TRUE if cross memory access happens. Cross memory access means
4120  * store area is overlapping with load area that a later load might load
4121  * the value from previous store, for this case we can't treat the sequence
4122  * as an memory copy.
4123  */
4124 static bool
4125 cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
4126 		 struct nfp_insn_meta *head_st_meta)
4127 {
4128 	s16 head_ld_off, head_st_off, ld_off;
4129 
4130 	/* Different pointer types does not overlap. */
4131 	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
4132 		return false;
4133 
4134 	/* load and store are both PTR_TO_PACKET, check ID info.  */
4135 	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
4136 		return true;
4137 
4138 	/* Canonicalize the offsets. Turn all of them against the original
4139 	 * base register.
4140 	 */
4141 	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
4142 	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
4143 	ld_off = ld->off + head_ld_meta->ptr.off;
4144 
4145 	/* Ascending order cross. */
4146 	if (ld_off > head_ld_off &&
4147 	    head_ld_off < head_st_off && ld_off >= head_st_off)
4148 		return true;
4149 
4150 	/* Descending order cross. */
4151 	if (ld_off < head_ld_off &&
4152 	    head_ld_off > head_st_off && ld_off <= head_st_off)
4153 		return true;
4154 
4155 	return false;
4156 }
4157 
4158 /* This pass try to identify the following instructoin sequences.
4159  *
4160  *   load R, [regA + offA]
4161  *   store [regB + offB], R
4162  *   load R, [regA + offA + const_imm_A]
4163  *   store [regB + offB + const_imm_A], R
4164  *   load R, [regA + offA + 2 * const_imm_A]
4165  *   store [regB + offB + 2 * const_imm_A], R
4166  *   ...
4167  *
4168  * Above sequence is typically generated by compiler when lowering
4169  * memcpy. NFP prefer using CPP instructions to accelerate it.
4170  */
4171 static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
4172 {
4173 	struct nfp_insn_meta *head_ld_meta = NULL;
4174 	struct nfp_insn_meta *head_st_meta = NULL;
4175 	struct nfp_insn_meta *meta1, *meta2;
4176 	struct bpf_insn *prev_ld = NULL;
4177 	struct bpf_insn *prev_st = NULL;
4178 	u8 count = 0;
4179 
4180 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4181 		struct bpf_insn *ld = &meta1->insn;
4182 		struct bpf_insn *st = &meta2->insn;
4183 
4184 		/* Reset record status if any of the following if true:
4185 		 *   - The current insn pair is not load/store.
4186 		 *   - The load/store pair doesn't chain with previous one.
4187 		 *   - The chained load/store pair crossed with previous pair.
4188 		 *   - The chained load/store pair has a total size of memory
4189 		 *     copy beyond 128 bytes which is the maximum length a
4190 		 *     single NFP CPP command can transfer.
4191 		 */
4192 		if (!curr_pair_is_memcpy(meta1, meta2) ||
4193 		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
4194 						   prev_st) ||
4195 		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
4196 						       head_st_meta) ||
4197 				      head_ld_meta->ldst_gather_len >= 128))) {
4198 			if (!count)
4199 				continue;
4200 
4201 			if (count > 1) {
4202 				s16 prev_ld_off = prev_ld->off;
4203 				s16 prev_st_off = prev_st->off;
4204 				s16 head_ld_off = head_ld_meta->insn.off;
4205 
4206 				if (prev_ld_off < head_ld_off) {
4207 					head_ld_meta->insn.off = prev_ld_off;
4208 					head_st_meta->insn.off = prev_st_off;
4209 					head_ld_meta->ldst_gather_len =
4210 						-head_ld_meta->ldst_gather_len;
4211 				}
4212 
4213 				head_ld_meta->paired_st = &head_st_meta->insn;
4214 				head_st_meta->flags |=
4215 					FLAG_INSN_SKIP_PREC_DEPENDENT;
4216 			} else {
4217 				head_ld_meta->ldst_gather_len = 0;
4218 			}
4219 
4220 			/* If the chain is ended by an load/store pair then this
4221 			 * could serve as the new head of the the next chain.
4222 			 */
4223 			if (curr_pair_is_memcpy(meta1, meta2)) {
4224 				head_ld_meta = meta1;
4225 				head_st_meta = meta2;
4226 				head_ld_meta->ldst_gather_len =
4227 					BPF_LDST_BYTES(ld);
4228 				meta1 = nfp_meta_next(meta1);
4229 				meta2 = nfp_meta_next(meta2);
4230 				prev_ld = ld;
4231 				prev_st = st;
4232 				count = 1;
4233 			} else {
4234 				head_ld_meta = NULL;
4235 				head_st_meta = NULL;
4236 				prev_ld = NULL;
4237 				prev_st = NULL;
4238 				count = 0;
4239 			}
4240 
4241 			continue;
4242 		}
4243 
4244 		if (!head_ld_meta) {
4245 			head_ld_meta = meta1;
4246 			head_st_meta = meta2;
4247 		} else {
4248 			meta1->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4249 			meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4250 		}
4251 
4252 		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
4253 		meta1 = nfp_meta_next(meta1);
4254 		meta2 = nfp_meta_next(meta2);
4255 		prev_ld = ld;
4256 		prev_st = st;
4257 		count++;
4258 	}
4259 }
4260 
4261 static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog)
4262 {
4263 	struct nfp_insn_meta *meta, *range_node = NULL;
4264 	s16 range_start = 0, range_end = 0;
4265 	bool cache_avail = false;
4266 	struct bpf_insn *insn;
4267 	s32 range_ptr_off = 0;
4268 	u32 range_ptr_id = 0;
4269 
4270 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4271 		if (meta->flags & FLAG_INSN_IS_JUMP_DST)
4272 			cache_avail = false;
4273 
4274 		if (meta->flags & FLAG_INSN_SKIP_MASK)
4275 			continue;
4276 
4277 		insn = &meta->insn;
4278 
4279 		if (is_mbpf_store_pkt(meta) ||
4280 		    insn->code == (BPF_JMP | BPF_CALL) ||
4281 		    is_mbpf_classic_store_pkt(meta) ||
4282 		    is_mbpf_classic_load(meta)) {
4283 			cache_avail = false;
4284 			continue;
4285 		}
4286 
4287 		if (!is_mbpf_load(meta))
4288 			continue;
4289 
4290 		if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) {
4291 			cache_avail = false;
4292 			continue;
4293 		}
4294 
4295 		if (!cache_avail) {
4296 			cache_avail = true;
4297 			if (range_node)
4298 				goto end_current_then_start_new;
4299 			goto start_new;
4300 		}
4301 
4302 		/* Check ID to make sure two reads share the same
4303 		 * variable offset against PTR_TO_PACKET, and check OFF
4304 		 * to make sure they also share the same constant
4305 		 * offset.
4306 		 *
4307 		 * OFFs don't really need to be the same, because they
4308 		 * are the constant offsets against PTR_TO_PACKET, so
4309 		 * for different OFFs, we could canonicalize them to
4310 		 * offsets against original packet pointer. We don't
4311 		 * support this.
4312 		 */
4313 		if (meta->ptr.id == range_ptr_id &&
4314 		    meta->ptr.off == range_ptr_off) {
4315 			s16 new_start = range_start;
4316 			s16 end, off = insn->off;
4317 			s16 new_end = range_end;
4318 			bool changed = false;
4319 
4320 			if (off < range_start) {
4321 				new_start = off;
4322 				changed = true;
4323 			}
4324 
4325 			end = off + BPF_LDST_BYTES(insn);
4326 			if (end > range_end) {
4327 				new_end = end;
4328 				changed = true;
4329 			}
4330 
4331 			if (!changed)
4332 				continue;
4333 
4334 			if (new_end - new_start <= 64) {
4335 				/* Install new range. */
4336 				range_start = new_start;
4337 				range_end = new_end;
4338 				continue;
4339 			}
4340 		}
4341 
4342 end_current_then_start_new:
4343 		range_node->pkt_cache.range_start = range_start;
4344 		range_node->pkt_cache.range_end = range_end;
4345 start_new:
4346 		range_node = meta;
4347 		range_node->pkt_cache.do_init = true;
4348 		range_ptr_id = range_node->ptr.id;
4349 		range_ptr_off = range_node->ptr.off;
4350 		range_start = insn->off;
4351 		range_end = insn->off + BPF_LDST_BYTES(insn);
4352 	}
4353 
4354 	if (range_node) {
4355 		range_node->pkt_cache.range_start = range_start;
4356 		range_node->pkt_cache.range_end = range_end;
4357 	}
4358 
4359 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4360 		if (meta->flags & FLAG_INSN_SKIP_MASK)
4361 			continue;
4362 
4363 		if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) {
4364 			if (meta->pkt_cache.do_init) {
4365 				range_start = meta->pkt_cache.range_start;
4366 				range_end = meta->pkt_cache.range_end;
4367 			} else {
4368 				meta->pkt_cache.range_start = range_start;
4369 				meta->pkt_cache.range_end = range_end;
4370 			}
4371 		}
4372 	}
4373 }
4374 
4375 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
4376 {
4377 	nfp_bpf_opt_reg_init(nfp_prog);
4378 
4379 	nfp_bpf_opt_neg_add_sub(nfp_prog);
4380 	nfp_bpf_opt_ld_mask(nfp_prog);
4381 	nfp_bpf_opt_ld_shift(nfp_prog);
4382 	nfp_bpf_opt_ldst_gather(nfp_prog);
4383 	nfp_bpf_opt_pkt_cache(nfp_prog);
4384 
4385 	return 0;
4386 }
4387 
4388 static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
4389 {
4390 	struct nfp_insn_meta *meta1, *meta2;
4391 	struct nfp_bpf_map *nfp_map;
4392 	struct bpf_map *map;
4393 	u32 id;
4394 
4395 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4396 		if (meta1->flags & FLAG_INSN_SKIP_MASK ||
4397 		    meta2->flags & FLAG_INSN_SKIP_MASK)
4398 			continue;
4399 
4400 		if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
4401 		    meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
4402 			continue;
4403 
4404 		map = (void *)(unsigned long)((u32)meta1->insn.imm |
4405 					      (u64)meta2->insn.imm << 32);
4406 		if (bpf_map_offload_neutral(map)) {
4407 			id = map->id;
4408 		} else {
4409 			nfp_map = map_to_offmap(map)->dev_priv;
4410 			id = nfp_map->tid;
4411 		}
4412 
4413 		meta1->insn.imm = id;
4414 		meta2->insn.imm = 0;
4415 	}
4416 
4417 	return 0;
4418 }
4419 
4420 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
4421 {
4422 	__le64 *ustore = (__force __le64 *)prog;
4423 	int i;
4424 
4425 	for (i = 0; i < len; i++) {
4426 		int err;
4427 
4428 		err = nfp_ustore_check_valid_no_ecc(prog[i]);
4429 		if (err)
4430 			return err;
4431 
4432 		ustore[i] = cpu_to_le64(nfp_ustore_calc_ecc_insn(prog[i]));
4433 	}
4434 
4435 	return 0;
4436 }
4437 
4438 static void nfp_bpf_prog_trim(struct nfp_prog *nfp_prog)
4439 {
4440 	void *prog;
4441 
4442 	prog = kvmalloc_array(nfp_prog->prog_len, sizeof(u64), GFP_KERNEL);
4443 	if (!prog)
4444 		return;
4445 
4446 	nfp_prog->__prog_alloc_len = nfp_prog->prog_len * sizeof(u64);
4447 	memcpy(prog, nfp_prog->prog, nfp_prog->__prog_alloc_len);
4448 	kvfree(nfp_prog->prog);
4449 	nfp_prog->prog = prog;
4450 }
4451 
4452 int nfp_bpf_jit(struct nfp_prog *nfp_prog)
4453 {
4454 	int ret;
4455 
4456 	ret = nfp_bpf_replace_map_ptrs(nfp_prog);
4457 	if (ret)
4458 		return ret;
4459 
4460 	ret = nfp_bpf_optimize(nfp_prog);
4461 	if (ret)
4462 		return ret;
4463 
4464 	ret = nfp_translate(nfp_prog);
4465 	if (ret) {
4466 		pr_err("Translation failed with error %d (translated: %u)\n",
4467 		       ret, nfp_prog->n_translated);
4468 		return -EINVAL;
4469 	}
4470 
4471 	nfp_bpf_prog_trim(nfp_prog);
4472 
4473 	return ret;
4474 }
4475 
4476 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog)
4477 {
4478 	struct nfp_insn_meta *meta;
4479 
4480 	/* Another pass to record jump information. */
4481 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4482 		struct nfp_insn_meta *dst_meta;
4483 		u64 code = meta->insn.code;
4484 		unsigned int dst_idx;
4485 		bool pseudo_call;
4486 
4487 		if (!is_mbpf_jmp(meta))
4488 			continue;
4489 		if (BPF_OP(code) == BPF_EXIT)
4490 			continue;
4491 		if (is_mbpf_helper_call(meta))
4492 			continue;
4493 
4494 		/* If opcode is BPF_CALL at this point, this can only be a
4495 		 * BPF-to-BPF call (a.k.a pseudo call).
4496 		 */
4497 		pseudo_call = BPF_OP(code) == BPF_CALL;
4498 
4499 		if (pseudo_call)
4500 			dst_idx = meta->n + 1 + meta->insn.imm;
4501 		else
4502 			dst_idx = meta->n + 1 + meta->insn.off;
4503 
4504 		dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_idx);
4505 
4506 		if (pseudo_call)
4507 			dst_meta->flags |= FLAG_INSN_IS_SUBPROG_START;
4508 
4509 		dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
4510 		meta->jmp_dst = dst_meta;
4511 	}
4512 }
4513 
4514 bool nfp_bpf_supported_opcode(u8 code)
4515 {
4516 	return !!instr_cb[code];
4517 }
4518 
4519 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
4520 {
4521 	unsigned int i;
4522 	u64 *prog;
4523 	int err;
4524 
4525 	prog = kmemdup(nfp_prog->prog, nfp_prog->prog_len * sizeof(u64),
4526 		       GFP_KERNEL);
4527 	if (!prog)
4528 		return ERR_PTR(-ENOMEM);
4529 
4530 	for (i = 0; i < nfp_prog->prog_len; i++) {
4531 		enum nfp_relo_type special;
4532 		u32 val;
4533 		u16 off;
4534 
4535 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
4536 		switch (special) {
4537 		case RELO_NONE:
4538 			continue;
4539 		case RELO_BR_REL:
4540 			br_add_offset(&prog[i], bv->start_off);
4541 			break;
4542 		case RELO_BR_GO_OUT:
4543 			br_set_offset(&prog[i],
4544 				      nfp_prog->tgt_out + bv->start_off);
4545 			break;
4546 		case RELO_BR_GO_ABORT:
4547 			br_set_offset(&prog[i],
4548 				      nfp_prog->tgt_abort + bv->start_off);
4549 			break;
4550 		case RELO_BR_GO_CALL_PUSH_REGS:
4551 			if (!nfp_prog->tgt_call_push_regs) {
4552 				pr_err("BUG: failed to detect subprogram registers needs\n");
4553 				err = -EINVAL;
4554 				goto err_free_prog;
4555 			}
4556 			off = nfp_prog->tgt_call_push_regs + bv->start_off;
4557 			br_set_offset(&prog[i], off);
4558 			break;
4559 		case RELO_BR_GO_CALL_POP_REGS:
4560 			if (!nfp_prog->tgt_call_pop_regs) {
4561 				pr_err("BUG: failed to detect subprogram registers needs\n");
4562 				err = -EINVAL;
4563 				goto err_free_prog;
4564 			}
4565 			off = nfp_prog->tgt_call_pop_regs + bv->start_off;
4566 			br_set_offset(&prog[i], off);
4567 			break;
4568 		case RELO_BR_NEXT_PKT:
4569 			br_set_offset(&prog[i], bv->tgt_done);
4570 			break;
4571 		case RELO_BR_HELPER:
4572 			val = br_get_offset(prog[i]);
4573 			val -= BR_OFF_RELO;
4574 			switch (val) {
4575 			case BPF_FUNC_map_lookup_elem:
4576 				val = nfp_prog->bpf->helpers.map_lookup;
4577 				break;
4578 			case BPF_FUNC_map_update_elem:
4579 				val = nfp_prog->bpf->helpers.map_update;
4580 				break;
4581 			case BPF_FUNC_map_delete_elem:
4582 				val = nfp_prog->bpf->helpers.map_delete;
4583 				break;
4584 			case BPF_FUNC_perf_event_output:
4585 				val = nfp_prog->bpf->helpers.perf_event_output;
4586 				break;
4587 			default:
4588 				pr_err("relocation of unknown helper %d\n",
4589 				       val);
4590 				err = -EINVAL;
4591 				goto err_free_prog;
4592 			}
4593 			br_set_offset(&prog[i], val);
4594 			break;
4595 		case RELO_IMMED_REL:
4596 			immed_add_value(&prog[i], bv->start_off);
4597 			break;
4598 		}
4599 
4600 		prog[i] &= ~OP_RELO_TYPE;
4601 	}
4602 
4603 	err = nfp_bpf_ustore_calc(prog, nfp_prog->prog_len);
4604 	if (err)
4605 		goto err_free_prog;
4606 
4607 	return prog;
4608 
4609 err_free_prog:
4610 	kfree(prog);
4611 	return ERR_PTR(err);
4612 }
4613