xref: /openbmc/linux/arch/mips/net/bpf_jit_comp32.c (revision 173940b3)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Just-In-Time compiler for eBPF bytecode on MIPS.
4  * Implementation of JIT functions for 32-bit CPUs.
5  *
6  * Copyright (c) 2021 Anyfi Networks AB.
7  * Author: Johan Almbladh <johan.almbladh@gmail.com>
8  *
9  * Based on code and ideas from
10  * Copyright (c) 2017 Cavium, Inc.
11  * Copyright (c) 2017 Shubham Bansal <illusionist.neo@gmail.com>
12  * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
13  */
14 
15 #include <linux/math64.h>
16 #include <linux/errno.h>
17 #include <linux/filter.h>
18 #include <linux/bpf.h>
19 #include <asm/cpu-features.h>
20 #include <asm/isa-rev.h>
21 #include <asm/uasm.h>
22 
23 #include "bpf_jit_comp.h"
24 
25 /* MIPS a4-a7 are not available in the o32 ABI */
26 #undef MIPS_R_A4
27 #undef MIPS_R_A5
28 #undef MIPS_R_A6
29 #undef MIPS_R_A7
30 
31 /* Stack is 8-byte aligned in o32 ABI */
32 #define MIPS_STACK_ALIGNMENT 8
33 
34 /*
35  * The top 16 bytes of a stack frame is reserved for the callee in O32 ABI.
36  * This corresponds to stack space for register arguments a0-a3.
37  */
38 #define JIT_RESERVED_STACK 16
39 
40 /* Temporary 64-bit register used by JIT */
41 #define JIT_REG_TMP MAX_BPF_JIT_REG
42 
43 /*
44  * Number of prologue bytes to skip when doing a tail call.
45  * Tail call count (TCC) initialization (8 bytes) always, plus
46  * R0-to-v0 assignment (4 bytes) if big endian.
47  */
48 #ifdef __BIG_ENDIAN
49 #define JIT_TCALL_SKIP 12
50 #else
51 #define JIT_TCALL_SKIP 8
52 #endif
53 
54 /* CPU registers holding the callee return value */
55 #define JIT_RETURN_REGS	  \
56 	(BIT(MIPS_R_V0) | \
57 	 BIT(MIPS_R_V1))
58 
59 /* CPU registers arguments passed to callee directly */
60 #define JIT_ARG_REGS      \
61 	(BIT(MIPS_R_A0) | \
62 	 BIT(MIPS_R_A1) | \
63 	 BIT(MIPS_R_A2) | \
64 	 BIT(MIPS_R_A3))
65 
66 /* CPU register arguments passed to callee on stack */
67 #define JIT_STACK_REGS    \
68 	(BIT(MIPS_R_T0) | \
69 	 BIT(MIPS_R_T1) | \
70 	 BIT(MIPS_R_T2) | \
71 	 BIT(MIPS_R_T3) | \
72 	 BIT(MIPS_R_T4) | \
73 	 BIT(MIPS_R_T5))
74 
75 /* Caller-saved CPU registers */
76 #define JIT_CALLER_REGS    \
77 	(JIT_RETURN_REGS | \
78 	 JIT_ARG_REGS    | \
79 	 JIT_STACK_REGS)
80 
81 /* Callee-saved CPU registers */
82 #define JIT_CALLEE_REGS   \
83 	(BIT(MIPS_R_S0) | \
84 	 BIT(MIPS_R_S1) | \
85 	 BIT(MIPS_R_S2) | \
86 	 BIT(MIPS_R_S3) | \
87 	 BIT(MIPS_R_S4) | \
88 	 BIT(MIPS_R_S5) | \
89 	 BIT(MIPS_R_S6) | \
90 	 BIT(MIPS_R_S7) | \
91 	 BIT(MIPS_R_GP) | \
92 	 BIT(MIPS_R_FP) | \
93 	 BIT(MIPS_R_RA))
94 
95 /*
96  * Mapping of 64-bit eBPF registers to 32-bit native MIPS registers.
97  *
98  * 1) Native register pairs are ordered according to CPU endiannes, following
99  *    the MIPS convention for passing 64-bit arguments and return values.
100  * 2) The eBPF return value, arguments and callee-saved registers are mapped
101  *    to their native MIPS equivalents.
102  * 3) Since the 32 highest bits in the eBPF FP register are always zero,
103  *    only one general-purpose register is actually needed for the mapping.
104  *    We use the fp register for this purpose, and map the highest bits to
105  *    the MIPS register r0 (zero).
106  * 4) We use the MIPS gp and at registers as internal temporary registers
107  *    for constant blinding. The gp register is callee-saved.
108  * 5) One 64-bit temporary register is mapped for use when sign-extending
109  *    immediate operands. MIPS registers t6-t9 are available to the JIT
110  *    for as temporaries when implementing complex 64-bit operations.
111  *
112  * With this scheme all eBPF registers are being mapped to native MIPS
113  * registers without having to use any stack scratch space. The direct
114  * register mapping (2) simplifies the handling of function calls.
115  */
116 static const u8 bpf2mips32[][2] = {
117 	/* Return value from in-kernel function, and exit value from eBPF */
118 	[BPF_REG_0] = {MIPS_R_V1, MIPS_R_V0},
119 	/* Arguments from eBPF program to in-kernel function */
120 	[BPF_REG_1] = {MIPS_R_A1, MIPS_R_A0},
121 	[BPF_REG_2] = {MIPS_R_A3, MIPS_R_A2},
122 	/* Remaining arguments, to be passed on the stack per O32 ABI */
123 	[BPF_REG_3] = {MIPS_R_T1, MIPS_R_T0},
124 	[BPF_REG_4] = {MIPS_R_T3, MIPS_R_T2},
125 	[BPF_REG_5] = {MIPS_R_T5, MIPS_R_T4},
126 	/* Callee-saved registers that in-kernel function will preserve */
127 	[BPF_REG_6] = {MIPS_R_S1, MIPS_R_S0},
128 	[BPF_REG_7] = {MIPS_R_S3, MIPS_R_S2},
129 	[BPF_REG_8] = {MIPS_R_S5, MIPS_R_S4},
130 	[BPF_REG_9] = {MIPS_R_S7, MIPS_R_S6},
131 	/* Read-only frame pointer to access the eBPF stack */
132 #ifdef __BIG_ENDIAN
133 	[BPF_REG_FP] = {MIPS_R_FP, MIPS_R_ZERO},
134 #else
135 	[BPF_REG_FP] = {MIPS_R_ZERO, MIPS_R_FP},
136 #endif
137 	/* Temporary register for blinding constants */
138 	[BPF_REG_AX] = {MIPS_R_GP, MIPS_R_AT},
139 	/* Temporary register for internal JIT use */
140 	[JIT_REG_TMP] = {MIPS_R_T7, MIPS_R_T6},
141 };
142 
143 /* Get low CPU register for a 64-bit eBPF register mapping */
144 static inline u8 lo(const u8 reg[])
145 {
146 #ifdef __BIG_ENDIAN
147 	return reg[0];
148 #else
149 	return reg[1];
150 #endif
151 }
152 
153 /* Get high CPU register for a 64-bit eBPF register mapping */
154 static inline u8 hi(const u8 reg[])
155 {
156 #ifdef __BIG_ENDIAN
157 	return reg[1];
158 #else
159 	return reg[0];
160 #endif
161 }
162 
163 /*
164  * Mark a 64-bit CPU register pair as clobbered, it needs to be
165  * saved/restored by the program if callee-saved.
166  */
167 static void clobber_reg64(struct jit_context *ctx, const u8 reg[])
168 {
169 	clobber_reg(ctx, reg[0]);
170 	clobber_reg(ctx, reg[1]);
171 }
172 
173 /* dst = imm (sign-extended) */
174 static void emit_mov_se_i64(struct jit_context *ctx, const u8 dst[], s32 imm)
175 {
176 	emit_mov_i(ctx, lo(dst), imm);
177 	if (imm < 0)
178 		emit(ctx, addiu, hi(dst), MIPS_R_ZERO, -1);
179 	else
180 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
181 	clobber_reg64(ctx, dst);
182 }
183 
184 /* Zero extension, if verifier does not do it for us  */
185 static void emit_zext_ver(struct jit_context *ctx, const u8 dst[])
186 {
187 	if (!ctx->program->aux->verifier_zext) {
188 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
189 		clobber_reg(ctx, hi(dst));
190 	}
191 }
192 
193 /* Load delay slot, if ISA mandates it */
194 static void emit_load_delay(struct jit_context *ctx)
195 {
196 	if (!cpu_has_mips_2_3_4_5_r)
197 		emit(ctx, nop);
198 }
199 
200 /* ALU immediate operation (64-bit) */
201 static void emit_alu_i64(struct jit_context *ctx,
202 			 const u8 dst[], s32 imm, u8 op)
203 {
204 	u8 src = MIPS_R_T6;
205 
206 	/*
207 	 * ADD/SUB with all but the max negative imm can be handled by
208 	 * inverting the operation and the imm value, saving one insn.
209 	 */
210 	if (imm > S32_MIN && imm < 0)
211 		switch (op) {
212 		case BPF_ADD:
213 			op = BPF_SUB;
214 			imm = -imm;
215 			break;
216 		case BPF_SUB:
217 			op = BPF_ADD;
218 			imm = -imm;
219 			break;
220 		}
221 
222 	/* Move immediate to temporary register */
223 	emit_mov_i(ctx, src, imm);
224 
225 	switch (op) {
226 	/* dst = dst + imm */
227 	case BPF_ADD:
228 		emit(ctx, addu, lo(dst), lo(dst), src);
229 		emit(ctx, sltu, MIPS_R_T9, lo(dst), src);
230 		emit(ctx, addu, hi(dst), hi(dst), MIPS_R_T9);
231 		if (imm < 0)
232 			emit(ctx, addiu, hi(dst), hi(dst), -1);
233 		break;
234 	/* dst = dst - imm */
235 	case BPF_SUB:
236 		emit(ctx, sltu, MIPS_R_T9, lo(dst), src);
237 		emit(ctx, subu, lo(dst), lo(dst), src);
238 		emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9);
239 		if (imm < 0)
240 			emit(ctx, addiu, hi(dst), hi(dst), 1);
241 		break;
242 	/* dst = dst | imm */
243 	case BPF_OR:
244 		emit(ctx, or, lo(dst), lo(dst), src);
245 		if (imm < 0)
246 			emit(ctx, addiu, hi(dst), MIPS_R_ZERO, -1);
247 		break;
248 	/* dst = dst & imm */
249 	case BPF_AND:
250 		emit(ctx, and, lo(dst), lo(dst), src);
251 		if (imm >= 0)
252 			emit(ctx, move, hi(dst), MIPS_R_ZERO);
253 		break;
254 	/* dst = dst ^ imm */
255 	case BPF_XOR:
256 		emit(ctx, xor, lo(dst), lo(dst), src);
257 		if (imm < 0) {
258 			emit(ctx, subu, hi(dst), MIPS_R_ZERO, hi(dst));
259 			emit(ctx, addiu, hi(dst), hi(dst), -1);
260 		}
261 		break;
262 	}
263 	clobber_reg64(ctx, dst);
264 }
265 
266 /* ALU register operation (64-bit) */
267 static void emit_alu_r64(struct jit_context *ctx,
268 			 const u8 dst[], const u8 src[], u8 op)
269 {
270 	switch (BPF_OP(op)) {
271 	/* dst = dst + src */
272 	case BPF_ADD:
273 		if (src == dst) {
274 			emit(ctx, srl, MIPS_R_T9, lo(dst), 31);
275 			emit(ctx, addu, lo(dst), lo(dst), lo(dst));
276 		} else {
277 			emit(ctx, addu, lo(dst), lo(dst), lo(src));
278 			emit(ctx, sltu, MIPS_R_T9, lo(dst), lo(src));
279 		}
280 		emit(ctx, addu, hi(dst), hi(dst), hi(src));
281 		emit(ctx, addu, hi(dst), hi(dst), MIPS_R_T9);
282 		break;
283 	/* dst = dst - src */
284 	case BPF_SUB:
285 		emit(ctx, sltu, MIPS_R_T9, lo(dst), lo(src));
286 		emit(ctx, subu, lo(dst), lo(dst), lo(src));
287 		emit(ctx, subu, hi(dst), hi(dst), hi(src));
288 		emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9);
289 		break;
290 	/* dst = dst | src */
291 	case BPF_OR:
292 		emit(ctx, or, lo(dst), lo(dst), lo(src));
293 		emit(ctx, or, hi(dst), hi(dst), hi(src));
294 		break;
295 	/* dst = dst & src */
296 	case BPF_AND:
297 		emit(ctx, and, lo(dst), lo(dst), lo(src));
298 		emit(ctx, and, hi(dst), hi(dst), hi(src));
299 		break;
300 	/* dst = dst ^ src */
301 	case BPF_XOR:
302 		emit(ctx, xor, lo(dst), lo(dst), lo(src));
303 		emit(ctx, xor, hi(dst), hi(dst), hi(src));
304 		break;
305 	}
306 	clobber_reg64(ctx, dst);
307 }
308 
309 /* ALU invert (64-bit) */
310 static void emit_neg_i64(struct jit_context *ctx, const u8 dst[])
311 {
312 	emit(ctx, sltu, MIPS_R_T9, MIPS_R_ZERO, lo(dst));
313 	emit(ctx, subu, lo(dst), MIPS_R_ZERO, lo(dst));
314 	emit(ctx, subu, hi(dst), MIPS_R_ZERO, hi(dst));
315 	emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9);
316 
317 	clobber_reg64(ctx, dst);
318 }
319 
320 /* ALU shift immediate (64-bit) */
321 static void emit_shift_i64(struct jit_context *ctx,
322 			   const u8 dst[], u32 imm, u8 op)
323 {
324 	switch (BPF_OP(op)) {
325 	/* dst = dst << imm */
326 	case BPF_LSH:
327 		if (imm < 32) {
328 			emit(ctx, srl, MIPS_R_T9, lo(dst), 32 - imm);
329 			emit(ctx, sll, lo(dst), lo(dst), imm);
330 			emit(ctx, sll, hi(dst), hi(dst), imm);
331 			emit(ctx, or, hi(dst), hi(dst), MIPS_R_T9);
332 		} else {
333 			emit(ctx, sll, hi(dst), lo(dst), imm - 32);
334 			emit(ctx, move, lo(dst), MIPS_R_ZERO);
335 		}
336 		break;
337 	/* dst = dst >> imm */
338 	case BPF_RSH:
339 		if (imm < 32) {
340 			emit(ctx, sll, MIPS_R_T9, hi(dst), 32 - imm);
341 			emit(ctx, srl, lo(dst), lo(dst), imm);
342 			emit(ctx, srl, hi(dst), hi(dst), imm);
343 			emit(ctx, or, lo(dst), lo(dst), MIPS_R_T9);
344 		} else {
345 			emit(ctx, srl, lo(dst), hi(dst), imm - 32);
346 			emit(ctx, move, hi(dst), MIPS_R_ZERO);
347 		}
348 		break;
349 	/* dst = dst >> imm (arithmetic) */
350 	case BPF_ARSH:
351 		if (imm < 32) {
352 			emit(ctx, sll, MIPS_R_T9, hi(dst), 32 - imm);
353 			emit(ctx, srl, lo(dst), lo(dst), imm);
354 			emit(ctx, sra, hi(dst), hi(dst), imm);
355 			emit(ctx, or, lo(dst), lo(dst), MIPS_R_T9);
356 		} else {
357 			emit(ctx, sra, lo(dst), hi(dst), imm - 32);
358 			emit(ctx, sra, hi(dst), hi(dst), 31);
359 		}
360 		break;
361 	}
362 	clobber_reg64(ctx, dst);
363 }
364 
365 /* ALU shift register (64-bit) */
366 static void emit_shift_r64(struct jit_context *ctx,
367 			   const u8 dst[], u8 src, u8 op)
368 {
369 	u8 t1 = MIPS_R_T8;
370 	u8 t2 = MIPS_R_T9;
371 
372 	emit(ctx, andi, t1, src, 32);              /* t1 = src & 32          */
373 	emit(ctx, beqz, t1, 16);                   /* PC += 16 if t1 == 0    */
374 	emit(ctx, nor, t2, src, MIPS_R_ZERO);      /* t2 = ~src (delay slot) */
375 
376 	switch (BPF_OP(op)) {
377 	/* dst = dst << src */
378 	case BPF_LSH:
379 		/* Next: shift >= 32 */
380 		emit(ctx, sllv, hi(dst), lo(dst), src);    /* dh = dl << src */
381 		emit(ctx, move, lo(dst), MIPS_R_ZERO);     /* dl = 0         */
382 		emit(ctx, b, 20);                          /* PC += 20       */
383 		/* +16: shift < 32 */
384 		emit(ctx, srl, t1, lo(dst), 1);            /* t1 = dl >> 1   */
385 		emit(ctx, srlv, t1, t1, t2);               /* t1 = t1 >> t2  */
386 		emit(ctx, sllv, lo(dst), lo(dst), src);    /* dl = dl << src */
387 		emit(ctx, sllv, hi(dst), hi(dst), src);    /* dh = dh << src */
388 		emit(ctx, or, hi(dst), hi(dst), t1);       /* dh = dh | t1   */
389 		break;
390 	/* dst = dst >> src */
391 	case BPF_RSH:
392 		/* Next: shift >= 32 */
393 		emit(ctx, srlv, lo(dst), hi(dst), src);    /* dl = dh >> src */
394 		emit(ctx, move, hi(dst), MIPS_R_ZERO);     /* dh = 0         */
395 		emit(ctx, b, 20);                          /* PC += 20       */
396 		/* +16: shift < 32 */
397 		emit(ctx, sll, t1, hi(dst), 1);            /* t1 = dl << 1   */
398 		emit(ctx, sllv, t1, t1, t2);               /* t1 = t1 << t2  */
399 		emit(ctx, srlv, lo(dst), lo(dst), src);    /* dl = dl >> src */
400 		emit(ctx, srlv, hi(dst), hi(dst), src);    /* dh = dh >> src */
401 		emit(ctx, or, lo(dst), lo(dst), t1);       /* dl = dl | t1   */
402 		break;
403 	/* dst = dst >> src (arithmetic) */
404 	case BPF_ARSH:
405 		/* Next: shift >= 32 */
406 		emit(ctx, srav, lo(dst), hi(dst), src);   /* dl = dh >>a src */
407 		emit(ctx, sra, hi(dst), hi(dst), 31);     /* dh = dh >>a 31  */
408 		emit(ctx, b, 20);                         /* PC += 20        */
409 		/* +16: shift < 32 */
410 		emit(ctx, sll, t1, hi(dst), 1);           /* t1 = dl << 1    */
411 		emit(ctx, sllv, t1, t1, t2);              /* t1 = t1 << t2   */
412 		emit(ctx, srlv, lo(dst), lo(dst), src);   /* dl = dl >>a src */
413 		emit(ctx, srav, hi(dst), hi(dst), src);   /* dh = dh >> src  */
414 		emit(ctx, or, lo(dst), lo(dst), t1);      /* dl = dl | t1    */
415 		break;
416 	}
417 
418 	/* +20: Done */
419 	clobber_reg64(ctx, dst);
420 }
421 
422 /* ALU mul immediate (64x32-bit) */
423 static void emit_mul_i64(struct jit_context *ctx, const u8 dst[], s32 imm)
424 {
425 	u8 src = MIPS_R_T6;
426 	u8 tmp = MIPS_R_T9;
427 
428 	switch (imm) {
429 	/* dst = dst * 1 is a no-op */
430 	case 1:
431 		break;
432 	/* dst = dst * -1 */
433 	case -1:
434 		emit_neg_i64(ctx, dst);
435 		break;
436 	case 0:
437 		emit_mov_r(ctx, lo(dst), MIPS_R_ZERO);
438 		emit_mov_r(ctx, hi(dst), MIPS_R_ZERO);
439 		break;
440 	/* Full 64x32 multiply */
441 	default:
442 		/* hi(dst) = hi(dst) * src(imm) */
443 		emit_mov_i(ctx, src, imm);
444 		if (cpu_has_mips32r1 || cpu_has_mips32r6) {
445 			emit(ctx, mul, hi(dst), hi(dst), src);
446 		} else {
447 			emit(ctx, multu, hi(dst), src);
448 			emit(ctx, mflo, hi(dst));
449 		}
450 
451 		/* hi(dst) = hi(dst) - lo(dst) */
452 		if (imm < 0)
453 			emit(ctx, subu, hi(dst), hi(dst), lo(dst));
454 
455 		/* tmp = lo(dst) * src(imm) >> 32 */
456 		/* lo(dst) = lo(dst) * src(imm) */
457 		if (cpu_has_mips32r6) {
458 			emit(ctx, muhu, tmp, lo(dst), src);
459 			emit(ctx, mulu, lo(dst), lo(dst), src);
460 		} else {
461 			emit(ctx, multu, lo(dst), src);
462 			emit(ctx, mflo, lo(dst));
463 			emit(ctx, mfhi, tmp);
464 		}
465 
466 		/* hi(dst) += tmp */
467 		emit(ctx, addu, hi(dst), hi(dst), tmp);
468 		clobber_reg64(ctx, dst);
469 		break;
470 	}
471 }
472 
473 /* ALU mul register (64x64-bit) */
474 static void emit_mul_r64(struct jit_context *ctx,
475 			 const u8 dst[], const u8 src[])
476 {
477 	u8 acc = MIPS_R_T8;
478 	u8 tmp = MIPS_R_T9;
479 
480 	/* acc = hi(dst) * lo(src) */
481 	if (cpu_has_mips32r1 || cpu_has_mips32r6) {
482 		emit(ctx, mul, acc, hi(dst), lo(src));
483 	} else {
484 		emit(ctx, multu, hi(dst), lo(src));
485 		emit(ctx, mflo, acc);
486 	}
487 
488 	/* tmp = lo(dst) * hi(src) */
489 	if (cpu_has_mips32r1 || cpu_has_mips32r6) {
490 		emit(ctx, mul, tmp, lo(dst), hi(src));
491 	} else {
492 		emit(ctx, multu, lo(dst), hi(src));
493 		emit(ctx, mflo, tmp);
494 	}
495 
496 	/* acc += tmp */
497 	emit(ctx, addu, acc, acc, tmp);
498 
499 	/* tmp = lo(dst) * lo(src) >> 32 */
500 	/* lo(dst) = lo(dst) * lo(src) */
501 	if (cpu_has_mips32r6) {
502 		emit(ctx, muhu, tmp, lo(dst), lo(src));
503 		emit(ctx, mulu, lo(dst), lo(dst), lo(src));
504 	} else {
505 		emit(ctx, multu, lo(dst), lo(src));
506 		emit(ctx, mflo, lo(dst));
507 		emit(ctx, mfhi, tmp);
508 	}
509 
510 	/* hi(dst) = acc + tmp */
511 	emit(ctx, addu, hi(dst), acc, tmp);
512 	clobber_reg64(ctx, dst);
513 }
514 
515 /* Helper function for 64-bit modulo */
516 static u64 jit_mod64(u64 a, u64 b)
517 {
518 	u64 rem;
519 
520 	div64_u64_rem(a, b, &rem);
521 	return rem;
522 }
523 
524 /* ALU div/mod register (64-bit) */
525 static void emit_divmod_r64(struct jit_context *ctx,
526 			    const u8 dst[], const u8 src[], u8 op)
527 {
528 	const u8 *r0 = bpf2mips32[BPF_REG_0]; /* Mapped to v0-v1 */
529 	const u8 *r1 = bpf2mips32[BPF_REG_1]; /* Mapped to a0-a1 */
530 	const u8 *r2 = bpf2mips32[BPF_REG_2]; /* Mapped to a2-a3 */
531 	int exclude, k;
532 	u32 addr = 0;
533 
534 	/* Push caller-saved registers on stack */
535 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
536 		  0, JIT_RESERVED_STACK);
537 
538 	/* Put 64-bit arguments 1 and 2 in registers a0-a3 */
539 	for (k = 0; k < 2; k++) {
540 		emit(ctx, move, MIPS_R_T9, src[k]);
541 		emit(ctx, move, r1[k], dst[k]);
542 		emit(ctx, move, r2[k], MIPS_R_T9);
543 	}
544 
545 	/* Emit function call */
546 	switch (BPF_OP(op)) {
547 	/* dst = dst / src */
548 	case BPF_DIV:
549 		addr = (u32)&div64_u64;
550 		break;
551 	/* dst = dst % src */
552 	case BPF_MOD:
553 		addr = (u32)&jit_mod64;
554 		break;
555 	}
556 	emit_mov_i(ctx, MIPS_R_T9, addr);
557 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
558 	emit(ctx, nop); /* Delay slot */
559 
560 	/* Store the 64-bit result in dst */
561 	emit(ctx, move, dst[0], r0[0]);
562 	emit(ctx, move, dst[1], r0[1]);
563 
564 	/* Restore caller-saved registers, excluding the computed result */
565 	exclude = BIT(lo(dst)) | BIT(hi(dst));
566 	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
567 		 exclude, JIT_RESERVED_STACK);
568 	emit_load_delay(ctx);
569 
570 	clobber_reg64(ctx, dst);
571 	clobber_reg(ctx, MIPS_R_V0);
572 	clobber_reg(ctx, MIPS_R_V1);
573 	clobber_reg(ctx, MIPS_R_RA);
574 }
575 
576 /* Swap bytes in a register word */
577 static void emit_swap8_r(struct jit_context *ctx, u8 dst, u8 src, u8 mask)
578 {
579 	u8 tmp = MIPS_R_T9;
580 
581 	emit(ctx, and, tmp, src, mask); /* tmp = src & 0x00ff00ff */
582 	emit(ctx, sll, tmp, tmp, 8);    /* tmp = tmp << 8         */
583 	emit(ctx, srl, dst, src, 8);    /* dst = src >> 8         */
584 	emit(ctx, and, dst, dst, mask); /* dst = dst & 0x00ff00ff */
585 	emit(ctx, or,  dst, dst, tmp);  /* dst = dst | tmp        */
586 }
587 
588 /* Swap half words in a register word */
589 static void emit_swap16_r(struct jit_context *ctx, u8 dst, u8 src)
590 {
591 	u8 tmp = MIPS_R_T9;
592 
593 	emit(ctx, sll, tmp, src, 16);  /* tmp = src << 16 */
594 	emit(ctx, srl, dst, src, 16);  /* dst = src >> 16 */
595 	emit(ctx, or,  dst, dst, tmp); /* dst = dst | tmp */
596 }
597 
598 /* Swap bytes and truncate a register double word, word or half word */
599 static void emit_bswap_r64(struct jit_context *ctx, const u8 dst[], u32 width)
600 {
601 	u8 tmp = MIPS_R_T8;
602 
603 	switch (width) {
604 	/* Swap bytes in a double word */
605 	case 64:
606 		if (cpu_has_mips32r2 || cpu_has_mips32r6) {
607 			emit(ctx, rotr, tmp, hi(dst), 16);
608 			emit(ctx, rotr, hi(dst), lo(dst), 16);
609 			emit(ctx, wsbh, lo(dst), tmp);
610 			emit(ctx, wsbh, hi(dst), hi(dst));
611 		} else {
612 			emit_swap16_r(ctx, tmp, lo(dst));
613 			emit_swap16_r(ctx, lo(dst), hi(dst));
614 			emit(ctx, move, hi(dst), tmp);
615 
616 			emit(ctx, lui, tmp, 0xff);      /* tmp = 0x00ff0000 */
617 			emit(ctx, ori, tmp, tmp, 0xff); /* tmp = 0x00ff00ff */
618 			emit_swap8_r(ctx, lo(dst), lo(dst), tmp);
619 			emit_swap8_r(ctx, hi(dst), hi(dst), tmp);
620 		}
621 		break;
622 	/* Swap bytes in a word */
623 	/* Swap bytes in a half word */
624 	case 32:
625 	case 16:
626 		emit_bswap_r(ctx, lo(dst), width);
627 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
628 		break;
629 	}
630 	clobber_reg64(ctx, dst);
631 }
632 
633 /* Truncate a register double word, word or half word */
634 static void emit_trunc_r64(struct jit_context *ctx, const u8 dst[], u32 width)
635 {
636 	switch (width) {
637 	case 64:
638 		break;
639 	/* Zero-extend a word */
640 	case 32:
641 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
642 		clobber_reg(ctx, hi(dst));
643 		break;
644 	/* Zero-extend a half word */
645 	case 16:
646 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
647 		emit(ctx, andi, lo(dst), lo(dst), 0xffff);
648 		clobber_reg64(ctx, dst);
649 		break;
650 	}
651 }
652 
653 /* Load operation: dst = *(size*)(src + off) */
654 static void emit_ldx(struct jit_context *ctx,
655 		     const u8 dst[], u8 src, s16 off, u8 size)
656 {
657 	switch (size) {
658 	/* Load a byte */
659 	case BPF_B:
660 		emit(ctx, lbu, lo(dst), off, src);
661 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
662 		break;
663 	/* Load a half word */
664 	case BPF_H:
665 		emit(ctx, lhu, lo(dst), off, src);
666 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
667 		break;
668 	/* Load a word */
669 	case BPF_W:
670 		emit(ctx, lw, lo(dst), off, src);
671 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
672 		break;
673 	/* Load a double word */
674 	case BPF_DW:
675 		if (dst[1] == src) {
676 			emit(ctx, lw, dst[0], off + 4, src);
677 			emit(ctx, lw, dst[1], off, src);
678 		} else {
679 			emit(ctx, lw, dst[1], off, src);
680 			emit(ctx, lw, dst[0], off + 4, src);
681 		}
682 		emit_load_delay(ctx);
683 		break;
684 	}
685 	clobber_reg64(ctx, dst);
686 }
687 
688 /* Store operation: *(size *)(dst + off) = src */
689 static void emit_stx(struct jit_context *ctx,
690 		     const u8 dst, const u8 src[], s16 off, u8 size)
691 {
692 	switch (size) {
693 	/* Store a byte */
694 	case BPF_B:
695 		emit(ctx, sb, lo(src), off, dst);
696 		break;
697 	/* Store a half word */
698 	case BPF_H:
699 		emit(ctx, sh, lo(src), off, dst);
700 		break;
701 	/* Store a word */
702 	case BPF_W:
703 		emit(ctx, sw, lo(src), off, dst);
704 		break;
705 	/* Store a double word */
706 	case BPF_DW:
707 		emit(ctx, sw, src[1], off, dst);
708 		emit(ctx, sw, src[0], off + 4, dst);
709 		break;
710 	}
711 }
712 
713 /* Atomic read-modify-write (32-bit, non-ll/sc fallback) */
714 static void emit_atomic_r32(struct jit_context *ctx,
715 			    u8 dst, u8 src, s16 off, u8 code)
716 {
717 	u32 exclude = 0;
718 	u32 addr = 0;
719 
720 	/* Push caller-saved registers on stack */
721 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
722 		  0, JIT_RESERVED_STACK);
723 	/*
724 	 * Argument 1: dst+off if xchg, otherwise src, passed in register a0
725 	 * Argument 2: src if xchg, otherwise dst+off, passed in register a1
726 	 */
727 	emit(ctx, move, MIPS_R_T9, dst);
728 	if (code == BPF_XCHG) {
729 		emit(ctx, move, MIPS_R_A1, src);
730 		emit(ctx, addiu, MIPS_R_A0, MIPS_R_T9, off);
731 	} else {
732 		emit(ctx, move, MIPS_R_A0, src);
733 		emit(ctx, addiu, MIPS_R_A1, MIPS_R_T9, off);
734 	}
735 
736 	/* Emit function call */
737 	switch (code) {
738 	case BPF_ADD:
739 		addr = (u32)&atomic_add;
740 		break;
741 	case BPF_ADD | BPF_FETCH:
742 		addr = (u32)&atomic_fetch_add;
743 		break;
744 	case BPF_SUB:
745 		addr = (u32)&atomic_sub;
746 		break;
747 	case BPF_SUB | BPF_FETCH:
748 		addr = (u32)&atomic_fetch_sub;
749 		break;
750 	case BPF_OR:
751 		addr = (u32)&atomic_or;
752 		break;
753 	case BPF_OR | BPF_FETCH:
754 		addr = (u32)&atomic_fetch_or;
755 		break;
756 	case BPF_AND:
757 		addr = (u32)&atomic_and;
758 		break;
759 	case BPF_AND | BPF_FETCH:
760 		addr = (u32)&atomic_fetch_and;
761 		break;
762 	case BPF_XOR:
763 		addr = (u32)&atomic_xor;
764 		break;
765 	case BPF_XOR | BPF_FETCH:
766 		addr = (u32)&atomic_fetch_xor;
767 		break;
768 	case BPF_XCHG:
769 		addr = (u32)&atomic_xchg;
770 		break;
771 	}
772 	emit_mov_i(ctx, MIPS_R_T9, addr);
773 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
774 	emit(ctx, nop); /* Delay slot */
775 
776 	/* Update src register with old value, if specified */
777 	if (code & BPF_FETCH) {
778 		emit(ctx, move, src, MIPS_R_V0);
779 		exclude = BIT(src);
780 		clobber_reg(ctx, src);
781 	}
782 
783 	/* Restore caller-saved registers, except any fetched value */
784 	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
785 		 exclude, JIT_RESERVED_STACK);
786 	emit_load_delay(ctx);
787 	clobber_reg(ctx, MIPS_R_RA);
788 }
789 
790 /* Helper function for 64-bit atomic exchange */
791 static s64 jit_xchg64(s64 a, atomic64_t *v)
792 {
793 	return atomic64_xchg(v, a);
794 }
795 
796 /* Atomic read-modify-write (64-bit) */
797 static void emit_atomic_r64(struct jit_context *ctx,
798 			    u8 dst, const u8 src[], s16 off, u8 code)
799 {
800 	const u8 *r0 = bpf2mips32[BPF_REG_0]; /* Mapped to v0-v1 */
801 	const u8 *r1 = bpf2mips32[BPF_REG_1]; /* Mapped to a0-a1 */
802 	u32 exclude = 0;
803 	u32 addr = 0;
804 
805 	/* Push caller-saved registers on stack */
806 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
807 		  0, JIT_RESERVED_STACK);
808 	/*
809 	 * Argument 1: 64-bit src, passed in registers a0-a1
810 	 * Argument 2: 32-bit dst+off, passed in register a2
811 	 */
812 	emit(ctx, move, MIPS_R_T9, dst);
813 	emit(ctx, move, r1[0], src[0]);
814 	emit(ctx, move, r1[1], src[1]);
815 	emit(ctx, addiu, MIPS_R_A2, MIPS_R_T9, off);
816 
817 	/* Emit function call */
818 	switch (code) {
819 	case BPF_ADD:
820 		addr = (u32)&atomic64_add;
821 		break;
822 	case BPF_ADD | BPF_FETCH:
823 		addr = (u32)&atomic64_fetch_add;
824 		break;
825 	case BPF_SUB:
826 		addr = (u32)&atomic64_sub;
827 		break;
828 	case BPF_SUB | BPF_FETCH:
829 		addr = (u32)&atomic64_fetch_sub;
830 		break;
831 	case BPF_OR:
832 		addr = (u32)&atomic64_or;
833 		break;
834 	case BPF_OR | BPF_FETCH:
835 		addr = (u32)&atomic64_fetch_or;
836 		break;
837 	case BPF_AND:
838 		addr = (u32)&atomic64_and;
839 		break;
840 	case BPF_AND | BPF_FETCH:
841 		addr = (u32)&atomic64_fetch_and;
842 		break;
843 	case BPF_XOR:
844 		addr = (u32)&atomic64_xor;
845 		break;
846 	case BPF_XOR | BPF_FETCH:
847 		addr = (u32)&atomic64_fetch_xor;
848 		break;
849 	case BPF_XCHG:
850 		addr = (u32)&jit_xchg64;
851 		break;
852 	}
853 	emit_mov_i(ctx, MIPS_R_T9, addr);
854 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
855 	emit(ctx, nop); /* Delay slot */
856 
857 	/* Update src register with old value, if specified */
858 	if (code & BPF_FETCH) {
859 		emit(ctx, move, lo(src), lo(r0));
860 		emit(ctx, move, hi(src), hi(r0));
861 		exclude = BIT(src[0]) | BIT(src[1]);
862 		clobber_reg64(ctx, src);
863 	}
864 
865 	/* Restore caller-saved registers, except any fetched value */
866 	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
867 		 exclude, JIT_RESERVED_STACK);
868 	emit_load_delay(ctx);
869 	clobber_reg(ctx, MIPS_R_RA);
870 }
871 
872 /* Atomic compare-and-exchange (32-bit, non-ll/sc fallback) */
873 static void emit_cmpxchg_r32(struct jit_context *ctx, u8 dst, u8 src, s16 off)
874 {
875 	const u8 *r0 = bpf2mips32[BPF_REG_0];
876 
877 	/* Push caller-saved registers on stack */
878 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
879 		  JIT_RETURN_REGS, JIT_RESERVED_STACK + 2 * sizeof(u32));
880 	/*
881 	 * Argument 1: 32-bit dst+off, passed in register a0
882 	 * Argument 2: 32-bit r0, passed in register a1
883 	 * Argument 3: 32-bit src, passed in register a2
884 	 */
885 	emit(ctx, addiu, MIPS_R_T9, dst, off);
886 	emit(ctx, move, MIPS_R_T8, src);
887 	emit(ctx, move, MIPS_R_A1, lo(r0));
888 	emit(ctx, move, MIPS_R_A0, MIPS_R_T9);
889 	emit(ctx, move, MIPS_R_A2, MIPS_R_T8);
890 
891 	/* Emit function call */
892 	emit_mov_i(ctx, MIPS_R_T9, (u32)&atomic_cmpxchg);
893 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
894 	emit(ctx, nop); /* Delay slot */
895 
896 #ifdef __BIG_ENDIAN
897 	emit(ctx, move, lo(r0), MIPS_R_V0);
898 #endif
899 	/* Restore caller-saved registers, except the return value */
900 	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
901 		 JIT_RETURN_REGS, JIT_RESERVED_STACK + 2 * sizeof(u32));
902 	emit_load_delay(ctx);
903 	clobber_reg(ctx, MIPS_R_V0);
904 	clobber_reg(ctx, MIPS_R_V1);
905 	clobber_reg(ctx, MIPS_R_RA);
906 }
907 
908 /* Atomic compare-and-exchange (64-bit) */
909 static void emit_cmpxchg_r64(struct jit_context *ctx,
910 			     u8 dst, const u8 src[], s16 off)
911 {
912 	const u8 *r0 = bpf2mips32[BPF_REG_0];
913 	const u8 *r2 = bpf2mips32[BPF_REG_2];
914 
915 	/* Push caller-saved registers on stack */
916 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
917 		  JIT_RETURN_REGS, JIT_RESERVED_STACK + 2 * sizeof(u32));
918 	/*
919 	 * Argument 1: 32-bit dst+off, passed in register a0 (a1 unused)
920 	 * Argument 2: 64-bit r0, passed in registers a2-a3
921 	 * Argument 3: 64-bit src, passed on stack
922 	 */
923 	push_regs(ctx, BIT(src[0]) | BIT(src[1]), 0, JIT_RESERVED_STACK);
924 	emit(ctx, addiu, MIPS_R_T9, dst, off);
925 	emit(ctx, move, r2[0], r0[0]);
926 	emit(ctx, move, r2[1], r0[1]);
927 	emit(ctx, move, MIPS_R_A0, MIPS_R_T9);
928 
929 	/* Emit function call */
930 	emit_mov_i(ctx, MIPS_R_T9, (u32)&atomic64_cmpxchg);
931 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
932 	emit(ctx, nop); /* Delay slot */
933 
934 	/* Restore caller-saved registers, except the return value */
935 	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
936 		 JIT_RETURN_REGS, JIT_RESERVED_STACK + 2 * sizeof(u32));
937 	emit_load_delay(ctx);
938 	clobber_reg(ctx, MIPS_R_V0);
939 	clobber_reg(ctx, MIPS_R_V1);
940 	clobber_reg(ctx, MIPS_R_RA);
941 }
942 
943 /*
944  * Conditional movz or an emulated equivalent.
945  * Note that the rs register may be modified.
946  */
947 static void emit_movz_r(struct jit_context *ctx, u8 rd, u8 rs, u8 rt)
948 {
949 	if (cpu_has_mips_2) {
950 		emit(ctx, movz, rd, rs, rt);           /* rd = rt ? rd : rs  */
951 	} else if (cpu_has_mips32r6) {
952 		if (rs != MIPS_R_ZERO)
953 			emit(ctx, seleqz, rs, rs, rt); /* rs = 0 if rt == 0  */
954 		emit(ctx, selnez, rd, rd, rt);         /* rd = 0 if rt != 0  */
955 		if (rs != MIPS_R_ZERO)
956 			emit(ctx, or, rd, rd, rs);     /* rd = rd | rs       */
957 	} else {
958 		emit(ctx, bnez, rt, 8);                /* PC += 8 if rd != 0 */
959 		emit(ctx, nop);                        /* +0: delay slot     */
960 		emit(ctx, or, rd, rs, MIPS_R_ZERO);    /* +4: rd = rs        */
961 	}
962 	clobber_reg(ctx, rd);
963 	clobber_reg(ctx, rs);
964 }
965 
966 /*
967  * Conditional movn or an emulated equivalent.
968  * Note that the rs register may be modified.
969  */
970 static void emit_movn_r(struct jit_context *ctx, u8 rd, u8 rs, u8 rt)
971 {
972 	if (cpu_has_mips_2) {
973 		emit(ctx, movn, rd, rs, rt);           /* rd = rt ? rs : rd  */
974 	} else if (cpu_has_mips32r6) {
975 		if (rs != MIPS_R_ZERO)
976 			emit(ctx, selnez, rs, rs, rt); /* rs = 0 if rt == 0  */
977 		emit(ctx, seleqz, rd, rd, rt);         /* rd = 0 if rt != 0  */
978 		if (rs != MIPS_R_ZERO)
979 			emit(ctx, or, rd, rd, rs);     /* rd = rd | rs       */
980 	} else {
981 		emit(ctx, beqz, rt, 8);                /* PC += 8 if rd == 0 */
982 		emit(ctx, nop);                        /* +0: delay slot     */
983 		emit(ctx, or, rd, rs, MIPS_R_ZERO);    /* +4: rd = rs        */
984 	}
985 	clobber_reg(ctx, rd);
986 	clobber_reg(ctx, rs);
987 }
988 
989 /* Emulation of 64-bit sltiu rd, rs, imm, where imm may be S32_MAX + 1 */
990 static void emit_sltiu_r64(struct jit_context *ctx, u8 rd,
991 			   const u8 rs[], s64 imm)
992 {
993 	u8 tmp = MIPS_R_T9;
994 
995 	if (imm < 0) {
996 		emit_mov_i(ctx, rd, imm);                 /* rd = imm        */
997 		emit(ctx, sltu, rd, lo(rs), rd);          /* rd = rsl < rd   */
998 		emit(ctx, sltiu, tmp, hi(rs), -1);        /* tmp = rsh < ~0U */
999 		emit(ctx, or, rd, rd, tmp);               /* rd = rd | tmp   */
1000 	} else { /* imm >= 0 */
1001 		if (imm > 0x7fff) {
1002 			emit_mov_i(ctx, rd, (s32)imm);     /* rd = imm       */
1003 			emit(ctx, sltu, rd, lo(rs), rd);   /* rd = rsl < rd  */
1004 		} else {
1005 			emit(ctx, sltiu, rd, lo(rs), imm); /* rd = rsl < imm */
1006 		}
1007 		emit_movn_r(ctx, rd, MIPS_R_ZERO, hi(rs)); /* rd = 0 if rsh  */
1008 	}
1009 }
1010 
1011 /* Emulation of 64-bit sltu rd, rs, rt */
1012 static void emit_sltu_r64(struct jit_context *ctx, u8 rd,
1013 			  const u8 rs[], const u8 rt[])
1014 {
1015 	u8 tmp = MIPS_R_T9;
1016 
1017 	emit(ctx, sltu, rd, lo(rs), lo(rt));           /* rd = rsl < rtl     */
1018 	emit(ctx, subu, tmp, hi(rs), hi(rt));          /* tmp = rsh - rth    */
1019 	emit_movn_r(ctx, rd, MIPS_R_ZERO, tmp);        /* rd = 0 if tmp != 0 */
1020 	emit(ctx, sltu, tmp, hi(rs), hi(rt));          /* tmp = rsh < rth    */
1021 	emit(ctx, or, rd, rd, tmp);                    /* rd = rd | tmp      */
1022 }
1023 
1024 /* Emulation of 64-bit slti rd, rs, imm, where imm may be S32_MAX + 1 */
1025 static void emit_slti_r64(struct jit_context *ctx, u8 rd,
1026 			  const u8 rs[], s64 imm)
1027 {
1028 	u8 t1 = MIPS_R_T8;
1029 	u8 t2 = MIPS_R_T9;
1030 	u8 cmp;
1031 
1032 	/*
1033 	 * if ((rs < 0) ^ (imm < 0)) t1 = imm >u rsl
1034 	 * else                      t1 = rsl <u imm
1035 	 */
1036 	emit_mov_i(ctx, rd, (s32)imm);
1037 	emit(ctx, sltu, t1, lo(rs), rd);               /* t1 = rsl <u imm   */
1038 	emit(ctx, sltu, t2, rd, lo(rs));               /* t2 = imm <u rsl   */
1039 	emit(ctx, srl, rd, hi(rs), 31);                /* rd = rsh >> 31    */
1040 	if (imm < 0)
1041 		emit_movz_r(ctx, t1, t2, rd);          /* t1 = rd ? t1 : t2 */
1042 	else
1043 		emit_movn_r(ctx, t1, t2, rd);          /* t1 = rd ? t2 : t1 */
1044 	/*
1045 	 * if ((imm < 0 && rsh != 0xffffffff) ||
1046 	 *     (imm >= 0 && rsh != 0))
1047 	 *      t1 = 0
1048 	 */
1049 	if (imm < 0) {
1050 		emit(ctx, addiu, rd, hi(rs), 1);       /* rd = rsh + 1 */
1051 		cmp = rd;
1052 	} else { /* imm >= 0 */
1053 		cmp = hi(rs);
1054 	}
1055 	emit_movn_r(ctx, t1, MIPS_R_ZERO, cmp);        /* t1 = 0 if cmp != 0 */
1056 
1057 	/*
1058 	 * if (imm < 0) rd = rsh < -1
1059 	 * else         rd = rsh != 0
1060 	 * rd = rd | t1
1061 	 */
1062 	emit(ctx, slti, rd, hi(rs), imm < 0 ? -1 : 0); /* rd = rsh < hi(imm) */
1063 	emit(ctx, or, rd, rd, t1);                     /* rd = rd | t1       */
1064 }
1065 
1066 /* Emulation of 64-bit(slt rd, rs, rt) */
1067 static void emit_slt_r64(struct jit_context *ctx, u8 rd,
1068 			 const u8 rs[], const u8 rt[])
1069 {
1070 	u8 t1 = MIPS_R_T7;
1071 	u8 t2 = MIPS_R_T8;
1072 	u8 t3 = MIPS_R_T9;
1073 
1074 	/*
1075 	 * if ((rs < 0) ^ (rt < 0)) t1 = rtl <u rsl
1076 	 * else                     t1 = rsl <u rtl
1077 	 * if (rsh == rth)          t1 = 0
1078 	 */
1079 	emit(ctx, sltu, t1, lo(rs), lo(rt));           /* t1 = rsl <u rtl   */
1080 	emit(ctx, sltu, t2, lo(rt), lo(rs));           /* t2 = rtl <u rsl   */
1081 	emit(ctx, xor, t3, hi(rs), hi(rt));            /* t3 = rlh ^ rth    */
1082 	emit(ctx, srl, rd, t3, 31);                    /* rd = t3 >> 31     */
1083 	emit_movn_r(ctx, t1, t2, rd);                  /* t1 = rd ? t2 : t1 */
1084 	emit_movn_r(ctx, t1, MIPS_R_ZERO, t3);         /* t1 = 0 if t3 != 0 */
1085 
1086 	/* rd = (rsh < rth) | t1 */
1087 	emit(ctx, slt, rd, hi(rs), hi(rt));            /* rd = rsh <s rth   */
1088 	emit(ctx, or, rd, rd, t1);                     /* rd = rd | t1      */
1089 }
1090 
1091 /* Jump immediate (64-bit) */
1092 static void emit_jmp_i64(struct jit_context *ctx,
1093 			 const u8 dst[], s32 imm, s32 off, u8 op)
1094 {
1095 	u8 tmp = MIPS_R_T6;
1096 
1097 	switch (op) {
1098 	/* No-op, used internally for branch optimization */
1099 	case JIT_JNOP:
1100 		break;
1101 	/* PC += off if dst == imm */
1102 	/* PC += off if dst != imm */
1103 	case BPF_JEQ:
1104 	case BPF_JNE:
1105 		if (imm >= -0x7fff && imm <= 0x8000) {
1106 			emit(ctx, addiu, tmp, lo(dst), -imm);
1107 		} else if ((u32)imm <= 0xffff) {
1108 			emit(ctx, xori, tmp, lo(dst), imm);
1109 		} else {       /* Register fallback */
1110 			emit_mov_i(ctx, tmp, imm);
1111 			emit(ctx, xor, tmp, lo(dst), tmp);
1112 		}
1113 		if (imm < 0) { /* Compare sign extension */
1114 			emit(ctx, addu, MIPS_R_T9, hi(dst), 1);
1115 			emit(ctx, or, tmp, tmp, MIPS_R_T9);
1116 		} else {       /* Compare zero extension */
1117 			emit(ctx, or, tmp, tmp, hi(dst));
1118 		}
1119 		if (op == BPF_JEQ)
1120 			emit(ctx, beqz, tmp, off);
1121 		else   /* BPF_JNE */
1122 			emit(ctx, bnez, tmp, off);
1123 		break;
1124 	/* PC += off if dst & imm */
1125 	/* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */
1126 	case BPF_JSET:
1127 	case JIT_JNSET:
1128 		if ((u32)imm <= 0xffff) {
1129 			emit(ctx, andi, tmp, lo(dst), imm);
1130 		} else {     /* Register fallback */
1131 			emit_mov_i(ctx, tmp, imm);
1132 			emit(ctx, and, tmp, lo(dst), tmp);
1133 		}
1134 		if (imm < 0) /* Sign-extension pulls in high word */
1135 			emit(ctx, or, tmp, tmp, hi(dst));
1136 		if (op == BPF_JSET)
1137 			emit(ctx, bnez, tmp, off);
1138 		else   /* JIT_JNSET */
1139 			emit(ctx, beqz, tmp, off);
1140 		break;
1141 	/* PC += off if dst > imm */
1142 	case BPF_JGT:
1143 		emit_sltiu_r64(ctx, tmp, dst, (s64)imm + 1);
1144 		emit(ctx, beqz, tmp, off);
1145 		break;
1146 	/* PC += off if dst >= imm */
1147 	case BPF_JGE:
1148 		emit_sltiu_r64(ctx, tmp, dst, imm);
1149 		emit(ctx, beqz, tmp, off);
1150 		break;
1151 	/* PC += off if dst < imm */
1152 	case BPF_JLT:
1153 		emit_sltiu_r64(ctx, tmp, dst, imm);
1154 		emit(ctx, bnez, tmp, off);
1155 		break;
1156 	/* PC += off if dst <= imm */
1157 	case BPF_JLE:
1158 		emit_sltiu_r64(ctx, tmp, dst, (s64)imm + 1);
1159 		emit(ctx, bnez, tmp, off);
1160 		break;
1161 	/* PC += off if dst > imm (signed) */
1162 	case BPF_JSGT:
1163 		emit_slti_r64(ctx, tmp, dst, (s64)imm + 1);
1164 		emit(ctx, beqz, tmp, off);
1165 		break;
1166 	/* PC += off if dst >= imm (signed) */
1167 	case BPF_JSGE:
1168 		emit_slti_r64(ctx, tmp, dst, imm);
1169 		emit(ctx, beqz, tmp, off);
1170 		break;
1171 	/* PC += off if dst < imm (signed) */
1172 	case BPF_JSLT:
1173 		emit_slti_r64(ctx, tmp, dst, imm);
1174 		emit(ctx, bnez, tmp, off);
1175 		break;
1176 	/* PC += off if dst <= imm (signed) */
1177 	case BPF_JSLE:
1178 		emit_slti_r64(ctx, tmp, dst, (s64)imm + 1);
1179 		emit(ctx, bnez, tmp, off);
1180 		break;
1181 	}
1182 }
1183 
1184 /* Jump register (64-bit) */
1185 static void emit_jmp_r64(struct jit_context *ctx,
1186 			 const u8 dst[], const u8 src[], s32 off, u8 op)
1187 {
1188 	u8 t1 = MIPS_R_T6;
1189 	u8 t2 = MIPS_R_T7;
1190 
1191 	switch (op) {
1192 	/* No-op, used internally for branch optimization */
1193 	case JIT_JNOP:
1194 		break;
1195 	/* PC += off if dst == src */
1196 	/* PC += off if dst != src */
1197 	case BPF_JEQ:
1198 	case BPF_JNE:
1199 		emit(ctx, subu, t1, lo(dst), lo(src));
1200 		emit(ctx, subu, t2, hi(dst), hi(src));
1201 		emit(ctx, or, t1, t1, t2);
1202 		if (op == BPF_JEQ)
1203 			emit(ctx, beqz, t1, off);
1204 		else   /* BPF_JNE */
1205 			emit(ctx, bnez, t1, off);
1206 		break;
1207 	/* PC += off if dst & src */
1208 	/* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */
1209 	case BPF_JSET:
1210 	case JIT_JNSET:
1211 		emit(ctx, and, t1, lo(dst), lo(src));
1212 		emit(ctx, and, t2, hi(dst), hi(src));
1213 		emit(ctx, or, t1, t1, t2);
1214 		if (op == BPF_JSET)
1215 			emit(ctx, bnez, t1, off);
1216 		else   /* JIT_JNSET */
1217 			emit(ctx, beqz, t1, off);
1218 		break;
1219 	/* PC += off if dst > src */
1220 	case BPF_JGT:
1221 		emit_sltu_r64(ctx, t1, src, dst);
1222 		emit(ctx, bnez, t1, off);
1223 		break;
1224 	/* PC += off if dst >= src */
1225 	case BPF_JGE:
1226 		emit_sltu_r64(ctx, t1, dst, src);
1227 		emit(ctx, beqz, t1, off);
1228 		break;
1229 	/* PC += off if dst < src */
1230 	case BPF_JLT:
1231 		emit_sltu_r64(ctx, t1, dst, src);
1232 		emit(ctx, bnez, t1, off);
1233 		break;
1234 	/* PC += off if dst <= src */
1235 	case BPF_JLE:
1236 		emit_sltu_r64(ctx, t1, src, dst);
1237 		emit(ctx, beqz, t1, off);
1238 		break;
1239 	/* PC += off if dst > src (signed) */
1240 	case BPF_JSGT:
1241 		emit_slt_r64(ctx, t1, src, dst);
1242 		emit(ctx, bnez, t1, off);
1243 		break;
1244 	/* PC += off if dst >= src (signed) */
1245 	case BPF_JSGE:
1246 		emit_slt_r64(ctx, t1, dst, src);
1247 		emit(ctx, beqz, t1, off);
1248 		break;
1249 	/* PC += off if dst < src (signed) */
1250 	case BPF_JSLT:
1251 		emit_slt_r64(ctx, t1, dst, src);
1252 		emit(ctx, bnez, t1, off);
1253 		break;
1254 	/* PC += off if dst <= src (signed) */
1255 	case BPF_JSLE:
1256 		emit_slt_r64(ctx, t1, src, dst);
1257 		emit(ctx, beqz, t1, off);
1258 		break;
1259 	}
1260 }
1261 
1262 /* Function call */
1263 static int emit_call(struct jit_context *ctx, const struct bpf_insn *insn)
1264 {
1265 	bool fixed;
1266 	u64 addr;
1267 
1268 	/* Decode the call address */
1269 	if (bpf_jit_get_func_addr(ctx->program, insn, false,
1270 				  &addr, &fixed) < 0)
1271 		return -1;
1272 	if (!fixed)
1273 		return -1;
1274 
1275 	/* Push stack arguments */
1276 	push_regs(ctx, JIT_STACK_REGS, 0, JIT_RESERVED_STACK);
1277 
1278 	/* Emit function call */
1279 	emit_mov_i(ctx, MIPS_R_T9, addr);
1280 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
1281 	emit(ctx, nop); /* Delay slot */
1282 
1283 	clobber_reg(ctx, MIPS_R_RA);
1284 	clobber_reg(ctx, MIPS_R_V0);
1285 	clobber_reg(ctx, MIPS_R_V1);
1286 	return 0;
1287 }
1288 
1289 /* Function tail call */
1290 static int emit_tail_call(struct jit_context *ctx)
1291 {
1292 	u8 ary = lo(bpf2mips32[BPF_REG_2]);
1293 	u8 ind = lo(bpf2mips32[BPF_REG_3]);
1294 	u8 t1 = MIPS_R_T8;
1295 	u8 t2 = MIPS_R_T9;
1296 	int off;
1297 
1298 	/*
1299 	 * Tail call:
1300 	 * eBPF R1   - function argument (context ptr), passed in a0-a1
1301 	 * eBPF R2   - ptr to object with array of function entry points
1302 	 * eBPF R3   - array index of function to be called
1303 	 * stack[sz] - remaining tail call count, initialized in prologue
1304 	 */
1305 
1306 	/* if (ind >= ary->map.max_entries) goto out */
1307 	off = offsetof(struct bpf_array, map.max_entries);
1308 	if (off > 0x7fff)
1309 		return -1;
1310 	emit(ctx, lw, t1, off, ary);             /* t1 = ary->map.max_entries*/
1311 	emit_load_delay(ctx);                    /* Load delay slot          */
1312 	emit(ctx, sltu, t1, ind, t1);            /* t1 = ind < t1            */
1313 	emit(ctx, beqz, t1, get_offset(ctx, 1)); /* PC += off(1) if t1 == 0  */
1314 						 /* (next insn delay slot)   */
1315 	/* if (TCC-- <= 0) goto out */
1316 	emit(ctx, lw, t2, ctx->stack_size, MIPS_R_SP);  /* t2 = *(SP + size) */
1317 	emit_load_delay(ctx);                     /* Load delay slot         */
1318 	emit(ctx, blez, t2, get_offset(ctx, 1));  /* PC += off(1) if t2 <= 0 */
1319 	emit(ctx, addiu, t2, t2, -1);             /* t2-- (delay slot)       */
1320 	emit(ctx, sw, t2, ctx->stack_size, MIPS_R_SP);  /* *(SP + size) = t2 */
1321 
1322 	/* prog = ary->ptrs[ind] */
1323 	off = offsetof(struct bpf_array, ptrs);
1324 	if (off > 0x7fff)
1325 		return -1;
1326 	emit(ctx, sll, t1, ind, 2);               /* t1 = ind << 2           */
1327 	emit(ctx, addu, t1, t1, ary);             /* t1 += ary               */
1328 	emit(ctx, lw, t2, off, t1);               /* t2 = *(t1 + off)        */
1329 	emit_load_delay(ctx);                     /* Load delay slot         */
1330 
1331 	/* if (prog == 0) goto out */
1332 	emit(ctx, beqz, t2, get_offset(ctx, 1));  /* PC += off(1) if t2 == 0 */
1333 	emit(ctx, nop);                           /* Delay slot              */
1334 
1335 	/* func = prog->bpf_func + 8 (prologue skip offset) */
1336 	off = offsetof(struct bpf_prog, bpf_func);
1337 	if (off > 0x7fff)
1338 		return -1;
1339 	emit(ctx, lw, t1, off, t2);                /* t1 = *(t2 + off)       */
1340 	emit_load_delay(ctx);                      /* Load delay slot        */
1341 	emit(ctx, addiu, t1, t1, JIT_TCALL_SKIP);  /* t1 += skip (8 or 12)   */
1342 
1343 	/* goto func */
1344 	build_epilogue(ctx, t1);
1345 	return 0;
1346 }
1347 
1348 /*
1349  * Stack frame layout for a JITed program (stack grows down).
1350  *
1351  * Higher address  : Caller's stack frame       :
1352  *                 :----------------------------:
1353  *                 : 64-bit eBPF args r3-r5     :
1354  *                 :----------------------------:
1355  *                 : Reserved / tail call count :
1356  *                 +============================+  <--- MIPS sp before call
1357  *                 | Callee-saved registers,    |
1358  *                 | including RA and FP        |
1359  *                 +----------------------------+  <--- eBPF FP (MIPS zero,fp)
1360  *                 | Local eBPF variables       |
1361  *                 | allocated by program       |
1362  *                 +----------------------------+
1363  *                 | Reserved for caller-saved  |
1364  *                 | registers                  |
1365  *                 +----------------------------+
1366  *                 | Reserved for 64-bit eBPF   |
1367  *                 | args r3-r5 & args passed   |
1368  *                 | on stack in kernel calls   |
1369  * Lower address   +============================+  <--- MIPS sp
1370  */
1371 
1372 /* Build program prologue to set up the stack and registers */
1373 void build_prologue(struct jit_context *ctx)
1374 {
1375 	const u8 *r1 = bpf2mips32[BPF_REG_1];
1376 	const u8 *fp = bpf2mips32[BPF_REG_FP];
1377 	int stack, saved, locals, reserved;
1378 
1379 	/*
1380 	 * The first two instructions initialize TCC in the reserved (for us)
1381 	 * 16-byte area in the parent's stack frame. On a tail call, the
1382 	 * calling function jumps into the prologue after these instructions.
1383 	 */
1384 	emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff));
1385 	emit(ctx, sw, MIPS_R_T9, 0, MIPS_R_SP);
1386 
1387 	/*
1388 	 * Register eBPF R1 contains the 32-bit context pointer argument.
1389 	 * A 32-bit argument is always passed in MIPS register a0, regardless
1390 	 * of CPU endianness. Initialize R1 accordingly and zero-extend.
1391 	 */
1392 #ifdef __BIG_ENDIAN
1393 	emit(ctx, move, lo(r1), MIPS_R_A0);
1394 #endif
1395 
1396 	/* === Entry-point for tail calls === */
1397 
1398 	/* Zero-extend the 32-bit argument */
1399 	emit(ctx, move, hi(r1), MIPS_R_ZERO);
1400 
1401 	/* If the eBPF frame pointer was accessed it must be saved */
1402 	if (ctx->accessed & BIT(BPF_REG_FP))
1403 		clobber_reg64(ctx, fp);
1404 
1405 	/* Compute the stack space needed for callee-saved registers */
1406 	saved = hweight32(ctx->clobbered & JIT_CALLEE_REGS) * sizeof(u32);
1407 	saved = ALIGN(saved, MIPS_STACK_ALIGNMENT);
1408 
1409 	/* Stack space used by eBPF program local data */
1410 	locals = ALIGN(ctx->program->aux->stack_depth, MIPS_STACK_ALIGNMENT);
1411 
1412 	/*
1413 	 * If we are emitting function calls, reserve extra stack space for
1414 	 * caller-saved registers and function arguments passed on the stack.
1415 	 * The required space is computed automatically during resource
1416 	 * usage discovery (pass 1).
1417 	 */
1418 	reserved = ctx->stack_used;
1419 
1420 	/* Allocate the stack frame */
1421 	stack = ALIGN(saved + locals + reserved, MIPS_STACK_ALIGNMENT);
1422 	emit(ctx, addiu, MIPS_R_SP, MIPS_R_SP, -stack);
1423 
1424 	/* Store callee-saved registers on stack */
1425 	push_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0, stack - saved);
1426 
1427 	/* Initialize the eBPF frame pointer if accessed */
1428 	if (ctx->accessed & BIT(BPF_REG_FP))
1429 		emit(ctx, addiu, lo(fp), MIPS_R_SP, stack - saved);
1430 
1431 	ctx->saved_size = saved;
1432 	ctx->stack_size = stack;
1433 }
1434 
1435 /* Build the program epilogue to restore the stack and registers */
1436 void build_epilogue(struct jit_context *ctx, int dest_reg)
1437 {
1438 	/* Restore callee-saved registers from stack */
1439 	pop_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0,
1440 		 ctx->stack_size - ctx->saved_size);
1441 	/*
1442 	 * A 32-bit return value is always passed in MIPS register v0,
1443 	 * but on big-endian targets the low part of R0 is mapped to v1.
1444 	 */
1445 #ifdef __BIG_ENDIAN
1446 	emit(ctx, move, MIPS_R_V0, MIPS_R_V1);
1447 #endif
1448 
1449 	/* Jump to the return address and adjust the stack pointer */
1450 	emit(ctx, jr, dest_reg);
1451 	emit(ctx, addiu, MIPS_R_SP, MIPS_R_SP, ctx->stack_size);
1452 }
1453 
1454 /* Build one eBPF instruction */
1455 int build_insn(const struct bpf_insn *insn, struct jit_context *ctx)
1456 {
1457 	const u8 *dst = bpf2mips32[insn->dst_reg];
1458 	const u8 *src = bpf2mips32[insn->src_reg];
1459 	const u8 *res = bpf2mips32[BPF_REG_0];
1460 	const u8 *tmp = bpf2mips32[JIT_REG_TMP];
1461 	u8 code = insn->code;
1462 	s16 off = insn->off;
1463 	s32 imm = insn->imm;
1464 	s32 val, rel;
1465 	u8 alu, jmp;
1466 
1467 	switch (code) {
1468 	/* ALU operations */
1469 	/* dst = imm */
1470 	case BPF_ALU | BPF_MOV | BPF_K:
1471 		emit_mov_i(ctx, lo(dst), imm);
1472 		emit_zext_ver(ctx, dst);
1473 		break;
1474 	/* dst = src */
1475 	case BPF_ALU | BPF_MOV | BPF_X:
1476 		if (imm == 1) {
1477 			/* Special mov32 for zext */
1478 			emit_mov_i(ctx, hi(dst), 0);
1479 		} else {
1480 			emit_mov_r(ctx, lo(dst), lo(src));
1481 			emit_zext_ver(ctx, dst);
1482 		}
1483 		break;
1484 	/* dst = -dst */
1485 	case BPF_ALU | BPF_NEG:
1486 		emit_alu_i(ctx, lo(dst), 0, BPF_NEG);
1487 		emit_zext_ver(ctx, dst);
1488 		break;
1489 	/* dst = dst & imm */
1490 	/* dst = dst | imm */
1491 	/* dst = dst ^ imm */
1492 	/* dst = dst << imm */
1493 	/* dst = dst >> imm */
1494 	/* dst = dst >> imm (arithmetic) */
1495 	/* dst = dst + imm */
1496 	/* dst = dst - imm */
1497 	/* dst = dst * imm */
1498 	/* dst = dst / imm */
1499 	/* dst = dst % imm */
1500 	case BPF_ALU | BPF_OR | BPF_K:
1501 	case BPF_ALU | BPF_AND | BPF_K:
1502 	case BPF_ALU | BPF_XOR | BPF_K:
1503 	case BPF_ALU | BPF_LSH | BPF_K:
1504 	case BPF_ALU | BPF_RSH | BPF_K:
1505 	case BPF_ALU | BPF_ARSH | BPF_K:
1506 	case BPF_ALU | BPF_ADD | BPF_K:
1507 	case BPF_ALU | BPF_SUB | BPF_K:
1508 	case BPF_ALU | BPF_MUL | BPF_K:
1509 	case BPF_ALU | BPF_DIV | BPF_K:
1510 	case BPF_ALU | BPF_MOD | BPF_K:
1511 		if (!valid_alu_i(BPF_OP(code), imm)) {
1512 			emit_mov_i(ctx, MIPS_R_T6, imm);
1513 			emit_alu_r(ctx, lo(dst), MIPS_R_T6, BPF_OP(code));
1514 		} else if (rewrite_alu_i(BPF_OP(code), imm, &alu, &val)) {
1515 			emit_alu_i(ctx, lo(dst), val, alu);
1516 		}
1517 		emit_zext_ver(ctx, dst);
1518 		break;
1519 	/* dst = dst & src */
1520 	/* dst = dst | src */
1521 	/* dst = dst ^ src */
1522 	/* dst = dst << src */
1523 	/* dst = dst >> src */
1524 	/* dst = dst >> src (arithmetic) */
1525 	/* dst = dst + src */
1526 	/* dst = dst - src */
1527 	/* dst = dst * src */
1528 	/* dst = dst / src */
1529 	/* dst = dst % src */
1530 	case BPF_ALU | BPF_AND | BPF_X:
1531 	case BPF_ALU | BPF_OR | BPF_X:
1532 	case BPF_ALU | BPF_XOR | BPF_X:
1533 	case BPF_ALU | BPF_LSH | BPF_X:
1534 	case BPF_ALU | BPF_RSH | BPF_X:
1535 	case BPF_ALU | BPF_ARSH | BPF_X:
1536 	case BPF_ALU | BPF_ADD | BPF_X:
1537 	case BPF_ALU | BPF_SUB | BPF_X:
1538 	case BPF_ALU | BPF_MUL | BPF_X:
1539 	case BPF_ALU | BPF_DIV | BPF_X:
1540 	case BPF_ALU | BPF_MOD | BPF_X:
1541 		emit_alu_r(ctx, lo(dst), lo(src), BPF_OP(code));
1542 		emit_zext_ver(ctx, dst);
1543 		break;
1544 	/* dst = imm (64-bit) */
1545 	case BPF_ALU64 | BPF_MOV | BPF_K:
1546 		emit_mov_se_i64(ctx, dst, imm);
1547 		break;
1548 	/* dst = src (64-bit) */
1549 	case BPF_ALU64 | BPF_MOV | BPF_X:
1550 		emit_mov_r(ctx, lo(dst), lo(src));
1551 		emit_mov_r(ctx, hi(dst), hi(src));
1552 		break;
1553 	/* dst = -dst (64-bit) */
1554 	case BPF_ALU64 | BPF_NEG:
1555 		emit_neg_i64(ctx, dst);
1556 		break;
1557 	/* dst = dst & imm (64-bit) */
1558 	case BPF_ALU64 | BPF_AND | BPF_K:
1559 		emit_alu_i64(ctx, dst, imm, BPF_OP(code));
1560 		break;
1561 	/* dst = dst | imm (64-bit) */
1562 	/* dst = dst ^ imm (64-bit) */
1563 	/* dst = dst + imm (64-bit) */
1564 	/* dst = dst - imm (64-bit) */
1565 	case BPF_ALU64 | BPF_OR | BPF_K:
1566 	case BPF_ALU64 | BPF_XOR | BPF_K:
1567 	case BPF_ALU64 | BPF_ADD | BPF_K:
1568 	case BPF_ALU64 | BPF_SUB | BPF_K:
1569 		if (imm)
1570 			emit_alu_i64(ctx, dst, imm, BPF_OP(code));
1571 		break;
1572 	/* dst = dst << imm (64-bit) */
1573 	/* dst = dst >> imm (64-bit) */
1574 	/* dst = dst >> imm (64-bit, arithmetic) */
1575 	case BPF_ALU64 | BPF_LSH | BPF_K:
1576 	case BPF_ALU64 | BPF_RSH | BPF_K:
1577 	case BPF_ALU64 | BPF_ARSH | BPF_K:
1578 		if (imm)
1579 			emit_shift_i64(ctx, dst, imm, BPF_OP(code));
1580 		break;
1581 	/* dst = dst * imm (64-bit) */
1582 	case BPF_ALU64 | BPF_MUL | BPF_K:
1583 		emit_mul_i64(ctx, dst, imm);
1584 		break;
1585 	/* dst = dst / imm (64-bit) */
1586 	/* dst = dst % imm (64-bit) */
1587 	case BPF_ALU64 | BPF_DIV | BPF_K:
1588 	case BPF_ALU64 | BPF_MOD | BPF_K:
1589 		/*
1590 		 * Sign-extend the immediate value into a temporary register,
1591 		 * and then do the operation on this register.
1592 		 */
1593 		emit_mov_se_i64(ctx, tmp, imm);
1594 		emit_divmod_r64(ctx, dst, tmp, BPF_OP(code));
1595 		break;
1596 	/* dst = dst & src (64-bit) */
1597 	/* dst = dst | src (64-bit) */
1598 	/* dst = dst ^ src (64-bit) */
1599 	/* dst = dst + src (64-bit) */
1600 	/* dst = dst - src (64-bit) */
1601 	case BPF_ALU64 | BPF_AND | BPF_X:
1602 	case BPF_ALU64 | BPF_OR | BPF_X:
1603 	case BPF_ALU64 | BPF_XOR | BPF_X:
1604 	case BPF_ALU64 | BPF_ADD | BPF_X:
1605 	case BPF_ALU64 | BPF_SUB | BPF_X:
1606 		emit_alu_r64(ctx, dst, src, BPF_OP(code));
1607 		break;
1608 	/* dst = dst << src (64-bit) */
1609 	/* dst = dst >> src (64-bit) */
1610 	/* dst = dst >> src (64-bit, arithmetic) */
1611 	case BPF_ALU64 | BPF_LSH | BPF_X:
1612 	case BPF_ALU64 | BPF_RSH | BPF_X:
1613 	case BPF_ALU64 | BPF_ARSH | BPF_X:
1614 		emit_shift_r64(ctx, dst, lo(src), BPF_OP(code));
1615 		break;
1616 	/* dst = dst * src (64-bit) */
1617 	case BPF_ALU64 | BPF_MUL | BPF_X:
1618 		emit_mul_r64(ctx, dst, src);
1619 		break;
1620 	/* dst = dst / src (64-bit) */
1621 	/* dst = dst % src (64-bit) */
1622 	case BPF_ALU64 | BPF_DIV | BPF_X:
1623 	case BPF_ALU64 | BPF_MOD | BPF_X:
1624 		emit_divmod_r64(ctx, dst, src, BPF_OP(code));
1625 		break;
1626 	/* dst = htole(dst) */
1627 	/* dst = htobe(dst) */
1628 	case BPF_ALU | BPF_END | BPF_FROM_LE:
1629 	case BPF_ALU | BPF_END | BPF_FROM_BE:
1630 		if (BPF_SRC(code) ==
1631 #ifdef __BIG_ENDIAN
1632 		    BPF_FROM_LE
1633 #else
1634 		    BPF_FROM_BE
1635 #endif
1636 		    )
1637 			emit_bswap_r64(ctx, dst, imm);
1638 		else
1639 			emit_trunc_r64(ctx, dst, imm);
1640 		break;
1641 	/* dst = imm64 */
1642 	case BPF_LD | BPF_IMM | BPF_DW:
1643 		emit_mov_i(ctx, lo(dst), imm);
1644 		emit_mov_i(ctx, hi(dst), insn[1].imm);
1645 		return 1;
1646 	/* LDX: dst = *(size *)(src + off) */
1647 	case BPF_LDX | BPF_MEM | BPF_W:
1648 	case BPF_LDX | BPF_MEM | BPF_H:
1649 	case BPF_LDX | BPF_MEM | BPF_B:
1650 	case BPF_LDX | BPF_MEM | BPF_DW:
1651 		emit_ldx(ctx, dst, lo(src), off, BPF_SIZE(code));
1652 		break;
1653 	/* ST: *(size *)(dst + off) = imm */
1654 	case BPF_ST | BPF_MEM | BPF_W:
1655 	case BPF_ST | BPF_MEM | BPF_H:
1656 	case BPF_ST | BPF_MEM | BPF_B:
1657 	case BPF_ST | BPF_MEM | BPF_DW:
1658 		switch (BPF_SIZE(code)) {
1659 		case BPF_DW:
1660 			/* Sign-extend immediate value into temporary reg */
1661 			emit_mov_se_i64(ctx, tmp, imm);
1662 			break;
1663 		case BPF_W:
1664 		case BPF_H:
1665 		case BPF_B:
1666 			emit_mov_i(ctx, lo(tmp), imm);
1667 			break;
1668 		}
1669 		emit_stx(ctx, lo(dst), tmp, off, BPF_SIZE(code));
1670 		break;
1671 	/* STX: *(size *)(dst + off) = src */
1672 	case BPF_STX | BPF_MEM | BPF_W:
1673 	case BPF_STX | BPF_MEM | BPF_H:
1674 	case BPF_STX | BPF_MEM | BPF_B:
1675 	case BPF_STX | BPF_MEM | BPF_DW:
1676 		emit_stx(ctx, lo(dst), src, off, BPF_SIZE(code));
1677 		break;
1678 	/* Speculation barrier */
1679 	case BPF_ST | BPF_NOSPEC:
1680 		break;
1681 	/* Atomics */
1682 	case BPF_STX | BPF_ATOMIC | BPF_W:
1683 		switch (imm) {
1684 		case BPF_ADD:
1685 		case BPF_ADD | BPF_FETCH:
1686 		case BPF_AND:
1687 		case BPF_AND | BPF_FETCH:
1688 		case BPF_OR:
1689 		case BPF_OR | BPF_FETCH:
1690 		case BPF_XOR:
1691 		case BPF_XOR | BPF_FETCH:
1692 		case BPF_XCHG:
1693 			if (cpu_has_llsc)
1694 				emit_atomic_r(ctx, lo(dst), lo(src), off, imm);
1695 			else /* Non-ll/sc fallback */
1696 				emit_atomic_r32(ctx, lo(dst), lo(src),
1697 						off, imm);
1698 			if (imm & BPF_FETCH)
1699 				emit_zext_ver(ctx, src);
1700 			break;
1701 		case BPF_CMPXCHG:
1702 			if (cpu_has_llsc)
1703 				emit_cmpxchg_r(ctx, lo(dst), lo(src),
1704 					       lo(res), off);
1705 			else /* Non-ll/sc fallback */
1706 				emit_cmpxchg_r32(ctx, lo(dst), lo(src), off);
1707 			/* Result zero-extension inserted by verifier */
1708 			break;
1709 		default:
1710 			goto notyet;
1711 		}
1712 		break;
1713 	/* Atomics (64-bit) */
1714 	case BPF_STX | BPF_ATOMIC | BPF_DW:
1715 		switch (imm) {
1716 		case BPF_ADD:
1717 		case BPF_ADD | BPF_FETCH:
1718 		case BPF_AND:
1719 		case BPF_AND | BPF_FETCH:
1720 		case BPF_OR:
1721 		case BPF_OR | BPF_FETCH:
1722 		case BPF_XOR:
1723 		case BPF_XOR | BPF_FETCH:
1724 		case BPF_XCHG:
1725 			emit_atomic_r64(ctx, lo(dst), src, off, imm);
1726 			break;
1727 		case BPF_CMPXCHG:
1728 			emit_cmpxchg_r64(ctx, lo(dst), src, off);
1729 			break;
1730 		default:
1731 			goto notyet;
1732 		}
1733 		break;
1734 	/* PC += off if dst == src */
1735 	/* PC += off if dst != src */
1736 	/* PC += off if dst & src */
1737 	/* PC += off if dst > src */
1738 	/* PC += off if dst >= src */
1739 	/* PC += off if dst < src */
1740 	/* PC += off if dst <= src */
1741 	/* PC += off if dst > src (signed) */
1742 	/* PC += off if dst >= src (signed) */
1743 	/* PC += off if dst < src (signed) */
1744 	/* PC += off if dst <= src (signed) */
1745 	case BPF_JMP32 | BPF_JEQ | BPF_X:
1746 	case BPF_JMP32 | BPF_JNE | BPF_X:
1747 	case BPF_JMP32 | BPF_JSET | BPF_X:
1748 	case BPF_JMP32 | BPF_JGT | BPF_X:
1749 	case BPF_JMP32 | BPF_JGE | BPF_X:
1750 	case BPF_JMP32 | BPF_JLT | BPF_X:
1751 	case BPF_JMP32 | BPF_JLE | BPF_X:
1752 	case BPF_JMP32 | BPF_JSGT | BPF_X:
1753 	case BPF_JMP32 | BPF_JSGE | BPF_X:
1754 	case BPF_JMP32 | BPF_JSLT | BPF_X:
1755 	case BPF_JMP32 | BPF_JSLE | BPF_X:
1756 		if (off == 0)
1757 			break;
1758 		setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel);
1759 		emit_jmp_r(ctx, lo(dst), lo(src), rel, jmp);
1760 		if (finish_jmp(ctx, jmp, off) < 0)
1761 			goto toofar;
1762 		break;
1763 	/* PC += off if dst == imm */
1764 	/* PC += off if dst != imm */
1765 	/* PC += off if dst & imm */
1766 	/* PC += off if dst > imm */
1767 	/* PC += off if dst >= imm */
1768 	/* PC += off if dst < imm */
1769 	/* PC += off if dst <= imm */
1770 	/* PC += off if dst > imm (signed) */
1771 	/* PC += off if dst >= imm (signed) */
1772 	/* PC += off if dst < imm (signed) */
1773 	/* PC += off if dst <= imm (signed) */
1774 	case BPF_JMP32 | BPF_JEQ | BPF_K:
1775 	case BPF_JMP32 | BPF_JNE | BPF_K:
1776 	case BPF_JMP32 | BPF_JSET | BPF_K:
1777 	case BPF_JMP32 | BPF_JGT | BPF_K:
1778 	case BPF_JMP32 | BPF_JGE | BPF_K:
1779 	case BPF_JMP32 | BPF_JLT | BPF_K:
1780 	case BPF_JMP32 | BPF_JLE | BPF_K:
1781 	case BPF_JMP32 | BPF_JSGT | BPF_K:
1782 	case BPF_JMP32 | BPF_JSGE | BPF_K:
1783 	case BPF_JMP32 | BPF_JSLT | BPF_K:
1784 	case BPF_JMP32 | BPF_JSLE | BPF_K:
1785 		if (off == 0)
1786 			break;
1787 		setup_jmp_i(ctx, imm, 32, BPF_OP(code), off, &jmp, &rel);
1788 		if (valid_jmp_i(jmp, imm)) {
1789 			emit_jmp_i(ctx, lo(dst), imm, rel, jmp);
1790 		} else {
1791 			/* Move large immediate to register */
1792 			emit_mov_i(ctx, MIPS_R_T6, imm);
1793 			emit_jmp_r(ctx, lo(dst), MIPS_R_T6, rel, jmp);
1794 		}
1795 		if (finish_jmp(ctx, jmp, off) < 0)
1796 			goto toofar;
1797 		break;
1798 	/* PC += off if dst == src */
1799 	/* PC += off if dst != src */
1800 	/* PC += off if dst & src */
1801 	/* PC += off if dst > src */
1802 	/* PC += off if dst >= src */
1803 	/* PC += off if dst < src */
1804 	/* PC += off if dst <= src */
1805 	/* PC += off if dst > src (signed) */
1806 	/* PC += off if dst >= src (signed) */
1807 	/* PC += off if dst < src (signed) */
1808 	/* PC += off if dst <= src (signed) */
1809 	case BPF_JMP | BPF_JEQ | BPF_X:
1810 	case BPF_JMP | BPF_JNE | BPF_X:
1811 	case BPF_JMP | BPF_JSET | BPF_X:
1812 	case BPF_JMP | BPF_JGT | BPF_X:
1813 	case BPF_JMP | BPF_JGE | BPF_X:
1814 	case BPF_JMP | BPF_JLT | BPF_X:
1815 	case BPF_JMP | BPF_JLE | BPF_X:
1816 	case BPF_JMP | BPF_JSGT | BPF_X:
1817 	case BPF_JMP | BPF_JSGE | BPF_X:
1818 	case BPF_JMP | BPF_JSLT | BPF_X:
1819 	case BPF_JMP | BPF_JSLE | BPF_X:
1820 		if (off == 0)
1821 			break;
1822 		setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel);
1823 		emit_jmp_r64(ctx, dst, src, rel, jmp);
1824 		if (finish_jmp(ctx, jmp, off) < 0)
1825 			goto toofar;
1826 		break;
1827 	/* PC += off if dst == imm */
1828 	/* PC += off if dst != imm */
1829 	/* PC += off if dst & imm */
1830 	/* PC += off if dst > imm */
1831 	/* PC += off if dst >= imm */
1832 	/* PC += off if dst < imm */
1833 	/* PC += off if dst <= imm */
1834 	/* PC += off if dst > imm (signed) */
1835 	/* PC += off if dst >= imm (signed) */
1836 	/* PC += off if dst < imm (signed) */
1837 	/* PC += off if dst <= imm (signed) */
1838 	case BPF_JMP | BPF_JEQ | BPF_K:
1839 	case BPF_JMP | BPF_JNE | BPF_K:
1840 	case BPF_JMP | BPF_JSET | BPF_K:
1841 	case BPF_JMP | BPF_JGT | BPF_K:
1842 	case BPF_JMP | BPF_JGE | BPF_K:
1843 	case BPF_JMP | BPF_JLT | BPF_K:
1844 	case BPF_JMP | BPF_JLE | BPF_K:
1845 	case BPF_JMP | BPF_JSGT | BPF_K:
1846 	case BPF_JMP | BPF_JSGE | BPF_K:
1847 	case BPF_JMP | BPF_JSLT | BPF_K:
1848 	case BPF_JMP | BPF_JSLE | BPF_K:
1849 		if (off == 0)
1850 			break;
1851 		setup_jmp_i(ctx, imm, 64, BPF_OP(code), off, &jmp, &rel);
1852 		emit_jmp_i64(ctx, dst, imm, rel, jmp);
1853 		if (finish_jmp(ctx, jmp, off) < 0)
1854 			goto toofar;
1855 		break;
1856 	/* PC += off */
1857 	case BPF_JMP | BPF_JA:
1858 		if (off == 0)
1859 			break;
1860 		if (emit_ja(ctx, off) < 0)
1861 			goto toofar;
1862 		break;
1863 	/* Tail call */
1864 	case BPF_JMP | BPF_TAIL_CALL:
1865 		if (emit_tail_call(ctx) < 0)
1866 			goto invalid;
1867 		break;
1868 	/* Function call */
1869 	case BPF_JMP | BPF_CALL:
1870 		if (emit_call(ctx, insn) < 0)
1871 			goto invalid;
1872 		break;
1873 	/* Function return */
1874 	case BPF_JMP | BPF_EXIT:
1875 		/*
1876 		 * Optimization: when last instruction is EXIT
1877 		 * simply continue to epilogue.
1878 		 */
1879 		if (ctx->bpf_index == ctx->program->len - 1)
1880 			break;
1881 		if (emit_exit(ctx) < 0)
1882 			goto toofar;
1883 		break;
1884 
1885 	default:
1886 invalid:
1887 		pr_err_once("unknown opcode %02x\n", code);
1888 		return -EINVAL;
1889 notyet:
1890 		pr_info_once("*** NOT YET: opcode %02x ***\n", code);
1891 		return -EFAULT;
1892 toofar:
1893 		pr_info_once("*** TOO FAR: jump at %u opcode %02x ***\n",
1894 			     ctx->bpf_index, code);
1895 		return -E2BIG;
1896 	}
1897 	return 0;
1898 }
1899