xref: /openbmc/linux/arch/mips/net/bpf_jit_comp32.c (revision c4a7b9b5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Just-In-Time compiler for eBPF bytecode on MIPS.
4  * Implementation of JIT functions for 32-bit CPUs.
5  *
6  * Copyright (c) 2021 Anyfi Networks AB.
7  * Author: Johan Almbladh <johan.almbladh@gmail.com>
8  *
9  * Based on code and ideas from
10  * Copyright (c) 2017 Cavium, Inc.
11  * Copyright (c) 2017 Shubham Bansal <illusionist.neo@gmail.com>
12  * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
13  */
14 
15 #include <linux/math64.h>
16 #include <linux/errno.h>
17 #include <linux/filter.h>
18 #include <linux/bpf.h>
19 #include <asm/cpu-features.h>
20 #include <asm/isa-rev.h>
21 #include <asm/uasm.h>
22 
23 #include "bpf_jit_comp.h"
24 
25 /* MIPS a4-a7 are not available in the o32 ABI */
26 #undef MIPS_R_A4
27 #undef MIPS_R_A5
28 #undef MIPS_R_A6
29 #undef MIPS_R_A7
30 
31 /* Stack is 8-byte aligned in o32 ABI */
32 #define MIPS_STACK_ALIGNMENT 8
33 
34 /*
35  * The top 16 bytes of a stack frame is reserved for the callee in O32 ABI.
36  * This corresponds to stack space for register arguments a0-a3.
37  */
38 #define JIT_RESERVED_STACK 16
39 
40 /* Temporary 64-bit register used by JIT */
41 #define JIT_REG_TMP MAX_BPF_JIT_REG
42 
43 /*
44  * Number of prologue bytes to skip when doing a tail call.
45  * Tail call count (TCC) initialization (8 bytes) always, plus
46  * R0-to-v0 assignment (4 bytes) if big endian.
47  */
48 #ifdef __BIG_ENDIAN
49 #define JIT_TCALL_SKIP 12
50 #else
51 #define JIT_TCALL_SKIP 8
52 #endif
53 
54 /* CPU registers holding the callee return value */
55 #define JIT_RETURN_REGS	  \
56 	(BIT(MIPS_R_V0) | \
57 	 BIT(MIPS_R_V1))
58 
59 /* CPU registers arguments passed to callee directly */
60 #define JIT_ARG_REGS      \
61 	(BIT(MIPS_R_A0) | \
62 	 BIT(MIPS_R_A1) | \
63 	 BIT(MIPS_R_A2) | \
64 	 BIT(MIPS_R_A3))
65 
66 /* CPU register arguments passed to callee on stack */
67 #define JIT_STACK_REGS    \
68 	(BIT(MIPS_R_T0) | \
69 	 BIT(MIPS_R_T1) | \
70 	 BIT(MIPS_R_T2) | \
71 	 BIT(MIPS_R_T3) | \
72 	 BIT(MIPS_R_T4) | \
73 	 BIT(MIPS_R_T5))
74 
75 /* Caller-saved CPU registers */
76 #define JIT_CALLER_REGS    \
77 	(JIT_RETURN_REGS | \
78 	 JIT_ARG_REGS    | \
79 	 JIT_STACK_REGS)
80 
81 /* Callee-saved CPU registers */
82 #define JIT_CALLEE_REGS   \
83 	(BIT(MIPS_R_S0) | \
84 	 BIT(MIPS_R_S1) | \
85 	 BIT(MIPS_R_S2) | \
86 	 BIT(MIPS_R_S3) | \
87 	 BIT(MIPS_R_S4) | \
88 	 BIT(MIPS_R_S5) | \
89 	 BIT(MIPS_R_S6) | \
90 	 BIT(MIPS_R_S7) | \
91 	 BIT(MIPS_R_GP) | \
92 	 BIT(MIPS_R_FP) | \
93 	 BIT(MIPS_R_RA))
94 
95 /*
96  * Mapping of 64-bit eBPF registers to 32-bit native MIPS registers.
97  *
98  * 1) Native register pairs are ordered according to CPU endiannes, following
99  *    the MIPS convention for passing 64-bit arguments and return values.
100  * 2) The eBPF return value, arguments and callee-saved registers are mapped
101  *    to their native MIPS equivalents.
102  * 3) Since the 32 highest bits in the eBPF FP register are always zero,
103  *    only one general-purpose register is actually needed for the mapping.
104  *    We use the fp register for this purpose, and map the highest bits to
105  *    the MIPS register r0 (zero).
106  * 4) We use the MIPS gp and at registers as internal temporary registers
107  *    for constant blinding. The gp register is callee-saved.
108  * 5) One 64-bit temporary register is mapped for use when sign-extending
109  *    immediate operands. MIPS registers t6-t9 are available to the JIT
110  *    for as temporaries when implementing complex 64-bit operations.
111  *
112  * With this scheme all eBPF registers are being mapped to native MIPS
113  * registers without having to use any stack scratch space. The direct
114  * register mapping (2) simplifies the handling of function calls.
115  */
116 static const u8 bpf2mips32[][2] = {
117 	/* Return value from in-kernel function, and exit value from eBPF */
118 	[BPF_REG_0] = {MIPS_R_V1, MIPS_R_V0},
119 	/* Arguments from eBPF program to in-kernel function */
120 	[BPF_REG_1] = {MIPS_R_A1, MIPS_R_A0},
121 	[BPF_REG_2] = {MIPS_R_A3, MIPS_R_A2},
122 	/* Remaining arguments, to be passed on the stack per O32 ABI */
123 	[BPF_REG_3] = {MIPS_R_T1, MIPS_R_T0},
124 	[BPF_REG_4] = {MIPS_R_T3, MIPS_R_T2},
125 	[BPF_REG_5] = {MIPS_R_T5, MIPS_R_T4},
126 	/* Callee-saved registers that in-kernel function will preserve */
127 	[BPF_REG_6] = {MIPS_R_S1, MIPS_R_S0},
128 	[BPF_REG_7] = {MIPS_R_S3, MIPS_R_S2},
129 	[BPF_REG_8] = {MIPS_R_S5, MIPS_R_S4},
130 	[BPF_REG_9] = {MIPS_R_S7, MIPS_R_S6},
131 	/* Read-only frame pointer to access the eBPF stack */
132 #ifdef __BIG_ENDIAN
133 	[BPF_REG_FP] = {MIPS_R_FP, MIPS_R_ZERO},
134 #else
135 	[BPF_REG_FP] = {MIPS_R_ZERO, MIPS_R_FP},
136 #endif
137 	/* Temporary register for blinding constants */
138 	[BPF_REG_AX] = {MIPS_R_GP, MIPS_R_AT},
139 	/* Temporary register for internal JIT use */
140 	[JIT_REG_TMP] = {MIPS_R_T7, MIPS_R_T6},
141 };
142 
143 /* Get low CPU register for a 64-bit eBPF register mapping */
144 static inline u8 lo(const u8 reg[])
145 {
146 #ifdef __BIG_ENDIAN
147 	return reg[0];
148 #else
149 	return reg[1];
150 #endif
151 }
152 
153 /* Get high CPU register for a 64-bit eBPF register mapping */
154 static inline u8 hi(const u8 reg[])
155 {
156 #ifdef __BIG_ENDIAN
157 	return reg[1];
158 #else
159 	return reg[0];
160 #endif
161 }
162 
163 /*
164  * Mark a 64-bit CPU register pair as clobbered, it needs to be
165  * saved/restored by the program if callee-saved.
166  */
167 static void clobber_reg64(struct jit_context *ctx, const u8 reg[])
168 {
169 	clobber_reg(ctx, reg[0]);
170 	clobber_reg(ctx, reg[1]);
171 }
172 
173 /* dst = imm (sign-extended) */
174 static void emit_mov_se_i64(struct jit_context *ctx, const u8 dst[], s32 imm)
175 {
176 	emit_mov_i(ctx, lo(dst), imm);
177 	if (imm < 0)
178 		emit(ctx, addiu, hi(dst), MIPS_R_ZERO, -1);
179 	else
180 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
181 	clobber_reg64(ctx, dst);
182 }
183 
184 /* Zero extension, if verifier does not do it for us  */
185 static void emit_zext_ver(struct jit_context *ctx, const u8 dst[])
186 {
187 	if (!ctx->program->aux->verifier_zext) {
188 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
189 		clobber_reg(ctx, hi(dst));
190 	}
191 }
192 
193 /* Load delay slot, if ISA mandates it */
194 static void emit_load_delay(struct jit_context *ctx)
195 {
196 	if (!cpu_has_mips_2_3_4_5_r)
197 		emit(ctx, nop);
198 }
199 
200 /* ALU immediate operation (64-bit) */
201 static void emit_alu_i64(struct jit_context *ctx,
202 			 const u8 dst[], s32 imm, u8 op)
203 {
204 	u8 src = MIPS_R_T6;
205 
206 	/*
207 	 * ADD/SUB with all but the max negative imm can be handled by
208 	 * inverting the operation and the imm value, saving one insn.
209 	 */
210 	if (imm > S32_MIN && imm < 0)
211 		switch (op) {
212 		case BPF_ADD:
213 			op = BPF_SUB;
214 			imm = -imm;
215 			break;
216 		case BPF_SUB:
217 			op = BPF_ADD;
218 			imm = -imm;
219 			break;
220 		}
221 
222 	/* Move immediate to temporary register */
223 	emit_mov_i(ctx, src, imm);
224 
225 	switch (op) {
226 	/* dst = dst + imm */
227 	case BPF_ADD:
228 		emit(ctx, addu, lo(dst), lo(dst), src);
229 		emit(ctx, sltu, MIPS_R_T9, lo(dst), src);
230 		emit(ctx, addu, hi(dst), hi(dst), MIPS_R_T9);
231 		if (imm < 0)
232 			emit(ctx, addiu, hi(dst), hi(dst), -1);
233 		break;
234 	/* dst = dst - imm */
235 	case BPF_SUB:
236 		emit(ctx, sltu, MIPS_R_T9, lo(dst), src);
237 		emit(ctx, subu, lo(dst), lo(dst), src);
238 		emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9);
239 		if (imm < 0)
240 			emit(ctx, addiu, hi(dst), hi(dst), 1);
241 		break;
242 	/* dst = dst | imm */
243 	case BPF_OR:
244 		emit(ctx, or, lo(dst), lo(dst), src);
245 		if (imm < 0)
246 			emit(ctx, addiu, hi(dst), MIPS_R_ZERO, -1);
247 		break;
248 	/* dst = dst & imm */
249 	case BPF_AND:
250 		emit(ctx, and, lo(dst), lo(dst), src);
251 		if (imm >= 0)
252 			emit(ctx, move, hi(dst), MIPS_R_ZERO);
253 		break;
254 	/* dst = dst ^ imm */
255 	case BPF_XOR:
256 		emit(ctx, xor, lo(dst), lo(dst), src);
257 		if (imm < 0) {
258 			emit(ctx, subu, hi(dst), MIPS_R_ZERO, hi(dst));
259 			emit(ctx, addiu, hi(dst), hi(dst), -1);
260 		}
261 		break;
262 	}
263 	clobber_reg64(ctx, dst);
264 }
265 
266 /* ALU register operation (64-bit) */
267 static void emit_alu_r64(struct jit_context *ctx,
268 			 const u8 dst[], const u8 src[], u8 op)
269 {
270 	switch (BPF_OP(op)) {
271 	/* dst = dst + src */
272 	case BPF_ADD:
273 		if (src == dst) {
274 			emit(ctx, srl, MIPS_R_T9, lo(dst), 31);
275 			emit(ctx, addu, lo(dst), lo(dst), lo(dst));
276 		} else {
277 			emit(ctx, addu, lo(dst), lo(dst), lo(src));
278 			emit(ctx, sltu, MIPS_R_T9, lo(dst), lo(src));
279 		}
280 		emit(ctx, addu, hi(dst), hi(dst), hi(src));
281 		emit(ctx, addu, hi(dst), hi(dst), MIPS_R_T9);
282 		break;
283 	/* dst = dst - src */
284 	case BPF_SUB:
285 		emit(ctx, sltu, MIPS_R_T9, lo(dst), lo(src));
286 		emit(ctx, subu, lo(dst), lo(dst), lo(src));
287 		emit(ctx, subu, hi(dst), hi(dst), hi(src));
288 		emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9);
289 		break;
290 	/* dst = dst | src */
291 	case BPF_OR:
292 		emit(ctx, or, lo(dst), lo(dst), lo(src));
293 		emit(ctx, or, hi(dst), hi(dst), hi(src));
294 		break;
295 	/* dst = dst & src */
296 	case BPF_AND:
297 		emit(ctx, and, lo(dst), lo(dst), lo(src));
298 		emit(ctx, and, hi(dst), hi(dst), hi(src));
299 		break;
300 	/* dst = dst ^ src */
301 	case BPF_XOR:
302 		emit(ctx, xor, lo(dst), lo(dst), lo(src));
303 		emit(ctx, xor, hi(dst), hi(dst), hi(src));
304 		break;
305 	}
306 	clobber_reg64(ctx, dst);
307 }
308 
309 /* ALU invert (64-bit) */
310 static void emit_neg_i64(struct jit_context *ctx, const u8 dst[])
311 {
312 	emit(ctx, sltu, MIPS_R_T9, MIPS_R_ZERO, lo(dst));
313 	emit(ctx, subu, lo(dst), MIPS_R_ZERO, lo(dst));
314 	emit(ctx, subu, hi(dst), MIPS_R_ZERO, hi(dst));
315 	emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9);
316 
317 	clobber_reg64(ctx, dst);
318 }
319 
320 /* ALU shift immediate (64-bit) */
321 static void emit_shift_i64(struct jit_context *ctx,
322 			   const u8 dst[], u32 imm, u8 op)
323 {
324 	switch (BPF_OP(op)) {
325 	/* dst = dst << imm */
326 	case BPF_LSH:
327 		if (imm < 32) {
328 			emit(ctx, srl, MIPS_R_T9, lo(dst), 32 - imm);
329 			emit(ctx, sll, lo(dst), lo(dst), imm);
330 			emit(ctx, sll, hi(dst), hi(dst), imm);
331 			emit(ctx, or, hi(dst), hi(dst), MIPS_R_T9);
332 		} else {
333 			emit(ctx, sll, hi(dst), lo(dst), imm - 32);
334 			emit(ctx, move, lo(dst), MIPS_R_ZERO);
335 		}
336 		break;
337 	/* dst = dst >> imm */
338 	case BPF_RSH:
339 		if (imm < 32) {
340 			emit(ctx, sll, MIPS_R_T9, hi(dst), 32 - imm);
341 			emit(ctx, srl, lo(dst), lo(dst), imm);
342 			emit(ctx, srl, hi(dst), hi(dst), imm);
343 			emit(ctx, or, lo(dst), lo(dst), MIPS_R_T9);
344 		} else {
345 			emit(ctx, srl, lo(dst), hi(dst), imm - 32);
346 			emit(ctx, move, hi(dst), MIPS_R_ZERO);
347 		}
348 		break;
349 	/* dst = dst >> imm (arithmetic) */
350 	case BPF_ARSH:
351 		if (imm < 32) {
352 			emit(ctx, sll, MIPS_R_T9, hi(dst), 32 - imm);
353 			emit(ctx, srl, lo(dst), lo(dst), imm);
354 			emit(ctx, sra, hi(dst), hi(dst), imm);
355 			emit(ctx, or, lo(dst), lo(dst), MIPS_R_T9);
356 		} else {
357 			emit(ctx, sra, lo(dst), hi(dst), imm - 32);
358 			emit(ctx, sra, hi(dst), hi(dst), 31);
359 		}
360 		break;
361 	}
362 	clobber_reg64(ctx, dst);
363 }
364 
365 /* ALU shift register (64-bit) */
366 static void emit_shift_r64(struct jit_context *ctx,
367 			   const u8 dst[], u8 src, u8 op)
368 {
369 	u8 t1 = MIPS_R_T8;
370 	u8 t2 = MIPS_R_T9;
371 
372 	emit(ctx, andi, t1, src, 32);              /* t1 = src & 32          */
373 	emit(ctx, beqz, t1, 16);                   /* PC += 16 if t1 == 0    */
374 	emit(ctx, nor, t2, src, MIPS_R_ZERO);      /* t2 = ~src (delay slot) */
375 
376 	switch (BPF_OP(op)) {
377 	/* dst = dst << src */
378 	case BPF_LSH:
379 		/* Next: shift >= 32 */
380 		emit(ctx, sllv, hi(dst), lo(dst), src);    /* dh = dl << src */
381 		emit(ctx, move, lo(dst), MIPS_R_ZERO);     /* dl = 0         */
382 		emit(ctx, b, 20);                          /* PC += 20       */
383 		/* +16: shift < 32 */
384 		emit(ctx, srl, t1, lo(dst), 1);            /* t1 = dl >> 1   */
385 		emit(ctx, srlv, t1, t1, t2);               /* t1 = t1 >> t2  */
386 		emit(ctx, sllv, lo(dst), lo(dst), src);    /* dl = dl << src */
387 		emit(ctx, sllv, hi(dst), hi(dst), src);    /* dh = dh << src */
388 		emit(ctx, or, hi(dst), hi(dst), t1);       /* dh = dh | t1   */
389 		break;
390 	/* dst = dst >> src */
391 	case BPF_RSH:
392 		/* Next: shift >= 32 */
393 		emit(ctx, srlv, lo(dst), hi(dst), src);    /* dl = dh >> src */
394 		emit(ctx, move, hi(dst), MIPS_R_ZERO);     /* dh = 0         */
395 		emit(ctx, b, 20);                          /* PC += 20       */
396 		/* +16: shift < 32 */
397 		emit(ctx, sll, t1, hi(dst), 1);            /* t1 = dl << 1   */
398 		emit(ctx, sllv, t1, t1, t2);               /* t1 = t1 << t2  */
399 		emit(ctx, srlv, lo(dst), lo(dst), src);    /* dl = dl >> src */
400 		emit(ctx, srlv, hi(dst), hi(dst), src);    /* dh = dh >> src */
401 		emit(ctx, or, lo(dst), lo(dst), t1);       /* dl = dl | t1   */
402 		break;
403 	/* dst = dst >> src (arithmetic) */
404 	case BPF_ARSH:
405 		/* Next: shift >= 32 */
406 		emit(ctx, srav, lo(dst), hi(dst), src);   /* dl = dh >>a src */
407 		emit(ctx, sra, hi(dst), hi(dst), 31);     /* dh = dh >>a 31  */
408 		emit(ctx, b, 20);                         /* PC += 20        */
409 		/* +16: shift < 32 */
410 		emit(ctx, sll, t1, hi(dst), 1);           /* t1 = dl << 1    */
411 		emit(ctx, sllv, t1, t1, t2);              /* t1 = t1 << t2   */
412 		emit(ctx, srlv, lo(dst), lo(dst), src);   /* dl = dl >>a src */
413 		emit(ctx, srav, hi(dst), hi(dst), src);   /* dh = dh >> src  */
414 		emit(ctx, or, lo(dst), lo(dst), t1);      /* dl = dl | t1    */
415 		break;
416 	}
417 
418 	/* +20: Done */
419 	clobber_reg64(ctx, dst);
420 }
421 
422 /* ALU mul immediate (64x32-bit) */
423 static void emit_mul_i64(struct jit_context *ctx, const u8 dst[], s32 imm)
424 {
425 	u8 src = MIPS_R_T6;
426 	u8 tmp = MIPS_R_T9;
427 
428 	switch (imm) {
429 	/* dst = dst * 1 is a no-op */
430 	case 1:
431 		break;
432 	/* dst = dst * -1 */
433 	case -1:
434 		emit_neg_i64(ctx, dst);
435 		break;
436 	case 0:
437 		emit_mov_r(ctx, lo(dst), MIPS_R_ZERO);
438 		emit_mov_r(ctx, hi(dst), MIPS_R_ZERO);
439 		break;
440 	/* Full 64x32 multiply */
441 	default:
442 		/* hi(dst) = hi(dst) * src(imm) */
443 		emit_mov_i(ctx, src, imm);
444 		if (cpu_has_mips32r1 || cpu_has_mips32r6) {
445 			emit(ctx, mul, hi(dst), hi(dst), src);
446 		} else {
447 			emit(ctx, multu, hi(dst), src);
448 			emit(ctx, mflo, hi(dst));
449 		}
450 
451 		/* hi(dst) = hi(dst) - lo(dst) */
452 		if (imm < 0)
453 			emit(ctx, subu, hi(dst), hi(dst), lo(dst));
454 
455 		/* tmp = lo(dst) * src(imm) >> 32 */
456 		/* lo(dst) = lo(dst) * src(imm) */
457 		if (cpu_has_mips32r6) {
458 			emit(ctx, muhu, tmp, lo(dst), src);
459 			emit(ctx, mulu, lo(dst), lo(dst), src);
460 		} else {
461 			emit(ctx, multu, lo(dst), src);
462 			emit(ctx, mflo, lo(dst));
463 			emit(ctx, mfhi, tmp);
464 		}
465 
466 		/* hi(dst) += tmp */
467 		emit(ctx, addu, hi(dst), hi(dst), tmp);
468 		clobber_reg64(ctx, dst);
469 		break;
470 	}
471 }
472 
473 /* ALU mul register (64x64-bit) */
474 static void emit_mul_r64(struct jit_context *ctx,
475 			 const u8 dst[], const u8 src[])
476 {
477 	u8 acc = MIPS_R_T8;
478 	u8 tmp = MIPS_R_T9;
479 
480 	/* acc = hi(dst) * lo(src) */
481 	if (cpu_has_mips32r1 || cpu_has_mips32r6) {
482 		emit(ctx, mul, acc, hi(dst), lo(src));
483 	} else {
484 		emit(ctx, multu, hi(dst), lo(src));
485 		emit(ctx, mflo, acc);
486 	}
487 
488 	/* tmp = lo(dst) * hi(src) */
489 	if (cpu_has_mips32r1 || cpu_has_mips32r6) {
490 		emit(ctx, mul, tmp, lo(dst), hi(src));
491 	} else {
492 		emit(ctx, multu, lo(dst), hi(src));
493 		emit(ctx, mflo, tmp);
494 	}
495 
496 	/* acc += tmp */
497 	emit(ctx, addu, acc, acc, tmp);
498 
499 	/* tmp = lo(dst) * lo(src) >> 32 */
500 	/* lo(dst) = lo(dst) * lo(src) */
501 	if (cpu_has_mips32r6) {
502 		emit(ctx, muhu, tmp, lo(dst), lo(src));
503 		emit(ctx, mulu, lo(dst), lo(dst), lo(src));
504 	} else {
505 		emit(ctx, multu, lo(dst), lo(src));
506 		emit(ctx, mflo, lo(dst));
507 		emit(ctx, mfhi, tmp);
508 	}
509 
510 	/* hi(dst) = acc + tmp */
511 	emit(ctx, addu, hi(dst), acc, tmp);
512 	clobber_reg64(ctx, dst);
513 }
514 
515 /* Helper function for 64-bit modulo */
516 static u64 jit_mod64(u64 a, u64 b)
517 {
518 	u64 rem;
519 
520 	div64_u64_rem(a, b, &rem);
521 	return rem;
522 }
523 
524 /* ALU div/mod register (64-bit) */
525 static void emit_divmod_r64(struct jit_context *ctx,
526 			    const u8 dst[], const u8 src[], u8 op)
527 {
528 	const u8 *r0 = bpf2mips32[BPF_REG_0]; /* Mapped to v0-v1 */
529 	const u8 *r1 = bpf2mips32[BPF_REG_1]; /* Mapped to a0-a1 */
530 	const u8 *r2 = bpf2mips32[BPF_REG_2]; /* Mapped to a2-a3 */
531 	int exclude, k;
532 	u32 addr = 0;
533 
534 	/* Push caller-saved registers on stack */
535 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
536 		  0, JIT_RESERVED_STACK);
537 
538 	/* Put 64-bit arguments 1 and 2 in registers a0-a3 */
539 	for (k = 0; k < 2; k++) {
540 		emit(ctx, move, MIPS_R_T9, src[k]);
541 		emit(ctx, move, r1[k], dst[k]);
542 		emit(ctx, move, r2[k], MIPS_R_T9);
543 	}
544 
545 	/* Emit function call */
546 	switch (BPF_OP(op)) {
547 	/* dst = dst / src */
548 	case BPF_DIV:
549 		addr = (u32)&div64_u64;
550 		break;
551 	/* dst = dst % src */
552 	case BPF_MOD:
553 		addr = (u32)&jit_mod64;
554 		break;
555 	}
556 	emit_mov_i(ctx, MIPS_R_T9, addr);
557 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
558 	emit(ctx, nop); /* Delay slot */
559 
560 	/* Store the 64-bit result in dst */
561 	emit(ctx, move, dst[0], r0[0]);
562 	emit(ctx, move, dst[1], r0[1]);
563 
564 	/* Restore caller-saved registers, excluding the computed result */
565 	exclude = BIT(lo(dst)) | BIT(hi(dst));
566 	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
567 		 exclude, JIT_RESERVED_STACK);
568 	emit_load_delay(ctx);
569 
570 	clobber_reg64(ctx, dst);
571 	clobber_reg(ctx, MIPS_R_V0);
572 	clobber_reg(ctx, MIPS_R_V1);
573 	clobber_reg(ctx, MIPS_R_RA);
574 }
575 
576 /* Swap bytes in a register word */
577 static void emit_swap8_r(struct jit_context *ctx, u8 dst, u8 src, u8 mask)
578 {
579 	u8 tmp = MIPS_R_T9;
580 
581 	emit(ctx, and, tmp, src, mask); /* tmp = src & 0x00ff00ff */
582 	emit(ctx, sll, tmp, tmp, 8);    /* tmp = tmp << 8         */
583 	emit(ctx, srl, dst, src, 8);    /* dst = src >> 8         */
584 	emit(ctx, and, dst, dst, mask); /* dst = dst & 0x00ff00ff */
585 	emit(ctx, or,  dst, dst, tmp);  /* dst = dst | tmp        */
586 }
587 
588 /* Swap half words in a register word */
589 static void emit_swap16_r(struct jit_context *ctx, u8 dst, u8 src)
590 {
591 	u8 tmp = MIPS_R_T9;
592 
593 	emit(ctx, sll, tmp, src, 16);  /* tmp = src << 16 */
594 	emit(ctx, srl, dst, src, 16);  /* dst = src >> 16 */
595 	emit(ctx, or,  dst, dst, tmp); /* dst = dst | tmp */
596 }
597 
598 /* Swap bytes and truncate a register double word, word or half word */
599 static void emit_bswap_r64(struct jit_context *ctx, const u8 dst[], u32 width)
600 {
601 	u8 tmp = MIPS_R_T8;
602 
603 	switch (width) {
604 	/* Swap bytes in a double word */
605 	case 64:
606 		if (cpu_has_mips32r2 || cpu_has_mips32r6) {
607 			emit(ctx, rotr, tmp, hi(dst), 16);
608 			emit(ctx, rotr, hi(dst), lo(dst), 16);
609 			emit(ctx, wsbh, lo(dst), tmp);
610 			emit(ctx, wsbh, hi(dst), hi(dst));
611 		} else {
612 			emit_swap16_r(ctx, tmp, lo(dst));
613 			emit_swap16_r(ctx, lo(dst), hi(dst));
614 			emit(ctx, move, hi(dst), tmp);
615 
616 			emit(ctx, lui, tmp, 0xff);      /* tmp = 0x00ff0000 */
617 			emit(ctx, ori, tmp, tmp, 0xff); /* tmp = 0x00ff00ff */
618 			emit_swap8_r(ctx, lo(dst), lo(dst), tmp);
619 			emit_swap8_r(ctx, hi(dst), hi(dst), tmp);
620 		}
621 		break;
622 	/* Swap bytes in a word */
623 	/* Swap bytes in a half word */
624 	case 32:
625 	case 16:
626 		emit_bswap_r(ctx, lo(dst), width);
627 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
628 		break;
629 	}
630 	clobber_reg64(ctx, dst);
631 }
632 
633 /* Truncate a register double word, word or half word */
634 static void emit_trunc_r64(struct jit_context *ctx, const u8 dst[], u32 width)
635 {
636 	switch (width) {
637 	case 64:
638 		break;
639 	/* Zero-extend a word */
640 	case 32:
641 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
642 		clobber_reg(ctx, hi(dst));
643 		break;
644 	/* Zero-extend a half word */
645 	case 16:
646 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
647 		emit(ctx, andi, lo(dst), lo(dst), 0xffff);
648 		clobber_reg64(ctx, dst);
649 		break;
650 	}
651 }
652 
653 /* Load operation: dst = *(size*)(src + off) */
654 static void emit_ldx(struct jit_context *ctx,
655 		     const u8 dst[], u8 src, s16 off, u8 size)
656 {
657 	switch (size) {
658 	/* Load a byte */
659 	case BPF_B:
660 		emit(ctx, lbu, lo(dst), off, src);
661 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
662 		break;
663 	/* Load a half word */
664 	case BPF_H:
665 		emit(ctx, lhu, lo(dst), off, src);
666 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
667 		break;
668 	/* Load a word */
669 	case BPF_W:
670 		emit(ctx, lw, lo(dst), off, src);
671 		emit(ctx, move, hi(dst), MIPS_R_ZERO);
672 		break;
673 	/* Load a double word */
674 	case BPF_DW:
675 		if (dst[1] == src) {
676 			emit(ctx, lw, dst[0], off + 4, src);
677 			emit(ctx, lw, dst[1], off, src);
678 		} else {
679 			emit(ctx, lw, dst[1], off, src);
680 			emit(ctx, lw, dst[0], off + 4, src);
681 		}
682 		emit_load_delay(ctx);
683 		break;
684 	}
685 	clobber_reg64(ctx, dst);
686 }
687 
688 /* Store operation: *(size *)(dst + off) = src */
689 static void emit_stx(struct jit_context *ctx,
690 		     const u8 dst, const u8 src[], s16 off, u8 size)
691 {
692 	switch (size) {
693 	/* Store a byte */
694 	case BPF_B:
695 		emit(ctx, sb, lo(src), off, dst);
696 		break;
697 	/* Store a half word */
698 	case BPF_H:
699 		emit(ctx, sh, lo(src), off, dst);
700 		break;
701 	/* Store a word */
702 	case BPF_W:
703 		emit(ctx, sw, lo(src), off, dst);
704 		break;
705 	/* Store a double word */
706 	case BPF_DW:
707 		emit(ctx, sw, src[1], off, dst);
708 		emit(ctx, sw, src[0], off + 4, dst);
709 		break;
710 	}
711 }
712 
713 /* Atomic read-modify-write (32-bit, non-ll/sc fallback) */
714 static void emit_atomic_r32(struct jit_context *ctx,
715 			    u8 dst, u8 src, s16 off, u8 code)
716 {
717 	u32 exclude = 0;
718 	u32 addr = 0;
719 
720 	/* Push caller-saved registers on stack */
721 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
722 		  0, JIT_RESERVED_STACK);
723 	/*
724 	 * Argument 1: dst+off if xchg, otherwise src, passed in register a0
725 	 * Argument 2: src if xchg, otherwise dst+off, passed in register a1
726 	 */
727 	emit(ctx, move, MIPS_R_T9, dst);
728 	if (code == BPF_XCHG) {
729 		emit(ctx, move, MIPS_R_A1, src);
730 		emit(ctx, addiu, MIPS_R_A0, MIPS_R_T9, off);
731 	} else {
732 		emit(ctx, move, MIPS_R_A0, src);
733 		emit(ctx, addiu, MIPS_R_A1, MIPS_R_T9, off);
734 	}
735 
736 	/* Emit function call */
737 	switch (code) {
738 	case BPF_ADD:
739 		addr = (u32)&atomic_add;
740 		break;
741 	case BPF_ADD | BPF_FETCH:
742 		addr = (u32)&atomic_fetch_add;
743 		break;
744 	case BPF_SUB:
745 		addr = (u32)&atomic_sub;
746 		break;
747 	case BPF_SUB | BPF_FETCH:
748 		addr = (u32)&atomic_fetch_sub;
749 		break;
750 	case BPF_OR:
751 		addr = (u32)&atomic_or;
752 		break;
753 	case BPF_OR | BPF_FETCH:
754 		addr = (u32)&atomic_fetch_or;
755 		break;
756 	case BPF_AND:
757 		addr = (u32)&atomic_and;
758 		break;
759 	case BPF_AND | BPF_FETCH:
760 		addr = (u32)&atomic_fetch_and;
761 		break;
762 	case BPF_XOR:
763 		addr = (u32)&atomic_xor;
764 		break;
765 	case BPF_XOR | BPF_FETCH:
766 		addr = (u32)&atomic_fetch_xor;
767 		break;
768 	case BPF_XCHG:
769 		addr = (u32)&atomic_xchg;
770 		break;
771 	}
772 	emit_mov_i(ctx, MIPS_R_T9, addr);
773 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
774 	emit(ctx, nop); /* Delay slot */
775 
776 	/* Update src register with old value, if specified */
777 	if (code & BPF_FETCH) {
778 		emit(ctx, move, src, MIPS_R_V0);
779 		exclude = BIT(src);
780 		clobber_reg(ctx, src);
781 	}
782 
783 	/* Restore caller-saved registers, except any fetched value */
784 	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
785 		 exclude, JIT_RESERVED_STACK);
786 	emit_load_delay(ctx);
787 	clobber_reg(ctx, MIPS_R_RA);
788 }
789 
790 /* Helper function for 64-bit atomic exchange */
791 static s64 jit_xchg64(s64 a, atomic64_t *v)
792 {
793 	return atomic64_xchg(v, a);
794 }
795 
796 /* Atomic read-modify-write (64-bit) */
797 static void emit_atomic_r64(struct jit_context *ctx,
798 			    u8 dst, const u8 src[], s16 off, u8 code)
799 {
800 	const u8 *r0 = bpf2mips32[BPF_REG_0]; /* Mapped to v0-v1 */
801 	const u8 *r1 = bpf2mips32[BPF_REG_1]; /* Mapped to a0-a1 */
802 	u32 exclude = 0;
803 	u32 addr = 0;
804 
805 	/* Push caller-saved registers on stack */
806 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
807 		  0, JIT_RESERVED_STACK);
808 	/*
809 	 * Argument 1: 64-bit src, passed in registers a0-a1
810 	 * Argument 2: 32-bit dst+off, passed in register a2
811 	 */
812 	emit(ctx, move, MIPS_R_T9, dst);
813 	emit(ctx, move, r1[0], src[0]);
814 	emit(ctx, move, r1[1], src[1]);
815 	emit(ctx, addiu, MIPS_R_A2, MIPS_R_T9, off);
816 
817 	/* Emit function call */
818 	switch (code) {
819 	case BPF_ADD:
820 		addr = (u32)&atomic64_add;
821 		break;
822 	case BPF_ADD | BPF_FETCH:
823 		addr = (u32)&atomic64_fetch_add;
824 		break;
825 	case BPF_SUB:
826 		addr = (u32)&atomic64_sub;
827 		break;
828 	case BPF_SUB | BPF_FETCH:
829 		addr = (u32)&atomic64_fetch_sub;
830 		break;
831 	case BPF_OR:
832 		addr = (u32)&atomic64_or;
833 		break;
834 	case BPF_OR | BPF_FETCH:
835 		addr = (u32)&atomic64_fetch_or;
836 		break;
837 	case BPF_AND:
838 		addr = (u32)&atomic64_and;
839 		break;
840 	case BPF_AND | BPF_FETCH:
841 		addr = (u32)&atomic64_fetch_and;
842 		break;
843 	case BPF_XOR:
844 		addr = (u32)&atomic64_xor;
845 		break;
846 	case BPF_XOR | BPF_FETCH:
847 		addr = (u32)&atomic64_fetch_xor;
848 		break;
849 	case BPF_XCHG:
850 		addr = (u32)&jit_xchg64;
851 		break;
852 	}
853 	emit_mov_i(ctx, MIPS_R_T9, addr);
854 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
855 	emit(ctx, nop); /* Delay slot */
856 
857 	/* Update src register with old value, if specified */
858 	if (code & BPF_FETCH) {
859 		emit(ctx, move, lo(src), lo(r0));
860 		emit(ctx, move, hi(src), hi(r0));
861 		exclude = BIT(src[0]) | BIT(src[1]);
862 		clobber_reg64(ctx, src);
863 	}
864 
865 	/* Restore caller-saved registers, except any fetched value */
866 	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
867 		 exclude, JIT_RESERVED_STACK);
868 	emit_load_delay(ctx);
869 	clobber_reg(ctx, MIPS_R_RA);
870 }
871 
872 /* Atomic compare-and-exchange (32-bit, non-ll/sc fallback) */
873 static void emit_cmpxchg_r32(struct jit_context *ctx, u8 dst, u8 src, s16 off)
874 {
875 	const u8 *r0 = bpf2mips32[BPF_REG_0];
876 
877 	/* Push caller-saved registers on stack */
878 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
879 		  JIT_RETURN_REGS, JIT_RESERVED_STACK + 2 * sizeof(u32));
880 	/*
881 	 * Argument 1: 32-bit dst+off, passed in register a0
882 	 * Argument 2: 32-bit r0, passed in register a1
883 	 * Argument 3: 32-bit src, passed in register a2
884 	 */
885 	emit(ctx, addiu, MIPS_R_T9, dst, off);
886 	emit(ctx, move, MIPS_R_T8, src);
887 	emit(ctx, move, MIPS_R_A1, lo(r0));
888 	emit(ctx, move, MIPS_R_A0, MIPS_R_T9);
889 	emit(ctx, move, MIPS_R_A2, MIPS_R_T8);
890 
891 	/* Emit function call */
892 	emit_mov_i(ctx, MIPS_R_T9, (u32)&atomic_cmpxchg);
893 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
894 	emit(ctx, nop); /* Delay slot */
895 
896 #ifdef __BIG_ENDIAN
897 	emit(ctx, move, lo(r0), MIPS_R_V0);
898 #endif
899 	/* Restore caller-saved registers, except the return value */
900 	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
901 		 JIT_RETURN_REGS, JIT_RESERVED_STACK + 2 * sizeof(u32));
902 	emit_load_delay(ctx);
903 	clobber_reg(ctx, MIPS_R_V0);
904 	clobber_reg(ctx, MIPS_R_V1);
905 	clobber_reg(ctx, MIPS_R_RA);
906 }
907 
908 /* Atomic compare-and-exchange (64-bit) */
909 static void emit_cmpxchg_r64(struct jit_context *ctx,
910 			     u8 dst, const u8 src[], s16 off)
911 {
912 	const u8 *r0 = bpf2mips32[BPF_REG_0];
913 	const u8 *r2 = bpf2mips32[BPF_REG_2];
914 
915 	/* Push caller-saved registers on stack */
916 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
917 		  JIT_RETURN_REGS, JIT_RESERVED_STACK + 2 * sizeof(u32));
918 	/*
919 	 * Argument 1: 32-bit dst+off, passed in register a0 (a1 unused)
920 	 * Argument 2: 64-bit r0, passed in registers a2-a3
921 	 * Argument 3: 64-bit src, passed on stack
922 	 */
923 	push_regs(ctx, BIT(src[0]) | BIT(src[1]), 0, JIT_RESERVED_STACK);
924 	emit(ctx, addiu, MIPS_R_T9, dst, off);
925 	emit(ctx, move, r2[0], r0[0]);
926 	emit(ctx, move, r2[1], r0[1]);
927 	emit(ctx, move, MIPS_R_A0, MIPS_R_T9);
928 
929 	/* Emit function call */
930 	emit_mov_i(ctx, MIPS_R_T9, (u32)&atomic64_cmpxchg);
931 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
932 	emit(ctx, nop); /* Delay slot */
933 
934 	/* Restore caller-saved registers, except the return value */
935 	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
936 		 JIT_RETURN_REGS, JIT_RESERVED_STACK + 2 * sizeof(u32));
937 	emit_load_delay(ctx);
938 	clobber_reg(ctx, MIPS_R_V0);
939 	clobber_reg(ctx, MIPS_R_V1);
940 	clobber_reg(ctx, MIPS_R_RA);
941 }
942 
943 /*
944  * Conditional movz or an emulated equivalent.
945  * Note that the rs register may be modified.
946  */
947 static void emit_movz_r(struct jit_context *ctx, u8 rd, u8 rs, u8 rt)
948 {
949 	if (cpu_has_mips_2) {
950 		emit(ctx, movz, rd, rs, rt);           /* rd = rt ? rd : rs  */
951 	} else if (cpu_has_mips32r6) {
952 		if (rs != MIPS_R_ZERO)
953 			emit(ctx, seleqz, rs, rs, rt); /* rs = 0 if rt == 0  */
954 		emit(ctx, selnez, rd, rd, rt);         /* rd = 0 if rt != 0  */
955 		if (rs != MIPS_R_ZERO)
956 			emit(ctx, or, rd, rd, rs);     /* rd = rd | rs       */
957 	} else {
958 		emit(ctx, bnez, rt, 8);                /* PC += 8 if rd != 0 */
959 		emit(ctx, nop);                        /* +0: delay slot     */
960 		emit(ctx, or, rd, rs, MIPS_R_ZERO);    /* +4: rd = rs        */
961 	}
962 	clobber_reg(ctx, rd);
963 	clobber_reg(ctx, rs);
964 }
965 
966 /*
967  * Conditional movn or an emulated equivalent.
968  * Note that the rs register may be modified.
969  */
970 static void emit_movn_r(struct jit_context *ctx, u8 rd, u8 rs, u8 rt)
971 {
972 	if (cpu_has_mips_2) {
973 		emit(ctx, movn, rd, rs, rt);           /* rd = rt ? rs : rd  */
974 	} else if (cpu_has_mips32r6) {
975 		if (rs != MIPS_R_ZERO)
976 			emit(ctx, selnez, rs, rs, rt); /* rs = 0 if rt == 0  */
977 		emit(ctx, seleqz, rd, rd, rt);         /* rd = 0 if rt != 0  */
978 		if (rs != MIPS_R_ZERO)
979 			emit(ctx, or, rd, rd, rs);     /* rd = rd | rs       */
980 	} else {
981 		emit(ctx, beqz, rt, 8);                /* PC += 8 if rd == 0 */
982 		emit(ctx, nop);                        /* +0: delay slot     */
983 		emit(ctx, or, rd, rs, MIPS_R_ZERO);    /* +4: rd = rs        */
984 	}
985 	clobber_reg(ctx, rd);
986 	clobber_reg(ctx, rs);
987 }
988 
989 /* Emulation of 64-bit sltiu rd, rs, imm, where imm may be S32_MAX + 1 */
990 static void emit_sltiu_r64(struct jit_context *ctx, u8 rd,
991 			   const u8 rs[], s64 imm)
992 {
993 	u8 tmp = MIPS_R_T9;
994 
995 	if (imm < 0) {
996 		emit_mov_i(ctx, rd, imm);                 /* rd = imm        */
997 		emit(ctx, sltu, rd, lo(rs), rd);          /* rd = rsl < rd   */
998 		emit(ctx, sltiu, tmp, hi(rs), -1);        /* tmp = rsh < ~0U */
999 		emit(ctx, or, rd, rd, tmp);               /* rd = rd | tmp   */
1000 	} else { /* imm >= 0 */
1001 		if (imm > 0x7fff) {
1002 			emit_mov_i(ctx, rd, (s32)imm);     /* rd = imm       */
1003 			emit(ctx, sltu, rd, lo(rs), rd);   /* rd = rsl < rd  */
1004 		} else {
1005 			emit(ctx, sltiu, rd, lo(rs), imm); /* rd = rsl < imm */
1006 		}
1007 		emit_movn_r(ctx, rd, MIPS_R_ZERO, hi(rs)); /* rd = 0 if rsh  */
1008 	}
1009 }
1010 
1011 /* Emulation of 64-bit sltu rd, rs, rt */
1012 static void emit_sltu_r64(struct jit_context *ctx, u8 rd,
1013 			  const u8 rs[], const u8 rt[])
1014 {
1015 	u8 tmp = MIPS_R_T9;
1016 
1017 	emit(ctx, sltu, rd, lo(rs), lo(rt));           /* rd = rsl < rtl     */
1018 	emit(ctx, subu, tmp, hi(rs), hi(rt));          /* tmp = rsh - rth    */
1019 	emit_movn_r(ctx, rd, MIPS_R_ZERO, tmp);        /* rd = 0 if tmp != 0 */
1020 	emit(ctx, sltu, tmp, hi(rs), hi(rt));          /* tmp = rsh < rth    */
1021 	emit(ctx, or, rd, rd, tmp);                    /* rd = rd | tmp      */
1022 }
1023 
1024 /* Emulation of 64-bit slti rd, rs, imm, where imm may be S32_MAX + 1 */
1025 static void emit_slti_r64(struct jit_context *ctx, u8 rd,
1026 			  const u8 rs[], s64 imm)
1027 {
1028 	u8 t1 = MIPS_R_T8;
1029 	u8 t2 = MIPS_R_T9;
1030 	u8 cmp;
1031 
1032 	/*
1033 	 * if ((rs < 0) ^ (imm < 0)) t1 = imm >u rsl
1034 	 * else                      t1 = rsl <u imm
1035 	 */
1036 	emit_mov_i(ctx, rd, (s32)imm);
1037 	emit(ctx, sltu, t1, lo(rs), rd);               /* t1 = rsl <u imm   */
1038 	emit(ctx, sltu, t2, rd, lo(rs));               /* t2 = imm <u rsl   */
1039 	emit(ctx, srl, rd, hi(rs), 31);                /* rd = rsh >> 31    */
1040 	if (imm < 0)
1041 		emit_movz_r(ctx, t1, t2, rd);          /* t1 = rd ? t1 : t2 */
1042 	else
1043 		emit_movn_r(ctx, t1, t2, rd);          /* t1 = rd ? t2 : t1 */
1044 	/*
1045 	 * if ((imm < 0 && rsh != 0xffffffff) ||
1046 	 *     (imm >= 0 && rsh != 0))
1047 	 *      t1 = 0
1048 	 */
1049 	if (imm < 0) {
1050 		emit(ctx, addiu, rd, hi(rs), 1);       /* rd = rsh + 1 */
1051 		cmp = rd;
1052 	} else { /* imm >= 0 */
1053 		cmp = hi(rs);
1054 	}
1055 	emit_movn_r(ctx, t1, MIPS_R_ZERO, cmp);        /* t1 = 0 if cmp != 0 */
1056 
1057 	/*
1058 	 * if (imm < 0) rd = rsh < -1
1059 	 * else         rd = rsh != 0
1060 	 * rd = rd | t1
1061 	 */
1062 	emit(ctx, slti, rd, hi(rs), imm < 0 ? -1 : 0); /* rd = rsh < hi(imm) */
1063 	emit(ctx, or, rd, rd, t1);                     /* rd = rd | t1       */
1064 }
1065 
1066 /* Emulation of 64-bit(slt rd, rs, rt) */
1067 static void emit_slt_r64(struct jit_context *ctx, u8 rd,
1068 			 const u8 rs[], const u8 rt[])
1069 {
1070 	u8 t1 = MIPS_R_T7;
1071 	u8 t2 = MIPS_R_T8;
1072 	u8 t3 = MIPS_R_T9;
1073 
1074 	/*
1075 	 * if ((rs < 0) ^ (rt < 0)) t1 = rtl <u rsl
1076 	 * else                     t1 = rsl <u rtl
1077 	 * if (rsh == rth)          t1 = 0
1078 	 */
1079 	emit(ctx, sltu, t1, lo(rs), lo(rt));           /* t1 = rsl <u rtl   */
1080 	emit(ctx, sltu, t2, lo(rt), lo(rs));           /* t2 = rtl <u rsl   */
1081 	emit(ctx, xor, t3, hi(rs), hi(rt));            /* t3 = rlh ^ rth    */
1082 	emit(ctx, srl, rd, t3, 31);                    /* rd = t3 >> 31     */
1083 	emit_movn_r(ctx, t1, t2, rd);                  /* t1 = rd ? t2 : t1 */
1084 	emit_movn_r(ctx, t1, MIPS_R_ZERO, t3);         /* t1 = 0 if t3 != 0 */
1085 
1086 	/* rd = (rsh < rth) | t1 */
1087 	emit(ctx, slt, rd, hi(rs), hi(rt));            /* rd = rsh <s rth   */
1088 	emit(ctx, or, rd, rd, t1);                     /* rd = rd | t1      */
1089 }
1090 
1091 /* Jump immediate (64-bit) */
1092 static void emit_jmp_i64(struct jit_context *ctx,
1093 			 const u8 dst[], s32 imm, s32 off, u8 op)
1094 {
1095 	u8 tmp = MIPS_R_T6;
1096 
1097 	switch (op) {
1098 	/* No-op, used internally for branch optimization */
1099 	case JIT_JNOP:
1100 		break;
1101 	/* PC += off if dst == imm */
1102 	/* PC += off if dst != imm */
1103 	case BPF_JEQ:
1104 	case BPF_JNE:
1105 		if (imm >= -0x7fff && imm <= 0x8000) {
1106 			emit(ctx, addiu, tmp, lo(dst), -imm);
1107 		} else if ((u32)imm <= 0xffff) {
1108 			emit(ctx, xori, tmp, lo(dst), imm);
1109 		} else {       /* Register fallback */
1110 			emit_mov_i(ctx, tmp, imm);
1111 			emit(ctx, xor, tmp, lo(dst), tmp);
1112 		}
1113 		if (imm < 0) { /* Compare sign extension */
1114 			emit(ctx, addu, MIPS_R_T9, hi(dst), 1);
1115 			emit(ctx, or, tmp, tmp, MIPS_R_T9);
1116 		} else {       /* Compare zero extension */
1117 			emit(ctx, or, tmp, tmp, hi(dst));
1118 		}
1119 		if (op == BPF_JEQ)
1120 			emit(ctx, beqz, tmp, off);
1121 		else   /* BPF_JNE */
1122 			emit(ctx, bnez, tmp, off);
1123 		break;
1124 	/* PC += off if dst & imm */
1125 	/* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */
1126 	case BPF_JSET:
1127 	case JIT_JNSET:
1128 		if ((u32)imm <= 0xffff) {
1129 			emit(ctx, andi, tmp, lo(dst), imm);
1130 		} else {     /* Register fallback */
1131 			emit_mov_i(ctx, tmp, imm);
1132 			emit(ctx, and, tmp, lo(dst), tmp);
1133 		}
1134 		if (imm < 0) /* Sign-extension pulls in high word */
1135 			emit(ctx, or, tmp, tmp, hi(dst));
1136 		if (op == BPF_JSET)
1137 			emit(ctx, bnez, tmp, off);
1138 		else   /* JIT_JNSET */
1139 			emit(ctx, beqz, tmp, off);
1140 		break;
1141 	/* PC += off if dst > imm */
1142 	case BPF_JGT:
1143 		emit_sltiu_r64(ctx, tmp, dst, (s64)imm + 1);
1144 		emit(ctx, beqz, tmp, off);
1145 		break;
1146 	/* PC += off if dst >= imm */
1147 	case BPF_JGE:
1148 		emit_sltiu_r64(ctx, tmp, dst, imm);
1149 		emit(ctx, beqz, tmp, off);
1150 		break;
1151 	/* PC += off if dst < imm */
1152 	case BPF_JLT:
1153 		emit_sltiu_r64(ctx, tmp, dst, imm);
1154 		emit(ctx, bnez, tmp, off);
1155 		break;
1156 	/* PC += off if dst <= imm */
1157 	case BPF_JLE:
1158 		emit_sltiu_r64(ctx, tmp, dst, (s64)imm + 1);
1159 		emit(ctx, bnez, tmp, off);
1160 		break;
1161 	/* PC += off if dst > imm (signed) */
1162 	case BPF_JSGT:
1163 		emit_slti_r64(ctx, tmp, dst, (s64)imm + 1);
1164 		emit(ctx, beqz, tmp, off);
1165 		break;
1166 	/* PC += off if dst >= imm (signed) */
1167 	case BPF_JSGE:
1168 		emit_slti_r64(ctx, tmp, dst, imm);
1169 		emit(ctx, beqz, tmp, off);
1170 		break;
1171 	/* PC += off if dst < imm (signed) */
1172 	case BPF_JSLT:
1173 		emit_slti_r64(ctx, tmp, dst, imm);
1174 		emit(ctx, bnez, tmp, off);
1175 		break;
1176 	/* PC += off if dst <= imm (signed) */
1177 	case BPF_JSLE:
1178 		emit_slti_r64(ctx, tmp, dst, (s64)imm + 1);
1179 		emit(ctx, bnez, tmp, off);
1180 		break;
1181 	}
1182 }
1183 
1184 /* Jump register (64-bit) */
1185 static void emit_jmp_r64(struct jit_context *ctx,
1186 			 const u8 dst[], const u8 src[], s32 off, u8 op)
1187 {
1188 	u8 t1 = MIPS_R_T6;
1189 	u8 t2 = MIPS_R_T7;
1190 
1191 	switch (op) {
1192 	/* No-op, used internally for branch optimization */
1193 	case JIT_JNOP:
1194 		break;
1195 	/* PC += off if dst == src */
1196 	/* PC += off if dst != src */
1197 	case BPF_JEQ:
1198 	case BPF_JNE:
1199 		emit(ctx, subu, t1, lo(dst), lo(src));
1200 		emit(ctx, subu, t2, hi(dst), hi(src));
1201 		emit(ctx, or, t1, t1, t2);
1202 		if (op == BPF_JEQ)
1203 			emit(ctx, beqz, t1, off);
1204 		else   /* BPF_JNE */
1205 			emit(ctx, bnez, t1, off);
1206 		break;
1207 	/* PC += off if dst & src */
1208 	/* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */
1209 	case BPF_JSET:
1210 	case JIT_JNSET:
1211 		emit(ctx, and, t1, lo(dst), lo(src));
1212 		emit(ctx, and, t2, hi(dst), hi(src));
1213 		emit(ctx, or, t1, t1, t2);
1214 		if (op == BPF_JSET)
1215 			emit(ctx, bnez, t1, off);
1216 		else   /* JIT_JNSET */
1217 			emit(ctx, beqz, t1, off);
1218 		break;
1219 	/* PC += off if dst > src */
1220 	case BPF_JGT:
1221 		emit_sltu_r64(ctx, t1, src, dst);
1222 		emit(ctx, bnez, t1, off);
1223 		break;
1224 	/* PC += off if dst >= src */
1225 	case BPF_JGE:
1226 		emit_sltu_r64(ctx, t1, dst, src);
1227 		emit(ctx, beqz, t1, off);
1228 		break;
1229 	/* PC += off if dst < src */
1230 	case BPF_JLT:
1231 		emit_sltu_r64(ctx, t1, dst, src);
1232 		emit(ctx, bnez, t1, off);
1233 		break;
1234 	/* PC += off if dst <= src */
1235 	case BPF_JLE:
1236 		emit_sltu_r64(ctx, t1, src, dst);
1237 		emit(ctx, beqz, t1, off);
1238 		break;
1239 	/* PC += off if dst > src (signed) */
1240 	case BPF_JSGT:
1241 		emit_slt_r64(ctx, t1, src, dst);
1242 		emit(ctx, bnez, t1, off);
1243 		break;
1244 	/* PC += off if dst >= src (signed) */
1245 	case BPF_JSGE:
1246 		emit_slt_r64(ctx, t1, dst, src);
1247 		emit(ctx, beqz, t1, off);
1248 		break;
1249 	/* PC += off if dst < src (signed) */
1250 	case BPF_JSLT:
1251 		emit_slt_r64(ctx, t1, dst, src);
1252 		emit(ctx, bnez, t1, off);
1253 		break;
1254 	/* PC += off if dst <= src (signed) */
1255 	case BPF_JSLE:
1256 		emit_slt_r64(ctx, t1, src, dst);
1257 		emit(ctx, beqz, t1, off);
1258 		break;
1259 	}
1260 }
1261 
1262 /* Function call */
1263 static int emit_call(struct jit_context *ctx, const struct bpf_insn *insn)
1264 {
1265 	bool fixed;
1266 	u64 addr;
1267 
1268 	/* Decode the call address */
1269 	if (bpf_jit_get_func_addr(ctx->program, insn, false,
1270 				  &addr, &fixed) < 0)
1271 		return -1;
1272 	if (!fixed)
1273 		return -1;
1274 
1275 	/* Push stack arguments */
1276 	push_regs(ctx, JIT_STACK_REGS, 0, JIT_RESERVED_STACK);
1277 
1278 	/* Emit function call */
1279 	emit_mov_i(ctx, MIPS_R_T9, addr);
1280 	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
1281 	emit(ctx, nop); /* Delay slot */
1282 
1283 	clobber_reg(ctx, MIPS_R_RA);
1284 	clobber_reg(ctx, MIPS_R_V0);
1285 	clobber_reg(ctx, MIPS_R_V1);
1286 	return 0;
1287 }
1288 
1289 /* Function tail call */
1290 static int emit_tail_call(struct jit_context *ctx)
1291 {
1292 	u8 ary = lo(bpf2mips32[BPF_REG_2]);
1293 	u8 ind = lo(bpf2mips32[BPF_REG_3]);
1294 	u8 t1 = MIPS_R_T8;
1295 	u8 t2 = MIPS_R_T9;
1296 	int off;
1297 
1298 	/*
1299 	 * Tail call:
1300 	 * eBPF R1   - function argument (context ptr), passed in a0-a1
1301 	 * eBPF R2   - ptr to object with array of function entry points
1302 	 * eBPF R3   - array index of function to be called
1303 	 * stack[sz] - remaining tail call count, initialized in prologue
1304 	 */
1305 
1306 	/* if (ind >= ary->map.max_entries) goto out */
1307 	off = offsetof(struct bpf_array, map.max_entries);
1308 	if (off > 0x7fff)
1309 		return -1;
1310 	emit(ctx, lw, t1, off, ary);             /* t1 = ary->map.max_entries*/
1311 	emit_load_delay(ctx);                    /* Load delay slot          */
1312 	emit(ctx, sltu, t1, ind, t1);            /* t1 = ind < t1            */
1313 	emit(ctx, beqz, t1, get_offset(ctx, 1)); /* PC += off(1) if t1 == 0  */
1314 						 /* (next insn delay slot)   */
1315 	/* if (TCC-- <= 0) goto out */
1316 	emit(ctx, lw, t2, ctx->stack_size, MIPS_R_SP);  /* t2 = *(SP + size) */
1317 	emit_load_delay(ctx);                     /* Load delay slot         */
1318 	emit(ctx, blez, t2, get_offset(ctx, 1));  /* PC += off(1) if t2 <= 0 */
1319 	emit(ctx, addiu, t2, t2, -1);             /* t2-- (delay slot)       */
1320 	emit(ctx, sw, t2, ctx->stack_size, MIPS_R_SP);  /* *(SP + size) = t2 */
1321 
1322 	/* prog = ary->ptrs[ind] */
1323 	off = offsetof(struct bpf_array, ptrs);
1324 	if (off > 0x7fff)
1325 		return -1;
1326 	emit(ctx, sll, t1, ind, 2);               /* t1 = ind << 2           */
1327 	emit(ctx, addu, t1, t1, ary);             /* t1 += ary               */
1328 	emit(ctx, lw, t2, off, t1);               /* t2 = *(t1 + off)        */
1329 	emit_load_delay(ctx);                     /* Load delay slot         */
1330 
1331 	/* if (prog == 0) goto out */
1332 	emit(ctx, beqz, t2, get_offset(ctx, 1));  /* PC += off(1) if t2 == 0 */
1333 	emit(ctx, nop);                           /* Delay slot              */
1334 
1335 	/* func = prog->bpf_func + 8 (prologue skip offset) */
1336 	off = offsetof(struct bpf_prog, bpf_func);
1337 	if (off > 0x7fff)
1338 		return -1;
1339 	emit(ctx, lw, t1, off, t2);                /* t1 = *(t2 + off)       */
1340 	emit_load_delay(ctx);                      /* Load delay slot        */
1341 	emit(ctx, addiu, t1, t1, JIT_TCALL_SKIP);  /* t1 += skip (8 or 12)   */
1342 
1343 	/* goto func */
1344 	build_epilogue(ctx, t1);
1345 	return 0;
1346 }
1347 
1348 /*
1349  * Stack frame layout for a JITed program (stack grows down).
1350  *
1351  * Higher address  : Caller's stack frame       :
1352  *                 :----------------------------:
1353  *                 : 64-bit eBPF args r3-r5     :
1354  *                 :----------------------------:
1355  *                 : Reserved / tail call count :
1356  *                 +============================+  <--- MIPS sp before call
1357  *                 | Callee-saved registers,    |
1358  *                 | including RA and FP        |
1359  *                 +----------------------------+  <--- eBPF FP (MIPS zero,fp)
1360  *                 | Local eBPF variables       |
1361  *                 | allocated by program       |
1362  *                 +----------------------------+
1363  *                 | Reserved for caller-saved  |
1364  *                 | registers                  |
1365  *                 +----------------------------+
1366  *                 | Reserved for 64-bit eBPF   |
1367  *                 | args r3-r5 & args passed   |
1368  *                 | on stack in kernel calls   |
1369  * Lower address   +============================+  <--- MIPS sp
1370  */
1371 
1372 /* Build program prologue to set up the stack and registers */
1373 void build_prologue(struct jit_context *ctx)
1374 {
1375 	const u8 *r1 = bpf2mips32[BPF_REG_1];
1376 	const u8 *fp = bpf2mips32[BPF_REG_FP];
1377 	int stack, saved, locals, reserved;
1378 
1379 	/*
1380 	 * In the unlikely event that the TCC limit is raised to more
1381 	 * than 16 bits, it is clamped to the maximum value allowed for
1382 	 * the generated code (0xffff). It is better fail to compile
1383 	 * instead of degrading gracefully.
1384 	 */
1385 	BUILD_BUG_ON(MAX_TAIL_CALL_CNT > 0xffff);
1386 
1387 	/*
1388 	 * The first two instructions initialize TCC in the reserved (for us)
1389 	 * 16-byte area in the parent's stack frame. On a tail call, the
1390 	 * calling function jumps into the prologue after these instructions.
1391 	 */
1392 	emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, MAX_TAIL_CALL_CNT);
1393 	emit(ctx, sw, MIPS_R_T9, 0, MIPS_R_SP);
1394 
1395 	/*
1396 	 * Register eBPF R1 contains the 32-bit context pointer argument.
1397 	 * A 32-bit argument is always passed in MIPS register a0, regardless
1398 	 * of CPU endianness. Initialize R1 accordingly and zero-extend.
1399 	 */
1400 #ifdef __BIG_ENDIAN
1401 	emit(ctx, move, lo(r1), MIPS_R_A0);
1402 #endif
1403 
1404 	/* === Entry-point for tail calls === */
1405 
1406 	/* Zero-extend the 32-bit argument */
1407 	emit(ctx, move, hi(r1), MIPS_R_ZERO);
1408 
1409 	/* If the eBPF frame pointer was accessed it must be saved */
1410 	if (ctx->accessed & BIT(BPF_REG_FP))
1411 		clobber_reg64(ctx, fp);
1412 
1413 	/* Compute the stack space needed for callee-saved registers */
1414 	saved = hweight32(ctx->clobbered & JIT_CALLEE_REGS) * sizeof(u32);
1415 	saved = ALIGN(saved, MIPS_STACK_ALIGNMENT);
1416 
1417 	/* Stack space used by eBPF program local data */
1418 	locals = ALIGN(ctx->program->aux->stack_depth, MIPS_STACK_ALIGNMENT);
1419 
1420 	/*
1421 	 * If we are emitting function calls, reserve extra stack space for
1422 	 * caller-saved registers and function arguments passed on the stack.
1423 	 * The required space is computed automatically during resource
1424 	 * usage discovery (pass 1).
1425 	 */
1426 	reserved = ctx->stack_used;
1427 
1428 	/* Allocate the stack frame */
1429 	stack = ALIGN(saved + locals + reserved, MIPS_STACK_ALIGNMENT);
1430 	emit(ctx, addiu, MIPS_R_SP, MIPS_R_SP, -stack);
1431 
1432 	/* Store callee-saved registers on stack */
1433 	push_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0, stack - saved);
1434 
1435 	/* Initialize the eBPF frame pointer if accessed */
1436 	if (ctx->accessed & BIT(BPF_REG_FP))
1437 		emit(ctx, addiu, lo(fp), MIPS_R_SP, stack - saved);
1438 
1439 	ctx->saved_size = saved;
1440 	ctx->stack_size = stack;
1441 }
1442 
1443 /* Build the program epilogue to restore the stack and registers */
1444 void build_epilogue(struct jit_context *ctx, int dest_reg)
1445 {
1446 	/* Restore callee-saved registers from stack */
1447 	pop_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0,
1448 		 ctx->stack_size - ctx->saved_size);
1449 	/*
1450 	 * A 32-bit return value is always passed in MIPS register v0,
1451 	 * but on big-endian targets the low part of R0 is mapped to v1.
1452 	 */
1453 #ifdef __BIG_ENDIAN
1454 	emit(ctx, move, MIPS_R_V0, MIPS_R_V1);
1455 #endif
1456 
1457 	/* Jump to the return address and adjust the stack pointer */
1458 	emit(ctx, jr, dest_reg);
1459 	emit(ctx, addiu, MIPS_R_SP, MIPS_R_SP, ctx->stack_size);
1460 }
1461 
1462 /* Build one eBPF instruction */
1463 int build_insn(const struct bpf_insn *insn, struct jit_context *ctx)
1464 {
1465 	const u8 *dst = bpf2mips32[insn->dst_reg];
1466 	const u8 *src = bpf2mips32[insn->src_reg];
1467 	const u8 *res = bpf2mips32[BPF_REG_0];
1468 	const u8 *tmp = bpf2mips32[JIT_REG_TMP];
1469 	u8 code = insn->code;
1470 	s16 off = insn->off;
1471 	s32 imm = insn->imm;
1472 	s32 val, rel;
1473 	u8 alu, jmp;
1474 
1475 	switch (code) {
1476 	/* ALU operations */
1477 	/* dst = imm */
1478 	case BPF_ALU | BPF_MOV | BPF_K:
1479 		emit_mov_i(ctx, lo(dst), imm);
1480 		emit_zext_ver(ctx, dst);
1481 		break;
1482 	/* dst = src */
1483 	case BPF_ALU | BPF_MOV | BPF_X:
1484 		if (imm == 1) {
1485 			/* Special mov32 for zext */
1486 			emit_mov_i(ctx, hi(dst), 0);
1487 		} else {
1488 			emit_mov_r(ctx, lo(dst), lo(src));
1489 			emit_zext_ver(ctx, dst);
1490 		}
1491 		break;
1492 	/* dst = -dst */
1493 	case BPF_ALU | BPF_NEG:
1494 		emit_alu_i(ctx, lo(dst), 0, BPF_NEG);
1495 		emit_zext_ver(ctx, dst);
1496 		break;
1497 	/* dst = dst & imm */
1498 	/* dst = dst | imm */
1499 	/* dst = dst ^ imm */
1500 	/* dst = dst << imm */
1501 	/* dst = dst >> imm */
1502 	/* dst = dst >> imm (arithmetic) */
1503 	/* dst = dst + imm */
1504 	/* dst = dst - imm */
1505 	/* dst = dst * imm */
1506 	/* dst = dst / imm */
1507 	/* dst = dst % imm */
1508 	case BPF_ALU | BPF_OR | BPF_K:
1509 	case BPF_ALU | BPF_AND | BPF_K:
1510 	case BPF_ALU | BPF_XOR | BPF_K:
1511 	case BPF_ALU | BPF_LSH | BPF_K:
1512 	case BPF_ALU | BPF_RSH | BPF_K:
1513 	case BPF_ALU | BPF_ARSH | BPF_K:
1514 	case BPF_ALU | BPF_ADD | BPF_K:
1515 	case BPF_ALU | BPF_SUB | BPF_K:
1516 	case BPF_ALU | BPF_MUL | BPF_K:
1517 	case BPF_ALU | BPF_DIV | BPF_K:
1518 	case BPF_ALU | BPF_MOD | BPF_K:
1519 		if (!valid_alu_i(BPF_OP(code), imm)) {
1520 			emit_mov_i(ctx, MIPS_R_T6, imm);
1521 			emit_alu_r(ctx, lo(dst), MIPS_R_T6, BPF_OP(code));
1522 		} else if (rewrite_alu_i(BPF_OP(code), imm, &alu, &val)) {
1523 			emit_alu_i(ctx, lo(dst), val, alu);
1524 		}
1525 		emit_zext_ver(ctx, dst);
1526 		break;
1527 	/* dst = dst & src */
1528 	/* dst = dst | src */
1529 	/* dst = dst ^ src */
1530 	/* dst = dst << src */
1531 	/* dst = dst >> src */
1532 	/* dst = dst >> src (arithmetic) */
1533 	/* dst = dst + src */
1534 	/* dst = dst - src */
1535 	/* dst = dst * src */
1536 	/* dst = dst / src */
1537 	/* dst = dst % src */
1538 	case BPF_ALU | BPF_AND | BPF_X:
1539 	case BPF_ALU | BPF_OR | BPF_X:
1540 	case BPF_ALU | BPF_XOR | BPF_X:
1541 	case BPF_ALU | BPF_LSH | BPF_X:
1542 	case BPF_ALU | BPF_RSH | BPF_X:
1543 	case BPF_ALU | BPF_ARSH | BPF_X:
1544 	case BPF_ALU | BPF_ADD | BPF_X:
1545 	case BPF_ALU | BPF_SUB | BPF_X:
1546 	case BPF_ALU | BPF_MUL | BPF_X:
1547 	case BPF_ALU | BPF_DIV | BPF_X:
1548 	case BPF_ALU | BPF_MOD | BPF_X:
1549 		emit_alu_r(ctx, lo(dst), lo(src), BPF_OP(code));
1550 		emit_zext_ver(ctx, dst);
1551 		break;
1552 	/* dst = imm (64-bit) */
1553 	case BPF_ALU64 | BPF_MOV | BPF_K:
1554 		emit_mov_se_i64(ctx, dst, imm);
1555 		break;
1556 	/* dst = src (64-bit) */
1557 	case BPF_ALU64 | BPF_MOV | BPF_X:
1558 		emit_mov_r(ctx, lo(dst), lo(src));
1559 		emit_mov_r(ctx, hi(dst), hi(src));
1560 		break;
1561 	/* dst = -dst (64-bit) */
1562 	case BPF_ALU64 | BPF_NEG:
1563 		emit_neg_i64(ctx, dst);
1564 		break;
1565 	/* dst = dst & imm (64-bit) */
1566 	case BPF_ALU64 | BPF_AND | BPF_K:
1567 		emit_alu_i64(ctx, dst, imm, BPF_OP(code));
1568 		break;
1569 	/* dst = dst | imm (64-bit) */
1570 	/* dst = dst ^ imm (64-bit) */
1571 	/* dst = dst + imm (64-bit) */
1572 	/* dst = dst - imm (64-bit) */
1573 	case BPF_ALU64 | BPF_OR | BPF_K:
1574 	case BPF_ALU64 | BPF_XOR | BPF_K:
1575 	case BPF_ALU64 | BPF_ADD | BPF_K:
1576 	case BPF_ALU64 | BPF_SUB | BPF_K:
1577 		if (imm)
1578 			emit_alu_i64(ctx, dst, imm, BPF_OP(code));
1579 		break;
1580 	/* dst = dst << imm (64-bit) */
1581 	/* dst = dst >> imm (64-bit) */
1582 	/* dst = dst >> imm (64-bit, arithmetic) */
1583 	case BPF_ALU64 | BPF_LSH | BPF_K:
1584 	case BPF_ALU64 | BPF_RSH | BPF_K:
1585 	case BPF_ALU64 | BPF_ARSH | BPF_K:
1586 		if (imm)
1587 			emit_shift_i64(ctx, dst, imm, BPF_OP(code));
1588 		break;
1589 	/* dst = dst * imm (64-bit) */
1590 	case BPF_ALU64 | BPF_MUL | BPF_K:
1591 		emit_mul_i64(ctx, dst, imm);
1592 		break;
1593 	/* dst = dst / imm (64-bit) */
1594 	/* dst = dst % imm (64-bit) */
1595 	case BPF_ALU64 | BPF_DIV | BPF_K:
1596 	case BPF_ALU64 | BPF_MOD | BPF_K:
1597 		/*
1598 		 * Sign-extend the immediate value into a temporary register,
1599 		 * and then do the operation on this register.
1600 		 */
1601 		emit_mov_se_i64(ctx, tmp, imm);
1602 		emit_divmod_r64(ctx, dst, tmp, BPF_OP(code));
1603 		break;
1604 	/* dst = dst & src (64-bit) */
1605 	/* dst = dst | src (64-bit) */
1606 	/* dst = dst ^ src (64-bit) */
1607 	/* dst = dst + src (64-bit) */
1608 	/* dst = dst - src (64-bit) */
1609 	case BPF_ALU64 | BPF_AND | BPF_X:
1610 	case BPF_ALU64 | BPF_OR | BPF_X:
1611 	case BPF_ALU64 | BPF_XOR | BPF_X:
1612 	case BPF_ALU64 | BPF_ADD | BPF_X:
1613 	case BPF_ALU64 | BPF_SUB | BPF_X:
1614 		emit_alu_r64(ctx, dst, src, BPF_OP(code));
1615 		break;
1616 	/* dst = dst << src (64-bit) */
1617 	/* dst = dst >> src (64-bit) */
1618 	/* dst = dst >> src (64-bit, arithmetic) */
1619 	case BPF_ALU64 | BPF_LSH | BPF_X:
1620 	case BPF_ALU64 | BPF_RSH | BPF_X:
1621 	case BPF_ALU64 | BPF_ARSH | BPF_X:
1622 		emit_shift_r64(ctx, dst, lo(src), BPF_OP(code));
1623 		break;
1624 	/* dst = dst * src (64-bit) */
1625 	case BPF_ALU64 | BPF_MUL | BPF_X:
1626 		emit_mul_r64(ctx, dst, src);
1627 		break;
1628 	/* dst = dst / src (64-bit) */
1629 	/* dst = dst % src (64-bit) */
1630 	case BPF_ALU64 | BPF_DIV | BPF_X:
1631 	case BPF_ALU64 | BPF_MOD | BPF_X:
1632 		emit_divmod_r64(ctx, dst, src, BPF_OP(code));
1633 		break;
1634 	/* dst = htole(dst) */
1635 	/* dst = htobe(dst) */
1636 	case BPF_ALU | BPF_END | BPF_FROM_LE:
1637 	case BPF_ALU | BPF_END | BPF_FROM_BE:
1638 		if (BPF_SRC(code) ==
1639 #ifdef __BIG_ENDIAN
1640 		    BPF_FROM_LE
1641 #else
1642 		    BPF_FROM_BE
1643 #endif
1644 		    )
1645 			emit_bswap_r64(ctx, dst, imm);
1646 		else
1647 			emit_trunc_r64(ctx, dst, imm);
1648 		break;
1649 	/* dst = imm64 */
1650 	case BPF_LD | BPF_IMM | BPF_DW:
1651 		emit_mov_i(ctx, lo(dst), imm);
1652 		emit_mov_i(ctx, hi(dst), insn[1].imm);
1653 		return 1;
1654 	/* LDX: dst = *(size *)(src + off) */
1655 	case BPF_LDX | BPF_MEM | BPF_W:
1656 	case BPF_LDX | BPF_MEM | BPF_H:
1657 	case BPF_LDX | BPF_MEM | BPF_B:
1658 	case BPF_LDX | BPF_MEM | BPF_DW:
1659 		emit_ldx(ctx, dst, lo(src), off, BPF_SIZE(code));
1660 		break;
1661 	/* ST: *(size *)(dst + off) = imm */
1662 	case BPF_ST | BPF_MEM | BPF_W:
1663 	case BPF_ST | BPF_MEM | BPF_H:
1664 	case BPF_ST | BPF_MEM | BPF_B:
1665 	case BPF_ST | BPF_MEM | BPF_DW:
1666 		switch (BPF_SIZE(code)) {
1667 		case BPF_DW:
1668 			/* Sign-extend immediate value into temporary reg */
1669 			emit_mov_se_i64(ctx, tmp, imm);
1670 			break;
1671 		case BPF_W:
1672 		case BPF_H:
1673 		case BPF_B:
1674 			emit_mov_i(ctx, lo(tmp), imm);
1675 			break;
1676 		}
1677 		emit_stx(ctx, lo(dst), tmp, off, BPF_SIZE(code));
1678 		break;
1679 	/* STX: *(size *)(dst + off) = src */
1680 	case BPF_STX | BPF_MEM | BPF_W:
1681 	case BPF_STX | BPF_MEM | BPF_H:
1682 	case BPF_STX | BPF_MEM | BPF_B:
1683 	case BPF_STX | BPF_MEM | BPF_DW:
1684 		emit_stx(ctx, lo(dst), src, off, BPF_SIZE(code));
1685 		break;
1686 	/* Speculation barrier */
1687 	case BPF_ST | BPF_NOSPEC:
1688 		break;
1689 	/* Atomics */
1690 	case BPF_STX | BPF_ATOMIC | BPF_W:
1691 		switch (imm) {
1692 		case BPF_ADD:
1693 		case BPF_ADD | BPF_FETCH:
1694 		case BPF_AND:
1695 		case BPF_AND | BPF_FETCH:
1696 		case BPF_OR:
1697 		case BPF_OR | BPF_FETCH:
1698 		case BPF_XOR:
1699 		case BPF_XOR | BPF_FETCH:
1700 		case BPF_XCHG:
1701 			if (cpu_has_llsc)
1702 				emit_atomic_r(ctx, lo(dst), lo(src), off, imm);
1703 			else /* Non-ll/sc fallback */
1704 				emit_atomic_r32(ctx, lo(dst), lo(src),
1705 						off, imm);
1706 			if (imm & BPF_FETCH)
1707 				emit_zext_ver(ctx, src);
1708 			break;
1709 		case BPF_CMPXCHG:
1710 			if (cpu_has_llsc)
1711 				emit_cmpxchg_r(ctx, lo(dst), lo(src),
1712 					       lo(res), off);
1713 			else /* Non-ll/sc fallback */
1714 				emit_cmpxchg_r32(ctx, lo(dst), lo(src), off);
1715 			/* Result zero-extension inserted by verifier */
1716 			break;
1717 		default:
1718 			goto notyet;
1719 		}
1720 		break;
1721 	/* Atomics (64-bit) */
1722 	case BPF_STX | BPF_ATOMIC | BPF_DW:
1723 		switch (imm) {
1724 		case BPF_ADD:
1725 		case BPF_ADD | BPF_FETCH:
1726 		case BPF_AND:
1727 		case BPF_AND | BPF_FETCH:
1728 		case BPF_OR:
1729 		case BPF_OR | BPF_FETCH:
1730 		case BPF_XOR:
1731 		case BPF_XOR | BPF_FETCH:
1732 		case BPF_XCHG:
1733 			emit_atomic_r64(ctx, lo(dst), src, off, imm);
1734 			break;
1735 		case BPF_CMPXCHG:
1736 			emit_cmpxchg_r64(ctx, lo(dst), src, off);
1737 			break;
1738 		default:
1739 			goto notyet;
1740 		}
1741 		break;
1742 	/* PC += off if dst == src */
1743 	/* PC += off if dst != src */
1744 	/* PC += off if dst & src */
1745 	/* PC += off if dst > src */
1746 	/* PC += off if dst >= src */
1747 	/* PC += off if dst < src */
1748 	/* PC += off if dst <= src */
1749 	/* PC += off if dst > src (signed) */
1750 	/* PC += off if dst >= src (signed) */
1751 	/* PC += off if dst < src (signed) */
1752 	/* PC += off if dst <= src (signed) */
1753 	case BPF_JMP32 | BPF_JEQ | BPF_X:
1754 	case BPF_JMP32 | BPF_JNE | BPF_X:
1755 	case BPF_JMP32 | BPF_JSET | BPF_X:
1756 	case BPF_JMP32 | BPF_JGT | BPF_X:
1757 	case BPF_JMP32 | BPF_JGE | BPF_X:
1758 	case BPF_JMP32 | BPF_JLT | BPF_X:
1759 	case BPF_JMP32 | BPF_JLE | BPF_X:
1760 	case BPF_JMP32 | BPF_JSGT | BPF_X:
1761 	case BPF_JMP32 | BPF_JSGE | BPF_X:
1762 	case BPF_JMP32 | BPF_JSLT | BPF_X:
1763 	case BPF_JMP32 | BPF_JSLE | BPF_X:
1764 		if (off == 0)
1765 			break;
1766 		setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel);
1767 		emit_jmp_r(ctx, lo(dst), lo(src), rel, jmp);
1768 		if (finish_jmp(ctx, jmp, off) < 0)
1769 			goto toofar;
1770 		break;
1771 	/* PC += off if dst == imm */
1772 	/* PC += off if dst != imm */
1773 	/* PC += off if dst & imm */
1774 	/* PC += off if dst > imm */
1775 	/* PC += off if dst >= imm */
1776 	/* PC += off if dst < imm */
1777 	/* PC += off if dst <= imm */
1778 	/* PC += off if dst > imm (signed) */
1779 	/* PC += off if dst >= imm (signed) */
1780 	/* PC += off if dst < imm (signed) */
1781 	/* PC += off if dst <= imm (signed) */
1782 	case BPF_JMP32 | BPF_JEQ | BPF_K:
1783 	case BPF_JMP32 | BPF_JNE | BPF_K:
1784 	case BPF_JMP32 | BPF_JSET | BPF_K:
1785 	case BPF_JMP32 | BPF_JGT | BPF_K:
1786 	case BPF_JMP32 | BPF_JGE | BPF_K:
1787 	case BPF_JMP32 | BPF_JLT | BPF_K:
1788 	case BPF_JMP32 | BPF_JLE | BPF_K:
1789 	case BPF_JMP32 | BPF_JSGT | BPF_K:
1790 	case BPF_JMP32 | BPF_JSGE | BPF_K:
1791 	case BPF_JMP32 | BPF_JSLT | BPF_K:
1792 	case BPF_JMP32 | BPF_JSLE | BPF_K:
1793 		if (off == 0)
1794 			break;
1795 		setup_jmp_i(ctx, imm, 32, BPF_OP(code), off, &jmp, &rel);
1796 		if (valid_jmp_i(jmp, imm)) {
1797 			emit_jmp_i(ctx, lo(dst), imm, rel, jmp);
1798 		} else {
1799 			/* Move large immediate to register */
1800 			emit_mov_i(ctx, MIPS_R_T6, imm);
1801 			emit_jmp_r(ctx, lo(dst), MIPS_R_T6, rel, jmp);
1802 		}
1803 		if (finish_jmp(ctx, jmp, off) < 0)
1804 			goto toofar;
1805 		break;
1806 	/* PC += off if dst == src */
1807 	/* PC += off if dst != src */
1808 	/* PC += off if dst & src */
1809 	/* PC += off if dst > src */
1810 	/* PC += off if dst >= src */
1811 	/* PC += off if dst < src */
1812 	/* PC += off if dst <= src */
1813 	/* PC += off if dst > src (signed) */
1814 	/* PC += off if dst >= src (signed) */
1815 	/* PC += off if dst < src (signed) */
1816 	/* PC += off if dst <= src (signed) */
1817 	case BPF_JMP | BPF_JEQ | BPF_X:
1818 	case BPF_JMP | BPF_JNE | BPF_X:
1819 	case BPF_JMP | BPF_JSET | BPF_X:
1820 	case BPF_JMP | BPF_JGT | BPF_X:
1821 	case BPF_JMP | BPF_JGE | BPF_X:
1822 	case BPF_JMP | BPF_JLT | BPF_X:
1823 	case BPF_JMP | BPF_JLE | BPF_X:
1824 	case BPF_JMP | BPF_JSGT | BPF_X:
1825 	case BPF_JMP | BPF_JSGE | BPF_X:
1826 	case BPF_JMP | BPF_JSLT | BPF_X:
1827 	case BPF_JMP | BPF_JSLE | BPF_X:
1828 		if (off == 0)
1829 			break;
1830 		setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel);
1831 		emit_jmp_r64(ctx, dst, src, rel, jmp);
1832 		if (finish_jmp(ctx, jmp, off) < 0)
1833 			goto toofar;
1834 		break;
1835 	/* PC += off if dst == imm */
1836 	/* PC += off if dst != imm */
1837 	/* PC += off if dst & imm */
1838 	/* PC += off if dst > imm */
1839 	/* PC += off if dst >= imm */
1840 	/* PC += off if dst < imm */
1841 	/* PC += off if dst <= imm */
1842 	/* PC += off if dst > imm (signed) */
1843 	/* PC += off if dst >= imm (signed) */
1844 	/* PC += off if dst < imm (signed) */
1845 	/* PC += off if dst <= imm (signed) */
1846 	case BPF_JMP | BPF_JEQ | BPF_K:
1847 	case BPF_JMP | BPF_JNE | BPF_K:
1848 	case BPF_JMP | BPF_JSET | BPF_K:
1849 	case BPF_JMP | BPF_JGT | BPF_K:
1850 	case BPF_JMP | BPF_JGE | BPF_K:
1851 	case BPF_JMP | BPF_JLT | BPF_K:
1852 	case BPF_JMP | BPF_JLE | BPF_K:
1853 	case BPF_JMP | BPF_JSGT | BPF_K:
1854 	case BPF_JMP | BPF_JSGE | BPF_K:
1855 	case BPF_JMP | BPF_JSLT | BPF_K:
1856 	case BPF_JMP | BPF_JSLE | BPF_K:
1857 		if (off == 0)
1858 			break;
1859 		setup_jmp_i(ctx, imm, 64, BPF_OP(code), off, &jmp, &rel);
1860 		emit_jmp_i64(ctx, dst, imm, rel, jmp);
1861 		if (finish_jmp(ctx, jmp, off) < 0)
1862 			goto toofar;
1863 		break;
1864 	/* PC += off */
1865 	case BPF_JMP | BPF_JA:
1866 		if (off == 0)
1867 			break;
1868 		if (emit_ja(ctx, off) < 0)
1869 			goto toofar;
1870 		break;
1871 	/* Tail call */
1872 	case BPF_JMP | BPF_TAIL_CALL:
1873 		if (emit_tail_call(ctx) < 0)
1874 			goto invalid;
1875 		break;
1876 	/* Function call */
1877 	case BPF_JMP | BPF_CALL:
1878 		if (emit_call(ctx, insn) < 0)
1879 			goto invalid;
1880 		break;
1881 	/* Function return */
1882 	case BPF_JMP | BPF_EXIT:
1883 		/*
1884 		 * Optimization: when last instruction is EXIT
1885 		 * simply continue to epilogue.
1886 		 */
1887 		if (ctx->bpf_index == ctx->program->len - 1)
1888 			break;
1889 		if (emit_exit(ctx) < 0)
1890 			goto toofar;
1891 		break;
1892 
1893 	default:
1894 invalid:
1895 		pr_err_once("unknown opcode %02x\n", code);
1896 		return -EINVAL;
1897 notyet:
1898 		pr_info_once("*** NOT YET: opcode %02x ***\n", code);
1899 		return -EFAULT;
1900 toofar:
1901 		pr_info_once("*** TOO FAR: jump at %u opcode %02x ***\n",
1902 			     ctx->bpf_index, code);
1903 		return -E2BIG;
1904 	}
1905 	return 0;
1906 }
1907