xref: /openbmc/linux/arch/x86/kernel/alternative.c (revision dbd171df)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #define pr_fmt(fmt) "SMP alternatives: " fmt
3 
4 #include <linux/module.h>
5 #include <linux/sched.h>
6 #include <linux/perf_event.h>
7 #include <linux/mutex.h>
8 #include <linux/list.h>
9 #include <linux/stringify.h>
10 #include <linux/highmem.h>
11 #include <linux/mm.h>
12 #include <linux/vmalloc.h>
13 #include <linux/memory.h>
14 #include <linux/stop_machine.h>
15 #include <linux/slab.h>
16 #include <linux/kdebug.h>
17 #include <linux/kprobes.h>
18 #include <linux/mmu_context.h>
19 #include <linux/bsearch.h>
20 #include <linux/sync_core.h>
21 #include <asm/text-patching.h>
22 #include <asm/alternative.h>
23 #include <asm/sections.h>
24 #include <asm/mce.h>
25 #include <asm/nmi.h>
26 #include <asm/cacheflush.h>
27 #include <asm/tlbflush.h>
28 #include <asm/insn.h>
29 #include <asm/io.h>
30 #include <asm/fixmap.h>
31 #include <asm/paravirt.h>
32 #include <asm/asm-prototypes.h>
33 
34 int __read_mostly alternatives_patched;
35 
36 EXPORT_SYMBOL_GPL(alternatives_patched);
37 
38 #define MAX_PATCH_LEN (255-1)
39 
40 static int __initdata_or_module debug_alternative;
41 
42 static int __init debug_alt(char *str)
43 {
44 	debug_alternative = 1;
45 	return 1;
46 }
47 __setup("debug-alternative", debug_alt);
48 
49 static int noreplace_smp;
50 
51 static int __init setup_noreplace_smp(char *str)
52 {
53 	noreplace_smp = 1;
54 	return 1;
55 }
56 __setup("noreplace-smp", setup_noreplace_smp);
57 
58 #define DPRINTK(fmt, args...)						\
59 do {									\
60 	if (debug_alternative)						\
61 		printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);		\
62 } while (0)
63 
64 #define DUMP_BYTES(buf, len, fmt, args...)				\
65 do {									\
66 	if (unlikely(debug_alternative)) {				\
67 		int j;							\
68 									\
69 		if (!(len))						\
70 			break;						\
71 									\
72 		printk(KERN_DEBUG pr_fmt(fmt), ##args);			\
73 		for (j = 0; j < (len) - 1; j++)				\
74 			printk(KERN_CONT "%02hhx ", buf[j]);		\
75 		printk(KERN_CONT "%02hhx\n", buf[j]);			\
76 	}								\
77 } while (0)
78 
79 static const unsigned char x86nops[] =
80 {
81 	BYTES_NOP1,
82 	BYTES_NOP2,
83 	BYTES_NOP3,
84 	BYTES_NOP4,
85 	BYTES_NOP5,
86 	BYTES_NOP6,
87 	BYTES_NOP7,
88 	BYTES_NOP8,
89 };
90 
91 const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
92 {
93 	NULL,
94 	x86nops,
95 	x86nops + 1,
96 	x86nops + 1 + 2,
97 	x86nops + 1 + 2 + 3,
98 	x86nops + 1 + 2 + 3 + 4,
99 	x86nops + 1 + 2 + 3 + 4 + 5,
100 	x86nops + 1 + 2 + 3 + 4 + 5 + 6,
101 	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
102 };
103 
104 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
105 static void __init_or_module add_nops(void *insns, unsigned int len)
106 {
107 	while (len > 0) {
108 		unsigned int noplen = len;
109 		if (noplen > ASM_NOP_MAX)
110 			noplen = ASM_NOP_MAX;
111 		memcpy(insns, x86_nops[noplen], noplen);
112 		insns += noplen;
113 		len -= noplen;
114 	}
115 }
116 
117 extern s32 __retpoline_sites[], __retpoline_sites_end[];
118 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
119 extern s32 __smp_locks[], __smp_locks_end[];
120 void text_poke_early(void *addr, const void *opcode, size_t len);
121 
122 /*
123  * Are we looking at a near JMP with a 1 or 4-byte displacement.
124  */
125 static inline bool is_jmp(const u8 opcode)
126 {
127 	return opcode == 0xeb || opcode == 0xe9;
128 }
129 
130 static void __init_or_module
131 recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
132 {
133 	u8 *next_rip, *tgt_rip;
134 	s32 n_dspl, o_dspl;
135 	int repl_len;
136 
137 	if (a->replacementlen != 5)
138 		return;
139 
140 	o_dspl = *(s32 *)(insn_buff + 1);
141 
142 	/* next_rip of the replacement JMP */
143 	next_rip = repl_insn + a->replacementlen;
144 	/* target rip of the replacement JMP */
145 	tgt_rip  = next_rip + o_dspl;
146 	n_dspl = tgt_rip - orig_insn;
147 
148 	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
149 
150 	if (tgt_rip - orig_insn >= 0) {
151 		if (n_dspl - 2 <= 127)
152 			goto two_byte_jmp;
153 		else
154 			goto five_byte_jmp;
155 	/* negative offset */
156 	} else {
157 		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
158 			goto two_byte_jmp;
159 		else
160 			goto five_byte_jmp;
161 	}
162 
163 two_byte_jmp:
164 	n_dspl -= 2;
165 
166 	insn_buff[0] = 0xeb;
167 	insn_buff[1] = (s8)n_dspl;
168 	add_nops(insn_buff + 2, 3);
169 
170 	repl_len = 2;
171 	goto done;
172 
173 five_byte_jmp:
174 	n_dspl -= 5;
175 
176 	insn_buff[0] = 0xe9;
177 	*(s32 *)&insn_buff[1] = n_dspl;
178 
179 	repl_len = 5;
180 
181 done:
182 
183 	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
184 		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
185 }
186 
187 /*
188  * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
189  *
190  * @instr: instruction byte stream
191  * @instrlen: length of the above
192  * @off: offset within @instr where the first NOP has been detected
193  *
194  * Return: number of NOPs found (and replaced).
195  */
196 static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
197 {
198 	unsigned long flags;
199 	int i = off, nnops;
200 
201 	while (i < instrlen) {
202 		if (instr[i] != 0x90)
203 			break;
204 
205 		i++;
206 	}
207 
208 	nnops = i - off;
209 
210 	if (nnops <= 1)
211 		return nnops;
212 
213 	local_irq_save(flags);
214 	add_nops(instr + off, nnops);
215 	local_irq_restore(flags);
216 
217 	DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
218 
219 	return nnops;
220 }
221 
222 /*
223  * "noinline" to cause control flow change and thus invalidate I$ and
224  * cause refetch after modification.
225  */
226 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
227 {
228 	struct insn insn;
229 	int i = 0;
230 
231 	/*
232 	 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
233 	 * ones.
234 	 */
235 	for (;;) {
236 		if (insn_decode_kernel(&insn, &instr[i]))
237 			return;
238 
239 		/*
240 		 * See if this and any potentially following NOPs can be
241 		 * optimized.
242 		 */
243 		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
244 			i += optimize_nops_range(instr, len, i);
245 		else
246 			i += insn.length;
247 
248 		if (i >= len)
249 			return;
250 	}
251 }
252 
253 /*
254  * Replace instructions with better alternatives for this CPU type. This runs
255  * before SMP is initialized to avoid SMP problems with self modifying code.
256  * This implies that asymmetric systems where APs have less capabilities than
257  * the boot processor are not handled. Tough. Make sure you disable such
258  * features by hand.
259  *
260  * Marked "noinline" to cause control flow change and thus insn cache
261  * to refetch changed I$ lines.
262  */
263 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
264 						  struct alt_instr *end)
265 {
266 	struct alt_instr *a;
267 	u8 *instr, *replacement;
268 	u8 insn_buff[MAX_PATCH_LEN];
269 
270 	DPRINTK("alt table %px, -> %px", start, end);
271 	/*
272 	 * The scan order should be from start to end. A later scanned
273 	 * alternative code can overwrite previously scanned alternative code.
274 	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
275 	 * patch code.
276 	 *
277 	 * So be careful if you want to change the scan order to any other
278 	 * order.
279 	 */
280 	for (a = start; a < end; a++) {
281 		int insn_buff_sz = 0;
282 		/* Mask away "NOT" flag bit for feature to test. */
283 		u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
284 
285 		instr = (u8 *)&a->instr_offset + a->instr_offset;
286 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
287 		BUG_ON(a->instrlen > sizeof(insn_buff));
288 		BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
289 
290 		/*
291 		 * Patch if either:
292 		 * - feature is present
293 		 * - feature not present but ALTINSTR_FLAG_INV is set to mean,
294 		 *   patch if feature is *NOT* present.
295 		 */
296 		if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
297 			goto next;
298 
299 		DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
300 			(a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
301 			feature >> 5,
302 			feature & 0x1f,
303 			instr, instr, a->instrlen,
304 			replacement, a->replacementlen);
305 
306 		DUMP_BYTES(instr, a->instrlen, "%px:   old_insn: ", instr);
307 		DUMP_BYTES(replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
308 
309 		memcpy(insn_buff, replacement, a->replacementlen);
310 		insn_buff_sz = a->replacementlen;
311 
312 		/*
313 		 * 0xe8 is a relative jump; fix the offset.
314 		 *
315 		 * Instruction length is checked before the opcode to avoid
316 		 * accessing uninitialized bytes for zero-length replacements.
317 		 */
318 		if (a->replacementlen == 5 && *insn_buff == 0xe8) {
319 			*(s32 *)(insn_buff + 1) += replacement - instr;
320 			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
321 				*(s32 *)(insn_buff + 1),
322 				(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
323 		}
324 
325 		if (a->replacementlen && is_jmp(replacement[0]))
326 			recompute_jump(a, instr, replacement, insn_buff);
327 
328 		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
329 			insn_buff[insn_buff_sz] = 0x90;
330 
331 		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
332 
333 		text_poke_early(instr, insn_buff, insn_buff_sz);
334 
335 next:
336 		optimize_nops(instr, a->instrlen);
337 	}
338 }
339 
340 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION)
341 
342 /*
343  * CALL/JMP *%\reg
344  */
345 static int emit_indirect(int op, int reg, u8 *bytes)
346 {
347 	int i = 0;
348 	u8 modrm;
349 
350 	switch (op) {
351 	case CALL_INSN_OPCODE:
352 		modrm = 0x10; /* Reg = 2; CALL r/m */
353 		break;
354 
355 	case JMP32_INSN_OPCODE:
356 		modrm = 0x20; /* Reg = 4; JMP r/m */
357 		break;
358 
359 	default:
360 		WARN_ON_ONCE(1);
361 		return -1;
362 	}
363 
364 	if (reg >= 8) {
365 		bytes[i++] = 0x41; /* REX.B prefix */
366 		reg -= 8;
367 	}
368 
369 	modrm |= 0xc0; /* Mod = 3 */
370 	modrm += reg;
371 
372 	bytes[i++] = 0xff; /* opcode */
373 	bytes[i++] = modrm;
374 
375 	return i;
376 }
377 
378 /*
379  * Rewrite the compiler generated retpoline thunk calls.
380  *
381  * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
382  * indirect instructions, avoiding the extra indirection.
383  *
384  * For example, convert:
385  *
386  *   CALL __x86_indirect_thunk_\reg
387  *
388  * into:
389  *
390  *   CALL *%\reg
391  *
392  * It also tries to inline spectre_v2=retpoline,amd when size permits.
393  */
394 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
395 {
396 	retpoline_thunk_t *target;
397 	int reg, ret, i = 0;
398 	u8 op, cc;
399 
400 	target = addr + insn->length + insn->immediate.value;
401 	reg = target - __x86_indirect_thunk_array;
402 
403 	if (WARN_ON_ONCE(reg & ~0xf))
404 		return -1;
405 
406 	/* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
407 	BUG_ON(reg == 4);
408 
409 	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
410 	    !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD))
411 		return -1;
412 
413 	op = insn->opcode.bytes[0];
414 
415 	/*
416 	 * Convert:
417 	 *
418 	 *   Jcc.d32 __x86_indirect_thunk_\reg
419 	 *
420 	 * into:
421 	 *
422 	 *   Jncc.d8 1f
423 	 *   [ LFENCE ]
424 	 *   JMP *%\reg
425 	 *   [ NOP ]
426 	 * 1:
427 	 */
428 	/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
429 	if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) {
430 		cc = insn->opcode.bytes[1] & 0xf;
431 		cc ^= 1; /* invert condition */
432 
433 		bytes[i++] = 0x70 + cc;        /* Jcc.d8 */
434 		bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
435 
436 		/* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
437 		op = JMP32_INSN_OPCODE;
438 	}
439 
440 	/*
441 	 * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE.
442 	 */
443 	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) {
444 		bytes[i++] = 0x0f;
445 		bytes[i++] = 0xae;
446 		bytes[i++] = 0xe8; /* LFENCE */
447 	}
448 
449 	ret = emit_indirect(op, reg, bytes + i);
450 	if (ret < 0)
451 		return ret;
452 	i += ret;
453 
454 	for (; i < insn->length;)
455 		bytes[i++] = BYTES_NOP1;
456 
457 	return i;
458 }
459 
460 /*
461  * Generated by 'objtool --retpoline'.
462  */
463 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
464 {
465 	s32 *s;
466 
467 	for (s = start; s < end; s++) {
468 		void *addr = (void *)s + *s;
469 		struct insn insn;
470 		int len, ret;
471 		u8 bytes[16];
472 		u8 op1, op2;
473 
474 		ret = insn_decode_kernel(&insn, addr);
475 		if (WARN_ON_ONCE(ret < 0))
476 			continue;
477 
478 		op1 = insn.opcode.bytes[0];
479 		op2 = insn.opcode.bytes[1];
480 
481 		switch (op1) {
482 		case CALL_INSN_OPCODE:
483 		case JMP32_INSN_OPCODE:
484 			break;
485 
486 		case 0x0f: /* escape */
487 			if (op2 >= 0x80 && op2 <= 0x8f)
488 				break;
489 			fallthrough;
490 		default:
491 			WARN_ON_ONCE(1);
492 			continue;
493 		}
494 
495 		DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
496 			addr, addr, insn.length,
497 			addr + insn.length + insn.immediate.value);
498 
499 		len = patch_retpoline(addr, &insn, bytes);
500 		if (len == insn.length) {
501 			optimize_nops(bytes, len);
502 			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
503 			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
504 			text_poke_early(addr, bytes, len);
505 		}
506 	}
507 }
508 
509 #else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */
510 
511 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
512 
513 #endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */
514 
515 #ifdef CONFIG_SMP
516 static void alternatives_smp_lock(const s32 *start, const s32 *end,
517 				  u8 *text, u8 *text_end)
518 {
519 	const s32 *poff;
520 
521 	for (poff = start; poff < end; poff++) {
522 		u8 *ptr = (u8 *)poff + *poff;
523 
524 		if (!*poff || ptr < text || ptr >= text_end)
525 			continue;
526 		/* turn DS segment override prefix into lock prefix */
527 		if (*ptr == 0x3e)
528 			text_poke(ptr, ((unsigned char []){0xf0}), 1);
529 	}
530 }
531 
532 static void alternatives_smp_unlock(const s32 *start, const s32 *end,
533 				    u8 *text, u8 *text_end)
534 {
535 	const s32 *poff;
536 
537 	for (poff = start; poff < end; poff++) {
538 		u8 *ptr = (u8 *)poff + *poff;
539 
540 		if (!*poff || ptr < text || ptr >= text_end)
541 			continue;
542 		/* turn lock prefix into DS segment override prefix */
543 		if (*ptr == 0xf0)
544 			text_poke(ptr, ((unsigned char []){0x3E}), 1);
545 	}
546 }
547 
548 struct smp_alt_module {
549 	/* what is this ??? */
550 	struct module	*mod;
551 	char		*name;
552 
553 	/* ptrs to lock prefixes */
554 	const s32	*locks;
555 	const s32	*locks_end;
556 
557 	/* .text segment, needed to avoid patching init code ;) */
558 	u8		*text;
559 	u8		*text_end;
560 
561 	struct list_head next;
562 };
563 static LIST_HEAD(smp_alt_modules);
564 static bool uniproc_patched = false;	/* protected by text_mutex */
565 
566 void __init_or_module alternatives_smp_module_add(struct module *mod,
567 						  char *name,
568 						  void *locks, void *locks_end,
569 						  void *text,  void *text_end)
570 {
571 	struct smp_alt_module *smp;
572 
573 	mutex_lock(&text_mutex);
574 	if (!uniproc_patched)
575 		goto unlock;
576 
577 	if (num_possible_cpus() == 1)
578 		/* Don't bother remembering, we'll never have to undo it. */
579 		goto smp_unlock;
580 
581 	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
582 	if (NULL == smp)
583 		/* we'll run the (safe but slow) SMP code then ... */
584 		goto unlock;
585 
586 	smp->mod	= mod;
587 	smp->name	= name;
588 	smp->locks	= locks;
589 	smp->locks_end	= locks_end;
590 	smp->text	= text;
591 	smp->text_end	= text_end;
592 	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
593 		smp->locks, smp->locks_end,
594 		smp->text, smp->text_end, smp->name);
595 
596 	list_add_tail(&smp->next, &smp_alt_modules);
597 smp_unlock:
598 	alternatives_smp_unlock(locks, locks_end, text, text_end);
599 unlock:
600 	mutex_unlock(&text_mutex);
601 }
602 
603 void __init_or_module alternatives_smp_module_del(struct module *mod)
604 {
605 	struct smp_alt_module *item;
606 
607 	mutex_lock(&text_mutex);
608 	list_for_each_entry(item, &smp_alt_modules, next) {
609 		if (mod != item->mod)
610 			continue;
611 		list_del(&item->next);
612 		kfree(item);
613 		break;
614 	}
615 	mutex_unlock(&text_mutex);
616 }
617 
618 void alternatives_enable_smp(void)
619 {
620 	struct smp_alt_module *mod;
621 
622 	/* Why bother if there are no other CPUs? */
623 	BUG_ON(num_possible_cpus() == 1);
624 
625 	mutex_lock(&text_mutex);
626 
627 	if (uniproc_patched) {
628 		pr_info("switching to SMP code\n");
629 		BUG_ON(num_online_cpus() != 1);
630 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
631 		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
632 		list_for_each_entry(mod, &smp_alt_modules, next)
633 			alternatives_smp_lock(mod->locks, mod->locks_end,
634 					      mod->text, mod->text_end);
635 		uniproc_patched = false;
636 	}
637 	mutex_unlock(&text_mutex);
638 }
639 
640 /*
641  * Return 1 if the address range is reserved for SMP-alternatives.
642  * Must hold text_mutex.
643  */
644 int alternatives_text_reserved(void *start, void *end)
645 {
646 	struct smp_alt_module *mod;
647 	const s32 *poff;
648 	u8 *text_start = start;
649 	u8 *text_end = end;
650 
651 	lockdep_assert_held(&text_mutex);
652 
653 	list_for_each_entry(mod, &smp_alt_modules, next) {
654 		if (mod->text > text_end || mod->text_end < text_start)
655 			continue;
656 		for (poff = mod->locks; poff < mod->locks_end; poff++) {
657 			const u8 *ptr = (const u8 *)poff + *poff;
658 
659 			if (text_start <= ptr && text_end > ptr)
660 				return 1;
661 		}
662 	}
663 
664 	return 0;
665 }
666 #endif /* CONFIG_SMP */
667 
668 #ifdef CONFIG_PARAVIRT
669 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
670 				     struct paravirt_patch_site *end)
671 {
672 	struct paravirt_patch_site *p;
673 	char insn_buff[MAX_PATCH_LEN];
674 
675 	for (p = start; p < end; p++) {
676 		unsigned int used;
677 
678 		BUG_ON(p->len > MAX_PATCH_LEN);
679 		/* prep the buffer with the original instructions */
680 		memcpy(insn_buff, p->instr, p->len);
681 		used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
682 
683 		BUG_ON(used > p->len);
684 
685 		/* Pad the rest with nops */
686 		add_nops(insn_buff + used, p->len - used);
687 		text_poke_early(p->instr, insn_buff, p->len);
688 	}
689 }
690 extern struct paravirt_patch_site __start_parainstructions[],
691 	__stop_parainstructions[];
692 #endif	/* CONFIG_PARAVIRT */
693 
694 /*
695  * Self-test for the INT3 based CALL emulation code.
696  *
697  * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
698  * properly and that there is a stack gap between the INT3 frame and the
699  * previous context. Without this gap doing a virtual PUSH on the interrupted
700  * stack would corrupt the INT3 IRET frame.
701  *
702  * See entry_{32,64}.S for more details.
703  */
704 
705 /*
706  * We define the int3_magic() function in assembly to control the calling
707  * convention such that we can 'call' it from assembly.
708  */
709 
710 extern void int3_magic(unsigned int *ptr); /* defined in asm */
711 
712 asm (
713 "	.pushsection	.init.text, \"ax\", @progbits\n"
714 "	.type		int3_magic, @function\n"
715 "int3_magic:\n"
716 "	movl	$1, (%" _ASM_ARG1 ")\n"
717 	ASM_RET
718 "	.size		int3_magic, .-int3_magic\n"
719 "	.popsection\n"
720 );
721 
722 extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
723 
724 static int __init
725 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
726 {
727 	struct die_args *args = data;
728 	struct pt_regs *regs = args->regs;
729 
730 	if (!regs || user_mode(regs))
731 		return NOTIFY_DONE;
732 
733 	if (val != DIE_INT3)
734 		return NOTIFY_DONE;
735 
736 	if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
737 		return NOTIFY_DONE;
738 
739 	int3_emulate_call(regs, (unsigned long)&int3_magic);
740 	return NOTIFY_STOP;
741 }
742 
743 static void __init int3_selftest(void)
744 {
745 	static __initdata struct notifier_block int3_exception_nb = {
746 		.notifier_call	= int3_exception_notify,
747 		.priority	= INT_MAX-1, /* last */
748 	};
749 	unsigned int val = 0;
750 
751 	BUG_ON(register_die_notifier(&int3_exception_nb));
752 
753 	/*
754 	 * Basically: int3_magic(&val); but really complicated :-)
755 	 *
756 	 * Stick the address of the INT3 instruction into int3_selftest_ip,
757 	 * then trigger the INT3, padded with NOPs to match a CALL instruction
758 	 * length.
759 	 */
760 	asm volatile ("1: int3; nop; nop; nop; nop\n\t"
761 		      ".pushsection .init.data,\"aw\"\n\t"
762 		      ".align " __ASM_SEL(4, 8) "\n\t"
763 		      ".type int3_selftest_ip, @object\n\t"
764 		      ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
765 		      "int3_selftest_ip:\n\t"
766 		      __ASM_SEL(.long, .quad) " 1b\n\t"
767 		      ".popsection\n\t"
768 		      : ASM_CALL_CONSTRAINT
769 		      : __ASM_SEL_RAW(a, D) (&val)
770 		      : "memory");
771 
772 	BUG_ON(val != 1);
773 
774 	unregister_die_notifier(&int3_exception_nb);
775 }
776 
777 void __init alternative_instructions(void)
778 {
779 	int3_selftest();
780 
781 	/*
782 	 * The patching is not fully atomic, so try to avoid local
783 	 * interruptions that might execute the to be patched code.
784 	 * Other CPUs are not running.
785 	 */
786 	stop_nmi();
787 
788 	/*
789 	 * Don't stop machine check exceptions while patching.
790 	 * MCEs only happen when something got corrupted and in this
791 	 * case we must do something about the corruption.
792 	 * Ignoring it is worse than an unlikely patching race.
793 	 * Also machine checks tend to be broadcast and if one CPU
794 	 * goes into machine check the others follow quickly, so we don't
795 	 * expect a machine check to cause undue problems during to code
796 	 * patching.
797 	 */
798 
799 	/*
800 	 * Paravirt patching and alternative patching can be combined to
801 	 * replace a function call with a short direct code sequence (e.g.
802 	 * by setting a constant return value instead of doing that in an
803 	 * external function).
804 	 * In order to make this work the following sequence is required:
805 	 * 1. set (artificial) features depending on used paravirt
806 	 *    functions which can later influence alternative patching
807 	 * 2. apply paravirt patching (generally replacing an indirect
808 	 *    function call with a direct one)
809 	 * 3. apply alternative patching (e.g. replacing a direct function
810 	 *    call with a custom code sequence)
811 	 * Doing paravirt patching after alternative patching would clobber
812 	 * the optimization of the custom code with a function call again.
813 	 */
814 	paravirt_set_cap();
815 
816 	/*
817 	 * First patch paravirt functions, such that we overwrite the indirect
818 	 * call with the direct call.
819 	 */
820 	apply_paravirt(__parainstructions, __parainstructions_end);
821 
822 	/*
823 	 * Rewrite the retpolines, must be done before alternatives since
824 	 * those can rewrite the retpoline thunks.
825 	 */
826 	apply_retpolines(__retpoline_sites, __retpoline_sites_end);
827 
828 	/*
829 	 * Then patch alternatives, such that those paravirt calls that are in
830 	 * alternatives can be overwritten by their immediate fragments.
831 	 */
832 	apply_alternatives(__alt_instructions, __alt_instructions_end);
833 
834 #ifdef CONFIG_SMP
835 	/* Patch to UP if other cpus not imminent. */
836 	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
837 		uniproc_patched = true;
838 		alternatives_smp_module_add(NULL, "core kernel",
839 					    __smp_locks, __smp_locks_end,
840 					    _text, _etext);
841 	}
842 
843 	if (!uniproc_patched || num_possible_cpus() == 1) {
844 		free_init_pages("SMP alternatives",
845 				(unsigned long)__smp_locks,
846 				(unsigned long)__smp_locks_end);
847 	}
848 #endif
849 
850 	restart_nmi();
851 	alternatives_patched = 1;
852 }
853 
854 /**
855  * text_poke_early - Update instructions on a live kernel at boot time
856  * @addr: address to modify
857  * @opcode: source of the copy
858  * @len: length to copy
859  *
860  * When you use this code to patch more than one byte of an instruction
861  * you need to make sure that other CPUs cannot execute this code in parallel.
862  * Also no thread must be currently preempted in the middle of these
863  * instructions. And on the local CPU you need to be protected against NMI or
864  * MCE handlers seeing an inconsistent instruction while you patch.
865  */
866 void __init_or_module text_poke_early(void *addr, const void *opcode,
867 				      size_t len)
868 {
869 	unsigned long flags;
870 
871 	if (boot_cpu_has(X86_FEATURE_NX) &&
872 	    is_module_text_address((unsigned long)addr)) {
873 		/*
874 		 * Modules text is marked initially as non-executable, so the
875 		 * code cannot be running and speculative code-fetches are
876 		 * prevented. Just change the code.
877 		 */
878 		memcpy(addr, opcode, len);
879 	} else {
880 		local_irq_save(flags);
881 		memcpy(addr, opcode, len);
882 		local_irq_restore(flags);
883 		sync_core();
884 
885 		/*
886 		 * Could also do a CLFLUSH here to speed up CPU recovery; but
887 		 * that causes hangs on some VIA CPUs.
888 		 */
889 	}
890 }
891 
892 typedef struct {
893 	struct mm_struct *mm;
894 } temp_mm_state_t;
895 
896 /*
897  * Using a temporary mm allows to set temporary mappings that are not accessible
898  * by other CPUs. Such mappings are needed to perform sensitive memory writes
899  * that override the kernel memory protections (e.g., W^X), without exposing the
900  * temporary page-table mappings that are required for these write operations to
901  * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
902  * mapping is torn down.
903  *
904  * Context: The temporary mm needs to be used exclusively by a single core. To
905  *          harden security IRQs must be disabled while the temporary mm is
906  *          loaded, thereby preventing interrupt handler bugs from overriding
907  *          the kernel memory protection.
908  */
909 static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
910 {
911 	temp_mm_state_t temp_state;
912 
913 	lockdep_assert_irqs_disabled();
914 
915 	/*
916 	 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
917 	 * with a stale address space WITHOUT being in lazy mode after
918 	 * restoring the previous mm.
919 	 */
920 	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
921 		leave_mm(smp_processor_id());
922 
923 	temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
924 	switch_mm_irqs_off(NULL, mm, current);
925 
926 	/*
927 	 * If breakpoints are enabled, disable them while the temporary mm is
928 	 * used. Userspace might set up watchpoints on addresses that are used
929 	 * in the temporary mm, which would lead to wrong signals being sent or
930 	 * crashes.
931 	 *
932 	 * Note that breakpoints are not disabled selectively, which also causes
933 	 * kernel breakpoints (e.g., perf's) to be disabled. This might be
934 	 * undesirable, but still seems reasonable as the code that runs in the
935 	 * temporary mm should be short.
936 	 */
937 	if (hw_breakpoint_active())
938 		hw_breakpoint_disable();
939 
940 	return temp_state;
941 }
942 
943 static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
944 {
945 	lockdep_assert_irqs_disabled();
946 	switch_mm_irqs_off(NULL, prev_state.mm, current);
947 
948 	/*
949 	 * Restore the breakpoints if they were disabled before the temporary mm
950 	 * was loaded.
951 	 */
952 	if (hw_breakpoint_active())
953 		hw_breakpoint_restore();
954 }
955 
956 __ro_after_init struct mm_struct *poking_mm;
957 __ro_after_init unsigned long poking_addr;
958 
959 static void *__text_poke(void *addr, const void *opcode, size_t len)
960 {
961 	bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
962 	struct page *pages[2] = {NULL};
963 	temp_mm_state_t prev;
964 	unsigned long flags;
965 	pte_t pte, *ptep;
966 	spinlock_t *ptl;
967 	pgprot_t pgprot;
968 
969 	/*
970 	 * While boot memory allocator is running we cannot use struct pages as
971 	 * they are not yet initialized. There is no way to recover.
972 	 */
973 	BUG_ON(!after_bootmem);
974 
975 	if (!core_kernel_text((unsigned long)addr)) {
976 		pages[0] = vmalloc_to_page(addr);
977 		if (cross_page_boundary)
978 			pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
979 	} else {
980 		pages[0] = virt_to_page(addr);
981 		WARN_ON(!PageReserved(pages[0]));
982 		if (cross_page_boundary)
983 			pages[1] = virt_to_page(addr + PAGE_SIZE);
984 	}
985 	/*
986 	 * If something went wrong, crash and burn since recovery paths are not
987 	 * implemented.
988 	 */
989 	BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
990 
991 	/*
992 	 * Map the page without the global bit, as TLB flushing is done with
993 	 * flush_tlb_mm_range(), which is intended for non-global PTEs.
994 	 */
995 	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
996 
997 	/*
998 	 * The lock is not really needed, but this allows to avoid open-coding.
999 	 */
1000 	ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1001 
1002 	/*
1003 	 * This must not fail; preallocated in poking_init().
1004 	 */
1005 	VM_BUG_ON(!ptep);
1006 
1007 	local_irq_save(flags);
1008 
1009 	pte = mk_pte(pages[0], pgprot);
1010 	set_pte_at(poking_mm, poking_addr, ptep, pte);
1011 
1012 	if (cross_page_boundary) {
1013 		pte = mk_pte(pages[1], pgprot);
1014 		set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1015 	}
1016 
1017 	/*
1018 	 * Loading the temporary mm behaves as a compiler barrier, which
1019 	 * guarantees that the PTE will be set at the time memcpy() is done.
1020 	 */
1021 	prev = use_temporary_mm(poking_mm);
1022 
1023 	kasan_disable_current();
1024 	memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
1025 	kasan_enable_current();
1026 
1027 	/*
1028 	 * Ensure that the PTE is only cleared after the instructions of memcpy
1029 	 * were issued by using a compiler barrier.
1030 	 */
1031 	barrier();
1032 
1033 	pte_clear(poking_mm, poking_addr, ptep);
1034 	if (cross_page_boundary)
1035 		pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1036 
1037 	/*
1038 	 * Loading the previous page-table hierarchy requires a serializing
1039 	 * instruction that already allows the core to see the updated version.
1040 	 * Xen-PV is assumed to serialize execution in a similar manner.
1041 	 */
1042 	unuse_temporary_mm(prev);
1043 
1044 	/*
1045 	 * Flushing the TLB might involve IPIs, which would require enabled
1046 	 * IRQs, but not if the mm is not used, as it is in this point.
1047 	 */
1048 	flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1049 			   (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1050 			   PAGE_SHIFT, false);
1051 
1052 	/*
1053 	 * If the text does not match what we just wrote then something is
1054 	 * fundamentally screwy; there's nothing we can really do about that.
1055 	 */
1056 	BUG_ON(memcmp(addr, opcode, len));
1057 
1058 	local_irq_restore(flags);
1059 	pte_unmap_unlock(ptep, ptl);
1060 	return addr;
1061 }
1062 
1063 /**
1064  * text_poke - Update instructions on a live kernel
1065  * @addr: address to modify
1066  * @opcode: source of the copy
1067  * @len: length to copy
1068  *
1069  * Only atomic text poke/set should be allowed when not doing early patching.
1070  * It means the size must be writable atomically and the address must be aligned
1071  * in a way that permits an atomic write. It also makes sure we fit on a single
1072  * page.
1073  *
1074  * Note that the caller must ensure that if the modified code is part of a
1075  * module, the module would not be removed during poking. This can be achieved
1076  * by registering a module notifier, and ordering module removal and patching
1077  * trough a mutex.
1078  */
1079 void *text_poke(void *addr, const void *opcode, size_t len)
1080 {
1081 	lockdep_assert_held(&text_mutex);
1082 
1083 	return __text_poke(addr, opcode, len);
1084 }
1085 
1086 /**
1087  * text_poke_kgdb - Update instructions on a live kernel by kgdb
1088  * @addr: address to modify
1089  * @opcode: source of the copy
1090  * @len: length to copy
1091  *
1092  * Only atomic text poke/set should be allowed when not doing early patching.
1093  * It means the size must be writable atomically and the address must be aligned
1094  * in a way that permits an atomic write. It also makes sure we fit on a single
1095  * page.
1096  *
1097  * Context: should only be used by kgdb, which ensures no other core is running,
1098  *	    despite the fact it does not hold the text_mutex.
1099  */
1100 void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
1101 {
1102 	return __text_poke(addr, opcode, len);
1103 }
1104 
1105 static void do_sync_core(void *info)
1106 {
1107 	sync_core();
1108 }
1109 
1110 void text_poke_sync(void)
1111 {
1112 	on_each_cpu(do_sync_core, NULL, 1);
1113 }
1114 
1115 struct text_poke_loc {
1116 	/* addr := _stext + rel_addr */
1117 	s32 rel_addr;
1118 	s32 disp;
1119 	u8 len;
1120 	u8 opcode;
1121 	const u8 text[POKE_MAX_OPCODE_SIZE];
1122 	/* see text_poke_bp_batch() */
1123 	u8 old;
1124 };
1125 
1126 struct bp_patching_desc {
1127 	struct text_poke_loc *vec;
1128 	int nr_entries;
1129 	atomic_t refs;
1130 };
1131 
1132 static struct bp_patching_desc *bp_desc;
1133 
1134 static __always_inline
1135 struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
1136 {
1137 	/* rcu_dereference */
1138 	struct bp_patching_desc *desc = __READ_ONCE(*descp);
1139 
1140 	if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
1141 		return NULL;
1142 
1143 	return desc;
1144 }
1145 
1146 static __always_inline void put_desc(struct bp_patching_desc *desc)
1147 {
1148 	smp_mb__before_atomic();
1149 	arch_atomic_dec(&desc->refs);
1150 }
1151 
1152 static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
1153 {
1154 	return _stext + tp->rel_addr;
1155 }
1156 
1157 static __always_inline int patch_cmp(const void *key, const void *elt)
1158 {
1159 	struct text_poke_loc *tp = (struct text_poke_loc *) elt;
1160 
1161 	if (key < text_poke_addr(tp))
1162 		return -1;
1163 	if (key > text_poke_addr(tp))
1164 		return 1;
1165 	return 0;
1166 }
1167 
1168 noinstr int poke_int3_handler(struct pt_regs *regs)
1169 {
1170 	struct bp_patching_desc *desc;
1171 	struct text_poke_loc *tp;
1172 	int ret = 0;
1173 	void *ip;
1174 
1175 	if (user_mode(regs))
1176 		return 0;
1177 
1178 	/*
1179 	 * Having observed our INT3 instruction, we now must observe
1180 	 * bp_desc:
1181 	 *
1182 	 *	bp_desc = desc			INT3
1183 	 *	WMB				RMB
1184 	 *	write INT3			if (desc)
1185 	 */
1186 	smp_rmb();
1187 
1188 	desc = try_get_desc(&bp_desc);
1189 	if (!desc)
1190 		return 0;
1191 
1192 	/*
1193 	 * Discount the INT3. See text_poke_bp_batch().
1194 	 */
1195 	ip = (void *) regs->ip - INT3_INSN_SIZE;
1196 
1197 	/*
1198 	 * Skip the binary search if there is a single member in the vector.
1199 	 */
1200 	if (unlikely(desc->nr_entries > 1)) {
1201 		tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
1202 				      sizeof(struct text_poke_loc),
1203 				      patch_cmp);
1204 		if (!tp)
1205 			goto out_put;
1206 	} else {
1207 		tp = desc->vec;
1208 		if (text_poke_addr(tp) != ip)
1209 			goto out_put;
1210 	}
1211 
1212 	ip += tp->len;
1213 
1214 	switch (tp->opcode) {
1215 	case INT3_INSN_OPCODE:
1216 		/*
1217 		 * Someone poked an explicit INT3, they'll want to handle it,
1218 		 * do not consume.
1219 		 */
1220 		goto out_put;
1221 
1222 	case RET_INSN_OPCODE:
1223 		int3_emulate_ret(regs);
1224 		break;
1225 
1226 	case CALL_INSN_OPCODE:
1227 		int3_emulate_call(regs, (long)ip + tp->disp);
1228 		break;
1229 
1230 	case JMP32_INSN_OPCODE:
1231 	case JMP8_INSN_OPCODE:
1232 		int3_emulate_jmp(regs, (long)ip + tp->disp);
1233 		break;
1234 
1235 	default:
1236 		BUG();
1237 	}
1238 
1239 	ret = 1;
1240 
1241 out_put:
1242 	put_desc(desc);
1243 	return ret;
1244 }
1245 
1246 #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
1247 static struct text_poke_loc tp_vec[TP_VEC_MAX];
1248 static int tp_vec_nr;
1249 
1250 /**
1251  * text_poke_bp_batch() -- update instructions on live kernel on SMP
1252  * @tp:			vector of instructions to patch
1253  * @nr_entries:		number of entries in the vector
1254  *
1255  * Modify multi-byte instruction by using int3 breakpoint on SMP.
1256  * We completely avoid stop_machine() here, and achieve the
1257  * synchronization using int3 breakpoint.
1258  *
1259  * The way it is done:
1260  *	- For each entry in the vector:
1261  *		- add a int3 trap to the address that will be patched
1262  *	- sync cores
1263  *	- For each entry in the vector:
1264  *		- update all but the first byte of the patched range
1265  *	- sync cores
1266  *	- For each entry in the vector:
1267  *		- replace the first byte (int3) by the first byte of
1268  *		  replacing opcode
1269  *	- sync cores
1270  */
1271 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
1272 {
1273 	struct bp_patching_desc desc = {
1274 		.vec = tp,
1275 		.nr_entries = nr_entries,
1276 		.refs = ATOMIC_INIT(1),
1277 	};
1278 	unsigned char int3 = INT3_INSN_OPCODE;
1279 	unsigned int i;
1280 	int do_sync;
1281 
1282 	lockdep_assert_held(&text_mutex);
1283 
1284 	smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */
1285 
1286 	/*
1287 	 * Corresponding read barrier in int3 notifier for making sure the
1288 	 * nr_entries and handler are correctly ordered wrt. patching.
1289 	 */
1290 	smp_wmb();
1291 
1292 	/*
1293 	 * First step: add a int3 trap to the address that will be patched.
1294 	 */
1295 	for (i = 0; i < nr_entries; i++) {
1296 		tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
1297 		text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
1298 	}
1299 
1300 	text_poke_sync();
1301 
1302 	/*
1303 	 * Second step: update all but the first byte of the patched range.
1304 	 */
1305 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
1306 		u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
1307 		int len = tp[i].len;
1308 
1309 		if (len - INT3_INSN_SIZE > 0) {
1310 			memcpy(old + INT3_INSN_SIZE,
1311 			       text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1312 			       len - INT3_INSN_SIZE);
1313 			text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1314 				  (const char *)tp[i].text + INT3_INSN_SIZE,
1315 				  len - INT3_INSN_SIZE);
1316 			do_sync++;
1317 		}
1318 
1319 		/*
1320 		 * Emit a perf event to record the text poke, primarily to
1321 		 * support Intel PT decoding which must walk the executable code
1322 		 * to reconstruct the trace. The flow up to here is:
1323 		 *   - write INT3 byte
1324 		 *   - IPI-SYNC
1325 		 *   - write instruction tail
1326 		 * At this point the actual control flow will be through the
1327 		 * INT3 and handler and not hit the old or new instruction.
1328 		 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
1329 		 * can still be decoded. Subsequently:
1330 		 *   - emit RECORD_TEXT_POKE with the new instruction
1331 		 *   - IPI-SYNC
1332 		 *   - write first byte
1333 		 *   - IPI-SYNC
1334 		 * So before the text poke event timestamp, the decoder will see
1335 		 * either the old instruction flow or FUP/TIP of INT3. After the
1336 		 * text poke event timestamp, the decoder will see either the
1337 		 * new instruction flow or FUP/TIP of INT3. Thus decoders can
1338 		 * use the timestamp as the point at which to modify the
1339 		 * executable code.
1340 		 * The old instruction is recorded so that the event can be
1341 		 * processed forwards or backwards.
1342 		 */
1343 		perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
1344 				     tp[i].text, len);
1345 	}
1346 
1347 	if (do_sync) {
1348 		/*
1349 		 * According to Intel, this core syncing is very likely
1350 		 * not necessary and we'd be safe even without it. But
1351 		 * better safe than sorry (plus there's not only Intel).
1352 		 */
1353 		text_poke_sync();
1354 	}
1355 
1356 	/*
1357 	 * Third step: replace the first byte (int3) by the first byte of
1358 	 * replacing opcode.
1359 	 */
1360 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
1361 		if (tp[i].text[0] == INT3_INSN_OPCODE)
1362 			continue;
1363 
1364 		text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
1365 		do_sync++;
1366 	}
1367 
1368 	if (do_sync)
1369 		text_poke_sync();
1370 
1371 	/*
1372 	 * Remove and synchronize_rcu(), except we have a very primitive
1373 	 * refcount based completion.
1374 	 */
1375 	WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */
1376 	if (!atomic_dec_and_test(&desc.refs))
1377 		atomic_cond_read_acquire(&desc.refs, !VAL);
1378 }
1379 
1380 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
1381 			       const void *opcode, size_t len, const void *emulate)
1382 {
1383 	struct insn insn;
1384 	int ret, i;
1385 
1386 	memcpy((void *)tp->text, opcode, len);
1387 	if (!emulate)
1388 		emulate = opcode;
1389 
1390 	ret = insn_decode_kernel(&insn, emulate);
1391 	BUG_ON(ret < 0);
1392 
1393 	tp->rel_addr = addr - (void *)_stext;
1394 	tp->len = len;
1395 	tp->opcode = insn.opcode.bytes[0];
1396 
1397 	switch (tp->opcode) {
1398 	case RET_INSN_OPCODE:
1399 	case JMP32_INSN_OPCODE:
1400 	case JMP8_INSN_OPCODE:
1401 		/*
1402 		 * Control flow instructions without implied execution of the
1403 		 * next instruction can be padded with INT3.
1404 		 */
1405 		for (i = insn.length; i < len; i++)
1406 			BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
1407 		break;
1408 
1409 	default:
1410 		BUG_ON(len != insn.length);
1411 	};
1412 
1413 
1414 	switch (tp->opcode) {
1415 	case INT3_INSN_OPCODE:
1416 	case RET_INSN_OPCODE:
1417 		break;
1418 
1419 	case CALL_INSN_OPCODE:
1420 	case JMP32_INSN_OPCODE:
1421 	case JMP8_INSN_OPCODE:
1422 		tp->disp = insn.immediate.value;
1423 		break;
1424 
1425 	default: /* assume NOP */
1426 		switch (len) {
1427 		case 2: /* NOP2 -- emulate as JMP8+0 */
1428 			BUG_ON(memcmp(emulate, x86_nops[len], len));
1429 			tp->opcode = JMP8_INSN_OPCODE;
1430 			tp->disp = 0;
1431 			break;
1432 
1433 		case 5: /* NOP5 -- emulate as JMP32+0 */
1434 			BUG_ON(memcmp(emulate, x86_nops[len], len));
1435 			tp->opcode = JMP32_INSN_OPCODE;
1436 			tp->disp = 0;
1437 			break;
1438 
1439 		default: /* unknown instruction */
1440 			BUG();
1441 		}
1442 		break;
1443 	}
1444 }
1445 
1446 /*
1447  * We hard rely on the tp_vec being ordered; ensure this is so by flushing
1448  * early if needed.
1449  */
1450 static bool tp_order_fail(void *addr)
1451 {
1452 	struct text_poke_loc *tp;
1453 
1454 	if (!tp_vec_nr)
1455 		return false;
1456 
1457 	if (!addr) /* force */
1458 		return true;
1459 
1460 	tp = &tp_vec[tp_vec_nr - 1];
1461 	if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
1462 		return true;
1463 
1464 	return false;
1465 }
1466 
1467 static void text_poke_flush(void *addr)
1468 {
1469 	if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
1470 		text_poke_bp_batch(tp_vec, tp_vec_nr);
1471 		tp_vec_nr = 0;
1472 	}
1473 }
1474 
1475 void text_poke_finish(void)
1476 {
1477 	text_poke_flush(NULL);
1478 }
1479 
1480 void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
1481 {
1482 	struct text_poke_loc *tp;
1483 
1484 	if (unlikely(system_state == SYSTEM_BOOTING)) {
1485 		text_poke_early(addr, opcode, len);
1486 		return;
1487 	}
1488 
1489 	text_poke_flush(addr);
1490 
1491 	tp = &tp_vec[tp_vec_nr++];
1492 	text_poke_loc_init(tp, addr, opcode, len, emulate);
1493 }
1494 
1495 /**
1496  * text_poke_bp() -- update instructions on live kernel on SMP
1497  * @addr:	address to patch
1498  * @opcode:	opcode of new instruction
1499  * @len:	length to copy
1500  * @emulate:	instruction to be emulated
1501  *
1502  * Update a single instruction with the vector in the stack, avoiding
1503  * dynamically allocated memory. This function should be used when it is
1504  * not possible to allocate memory.
1505  */
1506 void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
1507 {
1508 	struct text_poke_loc tp;
1509 
1510 	if (unlikely(system_state == SYSTEM_BOOTING)) {
1511 		text_poke_early(addr, opcode, len);
1512 		return;
1513 	}
1514 
1515 	text_poke_loc_init(&tp, addr, opcode, len, emulate);
1516 	text_poke_bp_batch(&tp, 1);
1517 }
1518