xref: /openbmc/linux/arch/x86/kernel/alternative.c (revision 2f0f2441b4a10948e2ec042b48fef13680387f7c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #define pr_fmt(fmt) "SMP alternatives: " fmt
3 
4 #include <linux/module.h>
5 #include <linux/sched.h>
6 #include <linux/mutex.h>
7 #include <linux/list.h>
8 #include <linux/stringify.h>
9 #include <linux/mm.h>
10 #include <linux/vmalloc.h>
11 #include <linux/memory.h>
12 #include <linux/stop_machine.h>
13 #include <linux/slab.h>
14 #include <linux/kdebug.h>
15 #include <linux/kprobes.h>
16 #include <linux/mmu_context.h>
17 #include <asm/text-patching.h>
18 #include <asm/alternative.h>
19 #include <asm/sections.h>
20 #include <asm/pgtable.h>
21 #include <asm/mce.h>
22 #include <asm/nmi.h>
23 #include <asm/cacheflush.h>
24 #include <asm/tlbflush.h>
25 #include <asm/io.h>
26 #include <asm/fixmap.h>
27 
28 int __read_mostly alternatives_patched;
29 
30 EXPORT_SYMBOL_GPL(alternatives_patched);
31 
32 #define MAX_PATCH_LEN (255-1)
33 
34 static int __initdata_or_module debug_alternative;
35 
36 static int __init debug_alt(char *str)
37 {
38 	debug_alternative = 1;
39 	return 1;
40 }
41 __setup("debug-alternative", debug_alt);
42 
43 static int noreplace_smp;
44 
45 static int __init setup_noreplace_smp(char *str)
46 {
47 	noreplace_smp = 1;
48 	return 1;
49 }
50 __setup("noreplace-smp", setup_noreplace_smp);
51 
52 #define DPRINTK(fmt, args...)						\
53 do {									\
54 	if (debug_alternative)						\
55 		printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);	\
56 } while (0)
57 
58 #define DUMP_BYTES(buf, len, fmt, args...)				\
59 do {									\
60 	if (unlikely(debug_alternative)) {				\
61 		int j;							\
62 									\
63 		if (!(len))						\
64 			break;						\
65 									\
66 		printk(KERN_DEBUG fmt, ##args);				\
67 		for (j = 0; j < (len) - 1; j++)				\
68 			printk(KERN_CONT "%02hhx ", buf[j]);		\
69 		printk(KERN_CONT "%02hhx\n", buf[j]);			\
70 	}								\
71 } while (0)
72 
73 /*
74  * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
75  * that correspond to that nop. Getting from one nop to the next, we
76  * add to the array the offset that is equal to the sum of all sizes of
77  * nops preceding the one we are after.
78  *
79  * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
80  * nice symmetry of sizes of the previous nops.
81  */
82 #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
83 static const unsigned char intelnops[] =
84 {
85 	GENERIC_NOP1,
86 	GENERIC_NOP2,
87 	GENERIC_NOP3,
88 	GENERIC_NOP4,
89 	GENERIC_NOP5,
90 	GENERIC_NOP6,
91 	GENERIC_NOP7,
92 	GENERIC_NOP8,
93 	GENERIC_NOP5_ATOMIC
94 };
95 static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
96 {
97 	NULL,
98 	intelnops,
99 	intelnops + 1,
100 	intelnops + 1 + 2,
101 	intelnops + 1 + 2 + 3,
102 	intelnops + 1 + 2 + 3 + 4,
103 	intelnops + 1 + 2 + 3 + 4 + 5,
104 	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
105 	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
106 	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
107 };
108 #endif
109 
110 #ifdef K8_NOP1
111 static const unsigned char k8nops[] =
112 {
113 	K8_NOP1,
114 	K8_NOP2,
115 	K8_NOP3,
116 	K8_NOP4,
117 	K8_NOP5,
118 	K8_NOP6,
119 	K8_NOP7,
120 	K8_NOP8,
121 	K8_NOP5_ATOMIC
122 };
123 static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
124 {
125 	NULL,
126 	k8nops,
127 	k8nops + 1,
128 	k8nops + 1 + 2,
129 	k8nops + 1 + 2 + 3,
130 	k8nops + 1 + 2 + 3 + 4,
131 	k8nops + 1 + 2 + 3 + 4 + 5,
132 	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
133 	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
134 	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
135 };
136 #endif
137 
138 #if defined(K7_NOP1) && !defined(CONFIG_X86_64)
139 static const unsigned char k7nops[] =
140 {
141 	K7_NOP1,
142 	K7_NOP2,
143 	K7_NOP3,
144 	K7_NOP4,
145 	K7_NOP5,
146 	K7_NOP6,
147 	K7_NOP7,
148 	K7_NOP8,
149 	K7_NOP5_ATOMIC
150 };
151 static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
152 {
153 	NULL,
154 	k7nops,
155 	k7nops + 1,
156 	k7nops + 1 + 2,
157 	k7nops + 1 + 2 + 3,
158 	k7nops + 1 + 2 + 3 + 4,
159 	k7nops + 1 + 2 + 3 + 4 + 5,
160 	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
161 	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
162 	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
163 };
164 #endif
165 
166 #ifdef P6_NOP1
167 static const unsigned char p6nops[] =
168 {
169 	P6_NOP1,
170 	P6_NOP2,
171 	P6_NOP3,
172 	P6_NOP4,
173 	P6_NOP5,
174 	P6_NOP6,
175 	P6_NOP7,
176 	P6_NOP8,
177 	P6_NOP5_ATOMIC
178 };
179 static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
180 {
181 	NULL,
182 	p6nops,
183 	p6nops + 1,
184 	p6nops + 1 + 2,
185 	p6nops + 1 + 2 + 3,
186 	p6nops + 1 + 2 + 3 + 4,
187 	p6nops + 1 + 2 + 3 + 4 + 5,
188 	p6nops + 1 + 2 + 3 + 4 + 5 + 6,
189 	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
190 	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
191 };
192 #endif
193 
194 /* Initialize these to a safe default */
195 #ifdef CONFIG_X86_64
196 const unsigned char * const *ideal_nops = p6_nops;
197 #else
198 const unsigned char * const *ideal_nops = intel_nops;
199 #endif
200 
201 void __init arch_init_ideal_nops(void)
202 {
203 	switch (boot_cpu_data.x86_vendor) {
204 	case X86_VENDOR_INTEL:
205 		/*
206 		 * Due to a decoder implementation quirk, some
207 		 * specific Intel CPUs actually perform better with
208 		 * the "k8_nops" than with the SDM-recommended NOPs.
209 		 */
210 		if (boot_cpu_data.x86 == 6 &&
211 		    boot_cpu_data.x86_model >= 0x0f &&
212 		    boot_cpu_data.x86_model != 0x1c &&
213 		    boot_cpu_data.x86_model != 0x26 &&
214 		    boot_cpu_data.x86_model != 0x27 &&
215 		    boot_cpu_data.x86_model < 0x30) {
216 			ideal_nops = k8_nops;
217 		} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
218 			   ideal_nops = p6_nops;
219 		} else {
220 #ifdef CONFIG_X86_64
221 			ideal_nops = k8_nops;
222 #else
223 			ideal_nops = intel_nops;
224 #endif
225 		}
226 		break;
227 
228 	case X86_VENDOR_HYGON:
229 		ideal_nops = p6_nops;
230 		return;
231 
232 	case X86_VENDOR_AMD:
233 		if (boot_cpu_data.x86 > 0xf) {
234 			ideal_nops = p6_nops;
235 			return;
236 		}
237 
238 		/* fall through */
239 
240 	default:
241 #ifdef CONFIG_X86_64
242 		ideal_nops = k8_nops;
243 #else
244 		if (boot_cpu_has(X86_FEATURE_K8))
245 			ideal_nops = k8_nops;
246 		else if (boot_cpu_has(X86_FEATURE_K7))
247 			ideal_nops = k7_nops;
248 		else
249 			ideal_nops = intel_nops;
250 #endif
251 	}
252 }
253 
254 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
255 static void __init_or_module add_nops(void *insns, unsigned int len)
256 {
257 	while (len > 0) {
258 		unsigned int noplen = len;
259 		if (noplen > ASM_NOP_MAX)
260 			noplen = ASM_NOP_MAX;
261 		memcpy(insns, ideal_nops[noplen], noplen);
262 		insns += noplen;
263 		len -= noplen;
264 	}
265 }
266 
267 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
268 extern s32 __smp_locks[], __smp_locks_end[];
269 void text_poke_early(void *addr, const void *opcode, size_t len);
270 
271 /*
272  * Are we looking at a near JMP with a 1 or 4-byte displacement.
273  */
274 static inline bool is_jmp(const u8 opcode)
275 {
276 	return opcode == 0xeb || opcode == 0xe9;
277 }
278 
279 static void __init_or_module
280 recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
281 {
282 	u8 *next_rip, *tgt_rip;
283 	s32 n_dspl, o_dspl;
284 	int repl_len;
285 
286 	if (a->replacementlen != 5)
287 		return;
288 
289 	o_dspl = *(s32 *)(insnbuf + 1);
290 
291 	/* next_rip of the replacement JMP */
292 	next_rip = repl_insn + a->replacementlen;
293 	/* target rip of the replacement JMP */
294 	tgt_rip  = next_rip + o_dspl;
295 	n_dspl = tgt_rip - orig_insn;
296 
297 	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
298 
299 	if (tgt_rip - orig_insn >= 0) {
300 		if (n_dspl - 2 <= 127)
301 			goto two_byte_jmp;
302 		else
303 			goto five_byte_jmp;
304 	/* negative offset */
305 	} else {
306 		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
307 			goto two_byte_jmp;
308 		else
309 			goto five_byte_jmp;
310 	}
311 
312 two_byte_jmp:
313 	n_dspl -= 2;
314 
315 	insnbuf[0] = 0xeb;
316 	insnbuf[1] = (s8)n_dspl;
317 	add_nops(insnbuf + 2, 3);
318 
319 	repl_len = 2;
320 	goto done;
321 
322 five_byte_jmp:
323 	n_dspl -= 5;
324 
325 	insnbuf[0] = 0xe9;
326 	*(s32 *)&insnbuf[1] = n_dspl;
327 
328 	repl_len = 5;
329 
330 done:
331 
332 	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
333 		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
334 }
335 
336 /*
337  * "noinline" to cause control flow change and thus invalidate I$ and
338  * cause refetch after modification.
339  */
340 static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
341 {
342 	unsigned long flags;
343 	int i;
344 
345 	for (i = 0; i < a->padlen; i++) {
346 		if (instr[i] != 0x90)
347 			return;
348 	}
349 
350 	local_irq_save(flags);
351 	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
352 	local_irq_restore(flags);
353 
354 	DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
355 		   instr, a->instrlen - a->padlen, a->padlen);
356 }
357 
358 /*
359  * Replace instructions with better alternatives for this CPU type. This runs
360  * before SMP is initialized to avoid SMP problems with self modifying code.
361  * This implies that asymmetric systems where APs have less capabilities than
362  * the boot processor are not handled. Tough. Make sure you disable such
363  * features by hand.
364  *
365  * Marked "noinline" to cause control flow change and thus insn cache
366  * to refetch changed I$ lines.
367  */
368 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
369 						  struct alt_instr *end)
370 {
371 	struct alt_instr *a;
372 	u8 *instr, *replacement;
373 	u8 insnbuf[MAX_PATCH_LEN];
374 
375 	DPRINTK("alt table %px, -> %px", start, end);
376 	/*
377 	 * The scan order should be from start to end. A later scanned
378 	 * alternative code can overwrite previously scanned alternative code.
379 	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
380 	 * patch code.
381 	 *
382 	 * So be careful if you want to change the scan order to any other
383 	 * order.
384 	 */
385 	for (a = start; a < end; a++) {
386 		int insnbuf_sz = 0;
387 
388 		instr = (u8 *)&a->instr_offset + a->instr_offset;
389 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
390 		BUG_ON(a->instrlen > sizeof(insnbuf));
391 		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
392 		if (!boot_cpu_has(a->cpuid)) {
393 			if (a->padlen > 1)
394 				optimize_nops(a, instr);
395 
396 			continue;
397 		}
398 
399 		DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
400 			a->cpuid >> 5,
401 			a->cpuid & 0x1f,
402 			instr, instr, a->instrlen,
403 			replacement, a->replacementlen, a->padlen);
404 
405 		DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
406 		DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
407 
408 		memcpy(insnbuf, replacement, a->replacementlen);
409 		insnbuf_sz = a->replacementlen;
410 
411 		/*
412 		 * 0xe8 is a relative jump; fix the offset.
413 		 *
414 		 * Instruction length is checked before the opcode to avoid
415 		 * accessing uninitialized bytes for zero-length replacements.
416 		 */
417 		if (a->replacementlen == 5 && *insnbuf == 0xe8) {
418 			*(s32 *)(insnbuf + 1) += replacement - instr;
419 			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
420 				*(s32 *)(insnbuf + 1),
421 				(unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
422 		}
423 
424 		if (a->replacementlen && is_jmp(replacement[0]))
425 			recompute_jump(a, instr, replacement, insnbuf);
426 
427 		if (a->instrlen > a->replacementlen) {
428 			add_nops(insnbuf + a->replacementlen,
429 				 a->instrlen - a->replacementlen);
430 			insnbuf_sz += a->instrlen - a->replacementlen;
431 		}
432 		DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
433 
434 		text_poke_early(instr, insnbuf, insnbuf_sz);
435 	}
436 }
437 
438 #ifdef CONFIG_SMP
439 static void alternatives_smp_lock(const s32 *start, const s32 *end,
440 				  u8 *text, u8 *text_end)
441 {
442 	const s32 *poff;
443 
444 	for (poff = start; poff < end; poff++) {
445 		u8 *ptr = (u8 *)poff + *poff;
446 
447 		if (!*poff || ptr < text || ptr >= text_end)
448 			continue;
449 		/* turn DS segment override prefix into lock prefix */
450 		if (*ptr == 0x3e)
451 			text_poke(ptr, ((unsigned char []){0xf0}), 1);
452 	}
453 }
454 
455 static void alternatives_smp_unlock(const s32 *start, const s32 *end,
456 				    u8 *text, u8 *text_end)
457 {
458 	const s32 *poff;
459 
460 	for (poff = start; poff < end; poff++) {
461 		u8 *ptr = (u8 *)poff + *poff;
462 
463 		if (!*poff || ptr < text || ptr >= text_end)
464 			continue;
465 		/* turn lock prefix into DS segment override prefix */
466 		if (*ptr == 0xf0)
467 			text_poke(ptr, ((unsigned char []){0x3E}), 1);
468 	}
469 }
470 
471 struct smp_alt_module {
472 	/* what is this ??? */
473 	struct module	*mod;
474 	char		*name;
475 
476 	/* ptrs to lock prefixes */
477 	const s32	*locks;
478 	const s32	*locks_end;
479 
480 	/* .text segment, needed to avoid patching init code ;) */
481 	u8		*text;
482 	u8		*text_end;
483 
484 	struct list_head next;
485 };
486 static LIST_HEAD(smp_alt_modules);
487 static bool uniproc_patched = false;	/* protected by text_mutex */
488 
489 void __init_or_module alternatives_smp_module_add(struct module *mod,
490 						  char *name,
491 						  void *locks, void *locks_end,
492 						  void *text,  void *text_end)
493 {
494 	struct smp_alt_module *smp;
495 
496 	mutex_lock(&text_mutex);
497 	if (!uniproc_patched)
498 		goto unlock;
499 
500 	if (num_possible_cpus() == 1)
501 		/* Don't bother remembering, we'll never have to undo it. */
502 		goto smp_unlock;
503 
504 	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
505 	if (NULL == smp)
506 		/* we'll run the (safe but slow) SMP code then ... */
507 		goto unlock;
508 
509 	smp->mod	= mod;
510 	smp->name	= name;
511 	smp->locks	= locks;
512 	smp->locks_end	= locks_end;
513 	smp->text	= text;
514 	smp->text_end	= text_end;
515 	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
516 		smp->locks, smp->locks_end,
517 		smp->text, smp->text_end, smp->name);
518 
519 	list_add_tail(&smp->next, &smp_alt_modules);
520 smp_unlock:
521 	alternatives_smp_unlock(locks, locks_end, text, text_end);
522 unlock:
523 	mutex_unlock(&text_mutex);
524 }
525 
526 void __init_or_module alternatives_smp_module_del(struct module *mod)
527 {
528 	struct smp_alt_module *item;
529 
530 	mutex_lock(&text_mutex);
531 	list_for_each_entry(item, &smp_alt_modules, next) {
532 		if (mod != item->mod)
533 			continue;
534 		list_del(&item->next);
535 		kfree(item);
536 		break;
537 	}
538 	mutex_unlock(&text_mutex);
539 }
540 
541 void alternatives_enable_smp(void)
542 {
543 	struct smp_alt_module *mod;
544 
545 	/* Why bother if there are no other CPUs? */
546 	BUG_ON(num_possible_cpus() == 1);
547 
548 	mutex_lock(&text_mutex);
549 
550 	if (uniproc_patched) {
551 		pr_info("switching to SMP code\n");
552 		BUG_ON(num_online_cpus() != 1);
553 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
554 		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
555 		list_for_each_entry(mod, &smp_alt_modules, next)
556 			alternatives_smp_lock(mod->locks, mod->locks_end,
557 					      mod->text, mod->text_end);
558 		uniproc_patched = false;
559 	}
560 	mutex_unlock(&text_mutex);
561 }
562 
563 /*
564  * Return 1 if the address range is reserved for SMP-alternatives.
565  * Must hold text_mutex.
566  */
567 int alternatives_text_reserved(void *start, void *end)
568 {
569 	struct smp_alt_module *mod;
570 	const s32 *poff;
571 	u8 *text_start = start;
572 	u8 *text_end = end;
573 
574 	lockdep_assert_held(&text_mutex);
575 
576 	list_for_each_entry(mod, &smp_alt_modules, next) {
577 		if (mod->text > text_end || mod->text_end < text_start)
578 			continue;
579 		for (poff = mod->locks; poff < mod->locks_end; poff++) {
580 			const u8 *ptr = (const u8 *)poff + *poff;
581 
582 			if (text_start <= ptr && text_end > ptr)
583 				return 1;
584 		}
585 	}
586 
587 	return 0;
588 }
589 #endif /* CONFIG_SMP */
590 
591 #ifdef CONFIG_PARAVIRT
592 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
593 				     struct paravirt_patch_site *end)
594 {
595 	struct paravirt_patch_site *p;
596 	char insnbuf[MAX_PATCH_LEN];
597 
598 	for (p = start; p < end; p++) {
599 		unsigned int used;
600 
601 		BUG_ON(p->len > MAX_PATCH_LEN);
602 		/* prep the buffer with the original instructions */
603 		memcpy(insnbuf, p->instr, p->len);
604 		used = pv_ops.init.patch(p->instrtype, insnbuf,
605 					 (unsigned long)p->instr, p->len);
606 
607 		BUG_ON(used > p->len);
608 
609 		/* Pad the rest with nops */
610 		add_nops(insnbuf + used, p->len - used);
611 		text_poke_early(p->instr, insnbuf, p->len);
612 	}
613 }
614 extern struct paravirt_patch_site __start_parainstructions[],
615 	__stop_parainstructions[];
616 #endif	/* CONFIG_PARAVIRT */
617 
618 void __init alternative_instructions(void)
619 {
620 	/* The patching is not fully atomic, so try to avoid local interruptions
621 	   that might execute the to be patched code.
622 	   Other CPUs are not running. */
623 	stop_nmi();
624 
625 	/*
626 	 * Don't stop machine check exceptions while patching.
627 	 * MCEs only happen when something got corrupted and in this
628 	 * case we must do something about the corruption.
629 	 * Ignoring it is worse than a unlikely patching race.
630 	 * Also machine checks tend to be broadcast and if one CPU
631 	 * goes into machine check the others follow quickly, so we don't
632 	 * expect a machine check to cause undue problems during to code
633 	 * patching.
634 	 */
635 
636 	apply_alternatives(__alt_instructions, __alt_instructions_end);
637 
638 #ifdef CONFIG_SMP
639 	/* Patch to UP if other cpus not imminent. */
640 	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
641 		uniproc_patched = true;
642 		alternatives_smp_module_add(NULL, "core kernel",
643 					    __smp_locks, __smp_locks_end,
644 					    _text, _etext);
645 	}
646 
647 	if (!uniproc_patched || num_possible_cpus() == 1)
648 		free_init_pages("SMP alternatives",
649 				(unsigned long)__smp_locks,
650 				(unsigned long)__smp_locks_end);
651 #endif
652 
653 	apply_paravirt(__parainstructions, __parainstructions_end);
654 
655 	restart_nmi();
656 	alternatives_patched = 1;
657 }
658 
659 /**
660  * text_poke_early - Update instructions on a live kernel at boot time
661  * @addr: address to modify
662  * @opcode: source of the copy
663  * @len: length to copy
664  *
665  * When you use this code to patch more than one byte of an instruction
666  * you need to make sure that other CPUs cannot execute this code in parallel.
667  * Also no thread must be currently preempted in the middle of these
668  * instructions. And on the local CPU you need to be protected again NMI or MCE
669  * handlers seeing an inconsistent instruction while you patch.
670  */
671 void __init_or_module text_poke_early(void *addr, const void *opcode,
672 				      size_t len)
673 {
674 	unsigned long flags;
675 
676 	if (boot_cpu_has(X86_FEATURE_NX) &&
677 	    is_module_text_address((unsigned long)addr)) {
678 		/*
679 		 * Modules text is marked initially as non-executable, so the
680 		 * code cannot be running and speculative code-fetches are
681 		 * prevented. Just change the code.
682 		 */
683 		memcpy(addr, opcode, len);
684 	} else {
685 		local_irq_save(flags);
686 		memcpy(addr, opcode, len);
687 		local_irq_restore(flags);
688 		sync_core();
689 
690 		/*
691 		 * Could also do a CLFLUSH here to speed up CPU recovery; but
692 		 * that causes hangs on some VIA CPUs.
693 		 */
694 	}
695 }
696 
697 __ro_after_init struct mm_struct *poking_mm;
698 __ro_after_init unsigned long poking_addr;
699 
700 static void *__text_poke(void *addr, const void *opcode, size_t len)
701 {
702 	bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
703 	struct page *pages[2] = {NULL};
704 	temp_mm_state_t prev;
705 	unsigned long flags;
706 	pte_t pte, *ptep;
707 	spinlock_t *ptl;
708 	pgprot_t pgprot;
709 
710 	/*
711 	 * While boot memory allocator is running we cannot use struct pages as
712 	 * they are not yet initialized. There is no way to recover.
713 	 */
714 	BUG_ON(!after_bootmem);
715 
716 	if (!core_kernel_text((unsigned long)addr)) {
717 		pages[0] = vmalloc_to_page(addr);
718 		if (cross_page_boundary)
719 			pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
720 	} else {
721 		pages[0] = virt_to_page(addr);
722 		WARN_ON(!PageReserved(pages[0]));
723 		if (cross_page_boundary)
724 			pages[1] = virt_to_page(addr + PAGE_SIZE);
725 	}
726 	/*
727 	 * If something went wrong, crash and burn since recovery paths are not
728 	 * implemented.
729 	 */
730 	BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
731 
732 	local_irq_save(flags);
733 
734 	/*
735 	 * Map the page without the global bit, as TLB flushing is done with
736 	 * flush_tlb_mm_range(), which is intended for non-global PTEs.
737 	 */
738 	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
739 
740 	/*
741 	 * The lock is not really needed, but this allows to avoid open-coding.
742 	 */
743 	ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
744 
745 	/*
746 	 * This must not fail; preallocated in poking_init().
747 	 */
748 	VM_BUG_ON(!ptep);
749 
750 	pte = mk_pte(pages[0], pgprot);
751 	set_pte_at(poking_mm, poking_addr, ptep, pte);
752 
753 	if (cross_page_boundary) {
754 		pte = mk_pte(pages[1], pgprot);
755 		set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
756 	}
757 
758 	/*
759 	 * Loading the temporary mm behaves as a compiler barrier, which
760 	 * guarantees that the PTE will be set at the time memcpy() is done.
761 	 */
762 	prev = use_temporary_mm(poking_mm);
763 
764 	kasan_disable_current();
765 	memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
766 	kasan_enable_current();
767 
768 	/*
769 	 * Ensure that the PTE is only cleared after the instructions of memcpy
770 	 * were issued by using a compiler barrier.
771 	 */
772 	barrier();
773 
774 	pte_clear(poking_mm, poking_addr, ptep);
775 	if (cross_page_boundary)
776 		pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
777 
778 	/*
779 	 * Loading the previous page-table hierarchy requires a serializing
780 	 * instruction that already allows the core to see the updated version.
781 	 * Xen-PV is assumed to serialize execution in a similar manner.
782 	 */
783 	unuse_temporary_mm(prev);
784 
785 	/*
786 	 * Flushing the TLB might involve IPIs, which would require enabled
787 	 * IRQs, but not if the mm is not used, as it is in this point.
788 	 */
789 	flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
790 			   (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
791 			   PAGE_SHIFT, false);
792 
793 	/*
794 	 * If the text does not match what we just wrote then something is
795 	 * fundamentally screwy; there's nothing we can really do about that.
796 	 */
797 	BUG_ON(memcmp(addr, opcode, len));
798 
799 	pte_unmap_unlock(ptep, ptl);
800 	local_irq_restore(flags);
801 	return addr;
802 }
803 
804 /**
805  * text_poke - Update instructions on a live kernel
806  * @addr: address to modify
807  * @opcode: source of the copy
808  * @len: length to copy
809  *
810  * Only atomic text poke/set should be allowed when not doing early patching.
811  * It means the size must be writable atomically and the address must be aligned
812  * in a way that permits an atomic write. It also makes sure we fit on a single
813  * page.
814  *
815  * Note that the caller must ensure that if the modified code is part of a
816  * module, the module would not be removed during poking. This can be achieved
817  * by registering a module notifier, and ordering module removal and patching
818  * trough a mutex.
819  */
820 void *text_poke(void *addr, const void *opcode, size_t len)
821 {
822 	lockdep_assert_held(&text_mutex);
823 
824 	return __text_poke(addr, opcode, len);
825 }
826 
827 /**
828  * text_poke_kgdb - Update instructions on a live kernel by kgdb
829  * @addr: address to modify
830  * @opcode: source of the copy
831  * @len: length to copy
832  *
833  * Only atomic text poke/set should be allowed when not doing early patching.
834  * It means the size must be writable atomically and the address must be aligned
835  * in a way that permits an atomic write. It also makes sure we fit on a single
836  * page.
837  *
838  * Context: should only be used by kgdb, which ensures no other core is running,
839  *	    despite the fact it does not hold the text_mutex.
840  */
841 void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
842 {
843 	return __text_poke(addr, opcode, len);
844 }
845 
846 static void do_sync_core(void *info)
847 {
848 	sync_core();
849 }
850 
851 static bool bp_patching_in_progress;
852 static void *bp_int3_handler, *bp_int3_addr;
853 
854 int poke_int3_handler(struct pt_regs *regs)
855 {
856 	/*
857 	 * Having observed our INT3 instruction, we now must observe
858 	 * bp_patching_in_progress.
859 	 *
860 	 * 	in_progress = TRUE		INT3
861 	 * 	WMB				RMB
862 	 * 	write INT3			if (in_progress)
863 	 *
864 	 * Idem for bp_int3_handler.
865 	 */
866 	smp_rmb();
867 
868 	if (likely(!bp_patching_in_progress))
869 		return 0;
870 
871 	if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
872 		return 0;
873 
874 	/* set up the specified breakpoint handler */
875 	regs->ip = (unsigned long) bp_int3_handler;
876 
877 	return 1;
878 }
879 NOKPROBE_SYMBOL(poke_int3_handler);
880 
881 /**
882  * text_poke_bp() -- update instructions on live kernel on SMP
883  * @addr:	address to patch
884  * @opcode:	opcode of new instruction
885  * @len:	length to copy
886  * @handler:	address to jump to when the temporary breakpoint is hit
887  *
888  * Modify multi-byte instruction by using int3 breakpoint on SMP.
889  * We completely avoid stop_machine() here, and achieve the
890  * synchronization using int3 breakpoint.
891  *
892  * The way it is done:
893  *	- add a int3 trap to the address that will be patched
894  *	- sync cores
895  *	- update all but the first byte of the patched range
896  *	- sync cores
897  *	- replace the first byte (int3) by the first byte of
898  *	  replacing opcode
899  *	- sync cores
900  */
901 void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
902 {
903 	unsigned char int3 = 0xcc;
904 
905 	bp_int3_handler = handler;
906 	bp_int3_addr = (u8 *)addr + sizeof(int3);
907 	bp_patching_in_progress = true;
908 
909 	lockdep_assert_held(&text_mutex);
910 
911 	/*
912 	 * Corresponding read barrier in int3 notifier for making sure the
913 	 * in_progress and handler are correctly ordered wrt. patching.
914 	 */
915 	smp_wmb();
916 
917 	text_poke(addr, &int3, sizeof(int3));
918 
919 	on_each_cpu(do_sync_core, NULL, 1);
920 
921 	if (len - sizeof(int3) > 0) {
922 		/* patch all but the first byte */
923 		text_poke((char *)addr + sizeof(int3),
924 			  (const char *) opcode + sizeof(int3),
925 			  len - sizeof(int3));
926 		/*
927 		 * According to Intel, this core syncing is very likely
928 		 * not necessary and we'd be safe even without it. But
929 		 * better safe than sorry (plus there's not only Intel).
930 		 */
931 		on_each_cpu(do_sync_core, NULL, 1);
932 	}
933 
934 	/* patch the first byte */
935 	text_poke(addr, opcode, sizeof(int3));
936 
937 	on_each_cpu(do_sync_core, NULL, 1);
938 	/*
939 	 * sync_core() implies an smp_mb() and orders this store against
940 	 * the writing of the new instruction.
941 	 */
942 	bp_patching_in_progress = false;
943 }
944 
945