xref: /openbmc/linux/arch/x86/kernel/alternative.c (revision e3d786a3)
1 #define pr_fmt(fmt) "SMP alternatives: " fmt
2 
3 #include <linux/module.h>
4 #include <linux/sched.h>
5 #include <linux/mutex.h>
6 #include <linux/list.h>
7 #include <linux/stringify.h>
8 #include <linux/mm.h>
9 #include <linux/vmalloc.h>
10 #include <linux/memory.h>
11 #include <linux/stop_machine.h>
12 #include <linux/slab.h>
13 #include <linux/kdebug.h>
14 #include <asm/text-patching.h>
15 #include <asm/alternative.h>
16 #include <asm/sections.h>
17 #include <asm/pgtable.h>
18 #include <asm/mce.h>
19 #include <asm/nmi.h>
20 #include <asm/cacheflush.h>
21 #include <asm/tlbflush.h>
22 #include <asm/io.h>
23 #include <asm/fixmap.h>
24 
25 int __read_mostly alternatives_patched;
26 
27 EXPORT_SYMBOL_GPL(alternatives_patched);
28 
29 #define MAX_PATCH_LEN (255-1)
30 
31 static int __initdata_or_module debug_alternative;
32 
33 static int __init debug_alt(char *str)
34 {
35 	debug_alternative = 1;
36 	return 1;
37 }
38 __setup("debug-alternative", debug_alt);
39 
40 static int noreplace_smp;
41 
42 static int __init setup_noreplace_smp(char *str)
43 {
44 	noreplace_smp = 1;
45 	return 1;
46 }
47 __setup("noreplace-smp", setup_noreplace_smp);
48 
49 #define DPRINTK(fmt, args...)						\
50 do {									\
51 	if (debug_alternative)						\
52 		printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);	\
53 } while (0)
54 
55 #define DUMP_BYTES(buf, len, fmt, args...)				\
56 do {									\
57 	if (unlikely(debug_alternative)) {				\
58 		int j;							\
59 									\
60 		if (!(len))						\
61 			break;						\
62 									\
63 		printk(KERN_DEBUG fmt, ##args);				\
64 		for (j = 0; j < (len) - 1; j++)				\
65 			printk(KERN_CONT "%02hhx ", buf[j]);		\
66 		printk(KERN_CONT "%02hhx\n", buf[j]);			\
67 	}								\
68 } while (0)
69 
70 /*
71  * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
72  * that correspond to that nop. Getting from one nop to the next, we
73  * add to the array the offset that is equal to the sum of all sizes of
74  * nops preceding the one we are after.
75  *
76  * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
77  * nice symmetry of sizes of the previous nops.
78  */
79 #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
80 static const unsigned char intelnops[] =
81 {
82 	GENERIC_NOP1,
83 	GENERIC_NOP2,
84 	GENERIC_NOP3,
85 	GENERIC_NOP4,
86 	GENERIC_NOP5,
87 	GENERIC_NOP6,
88 	GENERIC_NOP7,
89 	GENERIC_NOP8,
90 	GENERIC_NOP5_ATOMIC
91 };
92 static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
93 {
94 	NULL,
95 	intelnops,
96 	intelnops + 1,
97 	intelnops + 1 + 2,
98 	intelnops + 1 + 2 + 3,
99 	intelnops + 1 + 2 + 3 + 4,
100 	intelnops + 1 + 2 + 3 + 4 + 5,
101 	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
102 	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
103 	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
104 };
105 #endif
106 
107 #ifdef K8_NOP1
108 static const unsigned char k8nops[] =
109 {
110 	K8_NOP1,
111 	K8_NOP2,
112 	K8_NOP3,
113 	K8_NOP4,
114 	K8_NOP5,
115 	K8_NOP6,
116 	K8_NOP7,
117 	K8_NOP8,
118 	K8_NOP5_ATOMIC
119 };
120 static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
121 {
122 	NULL,
123 	k8nops,
124 	k8nops + 1,
125 	k8nops + 1 + 2,
126 	k8nops + 1 + 2 + 3,
127 	k8nops + 1 + 2 + 3 + 4,
128 	k8nops + 1 + 2 + 3 + 4 + 5,
129 	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
130 	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
131 	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
132 };
133 #endif
134 
135 #if defined(K7_NOP1) && !defined(CONFIG_X86_64)
136 static const unsigned char k7nops[] =
137 {
138 	K7_NOP1,
139 	K7_NOP2,
140 	K7_NOP3,
141 	K7_NOP4,
142 	K7_NOP5,
143 	K7_NOP6,
144 	K7_NOP7,
145 	K7_NOP8,
146 	K7_NOP5_ATOMIC
147 };
148 static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
149 {
150 	NULL,
151 	k7nops,
152 	k7nops + 1,
153 	k7nops + 1 + 2,
154 	k7nops + 1 + 2 + 3,
155 	k7nops + 1 + 2 + 3 + 4,
156 	k7nops + 1 + 2 + 3 + 4 + 5,
157 	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
158 	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
159 	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
160 };
161 #endif
162 
163 #ifdef P6_NOP1
164 static const unsigned char p6nops[] =
165 {
166 	P6_NOP1,
167 	P6_NOP2,
168 	P6_NOP3,
169 	P6_NOP4,
170 	P6_NOP5,
171 	P6_NOP6,
172 	P6_NOP7,
173 	P6_NOP8,
174 	P6_NOP5_ATOMIC
175 };
176 static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
177 {
178 	NULL,
179 	p6nops,
180 	p6nops + 1,
181 	p6nops + 1 + 2,
182 	p6nops + 1 + 2 + 3,
183 	p6nops + 1 + 2 + 3 + 4,
184 	p6nops + 1 + 2 + 3 + 4 + 5,
185 	p6nops + 1 + 2 + 3 + 4 + 5 + 6,
186 	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
187 	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
188 };
189 #endif
190 
191 /* Initialize these to a safe default */
192 #ifdef CONFIG_X86_64
193 const unsigned char * const *ideal_nops = p6_nops;
194 #else
195 const unsigned char * const *ideal_nops = intel_nops;
196 #endif
197 
198 void __init arch_init_ideal_nops(void)
199 {
200 	switch (boot_cpu_data.x86_vendor) {
201 	case X86_VENDOR_INTEL:
202 		/*
203 		 * Due to a decoder implementation quirk, some
204 		 * specific Intel CPUs actually perform better with
205 		 * the "k8_nops" than with the SDM-recommended NOPs.
206 		 */
207 		if (boot_cpu_data.x86 == 6 &&
208 		    boot_cpu_data.x86_model >= 0x0f &&
209 		    boot_cpu_data.x86_model != 0x1c &&
210 		    boot_cpu_data.x86_model != 0x26 &&
211 		    boot_cpu_data.x86_model != 0x27 &&
212 		    boot_cpu_data.x86_model < 0x30) {
213 			ideal_nops = k8_nops;
214 		} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
215 			   ideal_nops = p6_nops;
216 		} else {
217 #ifdef CONFIG_X86_64
218 			ideal_nops = k8_nops;
219 #else
220 			ideal_nops = intel_nops;
221 #endif
222 		}
223 		break;
224 
225 	case X86_VENDOR_HYGON:
226 		ideal_nops = p6_nops;
227 		return;
228 
229 	case X86_VENDOR_AMD:
230 		if (boot_cpu_data.x86 > 0xf) {
231 			ideal_nops = p6_nops;
232 			return;
233 		}
234 
235 		/* fall through */
236 
237 	default:
238 #ifdef CONFIG_X86_64
239 		ideal_nops = k8_nops;
240 #else
241 		if (boot_cpu_has(X86_FEATURE_K8))
242 			ideal_nops = k8_nops;
243 		else if (boot_cpu_has(X86_FEATURE_K7))
244 			ideal_nops = k7_nops;
245 		else
246 			ideal_nops = intel_nops;
247 #endif
248 	}
249 }
250 
251 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
252 static void __init_or_module add_nops(void *insns, unsigned int len)
253 {
254 	while (len > 0) {
255 		unsigned int noplen = len;
256 		if (noplen > ASM_NOP_MAX)
257 			noplen = ASM_NOP_MAX;
258 		memcpy(insns, ideal_nops[noplen], noplen);
259 		insns += noplen;
260 		len -= noplen;
261 	}
262 }
263 
264 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
265 extern s32 __smp_locks[], __smp_locks_end[];
266 void *text_poke_early(void *addr, const void *opcode, size_t len);
267 
268 /*
269  * Are we looking at a near JMP with a 1 or 4-byte displacement.
270  */
271 static inline bool is_jmp(const u8 opcode)
272 {
273 	return opcode == 0xeb || opcode == 0xe9;
274 }
275 
276 static void __init_or_module
277 recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
278 {
279 	u8 *next_rip, *tgt_rip;
280 	s32 n_dspl, o_dspl;
281 	int repl_len;
282 
283 	if (a->replacementlen != 5)
284 		return;
285 
286 	o_dspl = *(s32 *)(insnbuf + 1);
287 
288 	/* next_rip of the replacement JMP */
289 	next_rip = repl_insn + a->replacementlen;
290 	/* target rip of the replacement JMP */
291 	tgt_rip  = next_rip + o_dspl;
292 	n_dspl = tgt_rip - orig_insn;
293 
294 	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
295 
296 	if (tgt_rip - orig_insn >= 0) {
297 		if (n_dspl - 2 <= 127)
298 			goto two_byte_jmp;
299 		else
300 			goto five_byte_jmp;
301 	/* negative offset */
302 	} else {
303 		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
304 			goto two_byte_jmp;
305 		else
306 			goto five_byte_jmp;
307 	}
308 
309 two_byte_jmp:
310 	n_dspl -= 2;
311 
312 	insnbuf[0] = 0xeb;
313 	insnbuf[1] = (s8)n_dspl;
314 	add_nops(insnbuf + 2, 3);
315 
316 	repl_len = 2;
317 	goto done;
318 
319 five_byte_jmp:
320 	n_dspl -= 5;
321 
322 	insnbuf[0] = 0xe9;
323 	*(s32 *)&insnbuf[1] = n_dspl;
324 
325 	repl_len = 5;
326 
327 done:
328 
329 	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
330 		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
331 }
332 
333 /*
334  * "noinline" to cause control flow change and thus invalidate I$ and
335  * cause refetch after modification.
336  */
337 static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
338 {
339 	unsigned long flags;
340 	int i;
341 
342 	for (i = 0; i < a->padlen; i++) {
343 		if (instr[i] != 0x90)
344 			return;
345 	}
346 
347 	local_irq_save(flags);
348 	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
349 	local_irq_restore(flags);
350 
351 	DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
352 		   instr, a->instrlen - a->padlen, a->padlen);
353 }
354 
355 /*
356  * Replace instructions with better alternatives for this CPU type. This runs
357  * before SMP is initialized to avoid SMP problems with self modifying code.
358  * This implies that asymmetric systems where APs have less capabilities than
359  * the boot processor are not handled. Tough. Make sure you disable such
360  * features by hand.
361  *
362  * Marked "noinline" to cause control flow change and thus insn cache
363  * to refetch changed I$ lines.
364  */
365 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
366 						  struct alt_instr *end)
367 {
368 	struct alt_instr *a;
369 	u8 *instr, *replacement;
370 	u8 insnbuf[MAX_PATCH_LEN];
371 
372 	DPRINTK("alt table %px, -> %px", start, end);
373 	/*
374 	 * The scan order should be from start to end. A later scanned
375 	 * alternative code can overwrite previously scanned alternative code.
376 	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
377 	 * patch code.
378 	 *
379 	 * So be careful if you want to change the scan order to any other
380 	 * order.
381 	 */
382 	for (a = start; a < end; a++) {
383 		int insnbuf_sz = 0;
384 
385 		instr = (u8 *)&a->instr_offset + a->instr_offset;
386 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
387 		BUG_ON(a->instrlen > sizeof(insnbuf));
388 		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
389 		if (!boot_cpu_has(a->cpuid)) {
390 			if (a->padlen > 1)
391 				optimize_nops(a, instr);
392 
393 			continue;
394 		}
395 
396 		DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
397 			a->cpuid >> 5,
398 			a->cpuid & 0x1f,
399 			instr, a->instrlen,
400 			replacement, a->replacementlen, a->padlen);
401 
402 		DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
403 		DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
404 
405 		memcpy(insnbuf, replacement, a->replacementlen);
406 		insnbuf_sz = a->replacementlen;
407 
408 		/*
409 		 * 0xe8 is a relative jump; fix the offset.
410 		 *
411 		 * Instruction length is checked before the opcode to avoid
412 		 * accessing uninitialized bytes for zero-length replacements.
413 		 */
414 		if (a->replacementlen == 5 && *insnbuf == 0xe8) {
415 			*(s32 *)(insnbuf + 1) += replacement - instr;
416 			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
417 				*(s32 *)(insnbuf + 1),
418 				(unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
419 		}
420 
421 		if (a->replacementlen && is_jmp(replacement[0]))
422 			recompute_jump(a, instr, replacement, insnbuf);
423 
424 		if (a->instrlen > a->replacementlen) {
425 			add_nops(insnbuf + a->replacementlen,
426 				 a->instrlen - a->replacementlen);
427 			insnbuf_sz += a->instrlen - a->replacementlen;
428 		}
429 		DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
430 
431 		text_poke_early(instr, insnbuf, insnbuf_sz);
432 	}
433 }
434 
435 #ifdef CONFIG_SMP
436 static void alternatives_smp_lock(const s32 *start, const s32 *end,
437 				  u8 *text, u8 *text_end)
438 {
439 	const s32 *poff;
440 
441 	for (poff = start; poff < end; poff++) {
442 		u8 *ptr = (u8 *)poff + *poff;
443 
444 		if (!*poff || ptr < text || ptr >= text_end)
445 			continue;
446 		/* turn DS segment override prefix into lock prefix */
447 		if (*ptr == 0x3e)
448 			text_poke(ptr, ((unsigned char []){0xf0}), 1);
449 	}
450 }
451 
452 static void alternatives_smp_unlock(const s32 *start, const s32 *end,
453 				    u8 *text, u8 *text_end)
454 {
455 	const s32 *poff;
456 
457 	for (poff = start; poff < end; poff++) {
458 		u8 *ptr = (u8 *)poff + *poff;
459 
460 		if (!*poff || ptr < text || ptr >= text_end)
461 			continue;
462 		/* turn lock prefix into DS segment override prefix */
463 		if (*ptr == 0xf0)
464 			text_poke(ptr, ((unsigned char []){0x3E}), 1);
465 	}
466 }
467 
468 struct smp_alt_module {
469 	/* what is this ??? */
470 	struct module	*mod;
471 	char		*name;
472 
473 	/* ptrs to lock prefixes */
474 	const s32	*locks;
475 	const s32	*locks_end;
476 
477 	/* .text segment, needed to avoid patching init code ;) */
478 	u8		*text;
479 	u8		*text_end;
480 
481 	struct list_head next;
482 };
483 static LIST_HEAD(smp_alt_modules);
484 static bool uniproc_patched = false;	/* protected by text_mutex */
485 
486 void __init_or_module alternatives_smp_module_add(struct module *mod,
487 						  char *name,
488 						  void *locks, void *locks_end,
489 						  void *text,  void *text_end)
490 {
491 	struct smp_alt_module *smp;
492 
493 	mutex_lock(&text_mutex);
494 	if (!uniproc_patched)
495 		goto unlock;
496 
497 	if (num_possible_cpus() == 1)
498 		/* Don't bother remembering, we'll never have to undo it. */
499 		goto smp_unlock;
500 
501 	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
502 	if (NULL == smp)
503 		/* we'll run the (safe but slow) SMP code then ... */
504 		goto unlock;
505 
506 	smp->mod	= mod;
507 	smp->name	= name;
508 	smp->locks	= locks;
509 	smp->locks_end	= locks_end;
510 	smp->text	= text;
511 	smp->text_end	= text_end;
512 	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
513 		smp->locks, smp->locks_end,
514 		smp->text, smp->text_end, smp->name);
515 
516 	list_add_tail(&smp->next, &smp_alt_modules);
517 smp_unlock:
518 	alternatives_smp_unlock(locks, locks_end, text, text_end);
519 unlock:
520 	mutex_unlock(&text_mutex);
521 }
522 
523 void __init_or_module alternatives_smp_module_del(struct module *mod)
524 {
525 	struct smp_alt_module *item;
526 
527 	mutex_lock(&text_mutex);
528 	list_for_each_entry(item, &smp_alt_modules, next) {
529 		if (mod != item->mod)
530 			continue;
531 		list_del(&item->next);
532 		kfree(item);
533 		break;
534 	}
535 	mutex_unlock(&text_mutex);
536 }
537 
538 void alternatives_enable_smp(void)
539 {
540 	struct smp_alt_module *mod;
541 
542 	/* Why bother if there are no other CPUs? */
543 	BUG_ON(num_possible_cpus() == 1);
544 
545 	mutex_lock(&text_mutex);
546 
547 	if (uniproc_patched) {
548 		pr_info("switching to SMP code\n");
549 		BUG_ON(num_online_cpus() != 1);
550 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551 		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552 		list_for_each_entry(mod, &smp_alt_modules, next)
553 			alternatives_smp_lock(mod->locks, mod->locks_end,
554 					      mod->text, mod->text_end);
555 		uniproc_patched = false;
556 	}
557 	mutex_unlock(&text_mutex);
558 }
559 
560 /*
561  * Return 1 if the address range is reserved for SMP-alternatives.
562  * Must hold text_mutex.
563  */
564 int alternatives_text_reserved(void *start, void *end)
565 {
566 	struct smp_alt_module *mod;
567 	const s32 *poff;
568 	u8 *text_start = start;
569 	u8 *text_end = end;
570 
571 	lockdep_assert_held(&text_mutex);
572 
573 	list_for_each_entry(mod, &smp_alt_modules, next) {
574 		if (mod->text > text_end || mod->text_end < text_start)
575 			continue;
576 		for (poff = mod->locks; poff < mod->locks_end; poff++) {
577 			const u8 *ptr = (const u8 *)poff + *poff;
578 
579 			if (text_start <= ptr && text_end > ptr)
580 				return 1;
581 		}
582 	}
583 
584 	return 0;
585 }
586 #endif /* CONFIG_SMP */
587 
588 #ifdef CONFIG_PARAVIRT
589 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
590 				     struct paravirt_patch_site *end)
591 {
592 	struct paravirt_patch_site *p;
593 	char insnbuf[MAX_PATCH_LEN];
594 
595 	for (p = start; p < end; p++) {
596 		unsigned int used;
597 
598 		BUG_ON(p->len > MAX_PATCH_LEN);
599 		/* prep the buffer with the original instructions */
600 		memcpy(insnbuf, p->instr, p->len);
601 		used = pv_ops.init.patch(p->instrtype, insnbuf,
602 					 (unsigned long)p->instr, p->len);
603 
604 		BUG_ON(used > p->len);
605 
606 		/* Pad the rest with nops */
607 		add_nops(insnbuf + used, p->len - used);
608 		text_poke_early(p->instr, insnbuf, p->len);
609 	}
610 }
611 extern struct paravirt_patch_site __start_parainstructions[],
612 	__stop_parainstructions[];
613 #endif	/* CONFIG_PARAVIRT */
614 
615 void __init alternative_instructions(void)
616 {
617 	/* The patching is not fully atomic, so try to avoid local interruptions
618 	   that might execute the to be patched code.
619 	   Other CPUs are not running. */
620 	stop_nmi();
621 
622 	/*
623 	 * Don't stop machine check exceptions while patching.
624 	 * MCEs only happen when something got corrupted and in this
625 	 * case we must do something about the corruption.
626 	 * Ignoring it is worse than a unlikely patching race.
627 	 * Also machine checks tend to be broadcast and if one CPU
628 	 * goes into machine check the others follow quickly, so we don't
629 	 * expect a machine check to cause undue problems during to code
630 	 * patching.
631 	 */
632 
633 	apply_alternatives(__alt_instructions, __alt_instructions_end);
634 
635 #ifdef CONFIG_SMP
636 	/* Patch to UP if other cpus not imminent. */
637 	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
638 		uniproc_patched = true;
639 		alternatives_smp_module_add(NULL, "core kernel",
640 					    __smp_locks, __smp_locks_end,
641 					    _text, _etext);
642 	}
643 
644 	if (!uniproc_patched || num_possible_cpus() == 1)
645 		free_init_pages("SMP alternatives",
646 				(unsigned long)__smp_locks,
647 				(unsigned long)__smp_locks_end);
648 #endif
649 
650 	apply_paravirt(__parainstructions, __parainstructions_end);
651 
652 	restart_nmi();
653 	alternatives_patched = 1;
654 }
655 
656 /**
657  * text_poke_early - Update instructions on a live kernel at boot time
658  * @addr: address to modify
659  * @opcode: source of the copy
660  * @len: length to copy
661  *
662  * When you use this code to patch more than one byte of an instruction
663  * you need to make sure that other CPUs cannot execute this code in parallel.
664  * Also no thread must be currently preempted in the middle of these
665  * instructions. And on the local CPU you need to be protected again NMI or MCE
666  * handlers seeing an inconsistent instruction while you patch.
667  */
668 void *__init_or_module text_poke_early(void *addr, const void *opcode,
669 					      size_t len)
670 {
671 	unsigned long flags;
672 	local_irq_save(flags);
673 	memcpy(addr, opcode, len);
674 	local_irq_restore(flags);
675 	sync_core();
676 	/* Could also do a CLFLUSH here to speed up CPU recovery; but
677 	   that causes hangs on some VIA CPUs. */
678 	return addr;
679 }
680 
681 /**
682  * text_poke - Update instructions on a live kernel
683  * @addr: address to modify
684  * @opcode: source of the copy
685  * @len: length to copy
686  *
687  * Only atomic text poke/set should be allowed when not doing early patching.
688  * It means the size must be writable atomically and the address must be aligned
689  * in a way that permits an atomic write. It also makes sure we fit on a single
690  * page.
691  */
692 void *text_poke(void *addr, const void *opcode, size_t len)
693 {
694 	unsigned long flags;
695 	char *vaddr;
696 	struct page *pages[2];
697 	int i;
698 
699 	/*
700 	 * While boot memory allocator is runnig we cannot use struct
701 	 * pages as they are not yet initialized.
702 	 */
703 	BUG_ON(!after_bootmem);
704 
705 	lockdep_assert_held(&text_mutex);
706 
707 	if (!core_kernel_text((unsigned long)addr)) {
708 		pages[0] = vmalloc_to_page(addr);
709 		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
710 	} else {
711 		pages[0] = virt_to_page(addr);
712 		WARN_ON(!PageReserved(pages[0]));
713 		pages[1] = virt_to_page(addr + PAGE_SIZE);
714 	}
715 	BUG_ON(!pages[0]);
716 	local_irq_save(flags);
717 	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
718 	if (pages[1])
719 		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
720 	vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
721 	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
722 	clear_fixmap(FIX_TEXT_POKE0);
723 	if (pages[1])
724 		clear_fixmap(FIX_TEXT_POKE1);
725 	local_flush_tlb();
726 	sync_core();
727 	/* Could also do a CLFLUSH here to speed up CPU recovery; but
728 	   that causes hangs on some VIA CPUs. */
729 	for (i = 0; i < len; i++)
730 		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
731 	local_irq_restore(flags);
732 	return addr;
733 }
734 
735 static void do_sync_core(void *info)
736 {
737 	sync_core();
738 }
739 
740 static bool bp_patching_in_progress;
741 static void *bp_int3_handler, *bp_int3_addr;
742 
743 int poke_int3_handler(struct pt_regs *regs)
744 {
745 	/*
746 	 * Having observed our INT3 instruction, we now must observe
747 	 * bp_patching_in_progress.
748 	 *
749 	 * 	in_progress = TRUE		INT3
750 	 * 	WMB				RMB
751 	 * 	write INT3			if (in_progress)
752 	 *
753 	 * Idem for bp_int3_handler.
754 	 */
755 	smp_rmb();
756 
757 	if (likely(!bp_patching_in_progress))
758 		return 0;
759 
760 	if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
761 		return 0;
762 
763 	/* set up the specified breakpoint handler */
764 	regs->ip = (unsigned long) bp_int3_handler;
765 
766 	return 1;
767 
768 }
769 
770 /**
771  * text_poke_bp() -- update instructions on live kernel on SMP
772  * @addr:	address to patch
773  * @opcode:	opcode of new instruction
774  * @len:	length to copy
775  * @handler:	address to jump to when the temporary breakpoint is hit
776  *
777  * Modify multi-byte instruction by using int3 breakpoint on SMP.
778  * We completely avoid stop_machine() here, and achieve the
779  * synchronization using int3 breakpoint.
780  *
781  * The way it is done:
782  *	- add a int3 trap to the address that will be patched
783  *	- sync cores
784  *	- update all but the first byte of the patched range
785  *	- sync cores
786  *	- replace the first byte (int3) by the first byte of
787  *	  replacing opcode
788  *	- sync cores
789  */
790 void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
791 {
792 	unsigned char int3 = 0xcc;
793 
794 	bp_int3_handler = handler;
795 	bp_int3_addr = (u8 *)addr + sizeof(int3);
796 	bp_patching_in_progress = true;
797 
798 	lockdep_assert_held(&text_mutex);
799 
800 	/*
801 	 * Corresponding read barrier in int3 notifier for making sure the
802 	 * in_progress and handler are correctly ordered wrt. patching.
803 	 */
804 	smp_wmb();
805 
806 	text_poke(addr, &int3, sizeof(int3));
807 
808 	on_each_cpu(do_sync_core, NULL, 1);
809 
810 	if (len - sizeof(int3) > 0) {
811 		/* patch all but the first byte */
812 		text_poke((char *)addr + sizeof(int3),
813 			  (const char *) opcode + sizeof(int3),
814 			  len - sizeof(int3));
815 		/*
816 		 * According to Intel, this core syncing is very likely
817 		 * not necessary and we'd be safe even without it. But
818 		 * better safe than sorry (plus there's not only Intel).
819 		 */
820 		on_each_cpu(do_sync_core, NULL, 1);
821 	}
822 
823 	/* patch the first byte */
824 	text_poke(addr, opcode, sizeof(int3));
825 
826 	on_each_cpu(do_sync_core, NULL, 1);
827 	/*
828 	 * sync_core() implies an smp_mb() and orders this store against
829 	 * the writing of the new instruction.
830 	 */
831 	bp_patching_in_progress = false;
832 
833 	return addr;
834 }
835 
836