1 // SPDX-License-Identifier: GPL-2.0-only 2 #define pr_fmt(fmt) "SMP alternatives: " fmt 3 4 #include <linux/module.h> 5 #include <linux/sched.h> 6 #include <linux/perf_event.h> 7 #include <linux/mutex.h> 8 #include <linux/list.h> 9 #include <linux/stringify.h> 10 #include <linux/highmem.h> 11 #include <linux/mm.h> 12 #include <linux/vmalloc.h> 13 #include <linux/memory.h> 14 #include <linux/stop_machine.h> 15 #include <linux/slab.h> 16 #include <linux/kdebug.h> 17 #include <linux/kprobes.h> 18 #include <linux/mmu_context.h> 19 #include <linux/bsearch.h> 20 #include <linux/sync_core.h> 21 #include <asm/text-patching.h> 22 #include <asm/alternative.h> 23 #include <asm/sections.h> 24 #include <asm/mce.h> 25 #include <asm/nmi.h> 26 #include <asm/cacheflush.h> 27 #include <asm/tlbflush.h> 28 #include <asm/insn.h> 29 #include <asm/io.h> 30 #include <asm/fixmap.h> 31 #include <asm/paravirt.h> 32 #include <asm/asm-prototypes.h> 33 34 int __read_mostly alternatives_patched; 35 36 EXPORT_SYMBOL_GPL(alternatives_patched); 37 38 #define MAX_PATCH_LEN (255-1) 39 40 static int __initdata_or_module debug_alternative; 41 42 static int __init debug_alt(char *str) 43 { 44 debug_alternative = 1; 45 return 1; 46 } 47 __setup("debug-alternative", debug_alt); 48 49 static int noreplace_smp; 50 51 static int __init setup_noreplace_smp(char *str) 52 { 53 noreplace_smp = 1; 54 return 1; 55 } 56 __setup("noreplace-smp", setup_noreplace_smp); 57 58 #define DPRINTK(fmt, args...) \ 59 do { \ 60 if (debug_alternative) \ 61 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ 62 } while (0) 63 64 #define DUMP_BYTES(buf, len, fmt, args...) \ 65 do { \ 66 if (unlikely(debug_alternative)) { \ 67 int j; \ 68 \ 69 if (!(len)) \ 70 break; \ 71 \ 72 printk(KERN_DEBUG pr_fmt(fmt), ##args); \ 73 for (j = 0; j < (len) - 1; j++) \ 74 printk(KERN_CONT "%02hhx ", buf[j]); \ 75 printk(KERN_CONT "%02hhx\n", buf[j]); \ 76 } \ 77 } while (0) 78 79 static const unsigned char x86nops[] = 80 { 81 BYTES_NOP1, 82 BYTES_NOP2, 83 BYTES_NOP3, 84 BYTES_NOP4, 85 BYTES_NOP5, 86 BYTES_NOP6, 87 BYTES_NOP7, 88 BYTES_NOP8, 89 }; 90 91 const unsigned char * const x86_nops[ASM_NOP_MAX+1] = 92 { 93 NULL, 94 x86nops, 95 x86nops + 1, 96 x86nops + 1 + 2, 97 x86nops + 1 + 2 + 3, 98 x86nops + 1 + 2 + 3 + 4, 99 x86nops + 1 + 2 + 3 + 4 + 5, 100 x86nops + 1 + 2 + 3 + 4 + 5 + 6, 101 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 102 }; 103 104 /* Use this to add nops to a buffer, then text_poke the whole buffer. */ 105 static void __init_or_module add_nops(void *insns, unsigned int len) 106 { 107 while (len > 0) { 108 unsigned int noplen = len; 109 if (noplen > ASM_NOP_MAX) 110 noplen = ASM_NOP_MAX; 111 memcpy(insns, x86_nops[noplen], noplen); 112 insns += noplen; 113 len -= noplen; 114 } 115 } 116 117 extern s32 __retpoline_sites[], __retpoline_sites_end[]; 118 extern s32 __return_sites[], __return_sites_end[]; 119 extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; 120 extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 121 extern s32 __smp_locks[], __smp_locks_end[]; 122 void text_poke_early(void *addr, const void *opcode, size_t len); 123 124 /* 125 * Are we looking at a near JMP with a 1 or 4-byte displacement. 126 */ 127 static inline bool is_jmp(const u8 opcode) 128 { 129 return opcode == 0xeb || opcode == 0xe9; 130 } 131 132 static void __init_or_module 133 recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) 134 { 135 u8 *next_rip, *tgt_rip; 136 s32 n_dspl, o_dspl; 137 int repl_len; 138 139 if (a->replacementlen != 5) 140 return; 141 142 o_dspl = *(s32 *)(insn_buff + 1); 143 144 /* next_rip of the replacement JMP */ 145 next_rip = repl_insn + a->replacementlen; 146 /* target rip of the replacement JMP */ 147 tgt_rip = next_rip + o_dspl; 148 n_dspl = tgt_rip - orig_insn; 149 150 DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); 151 152 if (tgt_rip - orig_insn >= 0) { 153 if (n_dspl - 2 <= 127) 154 goto two_byte_jmp; 155 else 156 goto five_byte_jmp; 157 /* negative offset */ 158 } else { 159 if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) 160 goto two_byte_jmp; 161 else 162 goto five_byte_jmp; 163 } 164 165 two_byte_jmp: 166 n_dspl -= 2; 167 168 insn_buff[0] = 0xeb; 169 insn_buff[1] = (s8)n_dspl; 170 add_nops(insn_buff + 2, 3); 171 172 repl_len = 2; 173 goto done; 174 175 five_byte_jmp: 176 n_dspl -= 5; 177 178 insn_buff[0] = 0xe9; 179 *(s32 *)&insn_buff[1] = n_dspl; 180 181 repl_len = 5; 182 183 done: 184 185 DPRINTK("final displ: 0x%08x, JMP 0x%lx", 186 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); 187 } 188 189 /* 190 * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) 191 * 192 * @instr: instruction byte stream 193 * @instrlen: length of the above 194 * @off: offset within @instr where the first NOP has been detected 195 * 196 * Return: number of NOPs found (and replaced). 197 */ 198 static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) 199 { 200 unsigned long flags; 201 int i = off, nnops; 202 203 while (i < instrlen) { 204 if (instr[i] != 0x90) 205 break; 206 207 i++; 208 } 209 210 nnops = i - off; 211 212 if (nnops <= 1) 213 return nnops; 214 215 local_irq_save(flags); 216 add_nops(instr + off, nnops); 217 local_irq_restore(flags); 218 219 DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); 220 221 return nnops; 222 } 223 224 /* 225 * "noinline" to cause control flow change and thus invalidate I$ and 226 * cause refetch after modification. 227 */ 228 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) 229 { 230 struct insn insn; 231 int i = 0; 232 233 /* 234 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger 235 * ones. 236 */ 237 for (;;) { 238 if (insn_decode_kernel(&insn, &instr[i])) 239 return; 240 241 /* 242 * See if this and any potentially following NOPs can be 243 * optimized. 244 */ 245 if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) 246 i += optimize_nops_range(instr, len, i); 247 else 248 i += insn.length; 249 250 if (i >= len) 251 return; 252 } 253 } 254 255 /* 256 * Replace instructions with better alternatives for this CPU type. This runs 257 * before SMP is initialized to avoid SMP problems with self modifying code. 258 * This implies that asymmetric systems where APs have less capabilities than 259 * the boot processor are not handled. Tough. Make sure you disable such 260 * features by hand. 261 * 262 * Marked "noinline" to cause control flow change and thus insn cache 263 * to refetch changed I$ lines. 264 */ 265 void __init_or_module noinline apply_alternatives(struct alt_instr *start, 266 struct alt_instr *end) 267 { 268 struct alt_instr *a; 269 u8 *instr, *replacement; 270 u8 insn_buff[MAX_PATCH_LEN]; 271 272 DPRINTK("alt table %px, -> %px", start, end); 273 /* 274 * The scan order should be from start to end. A later scanned 275 * alternative code can overwrite previously scanned alternative code. 276 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 277 * patch code. 278 * 279 * So be careful if you want to change the scan order to any other 280 * order. 281 */ 282 for (a = start; a < end; a++) { 283 int insn_buff_sz = 0; 284 /* Mask away "NOT" flag bit for feature to test. */ 285 u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; 286 287 instr = (u8 *)&a->instr_offset + a->instr_offset; 288 replacement = (u8 *)&a->repl_offset + a->repl_offset; 289 BUG_ON(a->instrlen > sizeof(insn_buff)); 290 BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); 291 292 /* 293 * Patch if either: 294 * - feature is present 295 * - feature not present but ALTINSTR_FLAG_INV is set to mean, 296 * patch if feature is *NOT* present. 297 */ 298 if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) 299 goto next; 300 301 DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", 302 (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", 303 feature >> 5, 304 feature & 0x1f, 305 instr, instr, a->instrlen, 306 replacement, a->replacementlen); 307 308 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); 309 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); 310 311 memcpy(insn_buff, replacement, a->replacementlen); 312 insn_buff_sz = a->replacementlen; 313 314 /* 315 * 0xe8 is a relative jump; fix the offset. 316 * 317 * Instruction length is checked before the opcode to avoid 318 * accessing uninitialized bytes for zero-length replacements. 319 */ 320 if (a->replacementlen == 5 && *insn_buff == 0xe8) { 321 *(s32 *)(insn_buff + 1) += replacement - instr; 322 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", 323 *(s32 *)(insn_buff + 1), 324 (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); 325 } 326 327 if (a->replacementlen && is_jmp(replacement[0])) 328 recompute_jump(a, instr, replacement, insn_buff); 329 330 for (; insn_buff_sz < a->instrlen; insn_buff_sz++) 331 insn_buff[insn_buff_sz] = 0x90; 332 333 DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); 334 335 text_poke_early(instr, insn_buff, insn_buff_sz); 336 337 next: 338 optimize_nops(instr, a->instrlen); 339 } 340 } 341 342 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) 343 344 /* 345 * CALL/JMP *%\reg 346 */ 347 static int emit_indirect(int op, int reg, u8 *bytes) 348 { 349 int i = 0; 350 u8 modrm; 351 352 switch (op) { 353 case CALL_INSN_OPCODE: 354 modrm = 0x10; /* Reg = 2; CALL r/m */ 355 break; 356 357 case JMP32_INSN_OPCODE: 358 modrm = 0x20; /* Reg = 4; JMP r/m */ 359 break; 360 361 default: 362 WARN_ON_ONCE(1); 363 return -1; 364 } 365 366 if (reg >= 8) { 367 bytes[i++] = 0x41; /* REX.B prefix */ 368 reg -= 8; 369 } 370 371 modrm |= 0xc0; /* Mod = 3 */ 372 modrm += reg; 373 374 bytes[i++] = 0xff; /* opcode */ 375 bytes[i++] = modrm; 376 377 return i; 378 } 379 380 /* 381 * Rewrite the compiler generated retpoline thunk calls. 382 * 383 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate 384 * indirect instructions, avoiding the extra indirection. 385 * 386 * For example, convert: 387 * 388 * CALL __x86_indirect_thunk_\reg 389 * 390 * into: 391 * 392 * CALL *%\reg 393 * 394 * It also tries to inline spectre_v2=retpoline,lfence when size permits. 395 */ 396 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) 397 { 398 retpoline_thunk_t *target; 399 int reg, ret, i = 0; 400 u8 op, cc; 401 402 target = addr + insn->length + insn->immediate.value; 403 reg = target - __x86_indirect_thunk_array; 404 405 if (WARN_ON_ONCE(reg & ~0xf)) 406 return -1; 407 408 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ 409 BUG_ON(reg == 4); 410 411 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && 412 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) 413 return -1; 414 415 op = insn->opcode.bytes[0]; 416 417 /* 418 * Convert: 419 * 420 * Jcc.d32 __x86_indirect_thunk_\reg 421 * 422 * into: 423 * 424 * Jncc.d8 1f 425 * [ LFENCE ] 426 * JMP *%\reg 427 * [ NOP ] 428 * 1: 429 */ 430 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ 431 if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { 432 cc = insn->opcode.bytes[1] & 0xf; 433 cc ^= 1; /* invert condition */ 434 435 bytes[i++] = 0x70 + cc; /* Jcc.d8 */ 436 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ 437 438 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ 439 op = JMP32_INSN_OPCODE; 440 } 441 442 /* 443 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. 444 */ 445 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { 446 bytes[i++] = 0x0f; 447 bytes[i++] = 0xae; 448 bytes[i++] = 0xe8; /* LFENCE */ 449 } 450 451 ret = emit_indirect(op, reg, bytes + i); 452 if (ret < 0) 453 return ret; 454 i += ret; 455 456 /* 457 * The compiler is supposed to EMIT an INT3 after every unconditional 458 * JMP instruction due to AMD BTC. However, if the compiler is too old 459 * or SLS isn't enabled, we still need an INT3 after indirect JMPs 460 * even on Intel. 461 */ 462 if (op == JMP32_INSN_OPCODE && i < insn->length) 463 bytes[i++] = INT3_INSN_OPCODE; 464 465 for (; i < insn->length;) 466 bytes[i++] = BYTES_NOP1; 467 468 return i; 469 } 470 471 /* 472 * Generated by 'objtool --retpoline'. 473 */ 474 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) 475 { 476 s32 *s; 477 478 for (s = start; s < end; s++) { 479 void *addr = (void *)s + *s; 480 struct insn insn; 481 int len, ret; 482 u8 bytes[16]; 483 u8 op1, op2; 484 485 ret = insn_decode_kernel(&insn, addr); 486 if (WARN_ON_ONCE(ret < 0)) 487 continue; 488 489 op1 = insn.opcode.bytes[0]; 490 op2 = insn.opcode.bytes[1]; 491 492 switch (op1) { 493 case CALL_INSN_OPCODE: 494 case JMP32_INSN_OPCODE: 495 break; 496 497 case 0x0f: /* escape */ 498 if (op2 >= 0x80 && op2 <= 0x8f) 499 break; 500 fallthrough; 501 default: 502 WARN_ON_ONCE(1); 503 continue; 504 } 505 506 DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", 507 addr, addr, insn.length, 508 addr + insn.length + insn.immediate.value); 509 510 len = patch_retpoline(addr, &insn, bytes); 511 if (len == insn.length) { 512 optimize_nops(bytes, len); 513 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); 514 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); 515 text_poke_early(addr, bytes, len); 516 } 517 } 518 } 519 520 #ifdef CONFIG_RETHUNK 521 /* 522 * Rewrite the compiler generated return thunk tail-calls. 523 * 524 * For example, convert: 525 * 526 * JMP __x86_return_thunk 527 * 528 * into: 529 * 530 * RET 531 */ 532 static int patch_return(void *addr, struct insn *insn, u8 *bytes) 533 { 534 int i = 0; 535 536 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) 537 return -1; 538 539 bytes[i++] = RET_INSN_OPCODE; 540 541 for (; i < insn->length;) 542 bytes[i++] = INT3_INSN_OPCODE; 543 544 return i; 545 } 546 547 void __init_or_module noinline apply_returns(s32 *start, s32 *end) 548 { 549 s32 *s; 550 551 for (s = start; s < end; s++) { 552 void *dest = NULL, *addr = (void *)s + *s; 553 struct insn insn; 554 int len, ret; 555 u8 bytes[16]; 556 u8 op; 557 558 ret = insn_decode_kernel(&insn, addr); 559 if (WARN_ON_ONCE(ret < 0)) 560 continue; 561 562 op = insn.opcode.bytes[0]; 563 if (op == JMP32_INSN_OPCODE) 564 dest = addr + insn.length + insn.immediate.value; 565 566 if (__static_call_fixup(addr, op, dest) || 567 WARN_ONCE(dest != &__x86_return_thunk, 568 "missing return thunk: %pS-%pS: %*ph", 569 addr, dest, 5, addr)) 570 continue; 571 572 DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", 573 addr, addr, insn.length, 574 addr + insn.length + insn.immediate.value); 575 576 len = patch_return(addr, &insn, bytes); 577 if (len == insn.length) { 578 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); 579 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); 580 text_poke_early(addr, bytes, len); 581 } 582 } 583 } 584 #else 585 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 586 #endif /* CONFIG_RETHUNK */ 587 588 #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ 589 590 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } 591 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 592 593 #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ 594 595 #ifdef CONFIG_X86_KERNEL_IBT 596 597 /* 598 * Generated by: objtool --ibt 599 */ 600 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) 601 { 602 s32 *s; 603 604 for (s = start; s < end; s++) { 605 u32 endbr, poison = gen_endbr_poison(); 606 void *addr = (void *)s + *s; 607 608 if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) 609 continue; 610 611 if (WARN_ON_ONCE(!is_endbr(endbr))) 612 continue; 613 614 DPRINTK("ENDBR at: %pS (%px)", addr, addr); 615 616 /* 617 * When we have IBT, the lack of ENDBR will trigger #CP 618 */ 619 DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); 620 DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); 621 text_poke_early(addr, &poison, 4); 622 } 623 } 624 625 #else 626 627 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } 628 629 #endif /* CONFIG_X86_KERNEL_IBT */ 630 631 #ifdef CONFIG_SMP 632 static void alternatives_smp_lock(const s32 *start, const s32 *end, 633 u8 *text, u8 *text_end) 634 { 635 const s32 *poff; 636 637 for (poff = start; poff < end; poff++) { 638 u8 *ptr = (u8 *)poff + *poff; 639 640 if (!*poff || ptr < text || ptr >= text_end) 641 continue; 642 /* turn DS segment override prefix into lock prefix */ 643 if (*ptr == 0x3e) 644 text_poke(ptr, ((unsigned char []){0xf0}), 1); 645 } 646 } 647 648 static void alternatives_smp_unlock(const s32 *start, const s32 *end, 649 u8 *text, u8 *text_end) 650 { 651 const s32 *poff; 652 653 for (poff = start; poff < end; poff++) { 654 u8 *ptr = (u8 *)poff + *poff; 655 656 if (!*poff || ptr < text || ptr >= text_end) 657 continue; 658 /* turn lock prefix into DS segment override prefix */ 659 if (*ptr == 0xf0) 660 text_poke(ptr, ((unsigned char []){0x3E}), 1); 661 } 662 } 663 664 struct smp_alt_module { 665 /* what is this ??? */ 666 struct module *mod; 667 char *name; 668 669 /* ptrs to lock prefixes */ 670 const s32 *locks; 671 const s32 *locks_end; 672 673 /* .text segment, needed to avoid patching init code ;) */ 674 u8 *text; 675 u8 *text_end; 676 677 struct list_head next; 678 }; 679 static LIST_HEAD(smp_alt_modules); 680 static bool uniproc_patched = false; /* protected by text_mutex */ 681 682 void __init_or_module alternatives_smp_module_add(struct module *mod, 683 char *name, 684 void *locks, void *locks_end, 685 void *text, void *text_end) 686 { 687 struct smp_alt_module *smp; 688 689 mutex_lock(&text_mutex); 690 if (!uniproc_patched) 691 goto unlock; 692 693 if (num_possible_cpus() == 1) 694 /* Don't bother remembering, we'll never have to undo it. */ 695 goto smp_unlock; 696 697 smp = kzalloc(sizeof(*smp), GFP_KERNEL); 698 if (NULL == smp) 699 /* we'll run the (safe but slow) SMP code then ... */ 700 goto unlock; 701 702 smp->mod = mod; 703 smp->name = name; 704 smp->locks = locks; 705 smp->locks_end = locks_end; 706 smp->text = text; 707 smp->text_end = text_end; 708 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", 709 smp->locks, smp->locks_end, 710 smp->text, smp->text_end, smp->name); 711 712 list_add_tail(&smp->next, &smp_alt_modules); 713 smp_unlock: 714 alternatives_smp_unlock(locks, locks_end, text, text_end); 715 unlock: 716 mutex_unlock(&text_mutex); 717 } 718 719 void __init_or_module alternatives_smp_module_del(struct module *mod) 720 { 721 struct smp_alt_module *item; 722 723 mutex_lock(&text_mutex); 724 list_for_each_entry(item, &smp_alt_modules, next) { 725 if (mod != item->mod) 726 continue; 727 list_del(&item->next); 728 kfree(item); 729 break; 730 } 731 mutex_unlock(&text_mutex); 732 } 733 734 void alternatives_enable_smp(void) 735 { 736 struct smp_alt_module *mod; 737 738 /* Why bother if there are no other CPUs? */ 739 BUG_ON(num_possible_cpus() == 1); 740 741 mutex_lock(&text_mutex); 742 743 if (uniproc_patched) { 744 pr_info("switching to SMP code\n"); 745 BUG_ON(num_online_cpus() != 1); 746 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 747 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 748 list_for_each_entry(mod, &smp_alt_modules, next) 749 alternatives_smp_lock(mod->locks, mod->locks_end, 750 mod->text, mod->text_end); 751 uniproc_patched = false; 752 } 753 mutex_unlock(&text_mutex); 754 } 755 756 /* 757 * Return 1 if the address range is reserved for SMP-alternatives. 758 * Must hold text_mutex. 759 */ 760 int alternatives_text_reserved(void *start, void *end) 761 { 762 struct smp_alt_module *mod; 763 const s32 *poff; 764 u8 *text_start = start; 765 u8 *text_end = end; 766 767 lockdep_assert_held(&text_mutex); 768 769 list_for_each_entry(mod, &smp_alt_modules, next) { 770 if (mod->text > text_end || mod->text_end < text_start) 771 continue; 772 for (poff = mod->locks; poff < mod->locks_end; poff++) { 773 const u8 *ptr = (const u8 *)poff + *poff; 774 775 if (text_start <= ptr && text_end > ptr) 776 return 1; 777 } 778 } 779 780 return 0; 781 } 782 #endif /* CONFIG_SMP */ 783 784 #ifdef CONFIG_PARAVIRT 785 void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 786 struct paravirt_patch_site *end) 787 { 788 struct paravirt_patch_site *p; 789 char insn_buff[MAX_PATCH_LEN]; 790 791 for (p = start; p < end; p++) { 792 unsigned int used; 793 794 BUG_ON(p->len > MAX_PATCH_LEN); 795 /* prep the buffer with the original instructions */ 796 memcpy(insn_buff, p->instr, p->len); 797 used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); 798 799 BUG_ON(used > p->len); 800 801 /* Pad the rest with nops */ 802 add_nops(insn_buff + used, p->len - used); 803 text_poke_early(p->instr, insn_buff, p->len); 804 } 805 } 806 extern struct paravirt_patch_site __start_parainstructions[], 807 __stop_parainstructions[]; 808 #endif /* CONFIG_PARAVIRT */ 809 810 /* 811 * Self-test for the INT3 based CALL emulation code. 812 * 813 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up 814 * properly and that there is a stack gap between the INT3 frame and the 815 * previous context. Without this gap doing a virtual PUSH on the interrupted 816 * stack would corrupt the INT3 IRET frame. 817 * 818 * See entry_{32,64}.S for more details. 819 */ 820 821 /* 822 * We define the int3_magic() function in assembly to control the calling 823 * convention such that we can 'call' it from assembly. 824 */ 825 826 extern void int3_magic(unsigned int *ptr); /* defined in asm */ 827 828 asm ( 829 " .pushsection .init.text, \"ax\", @progbits\n" 830 " .type int3_magic, @function\n" 831 "int3_magic:\n" 832 ANNOTATE_NOENDBR 833 " movl $1, (%" _ASM_ARG1 ")\n" 834 ASM_RET 835 " .size int3_magic, .-int3_magic\n" 836 " .popsection\n" 837 ); 838 839 extern void int3_selftest_ip(void); /* defined in asm below */ 840 841 static int __init 842 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) 843 { 844 unsigned long selftest = (unsigned long)&int3_selftest_ip; 845 struct die_args *args = data; 846 struct pt_regs *regs = args->regs; 847 848 OPTIMIZER_HIDE_VAR(selftest); 849 850 if (!regs || user_mode(regs)) 851 return NOTIFY_DONE; 852 853 if (val != DIE_INT3) 854 return NOTIFY_DONE; 855 856 if (regs->ip - INT3_INSN_SIZE != selftest) 857 return NOTIFY_DONE; 858 859 int3_emulate_call(regs, (unsigned long)&int3_magic); 860 return NOTIFY_STOP; 861 } 862 863 /* Must be noinline to ensure uniqueness of int3_selftest_ip. */ 864 static noinline void __init int3_selftest(void) 865 { 866 static __initdata struct notifier_block int3_exception_nb = { 867 .notifier_call = int3_exception_notify, 868 .priority = INT_MAX-1, /* last */ 869 }; 870 unsigned int val = 0; 871 872 BUG_ON(register_die_notifier(&int3_exception_nb)); 873 874 /* 875 * Basically: int3_magic(&val); but really complicated :-) 876 * 877 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb 878 * notifier above will emulate CALL for us. 879 */ 880 asm volatile ("int3_selftest_ip:\n\t" 881 ANNOTATE_NOENDBR 882 " int3; nop; nop; nop; nop\n\t" 883 : ASM_CALL_CONSTRAINT 884 : __ASM_SEL_RAW(a, D) (&val) 885 : "memory"); 886 887 BUG_ON(val != 1); 888 889 unregister_die_notifier(&int3_exception_nb); 890 } 891 892 void __init alternative_instructions(void) 893 { 894 int3_selftest(); 895 896 /* 897 * The patching is not fully atomic, so try to avoid local 898 * interruptions that might execute the to be patched code. 899 * Other CPUs are not running. 900 */ 901 stop_nmi(); 902 903 /* 904 * Don't stop machine check exceptions while patching. 905 * MCEs only happen when something got corrupted and in this 906 * case we must do something about the corruption. 907 * Ignoring it is worse than an unlikely patching race. 908 * Also machine checks tend to be broadcast and if one CPU 909 * goes into machine check the others follow quickly, so we don't 910 * expect a machine check to cause undue problems during to code 911 * patching. 912 */ 913 914 /* 915 * Paravirt patching and alternative patching can be combined to 916 * replace a function call with a short direct code sequence (e.g. 917 * by setting a constant return value instead of doing that in an 918 * external function). 919 * In order to make this work the following sequence is required: 920 * 1. set (artificial) features depending on used paravirt 921 * functions which can later influence alternative patching 922 * 2. apply paravirt patching (generally replacing an indirect 923 * function call with a direct one) 924 * 3. apply alternative patching (e.g. replacing a direct function 925 * call with a custom code sequence) 926 * Doing paravirt patching after alternative patching would clobber 927 * the optimization of the custom code with a function call again. 928 */ 929 paravirt_set_cap(); 930 931 /* 932 * First patch paravirt functions, such that we overwrite the indirect 933 * call with the direct call. 934 */ 935 apply_paravirt(__parainstructions, __parainstructions_end); 936 937 /* 938 * Rewrite the retpolines, must be done before alternatives since 939 * those can rewrite the retpoline thunks. 940 */ 941 apply_retpolines(__retpoline_sites, __retpoline_sites_end); 942 apply_returns(__return_sites, __return_sites_end); 943 944 /* 945 * Then patch alternatives, such that those paravirt calls that are in 946 * alternatives can be overwritten by their immediate fragments. 947 */ 948 apply_alternatives(__alt_instructions, __alt_instructions_end); 949 950 apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); 951 952 #ifdef CONFIG_SMP 953 /* Patch to UP if other cpus not imminent. */ 954 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { 955 uniproc_patched = true; 956 alternatives_smp_module_add(NULL, "core kernel", 957 __smp_locks, __smp_locks_end, 958 _text, _etext); 959 } 960 961 if (!uniproc_patched || num_possible_cpus() == 1) { 962 free_init_pages("SMP alternatives", 963 (unsigned long)__smp_locks, 964 (unsigned long)__smp_locks_end); 965 } 966 #endif 967 968 restart_nmi(); 969 alternatives_patched = 1; 970 } 971 972 /** 973 * text_poke_early - Update instructions on a live kernel at boot time 974 * @addr: address to modify 975 * @opcode: source of the copy 976 * @len: length to copy 977 * 978 * When you use this code to patch more than one byte of an instruction 979 * you need to make sure that other CPUs cannot execute this code in parallel. 980 * Also no thread must be currently preempted in the middle of these 981 * instructions. And on the local CPU you need to be protected against NMI or 982 * MCE handlers seeing an inconsistent instruction while you patch. 983 */ 984 void __init_or_module text_poke_early(void *addr, const void *opcode, 985 size_t len) 986 { 987 unsigned long flags; 988 989 if (boot_cpu_has(X86_FEATURE_NX) && 990 is_module_text_address((unsigned long)addr)) { 991 /* 992 * Modules text is marked initially as non-executable, so the 993 * code cannot be running and speculative code-fetches are 994 * prevented. Just change the code. 995 */ 996 memcpy(addr, opcode, len); 997 } else { 998 local_irq_save(flags); 999 memcpy(addr, opcode, len); 1000 local_irq_restore(flags); 1001 sync_core(); 1002 1003 /* 1004 * Could also do a CLFLUSH here to speed up CPU recovery; but 1005 * that causes hangs on some VIA CPUs. 1006 */ 1007 } 1008 } 1009 1010 typedef struct { 1011 struct mm_struct *mm; 1012 } temp_mm_state_t; 1013 1014 /* 1015 * Using a temporary mm allows to set temporary mappings that are not accessible 1016 * by other CPUs. Such mappings are needed to perform sensitive memory writes 1017 * that override the kernel memory protections (e.g., W^X), without exposing the 1018 * temporary page-table mappings that are required for these write operations to 1019 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the 1020 * mapping is torn down. 1021 * 1022 * Context: The temporary mm needs to be used exclusively by a single core. To 1023 * harden security IRQs must be disabled while the temporary mm is 1024 * loaded, thereby preventing interrupt handler bugs from overriding 1025 * the kernel memory protection. 1026 */ 1027 static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) 1028 { 1029 temp_mm_state_t temp_state; 1030 1031 lockdep_assert_irqs_disabled(); 1032 1033 /* 1034 * Make sure not to be in TLB lazy mode, as otherwise we'll end up 1035 * with a stale address space WITHOUT being in lazy mode after 1036 * restoring the previous mm. 1037 */ 1038 if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) 1039 leave_mm(smp_processor_id()); 1040 1041 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); 1042 switch_mm_irqs_off(NULL, mm, current); 1043 1044 /* 1045 * If breakpoints are enabled, disable them while the temporary mm is 1046 * used. Userspace might set up watchpoints on addresses that are used 1047 * in the temporary mm, which would lead to wrong signals being sent or 1048 * crashes. 1049 * 1050 * Note that breakpoints are not disabled selectively, which also causes 1051 * kernel breakpoints (e.g., perf's) to be disabled. This might be 1052 * undesirable, but still seems reasonable as the code that runs in the 1053 * temporary mm should be short. 1054 */ 1055 if (hw_breakpoint_active()) 1056 hw_breakpoint_disable(); 1057 1058 return temp_state; 1059 } 1060 1061 static inline void unuse_temporary_mm(temp_mm_state_t prev_state) 1062 { 1063 lockdep_assert_irqs_disabled(); 1064 switch_mm_irqs_off(NULL, prev_state.mm, current); 1065 1066 /* 1067 * Restore the breakpoints if they were disabled before the temporary mm 1068 * was loaded. 1069 */ 1070 if (hw_breakpoint_active()) 1071 hw_breakpoint_restore(); 1072 } 1073 1074 __ro_after_init struct mm_struct *poking_mm; 1075 __ro_after_init unsigned long poking_addr; 1076 1077 static void text_poke_memcpy(void *dst, const void *src, size_t len) 1078 { 1079 memcpy(dst, src, len); 1080 } 1081 1082 static void text_poke_memset(void *dst, const void *src, size_t len) 1083 { 1084 int c = *(const int *)src; 1085 1086 memset(dst, c, len); 1087 } 1088 1089 typedef void text_poke_f(void *dst, const void *src, size_t len); 1090 1091 static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) 1092 { 1093 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; 1094 struct page *pages[2] = {NULL}; 1095 temp_mm_state_t prev; 1096 unsigned long flags; 1097 pte_t pte, *ptep; 1098 spinlock_t *ptl; 1099 pgprot_t pgprot; 1100 1101 /* 1102 * While boot memory allocator is running we cannot use struct pages as 1103 * they are not yet initialized. There is no way to recover. 1104 */ 1105 BUG_ON(!after_bootmem); 1106 1107 if (!core_kernel_text((unsigned long)addr)) { 1108 pages[0] = vmalloc_to_page(addr); 1109 if (cross_page_boundary) 1110 pages[1] = vmalloc_to_page(addr + PAGE_SIZE); 1111 } else { 1112 pages[0] = virt_to_page(addr); 1113 WARN_ON(!PageReserved(pages[0])); 1114 if (cross_page_boundary) 1115 pages[1] = virt_to_page(addr + PAGE_SIZE); 1116 } 1117 /* 1118 * If something went wrong, crash and burn since recovery paths are not 1119 * implemented. 1120 */ 1121 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); 1122 1123 /* 1124 * Map the page without the global bit, as TLB flushing is done with 1125 * flush_tlb_mm_range(), which is intended for non-global PTEs. 1126 */ 1127 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); 1128 1129 /* 1130 * The lock is not really needed, but this allows to avoid open-coding. 1131 */ 1132 ptep = get_locked_pte(poking_mm, poking_addr, &ptl); 1133 1134 /* 1135 * This must not fail; preallocated in poking_init(). 1136 */ 1137 VM_BUG_ON(!ptep); 1138 1139 local_irq_save(flags); 1140 1141 pte = mk_pte(pages[0], pgprot); 1142 set_pte_at(poking_mm, poking_addr, ptep, pte); 1143 1144 if (cross_page_boundary) { 1145 pte = mk_pte(pages[1], pgprot); 1146 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); 1147 } 1148 1149 /* 1150 * Loading the temporary mm behaves as a compiler barrier, which 1151 * guarantees that the PTE will be set at the time memcpy() is done. 1152 */ 1153 prev = use_temporary_mm(poking_mm); 1154 1155 kasan_disable_current(); 1156 func((u8 *)poking_addr + offset_in_page(addr), src, len); 1157 kasan_enable_current(); 1158 1159 /* 1160 * Ensure that the PTE is only cleared after the instructions of memcpy 1161 * were issued by using a compiler barrier. 1162 */ 1163 barrier(); 1164 1165 pte_clear(poking_mm, poking_addr, ptep); 1166 if (cross_page_boundary) 1167 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); 1168 1169 /* 1170 * Loading the previous page-table hierarchy requires a serializing 1171 * instruction that already allows the core to see the updated version. 1172 * Xen-PV is assumed to serialize execution in a similar manner. 1173 */ 1174 unuse_temporary_mm(prev); 1175 1176 /* 1177 * Flushing the TLB might involve IPIs, which would require enabled 1178 * IRQs, but not if the mm is not used, as it is in this point. 1179 */ 1180 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + 1181 (cross_page_boundary ? 2 : 1) * PAGE_SIZE, 1182 PAGE_SHIFT, false); 1183 1184 if (func == text_poke_memcpy) { 1185 /* 1186 * If the text does not match what we just wrote then something is 1187 * fundamentally screwy; there's nothing we can really do about that. 1188 */ 1189 BUG_ON(memcmp(addr, src, len)); 1190 } 1191 1192 local_irq_restore(flags); 1193 pte_unmap_unlock(ptep, ptl); 1194 return addr; 1195 } 1196 1197 /** 1198 * text_poke - Update instructions on a live kernel 1199 * @addr: address to modify 1200 * @opcode: source of the copy 1201 * @len: length to copy 1202 * 1203 * Only atomic text poke/set should be allowed when not doing early patching. 1204 * It means the size must be writable atomically and the address must be aligned 1205 * in a way that permits an atomic write. It also makes sure we fit on a single 1206 * page. 1207 * 1208 * Note that the caller must ensure that if the modified code is part of a 1209 * module, the module would not be removed during poking. This can be achieved 1210 * by registering a module notifier, and ordering module removal and patching 1211 * trough a mutex. 1212 */ 1213 void *text_poke(void *addr, const void *opcode, size_t len) 1214 { 1215 lockdep_assert_held(&text_mutex); 1216 1217 return __text_poke(text_poke_memcpy, addr, opcode, len); 1218 } 1219 1220 /** 1221 * text_poke_kgdb - Update instructions on a live kernel by kgdb 1222 * @addr: address to modify 1223 * @opcode: source of the copy 1224 * @len: length to copy 1225 * 1226 * Only atomic text poke/set should be allowed when not doing early patching. 1227 * It means the size must be writable atomically and the address must be aligned 1228 * in a way that permits an atomic write. It also makes sure we fit on a single 1229 * page. 1230 * 1231 * Context: should only be used by kgdb, which ensures no other core is running, 1232 * despite the fact it does not hold the text_mutex. 1233 */ 1234 void *text_poke_kgdb(void *addr, const void *opcode, size_t len) 1235 { 1236 return __text_poke(text_poke_memcpy, addr, opcode, len); 1237 } 1238 1239 /** 1240 * text_poke_copy - Copy instructions into (an unused part of) RX memory 1241 * @addr: address to modify 1242 * @opcode: source of the copy 1243 * @len: length to copy, could be more than 2x PAGE_SIZE 1244 * 1245 * Not safe against concurrent execution; useful for JITs to dump 1246 * new code blocks into unused regions of RX memory. Can be used in 1247 * conjunction with synchronize_rcu_tasks() to wait for existing 1248 * execution to quiesce after having made sure no existing functions 1249 * pointers are live. 1250 */ 1251 void *text_poke_copy(void *addr, const void *opcode, size_t len) 1252 { 1253 unsigned long start = (unsigned long)addr; 1254 size_t patched = 0; 1255 1256 if (WARN_ON_ONCE(core_kernel_text(start))) 1257 return NULL; 1258 1259 mutex_lock(&text_mutex); 1260 while (patched < len) { 1261 unsigned long ptr = start + patched; 1262 size_t s; 1263 1264 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 1265 1266 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); 1267 patched += s; 1268 } 1269 mutex_unlock(&text_mutex); 1270 return addr; 1271 } 1272 1273 /** 1274 * text_poke_set - memset into (an unused part of) RX memory 1275 * @addr: address to modify 1276 * @c: the byte to fill the area with 1277 * @len: length to copy, could be more than 2x PAGE_SIZE 1278 * 1279 * This is useful to overwrite unused regions of RX memory with illegal 1280 * instructions. 1281 */ 1282 void *text_poke_set(void *addr, int c, size_t len) 1283 { 1284 unsigned long start = (unsigned long)addr; 1285 size_t patched = 0; 1286 1287 if (WARN_ON_ONCE(core_kernel_text(start))) 1288 return NULL; 1289 1290 mutex_lock(&text_mutex); 1291 while (patched < len) { 1292 unsigned long ptr = start + patched; 1293 size_t s; 1294 1295 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 1296 1297 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); 1298 patched += s; 1299 } 1300 mutex_unlock(&text_mutex); 1301 return addr; 1302 } 1303 1304 static void do_sync_core(void *info) 1305 { 1306 sync_core(); 1307 } 1308 1309 void text_poke_sync(void) 1310 { 1311 on_each_cpu(do_sync_core, NULL, 1); 1312 } 1313 1314 struct text_poke_loc { 1315 /* addr := _stext + rel_addr */ 1316 s32 rel_addr; 1317 s32 disp; 1318 u8 len; 1319 u8 opcode; 1320 const u8 text[POKE_MAX_OPCODE_SIZE]; 1321 /* see text_poke_bp_batch() */ 1322 u8 old; 1323 }; 1324 1325 struct bp_patching_desc { 1326 struct text_poke_loc *vec; 1327 int nr_entries; 1328 atomic_t refs; 1329 }; 1330 1331 static struct bp_patching_desc bp_desc; 1332 1333 static __always_inline 1334 struct bp_patching_desc *try_get_desc(void) 1335 { 1336 struct bp_patching_desc *desc = &bp_desc; 1337 1338 if (!arch_atomic_inc_not_zero(&desc->refs)) 1339 return NULL; 1340 1341 return desc; 1342 } 1343 1344 static __always_inline void put_desc(void) 1345 { 1346 struct bp_patching_desc *desc = &bp_desc; 1347 1348 smp_mb__before_atomic(); 1349 arch_atomic_dec(&desc->refs); 1350 } 1351 1352 static __always_inline void *text_poke_addr(struct text_poke_loc *tp) 1353 { 1354 return _stext + tp->rel_addr; 1355 } 1356 1357 static __always_inline int patch_cmp(const void *key, const void *elt) 1358 { 1359 struct text_poke_loc *tp = (struct text_poke_loc *) elt; 1360 1361 if (key < text_poke_addr(tp)) 1362 return -1; 1363 if (key > text_poke_addr(tp)) 1364 return 1; 1365 return 0; 1366 } 1367 1368 noinstr int poke_int3_handler(struct pt_regs *regs) 1369 { 1370 struct bp_patching_desc *desc; 1371 struct text_poke_loc *tp; 1372 int ret = 0; 1373 void *ip; 1374 1375 if (user_mode(regs)) 1376 return 0; 1377 1378 /* 1379 * Having observed our INT3 instruction, we now must observe 1380 * bp_desc with non-zero refcount: 1381 * 1382 * bp_desc.refs = 1 INT3 1383 * WMB RMB 1384 * write INT3 if (bp_desc.refs != 0) 1385 */ 1386 smp_rmb(); 1387 1388 desc = try_get_desc(); 1389 if (!desc) 1390 return 0; 1391 1392 /* 1393 * Discount the INT3. See text_poke_bp_batch(). 1394 */ 1395 ip = (void *) regs->ip - INT3_INSN_SIZE; 1396 1397 /* 1398 * Skip the binary search if there is a single member in the vector. 1399 */ 1400 if (unlikely(desc->nr_entries > 1)) { 1401 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, 1402 sizeof(struct text_poke_loc), 1403 patch_cmp); 1404 if (!tp) 1405 goto out_put; 1406 } else { 1407 tp = desc->vec; 1408 if (text_poke_addr(tp) != ip) 1409 goto out_put; 1410 } 1411 1412 ip += tp->len; 1413 1414 switch (tp->opcode) { 1415 case INT3_INSN_OPCODE: 1416 /* 1417 * Someone poked an explicit INT3, they'll want to handle it, 1418 * do not consume. 1419 */ 1420 goto out_put; 1421 1422 case RET_INSN_OPCODE: 1423 int3_emulate_ret(regs); 1424 break; 1425 1426 case CALL_INSN_OPCODE: 1427 int3_emulate_call(regs, (long)ip + tp->disp); 1428 break; 1429 1430 case JMP32_INSN_OPCODE: 1431 case JMP8_INSN_OPCODE: 1432 int3_emulate_jmp(regs, (long)ip + tp->disp); 1433 break; 1434 1435 default: 1436 BUG(); 1437 } 1438 1439 ret = 1; 1440 1441 out_put: 1442 put_desc(); 1443 return ret; 1444 } 1445 1446 #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) 1447 static struct text_poke_loc tp_vec[TP_VEC_MAX]; 1448 static int tp_vec_nr; 1449 1450 /** 1451 * text_poke_bp_batch() -- update instructions on live kernel on SMP 1452 * @tp: vector of instructions to patch 1453 * @nr_entries: number of entries in the vector 1454 * 1455 * Modify multi-byte instruction by using int3 breakpoint on SMP. 1456 * We completely avoid stop_machine() here, and achieve the 1457 * synchronization using int3 breakpoint. 1458 * 1459 * The way it is done: 1460 * - For each entry in the vector: 1461 * - add a int3 trap to the address that will be patched 1462 * - sync cores 1463 * - For each entry in the vector: 1464 * - update all but the first byte of the patched range 1465 * - sync cores 1466 * - For each entry in the vector: 1467 * - replace the first byte (int3) by the first byte of 1468 * replacing opcode 1469 * - sync cores 1470 */ 1471 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) 1472 { 1473 unsigned char int3 = INT3_INSN_OPCODE; 1474 unsigned int i; 1475 int do_sync; 1476 1477 lockdep_assert_held(&text_mutex); 1478 1479 bp_desc.vec = tp; 1480 bp_desc.nr_entries = nr_entries; 1481 1482 /* 1483 * Corresponds to the implicit memory barrier in try_get_desc() to 1484 * ensure reading a non-zero refcount provides up to date bp_desc data. 1485 */ 1486 atomic_set_release(&bp_desc.refs, 1); 1487 1488 /* 1489 * Corresponding read barrier in int3 notifier for making sure the 1490 * nr_entries and handler are correctly ordered wrt. patching. 1491 */ 1492 smp_wmb(); 1493 1494 /* 1495 * First step: add a int3 trap to the address that will be patched. 1496 */ 1497 for (i = 0; i < nr_entries; i++) { 1498 tp[i].old = *(u8 *)text_poke_addr(&tp[i]); 1499 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); 1500 } 1501 1502 text_poke_sync(); 1503 1504 /* 1505 * Second step: update all but the first byte of the patched range. 1506 */ 1507 for (do_sync = 0, i = 0; i < nr_entries; i++) { 1508 u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; 1509 int len = tp[i].len; 1510 1511 if (len - INT3_INSN_SIZE > 0) { 1512 memcpy(old + INT3_INSN_SIZE, 1513 text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 1514 len - INT3_INSN_SIZE); 1515 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 1516 (const char *)tp[i].text + INT3_INSN_SIZE, 1517 len - INT3_INSN_SIZE); 1518 do_sync++; 1519 } 1520 1521 /* 1522 * Emit a perf event to record the text poke, primarily to 1523 * support Intel PT decoding which must walk the executable code 1524 * to reconstruct the trace. The flow up to here is: 1525 * - write INT3 byte 1526 * - IPI-SYNC 1527 * - write instruction tail 1528 * At this point the actual control flow will be through the 1529 * INT3 and handler and not hit the old or new instruction. 1530 * Intel PT outputs FUP/TIP packets for the INT3, so the flow 1531 * can still be decoded. Subsequently: 1532 * - emit RECORD_TEXT_POKE with the new instruction 1533 * - IPI-SYNC 1534 * - write first byte 1535 * - IPI-SYNC 1536 * So before the text poke event timestamp, the decoder will see 1537 * either the old instruction flow or FUP/TIP of INT3. After the 1538 * text poke event timestamp, the decoder will see either the 1539 * new instruction flow or FUP/TIP of INT3. Thus decoders can 1540 * use the timestamp as the point at which to modify the 1541 * executable code. 1542 * The old instruction is recorded so that the event can be 1543 * processed forwards or backwards. 1544 */ 1545 perf_event_text_poke(text_poke_addr(&tp[i]), old, len, 1546 tp[i].text, len); 1547 } 1548 1549 if (do_sync) { 1550 /* 1551 * According to Intel, this core syncing is very likely 1552 * not necessary and we'd be safe even without it. But 1553 * better safe than sorry (plus there's not only Intel). 1554 */ 1555 text_poke_sync(); 1556 } 1557 1558 /* 1559 * Third step: replace the first byte (int3) by the first byte of 1560 * replacing opcode. 1561 */ 1562 for (do_sync = 0, i = 0; i < nr_entries; i++) { 1563 if (tp[i].text[0] == INT3_INSN_OPCODE) 1564 continue; 1565 1566 text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); 1567 do_sync++; 1568 } 1569 1570 if (do_sync) 1571 text_poke_sync(); 1572 1573 /* 1574 * Remove and wait for refs to be zero. 1575 */ 1576 if (!atomic_dec_and_test(&bp_desc.refs)) 1577 atomic_cond_read_acquire(&bp_desc.refs, !VAL); 1578 } 1579 1580 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, 1581 const void *opcode, size_t len, const void *emulate) 1582 { 1583 struct insn insn; 1584 int ret, i; 1585 1586 memcpy((void *)tp->text, opcode, len); 1587 if (!emulate) 1588 emulate = opcode; 1589 1590 ret = insn_decode_kernel(&insn, emulate); 1591 BUG_ON(ret < 0); 1592 1593 tp->rel_addr = addr - (void *)_stext; 1594 tp->len = len; 1595 tp->opcode = insn.opcode.bytes[0]; 1596 1597 switch (tp->opcode) { 1598 case RET_INSN_OPCODE: 1599 case JMP32_INSN_OPCODE: 1600 case JMP8_INSN_OPCODE: 1601 /* 1602 * Control flow instructions without implied execution of the 1603 * next instruction can be padded with INT3. 1604 */ 1605 for (i = insn.length; i < len; i++) 1606 BUG_ON(tp->text[i] != INT3_INSN_OPCODE); 1607 break; 1608 1609 default: 1610 BUG_ON(len != insn.length); 1611 }; 1612 1613 1614 switch (tp->opcode) { 1615 case INT3_INSN_OPCODE: 1616 case RET_INSN_OPCODE: 1617 break; 1618 1619 case CALL_INSN_OPCODE: 1620 case JMP32_INSN_OPCODE: 1621 case JMP8_INSN_OPCODE: 1622 tp->disp = insn.immediate.value; 1623 break; 1624 1625 default: /* assume NOP */ 1626 switch (len) { 1627 case 2: /* NOP2 -- emulate as JMP8+0 */ 1628 BUG_ON(memcmp(emulate, x86_nops[len], len)); 1629 tp->opcode = JMP8_INSN_OPCODE; 1630 tp->disp = 0; 1631 break; 1632 1633 case 5: /* NOP5 -- emulate as JMP32+0 */ 1634 BUG_ON(memcmp(emulate, x86_nops[len], len)); 1635 tp->opcode = JMP32_INSN_OPCODE; 1636 tp->disp = 0; 1637 break; 1638 1639 default: /* unknown instruction */ 1640 BUG(); 1641 } 1642 break; 1643 } 1644 } 1645 1646 /* 1647 * We hard rely on the tp_vec being ordered; ensure this is so by flushing 1648 * early if needed. 1649 */ 1650 static bool tp_order_fail(void *addr) 1651 { 1652 struct text_poke_loc *tp; 1653 1654 if (!tp_vec_nr) 1655 return false; 1656 1657 if (!addr) /* force */ 1658 return true; 1659 1660 tp = &tp_vec[tp_vec_nr - 1]; 1661 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) 1662 return true; 1663 1664 return false; 1665 } 1666 1667 static void text_poke_flush(void *addr) 1668 { 1669 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { 1670 text_poke_bp_batch(tp_vec, tp_vec_nr); 1671 tp_vec_nr = 0; 1672 } 1673 } 1674 1675 void text_poke_finish(void) 1676 { 1677 text_poke_flush(NULL); 1678 } 1679 1680 void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) 1681 { 1682 struct text_poke_loc *tp; 1683 1684 if (unlikely(system_state == SYSTEM_BOOTING)) { 1685 text_poke_early(addr, opcode, len); 1686 return; 1687 } 1688 1689 text_poke_flush(addr); 1690 1691 tp = &tp_vec[tp_vec_nr++]; 1692 text_poke_loc_init(tp, addr, opcode, len, emulate); 1693 } 1694 1695 /** 1696 * text_poke_bp() -- update instructions on live kernel on SMP 1697 * @addr: address to patch 1698 * @opcode: opcode of new instruction 1699 * @len: length to copy 1700 * @emulate: instruction to be emulated 1701 * 1702 * Update a single instruction with the vector in the stack, avoiding 1703 * dynamically allocated memory. This function should be used when it is 1704 * not possible to allocate memory. 1705 */ 1706 void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) 1707 { 1708 struct text_poke_loc tp; 1709 1710 if (unlikely(system_state == SYSTEM_BOOTING)) { 1711 text_poke_early(addr, opcode, len); 1712 return; 1713 } 1714 1715 text_poke_loc_init(&tp, addr, opcode, len, emulate); 1716 text_poke_bp_batch(&tp, 1); 1717 } 1718