1 // SPDX-License-Identifier: GPL-2.0-only 2 #define pr_fmt(fmt) "SMP alternatives: " fmt 3 4 #include <linux/module.h> 5 #include <linux/sched.h> 6 #include <linux/perf_event.h> 7 #include <linux/mutex.h> 8 #include <linux/list.h> 9 #include <linux/stringify.h> 10 #include <linux/highmem.h> 11 #include <linux/mm.h> 12 #include <linux/vmalloc.h> 13 #include <linux/memory.h> 14 #include <linux/stop_machine.h> 15 #include <linux/slab.h> 16 #include <linux/kdebug.h> 17 #include <linux/kprobes.h> 18 #include <linux/mmu_context.h> 19 #include <linux/bsearch.h> 20 #include <linux/sync_core.h> 21 #include <asm/text-patching.h> 22 #include <asm/alternative.h> 23 #include <asm/sections.h> 24 #include <asm/mce.h> 25 #include <asm/nmi.h> 26 #include <asm/cacheflush.h> 27 #include <asm/tlbflush.h> 28 #include <asm/insn.h> 29 #include <asm/io.h> 30 #include <asm/fixmap.h> 31 #include <asm/paravirt.h> 32 #include <asm/asm-prototypes.h> 33 34 int __read_mostly alternatives_patched; 35 36 EXPORT_SYMBOL_GPL(alternatives_patched); 37 38 #define MAX_PATCH_LEN (255-1) 39 40 static int __initdata_or_module debug_alternative; 41 42 static int __init debug_alt(char *str) 43 { 44 debug_alternative = 1; 45 return 1; 46 } 47 __setup("debug-alternative", debug_alt); 48 49 static int noreplace_smp; 50 51 static int __init setup_noreplace_smp(char *str) 52 { 53 noreplace_smp = 1; 54 return 1; 55 } 56 __setup("noreplace-smp", setup_noreplace_smp); 57 58 #define DPRINTK(fmt, args...) \ 59 do { \ 60 if (debug_alternative) \ 61 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ 62 } while (0) 63 64 #define DUMP_BYTES(buf, len, fmt, args...) \ 65 do { \ 66 if (unlikely(debug_alternative)) { \ 67 int j; \ 68 \ 69 if (!(len)) \ 70 break; \ 71 \ 72 printk(KERN_DEBUG pr_fmt(fmt), ##args); \ 73 for (j = 0; j < (len) - 1; j++) \ 74 printk(KERN_CONT "%02hhx ", buf[j]); \ 75 printk(KERN_CONT "%02hhx\n", buf[j]); \ 76 } \ 77 } while (0) 78 79 static const unsigned char x86nops[] = 80 { 81 BYTES_NOP1, 82 BYTES_NOP2, 83 BYTES_NOP3, 84 BYTES_NOP4, 85 BYTES_NOP5, 86 BYTES_NOP6, 87 BYTES_NOP7, 88 BYTES_NOP8, 89 }; 90 91 const unsigned char * const x86_nops[ASM_NOP_MAX+1] = 92 { 93 NULL, 94 x86nops, 95 x86nops + 1, 96 x86nops + 1 + 2, 97 x86nops + 1 + 2 + 3, 98 x86nops + 1 + 2 + 3 + 4, 99 x86nops + 1 + 2 + 3 + 4 + 5, 100 x86nops + 1 + 2 + 3 + 4 + 5 + 6, 101 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 102 }; 103 104 /* Use this to add nops to a buffer, then text_poke the whole buffer. */ 105 static void __init_or_module add_nops(void *insns, unsigned int len) 106 { 107 while (len > 0) { 108 unsigned int noplen = len; 109 if (noplen > ASM_NOP_MAX) 110 noplen = ASM_NOP_MAX; 111 memcpy(insns, x86_nops[noplen], noplen); 112 insns += noplen; 113 len -= noplen; 114 } 115 } 116 117 extern s32 __retpoline_sites[], __retpoline_sites_end[]; 118 extern s32 __return_sites[], __return_sites_end[]; 119 extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; 120 extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 121 extern s32 __smp_locks[], __smp_locks_end[]; 122 void text_poke_early(void *addr, const void *opcode, size_t len); 123 124 /* 125 * Are we looking at a near JMP with a 1 or 4-byte displacement. 126 */ 127 static inline bool is_jmp(const u8 opcode) 128 { 129 return opcode == 0xeb || opcode == 0xe9; 130 } 131 132 static void __init_or_module 133 recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) 134 { 135 u8 *next_rip, *tgt_rip; 136 s32 n_dspl, o_dspl; 137 int repl_len; 138 139 if (a->replacementlen != 5) 140 return; 141 142 o_dspl = *(s32 *)(insn_buff + 1); 143 144 /* next_rip of the replacement JMP */ 145 next_rip = repl_insn + a->replacementlen; 146 /* target rip of the replacement JMP */ 147 tgt_rip = next_rip + o_dspl; 148 n_dspl = tgt_rip - orig_insn; 149 150 DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); 151 152 if (tgt_rip - orig_insn >= 0) { 153 if (n_dspl - 2 <= 127) 154 goto two_byte_jmp; 155 else 156 goto five_byte_jmp; 157 /* negative offset */ 158 } else { 159 if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) 160 goto two_byte_jmp; 161 else 162 goto five_byte_jmp; 163 } 164 165 two_byte_jmp: 166 n_dspl -= 2; 167 168 insn_buff[0] = 0xeb; 169 insn_buff[1] = (s8)n_dspl; 170 add_nops(insn_buff + 2, 3); 171 172 repl_len = 2; 173 goto done; 174 175 five_byte_jmp: 176 n_dspl -= 5; 177 178 insn_buff[0] = 0xe9; 179 *(s32 *)&insn_buff[1] = n_dspl; 180 181 repl_len = 5; 182 183 done: 184 185 DPRINTK("final displ: 0x%08x, JMP 0x%lx", 186 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); 187 } 188 189 /* 190 * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) 191 * 192 * @instr: instruction byte stream 193 * @instrlen: length of the above 194 * @off: offset within @instr where the first NOP has been detected 195 * 196 * Return: number of NOPs found (and replaced). 197 */ 198 static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) 199 { 200 unsigned long flags; 201 int i = off, nnops; 202 203 while (i < instrlen) { 204 if (instr[i] != 0x90) 205 break; 206 207 i++; 208 } 209 210 nnops = i - off; 211 212 if (nnops <= 1) 213 return nnops; 214 215 local_irq_save(flags); 216 add_nops(instr + off, nnops); 217 local_irq_restore(flags); 218 219 DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); 220 221 return nnops; 222 } 223 224 /* 225 * "noinline" to cause control flow change and thus invalidate I$ and 226 * cause refetch after modification. 227 */ 228 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) 229 { 230 struct insn insn; 231 int i = 0; 232 233 /* 234 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger 235 * ones. 236 */ 237 for (;;) { 238 if (insn_decode_kernel(&insn, &instr[i])) 239 return; 240 241 /* 242 * See if this and any potentially following NOPs can be 243 * optimized. 244 */ 245 if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) 246 i += optimize_nops_range(instr, len, i); 247 else 248 i += insn.length; 249 250 if (i >= len) 251 return; 252 } 253 } 254 255 /* 256 * Replace instructions with better alternatives for this CPU type. This runs 257 * before SMP is initialized to avoid SMP problems with self modifying code. 258 * This implies that asymmetric systems where APs have less capabilities than 259 * the boot processor are not handled. Tough. Make sure you disable such 260 * features by hand. 261 * 262 * Marked "noinline" to cause control flow change and thus insn cache 263 * to refetch changed I$ lines. 264 */ 265 void __init_or_module noinline apply_alternatives(struct alt_instr *start, 266 struct alt_instr *end) 267 { 268 struct alt_instr *a; 269 u8 *instr, *replacement; 270 u8 insn_buff[MAX_PATCH_LEN]; 271 272 DPRINTK("alt table %px, -> %px", start, end); 273 /* 274 * The scan order should be from start to end. A later scanned 275 * alternative code can overwrite previously scanned alternative code. 276 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 277 * patch code. 278 * 279 * So be careful if you want to change the scan order to any other 280 * order. 281 */ 282 for (a = start; a < end; a++) { 283 int insn_buff_sz = 0; 284 /* Mask away "NOT" flag bit for feature to test. */ 285 u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; 286 287 instr = (u8 *)&a->instr_offset + a->instr_offset; 288 replacement = (u8 *)&a->repl_offset + a->repl_offset; 289 BUG_ON(a->instrlen > sizeof(insn_buff)); 290 BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); 291 292 /* 293 * Patch if either: 294 * - feature is present 295 * - feature not present but ALTINSTR_FLAG_INV is set to mean, 296 * patch if feature is *NOT* present. 297 */ 298 if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) 299 goto next; 300 301 DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", 302 (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", 303 feature >> 5, 304 feature & 0x1f, 305 instr, instr, a->instrlen, 306 replacement, a->replacementlen); 307 308 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); 309 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); 310 311 memcpy(insn_buff, replacement, a->replacementlen); 312 insn_buff_sz = a->replacementlen; 313 314 /* 315 * 0xe8 is a relative jump; fix the offset. 316 * 317 * Instruction length is checked before the opcode to avoid 318 * accessing uninitialized bytes for zero-length replacements. 319 */ 320 if (a->replacementlen == 5 && *insn_buff == 0xe8) { 321 *(s32 *)(insn_buff + 1) += replacement - instr; 322 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", 323 *(s32 *)(insn_buff + 1), 324 (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); 325 } 326 327 if (a->replacementlen && is_jmp(replacement[0])) 328 recompute_jump(a, instr, replacement, insn_buff); 329 330 for (; insn_buff_sz < a->instrlen; insn_buff_sz++) 331 insn_buff[insn_buff_sz] = 0x90; 332 333 DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); 334 335 text_poke_early(instr, insn_buff, insn_buff_sz); 336 337 next: 338 optimize_nops(instr, a->instrlen); 339 } 340 } 341 342 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) 343 344 /* 345 * CALL/JMP *%\reg 346 */ 347 static int emit_indirect(int op, int reg, u8 *bytes) 348 { 349 int i = 0; 350 u8 modrm; 351 352 switch (op) { 353 case CALL_INSN_OPCODE: 354 modrm = 0x10; /* Reg = 2; CALL r/m */ 355 break; 356 357 case JMP32_INSN_OPCODE: 358 modrm = 0x20; /* Reg = 4; JMP r/m */ 359 break; 360 361 default: 362 WARN_ON_ONCE(1); 363 return -1; 364 } 365 366 if (reg >= 8) { 367 bytes[i++] = 0x41; /* REX.B prefix */ 368 reg -= 8; 369 } 370 371 modrm |= 0xc0; /* Mod = 3 */ 372 modrm += reg; 373 374 bytes[i++] = 0xff; /* opcode */ 375 bytes[i++] = modrm; 376 377 return i; 378 } 379 380 /* 381 * Rewrite the compiler generated retpoline thunk calls. 382 * 383 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate 384 * indirect instructions, avoiding the extra indirection. 385 * 386 * For example, convert: 387 * 388 * CALL __x86_indirect_thunk_\reg 389 * 390 * into: 391 * 392 * CALL *%\reg 393 * 394 * It also tries to inline spectre_v2=retpoline,lfence when size permits. 395 */ 396 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) 397 { 398 retpoline_thunk_t *target; 399 int reg, ret, i = 0; 400 u8 op, cc; 401 402 target = addr + insn->length + insn->immediate.value; 403 reg = target - __x86_indirect_thunk_array; 404 405 if (WARN_ON_ONCE(reg & ~0xf)) 406 return -1; 407 408 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ 409 BUG_ON(reg == 4); 410 411 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && 412 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) 413 return -1; 414 415 op = insn->opcode.bytes[0]; 416 417 /* 418 * Convert: 419 * 420 * Jcc.d32 __x86_indirect_thunk_\reg 421 * 422 * into: 423 * 424 * Jncc.d8 1f 425 * [ LFENCE ] 426 * JMP *%\reg 427 * [ NOP ] 428 * 1: 429 */ 430 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ 431 if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { 432 cc = insn->opcode.bytes[1] & 0xf; 433 cc ^= 1; /* invert condition */ 434 435 bytes[i++] = 0x70 + cc; /* Jcc.d8 */ 436 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ 437 438 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ 439 op = JMP32_INSN_OPCODE; 440 } 441 442 /* 443 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. 444 */ 445 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { 446 bytes[i++] = 0x0f; 447 bytes[i++] = 0xae; 448 bytes[i++] = 0xe8; /* LFENCE */ 449 } 450 451 ret = emit_indirect(op, reg, bytes + i); 452 if (ret < 0) 453 return ret; 454 i += ret; 455 456 for (; i < insn->length;) 457 bytes[i++] = BYTES_NOP1; 458 459 return i; 460 } 461 462 /* 463 * Generated by 'objtool --retpoline'. 464 */ 465 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) 466 { 467 s32 *s; 468 469 for (s = start; s < end; s++) { 470 void *addr = (void *)s + *s; 471 struct insn insn; 472 int len, ret; 473 u8 bytes[16]; 474 u8 op1, op2; 475 476 ret = insn_decode_kernel(&insn, addr); 477 if (WARN_ON_ONCE(ret < 0)) 478 continue; 479 480 op1 = insn.opcode.bytes[0]; 481 op2 = insn.opcode.bytes[1]; 482 483 switch (op1) { 484 case CALL_INSN_OPCODE: 485 case JMP32_INSN_OPCODE: 486 break; 487 488 case 0x0f: /* escape */ 489 if (op2 >= 0x80 && op2 <= 0x8f) 490 break; 491 fallthrough; 492 default: 493 WARN_ON_ONCE(1); 494 continue; 495 } 496 497 DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", 498 addr, addr, insn.length, 499 addr + insn.length + insn.immediate.value); 500 501 len = patch_retpoline(addr, &insn, bytes); 502 if (len == insn.length) { 503 optimize_nops(bytes, len); 504 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); 505 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); 506 text_poke_early(addr, bytes, len); 507 } 508 } 509 } 510 511 #ifdef CONFIG_RETHUNK 512 /* 513 * Rewrite the compiler generated return thunk tail-calls. 514 * 515 * For example, convert: 516 * 517 * JMP __x86_return_thunk 518 * 519 * into: 520 * 521 * RET 522 */ 523 static int patch_return(void *addr, struct insn *insn, u8 *bytes) 524 { 525 int i = 0; 526 527 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) 528 return -1; 529 530 bytes[i++] = RET_INSN_OPCODE; 531 532 for (; i < insn->length;) 533 bytes[i++] = INT3_INSN_OPCODE; 534 535 return i; 536 } 537 538 void __init_or_module noinline apply_returns(s32 *start, s32 *end) 539 { 540 s32 *s; 541 542 for (s = start; s < end; s++) { 543 void *dest = NULL, *addr = (void *)s + *s; 544 struct insn insn; 545 int len, ret; 546 u8 bytes[16]; 547 u8 op; 548 549 ret = insn_decode_kernel(&insn, addr); 550 if (WARN_ON_ONCE(ret < 0)) 551 continue; 552 553 op = insn.opcode.bytes[0]; 554 if (op == JMP32_INSN_OPCODE) 555 dest = addr + insn.length + insn.immediate.value; 556 557 if (__static_call_fixup(addr, op, dest) || 558 WARN_ONCE(dest != &__x86_return_thunk, 559 "missing return thunk: %pS-%pS: %*ph", 560 addr, dest, 5, addr)) 561 continue; 562 563 DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", 564 addr, addr, insn.length, 565 addr + insn.length + insn.immediate.value); 566 567 len = patch_return(addr, &insn, bytes); 568 if (len == insn.length) { 569 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); 570 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); 571 text_poke_early(addr, bytes, len); 572 } 573 } 574 } 575 #else 576 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 577 #endif /* CONFIG_RETHUNK */ 578 579 #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ 580 581 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } 582 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 583 584 #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ 585 586 #ifdef CONFIG_X86_KERNEL_IBT 587 588 /* 589 * Generated by: objtool --ibt 590 */ 591 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) 592 { 593 s32 *s; 594 595 for (s = start; s < end; s++) { 596 u32 endbr, poison = gen_endbr_poison(); 597 void *addr = (void *)s + *s; 598 599 if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) 600 continue; 601 602 if (WARN_ON_ONCE(!is_endbr(endbr))) 603 continue; 604 605 DPRINTK("ENDBR at: %pS (%px)", addr, addr); 606 607 /* 608 * When we have IBT, the lack of ENDBR will trigger #CP 609 */ 610 DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); 611 DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); 612 text_poke_early(addr, &poison, 4); 613 } 614 } 615 616 #else 617 618 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } 619 620 #endif /* CONFIG_X86_KERNEL_IBT */ 621 622 #ifdef CONFIG_SMP 623 static void alternatives_smp_lock(const s32 *start, const s32 *end, 624 u8 *text, u8 *text_end) 625 { 626 const s32 *poff; 627 628 for (poff = start; poff < end; poff++) { 629 u8 *ptr = (u8 *)poff + *poff; 630 631 if (!*poff || ptr < text || ptr >= text_end) 632 continue; 633 /* turn DS segment override prefix into lock prefix */ 634 if (*ptr == 0x3e) 635 text_poke(ptr, ((unsigned char []){0xf0}), 1); 636 } 637 } 638 639 static void alternatives_smp_unlock(const s32 *start, const s32 *end, 640 u8 *text, u8 *text_end) 641 { 642 const s32 *poff; 643 644 for (poff = start; poff < end; poff++) { 645 u8 *ptr = (u8 *)poff + *poff; 646 647 if (!*poff || ptr < text || ptr >= text_end) 648 continue; 649 /* turn lock prefix into DS segment override prefix */ 650 if (*ptr == 0xf0) 651 text_poke(ptr, ((unsigned char []){0x3E}), 1); 652 } 653 } 654 655 struct smp_alt_module { 656 /* what is this ??? */ 657 struct module *mod; 658 char *name; 659 660 /* ptrs to lock prefixes */ 661 const s32 *locks; 662 const s32 *locks_end; 663 664 /* .text segment, needed to avoid patching init code ;) */ 665 u8 *text; 666 u8 *text_end; 667 668 struct list_head next; 669 }; 670 static LIST_HEAD(smp_alt_modules); 671 static bool uniproc_patched = false; /* protected by text_mutex */ 672 673 void __init_or_module alternatives_smp_module_add(struct module *mod, 674 char *name, 675 void *locks, void *locks_end, 676 void *text, void *text_end) 677 { 678 struct smp_alt_module *smp; 679 680 mutex_lock(&text_mutex); 681 if (!uniproc_patched) 682 goto unlock; 683 684 if (num_possible_cpus() == 1) 685 /* Don't bother remembering, we'll never have to undo it. */ 686 goto smp_unlock; 687 688 smp = kzalloc(sizeof(*smp), GFP_KERNEL); 689 if (NULL == smp) 690 /* we'll run the (safe but slow) SMP code then ... */ 691 goto unlock; 692 693 smp->mod = mod; 694 smp->name = name; 695 smp->locks = locks; 696 smp->locks_end = locks_end; 697 smp->text = text; 698 smp->text_end = text_end; 699 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", 700 smp->locks, smp->locks_end, 701 smp->text, smp->text_end, smp->name); 702 703 list_add_tail(&smp->next, &smp_alt_modules); 704 smp_unlock: 705 alternatives_smp_unlock(locks, locks_end, text, text_end); 706 unlock: 707 mutex_unlock(&text_mutex); 708 } 709 710 void __init_or_module alternatives_smp_module_del(struct module *mod) 711 { 712 struct smp_alt_module *item; 713 714 mutex_lock(&text_mutex); 715 list_for_each_entry(item, &smp_alt_modules, next) { 716 if (mod != item->mod) 717 continue; 718 list_del(&item->next); 719 kfree(item); 720 break; 721 } 722 mutex_unlock(&text_mutex); 723 } 724 725 void alternatives_enable_smp(void) 726 { 727 struct smp_alt_module *mod; 728 729 /* Why bother if there are no other CPUs? */ 730 BUG_ON(num_possible_cpus() == 1); 731 732 mutex_lock(&text_mutex); 733 734 if (uniproc_patched) { 735 pr_info("switching to SMP code\n"); 736 BUG_ON(num_online_cpus() != 1); 737 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 738 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 739 list_for_each_entry(mod, &smp_alt_modules, next) 740 alternatives_smp_lock(mod->locks, mod->locks_end, 741 mod->text, mod->text_end); 742 uniproc_patched = false; 743 } 744 mutex_unlock(&text_mutex); 745 } 746 747 /* 748 * Return 1 if the address range is reserved for SMP-alternatives. 749 * Must hold text_mutex. 750 */ 751 int alternatives_text_reserved(void *start, void *end) 752 { 753 struct smp_alt_module *mod; 754 const s32 *poff; 755 u8 *text_start = start; 756 u8 *text_end = end; 757 758 lockdep_assert_held(&text_mutex); 759 760 list_for_each_entry(mod, &smp_alt_modules, next) { 761 if (mod->text > text_end || mod->text_end < text_start) 762 continue; 763 for (poff = mod->locks; poff < mod->locks_end; poff++) { 764 const u8 *ptr = (const u8 *)poff + *poff; 765 766 if (text_start <= ptr && text_end > ptr) 767 return 1; 768 } 769 } 770 771 return 0; 772 } 773 #endif /* CONFIG_SMP */ 774 775 #ifdef CONFIG_PARAVIRT 776 void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 777 struct paravirt_patch_site *end) 778 { 779 struct paravirt_patch_site *p; 780 char insn_buff[MAX_PATCH_LEN]; 781 782 for (p = start; p < end; p++) { 783 unsigned int used; 784 785 BUG_ON(p->len > MAX_PATCH_LEN); 786 /* prep the buffer with the original instructions */ 787 memcpy(insn_buff, p->instr, p->len); 788 used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); 789 790 BUG_ON(used > p->len); 791 792 /* Pad the rest with nops */ 793 add_nops(insn_buff + used, p->len - used); 794 text_poke_early(p->instr, insn_buff, p->len); 795 } 796 } 797 extern struct paravirt_patch_site __start_parainstructions[], 798 __stop_parainstructions[]; 799 #endif /* CONFIG_PARAVIRT */ 800 801 /* 802 * Self-test for the INT3 based CALL emulation code. 803 * 804 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up 805 * properly and that there is a stack gap between the INT3 frame and the 806 * previous context. Without this gap doing a virtual PUSH on the interrupted 807 * stack would corrupt the INT3 IRET frame. 808 * 809 * See entry_{32,64}.S for more details. 810 */ 811 812 /* 813 * We define the int3_magic() function in assembly to control the calling 814 * convention such that we can 'call' it from assembly. 815 */ 816 817 extern void int3_magic(unsigned int *ptr); /* defined in asm */ 818 819 asm ( 820 " .pushsection .init.text, \"ax\", @progbits\n" 821 " .type int3_magic, @function\n" 822 "int3_magic:\n" 823 ANNOTATE_NOENDBR 824 " movl $1, (%" _ASM_ARG1 ")\n" 825 ASM_RET 826 " .size int3_magic, .-int3_magic\n" 827 " .popsection\n" 828 ); 829 830 extern void int3_selftest_ip(void); /* defined in asm below */ 831 832 static int __init 833 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) 834 { 835 unsigned long selftest = (unsigned long)&int3_selftest_ip; 836 struct die_args *args = data; 837 struct pt_regs *regs = args->regs; 838 839 OPTIMIZER_HIDE_VAR(selftest); 840 841 if (!regs || user_mode(regs)) 842 return NOTIFY_DONE; 843 844 if (val != DIE_INT3) 845 return NOTIFY_DONE; 846 847 if (regs->ip - INT3_INSN_SIZE != selftest) 848 return NOTIFY_DONE; 849 850 int3_emulate_call(regs, (unsigned long)&int3_magic); 851 return NOTIFY_STOP; 852 } 853 854 /* Must be noinline to ensure uniqueness of int3_selftest_ip. */ 855 static noinline void __init int3_selftest(void) 856 { 857 static __initdata struct notifier_block int3_exception_nb = { 858 .notifier_call = int3_exception_notify, 859 .priority = INT_MAX-1, /* last */ 860 }; 861 unsigned int val = 0; 862 863 BUG_ON(register_die_notifier(&int3_exception_nb)); 864 865 /* 866 * Basically: int3_magic(&val); but really complicated :-) 867 * 868 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb 869 * notifier above will emulate CALL for us. 870 */ 871 asm volatile ("int3_selftest_ip:\n\t" 872 ANNOTATE_NOENDBR 873 " int3; nop; nop; nop; nop\n\t" 874 : ASM_CALL_CONSTRAINT 875 : __ASM_SEL_RAW(a, D) (&val) 876 : "memory"); 877 878 BUG_ON(val != 1); 879 880 unregister_die_notifier(&int3_exception_nb); 881 } 882 883 void __init alternative_instructions(void) 884 { 885 int3_selftest(); 886 887 /* 888 * The patching is not fully atomic, so try to avoid local 889 * interruptions that might execute the to be patched code. 890 * Other CPUs are not running. 891 */ 892 stop_nmi(); 893 894 /* 895 * Don't stop machine check exceptions while patching. 896 * MCEs only happen when something got corrupted and in this 897 * case we must do something about the corruption. 898 * Ignoring it is worse than an unlikely patching race. 899 * Also machine checks tend to be broadcast and if one CPU 900 * goes into machine check the others follow quickly, so we don't 901 * expect a machine check to cause undue problems during to code 902 * patching. 903 */ 904 905 /* 906 * Paravirt patching and alternative patching can be combined to 907 * replace a function call with a short direct code sequence (e.g. 908 * by setting a constant return value instead of doing that in an 909 * external function). 910 * In order to make this work the following sequence is required: 911 * 1. set (artificial) features depending on used paravirt 912 * functions which can later influence alternative patching 913 * 2. apply paravirt patching (generally replacing an indirect 914 * function call with a direct one) 915 * 3. apply alternative patching (e.g. replacing a direct function 916 * call with a custom code sequence) 917 * Doing paravirt patching after alternative patching would clobber 918 * the optimization of the custom code with a function call again. 919 */ 920 paravirt_set_cap(); 921 922 /* 923 * First patch paravirt functions, such that we overwrite the indirect 924 * call with the direct call. 925 */ 926 apply_paravirt(__parainstructions, __parainstructions_end); 927 928 /* 929 * Rewrite the retpolines, must be done before alternatives since 930 * those can rewrite the retpoline thunks. 931 */ 932 apply_retpolines(__retpoline_sites, __retpoline_sites_end); 933 apply_returns(__return_sites, __return_sites_end); 934 935 /* 936 * Then patch alternatives, such that those paravirt calls that are in 937 * alternatives can be overwritten by their immediate fragments. 938 */ 939 apply_alternatives(__alt_instructions, __alt_instructions_end); 940 941 apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); 942 943 #ifdef CONFIG_SMP 944 /* Patch to UP if other cpus not imminent. */ 945 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { 946 uniproc_patched = true; 947 alternatives_smp_module_add(NULL, "core kernel", 948 __smp_locks, __smp_locks_end, 949 _text, _etext); 950 } 951 952 if (!uniproc_patched || num_possible_cpus() == 1) { 953 free_init_pages("SMP alternatives", 954 (unsigned long)__smp_locks, 955 (unsigned long)__smp_locks_end); 956 } 957 #endif 958 959 restart_nmi(); 960 alternatives_patched = 1; 961 } 962 963 /** 964 * text_poke_early - Update instructions on a live kernel at boot time 965 * @addr: address to modify 966 * @opcode: source of the copy 967 * @len: length to copy 968 * 969 * When you use this code to patch more than one byte of an instruction 970 * you need to make sure that other CPUs cannot execute this code in parallel. 971 * Also no thread must be currently preempted in the middle of these 972 * instructions. And on the local CPU you need to be protected against NMI or 973 * MCE handlers seeing an inconsistent instruction while you patch. 974 */ 975 void __init_or_module text_poke_early(void *addr, const void *opcode, 976 size_t len) 977 { 978 unsigned long flags; 979 980 if (boot_cpu_has(X86_FEATURE_NX) && 981 is_module_text_address((unsigned long)addr)) { 982 /* 983 * Modules text is marked initially as non-executable, so the 984 * code cannot be running and speculative code-fetches are 985 * prevented. Just change the code. 986 */ 987 memcpy(addr, opcode, len); 988 } else { 989 local_irq_save(flags); 990 memcpy(addr, opcode, len); 991 local_irq_restore(flags); 992 sync_core(); 993 994 /* 995 * Could also do a CLFLUSH here to speed up CPU recovery; but 996 * that causes hangs on some VIA CPUs. 997 */ 998 } 999 } 1000 1001 typedef struct { 1002 struct mm_struct *mm; 1003 } temp_mm_state_t; 1004 1005 /* 1006 * Using a temporary mm allows to set temporary mappings that are not accessible 1007 * by other CPUs. Such mappings are needed to perform sensitive memory writes 1008 * that override the kernel memory protections (e.g., W^X), without exposing the 1009 * temporary page-table mappings that are required for these write operations to 1010 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the 1011 * mapping is torn down. 1012 * 1013 * Context: The temporary mm needs to be used exclusively by a single core. To 1014 * harden security IRQs must be disabled while the temporary mm is 1015 * loaded, thereby preventing interrupt handler bugs from overriding 1016 * the kernel memory protection. 1017 */ 1018 static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) 1019 { 1020 temp_mm_state_t temp_state; 1021 1022 lockdep_assert_irqs_disabled(); 1023 1024 /* 1025 * Make sure not to be in TLB lazy mode, as otherwise we'll end up 1026 * with a stale address space WITHOUT being in lazy mode after 1027 * restoring the previous mm. 1028 */ 1029 if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) 1030 leave_mm(smp_processor_id()); 1031 1032 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); 1033 switch_mm_irqs_off(NULL, mm, current); 1034 1035 /* 1036 * If breakpoints are enabled, disable them while the temporary mm is 1037 * used. Userspace might set up watchpoints on addresses that are used 1038 * in the temporary mm, which would lead to wrong signals being sent or 1039 * crashes. 1040 * 1041 * Note that breakpoints are not disabled selectively, which also causes 1042 * kernel breakpoints (e.g., perf's) to be disabled. This might be 1043 * undesirable, but still seems reasonable as the code that runs in the 1044 * temporary mm should be short. 1045 */ 1046 if (hw_breakpoint_active()) 1047 hw_breakpoint_disable(); 1048 1049 return temp_state; 1050 } 1051 1052 static inline void unuse_temporary_mm(temp_mm_state_t prev_state) 1053 { 1054 lockdep_assert_irqs_disabled(); 1055 switch_mm_irqs_off(NULL, prev_state.mm, current); 1056 1057 /* 1058 * Restore the breakpoints if they were disabled before the temporary mm 1059 * was loaded. 1060 */ 1061 if (hw_breakpoint_active()) 1062 hw_breakpoint_restore(); 1063 } 1064 1065 __ro_after_init struct mm_struct *poking_mm; 1066 __ro_after_init unsigned long poking_addr; 1067 1068 static void text_poke_memcpy(void *dst, const void *src, size_t len) 1069 { 1070 memcpy(dst, src, len); 1071 } 1072 1073 static void text_poke_memset(void *dst, const void *src, size_t len) 1074 { 1075 int c = *(const int *)src; 1076 1077 memset(dst, c, len); 1078 } 1079 1080 typedef void text_poke_f(void *dst, const void *src, size_t len); 1081 1082 static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) 1083 { 1084 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; 1085 struct page *pages[2] = {NULL}; 1086 temp_mm_state_t prev; 1087 unsigned long flags; 1088 pte_t pte, *ptep; 1089 spinlock_t *ptl; 1090 pgprot_t pgprot; 1091 1092 /* 1093 * While boot memory allocator is running we cannot use struct pages as 1094 * they are not yet initialized. There is no way to recover. 1095 */ 1096 BUG_ON(!after_bootmem); 1097 1098 if (!core_kernel_text((unsigned long)addr)) { 1099 pages[0] = vmalloc_to_page(addr); 1100 if (cross_page_boundary) 1101 pages[1] = vmalloc_to_page(addr + PAGE_SIZE); 1102 } else { 1103 pages[0] = virt_to_page(addr); 1104 WARN_ON(!PageReserved(pages[0])); 1105 if (cross_page_boundary) 1106 pages[1] = virt_to_page(addr + PAGE_SIZE); 1107 } 1108 /* 1109 * If something went wrong, crash and burn since recovery paths are not 1110 * implemented. 1111 */ 1112 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); 1113 1114 /* 1115 * Map the page without the global bit, as TLB flushing is done with 1116 * flush_tlb_mm_range(), which is intended for non-global PTEs. 1117 */ 1118 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); 1119 1120 /* 1121 * The lock is not really needed, but this allows to avoid open-coding. 1122 */ 1123 ptep = get_locked_pte(poking_mm, poking_addr, &ptl); 1124 1125 /* 1126 * This must not fail; preallocated in poking_init(). 1127 */ 1128 VM_BUG_ON(!ptep); 1129 1130 local_irq_save(flags); 1131 1132 pte = mk_pte(pages[0], pgprot); 1133 set_pte_at(poking_mm, poking_addr, ptep, pte); 1134 1135 if (cross_page_boundary) { 1136 pte = mk_pte(pages[1], pgprot); 1137 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); 1138 } 1139 1140 /* 1141 * Loading the temporary mm behaves as a compiler barrier, which 1142 * guarantees that the PTE will be set at the time memcpy() is done. 1143 */ 1144 prev = use_temporary_mm(poking_mm); 1145 1146 kasan_disable_current(); 1147 func((u8 *)poking_addr + offset_in_page(addr), src, len); 1148 kasan_enable_current(); 1149 1150 /* 1151 * Ensure that the PTE is only cleared after the instructions of memcpy 1152 * were issued by using a compiler barrier. 1153 */ 1154 barrier(); 1155 1156 pte_clear(poking_mm, poking_addr, ptep); 1157 if (cross_page_boundary) 1158 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); 1159 1160 /* 1161 * Loading the previous page-table hierarchy requires a serializing 1162 * instruction that already allows the core to see the updated version. 1163 * Xen-PV is assumed to serialize execution in a similar manner. 1164 */ 1165 unuse_temporary_mm(prev); 1166 1167 /* 1168 * Flushing the TLB might involve IPIs, which would require enabled 1169 * IRQs, but not if the mm is not used, as it is in this point. 1170 */ 1171 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + 1172 (cross_page_boundary ? 2 : 1) * PAGE_SIZE, 1173 PAGE_SHIFT, false); 1174 1175 if (func == text_poke_memcpy) { 1176 /* 1177 * If the text does not match what we just wrote then something is 1178 * fundamentally screwy; there's nothing we can really do about that. 1179 */ 1180 BUG_ON(memcmp(addr, src, len)); 1181 } 1182 1183 local_irq_restore(flags); 1184 pte_unmap_unlock(ptep, ptl); 1185 return addr; 1186 } 1187 1188 /** 1189 * text_poke - Update instructions on a live kernel 1190 * @addr: address to modify 1191 * @opcode: source of the copy 1192 * @len: length to copy 1193 * 1194 * Only atomic text poke/set should be allowed when not doing early patching. 1195 * It means the size must be writable atomically and the address must be aligned 1196 * in a way that permits an atomic write. It also makes sure we fit on a single 1197 * page. 1198 * 1199 * Note that the caller must ensure that if the modified code is part of a 1200 * module, the module would not be removed during poking. This can be achieved 1201 * by registering a module notifier, and ordering module removal and patching 1202 * trough a mutex. 1203 */ 1204 void *text_poke(void *addr, const void *opcode, size_t len) 1205 { 1206 lockdep_assert_held(&text_mutex); 1207 1208 return __text_poke(text_poke_memcpy, addr, opcode, len); 1209 } 1210 1211 /** 1212 * text_poke_kgdb - Update instructions on a live kernel by kgdb 1213 * @addr: address to modify 1214 * @opcode: source of the copy 1215 * @len: length to copy 1216 * 1217 * Only atomic text poke/set should be allowed when not doing early patching. 1218 * It means the size must be writable atomically and the address must be aligned 1219 * in a way that permits an atomic write. It also makes sure we fit on a single 1220 * page. 1221 * 1222 * Context: should only be used by kgdb, which ensures no other core is running, 1223 * despite the fact it does not hold the text_mutex. 1224 */ 1225 void *text_poke_kgdb(void *addr, const void *opcode, size_t len) 1226 { 1227 return __text_poke(text_poke_memcpy, addr, opcode, len); 1228 } 1229 1230 /** 1231 * text_poke_copy - Copy instructions into (an unused part of) RX memory 1232 * @addr: address to modify 1233 * @opcode: source of the copy 1234 * @len: length to copy, could be more than 2x PAGE_SIZE 1235 * 1236 * Not safe against concurrent execution; useful for JITs to dump 1237 * new code blocks into unused regions of RX memory. Can be used in 1238 * conjunction with synchronize_rcu_tasks() to wait for existing 1239 * execution to quiesce after having made sure no existing functions 1240 * pointers are live. 1241 */ 1242 void *text_poke_copy(void *addr, const void *opcode, size_t len) 1243 { 1244 unsigned long start = (unsigned long)addr; 1245 size_t patched = 0; 1246 1247 if (WARN_ON_ONCE(core_kernel_text(start))) 1248 return NULL; 1249 1250 mutex_lock(&text_mutex); 1251 while (patched < len) { 1252 unsigned long ptr = start + patched; 1253 size_t s; 1254 1255 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 1256 1257 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); 1258 patched += s; 1259 } 1260 mutex_unlock(&text_mutex); 1261 return addr; 1262 } 1263 1264 /** 1265 * text_poke_set - memset into (an unused part of) RX memory 1266 * @addr: address to modify 1267 * @c: the byte to fill the area with 1268 * @len: length to copy, could be more than 2x PAGE_SIZE 1269 * 1270 * This is useful to overwrite unused regions of RX memory with illegal 1271 * instructions. 1272 */ 1273 void *text_poke_set(void *addr, int c, size_t len) 1274 { 1275 unsigned long start = (unsigned long)addr; 1276 size_t patched = 0; 1277 1278 if (WARN_ON_ONCE(core_kernel_text(start))) 1279 return NULL; 1280 1281 mutex_lock(&text_mutex); 1282 while (patched < len) { 1283 unsigned long ptr = start + patched; 1284 size_t s; 1285 1286 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 1287 1288 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); 1289 patched += s; 1290 } 1291 mutex_unlock(&text_mutex); 1292 return addr; 1293 } 1294 1295 static void do_sync_core(void *info) 1296 { 1297 sync_core(); 1298 } 1299 1300 void text_poke_sync(void) 1301 { 1302 on_each_cpu(do_sync_core, NULL, 1); 1303 } 1304 1305 struct text_poke_loc { 1306 /* addr := _stext + rel_addr */ 1307 s32 rel_addr; 1308 s32 disp; 1309 u8 len; 1310 u8 opcode; 1311 const u8 text[POKE_MAX_OPCODE_SIZE]; 1312 /* see text_poke_bp_batch() */ 1313 u8 old; 1314 }; 1315 1316 struct bp_patching_desc { 1317 struct text_poke_loc *vec; 1318 int nr_entries; 1319 atomic_t refs; 1320 }; 1321 1322 static struct bp_patching_desc *bp_desc; 1323 1324 static __always_inline 1325 struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) 1326 { 1327 /* rcu_dereference */ 1328 struct bp_patching_desc *desc = __READ_ONCE(*descp); 1329 1330 if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) 1331 return NULL; 1332 1333 return desc; 1334 } 1335 1336 static __always_inline void put_desc(struct bp_patching_desc *desc) 1337 { 1338 smp_mb__before_atomic(); 1339 arch_atomic_dec(&desc->refs); 1340 } 1341 1342 static __always_inline void *text_poke_addr(struct text_poke_loc *tp) 1343 { 1344 return _stext + tp->rel_addr; 1345 } 1346 1347 static __always_inline int patch_cmp(const void *key, const void *elt) 1348 { 1349 struct text_poke_loc *tp = (struct text_poke_loc *) elt; 1350 1351 if (key < text_poke_addr(tp)) 1352 return -1; 1353 if (key > text_poke_addr(tp)) 1354 return 1; 1355 return 0; 1356 } 1357 1358 noinstr int poke_int3_handler(struct pt_regs *regs) 1359 { 1360 struct bp_patching_desc *desc; 1361 struct text_poke_loc *tp; 1362 int ret = 0; 1363 void *ip; 1364 1365 if (user_mode(regs)) 1366 return 0; 1367 1368 /* 1369 * Having observed our INT3 instruction, we now must observe 1370 * bp_desc: 1371 * 1372 * bp_desc = desc INT3 1373 * WMB RMB 1374 * write INT3 if (desc) 1375 */ 1376 smp_rmb(); 1377 1378 desc = try_get_desc(&bp_desc); 1379 if (!desc) 1380 return 0; 1381 1382 /* 1383 * Discount the INT3. See text_poke_bp_batch(). 1384 */ 1385 ip = (void *) regs->ip - INT3_INSN_SIZE; 1386 1387 /* 1388 * Skip the binary search if there is a single member in the vector. 1389 */ 1390 if (unlikely(desc->nr_entries > 1)) { 1391 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, 1392 sizeof(struct text_poke_loc), 1393 patch_cmp); 1394 if (!tp) 1395 goto out_put; 1396 } else { 1397 tp = desc->vec; 1398 if (text_poke_addr(tp) != ip) 1399 goto out_put; 1400 } 1401 1402 ip += tp->len; 1403 1404 switch (tp->opcode) { 1405 case INT3_INSN_OPCODE: 1406 /* 1407 * Someone poked an explicit INT3, they'll want to handle it, 1408 * do not consume. 1409 */ 1410 goto out_put; 1411 1412 case RET_INSN_OPCODE: 1413 int3_emulate_ret(regs); 1414 break; 1415 1416 case CALL_INSN_OPCODE: 1417 int3_emulate_call(regs, (long)ip + tp->disp); 1418 break; 1419 1420 case JMP32_INSN_OPCODE: 1421 case JMP8_INSN_OPCODE: 1422 int3_emulate_jmp(regs, (long)ip + tp->disp); 1423 break; 1424 1425 default: 1426 BUG(); 1427 } 1428 1429 ret = 1; 1430 1431 out_put: 1432 put_desc(desc); 1433 return ret; 1434 } 1435 1436 #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) 1437 static struct text_poke_loc tp_vec[TP_VEC_MAX]; 1438 static int tp_vec_nr; 1439 1440 /** 1441 * text_poke_bp_batch() -- update instructions on live kernel on SMP 1442 * @tp: vector of instructions to patch 1443 * @nr_entries: number of entries in the vector 1444 * 1445 * Modify multi-byte instruction by using int3 breakpoint on SMP. 1446 * We completely avoid stop_machine() here, and achieve the 1447 * synchronization using int3 breakpoint. 1448 * 1449 * The way it is done: 1450 * - For each entry in the vector: 1451 * - add a int3 trap to the address that will be patched 1452 * - sync cores 1453 * - For each entry in the vector: 1454 * - update all but the first byte of the patched range 1455 * - sync cores 1456 * - For each entry in the vector: 1457 * - replace the first byte (int3) by the first byte of 1458 * replacing opcode 1459 * - sync cores 1460 */ 1461 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) 1462 { 1463 struct bp_patching_desc desc = { 1464 .vec = tp, 1465 .nr_entries = nr_entries, 1466 .refs = ATOMIC_INIT(1), 1467 }; 1468 unsigned char int3 = INT3_INSN_OPCODE; 1469 unsigned int i; 1470 int do_sync; 1471 1472 lockdep_assert_held(&text_mutex); 1473 1474 smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ 1475 1476 /* 1477 * Corresponding read barrier in int3 notifier for making sure the 1478 * nr_entries and handler are correctly ordered wrt. patching. 1479 */ 1480 smp_wmb(); 1481 1482 /* 1483 * First step: add a int3 trap to the address that will be patched. 1484 */ 1485 for (i = 0; i < nr_entries; i++) { 1486 tp[i].old = *(u8 *)text_poke_addr(&tp[i]); 1487 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); 1488 } 1489 1490 text_poke_sync(); 1491 1492 /* 1493 * Second step: update all but the first byte of the patched range. 1494 */ 1495 for (do_sync = 0, i = 0; i < nr_entries; i++) { 1496 u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; 1497 int len = tp[i].len; 1498 1499 if (len - INT3_INSN_SIZE > 0) { 1500 memcpy(old + INT3_INSN_SIZE, 1501 text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 1502 len - INT3_INSN_SIZE); 1503 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 1504 (const char *)tp[i].text + INT3_INSN_SIZE, 1505 len - INT3_INSN_SIZE); 1506 do_sync++; 1507 } 1508 1509 /* 1510 * Emit a perf event to record the text poke, primarily to 1511 * support Intel PT decoding which must walk the executable code 1512 * to reconstruct the trace. The flow up to here is: 1513 * - write INT3 byte 1514 * - IPI-SYNC 1515 * - write instruction tail 1516 * At this point the actual control flow will be through the 1517 * INT3 and handler and not hit the old or new instruction. 1518 * Intel PT outputs FUP/TIP packets for the INT3, so the flow 1519 * can still be decoded. Subsequently: 1520 * - emit RECORD_TEXT_POKE with the new instruction 1521 * - IPI-SYNC 1522 * - write first byte 1523 * - IPI-SYNC 1524 * So before the text poke event timestamp, the decoder will see 1525 * either the old instruction flow or FUP/TIP of INT3. After the 1526 * text poke event timestamp, the decoder will see either the 1527 * new instruction flow or FUP/TIP of INT3. Thus decoders can 1528 * use the timestamp as the point at which to modify the 1529 * executable code. 1530 * The old instruction is recorded so that the event can be 1531 * processed forwards or backwards. 1532 */ 1533 perf_event_text_poke(text_poke_addr(&tp[i]), old, len, 1534 tp[i].text, len); 1535 } 1536 1537 if (do_sync) { 1538 /* 1539 * According to Intel, this core syncing is very likely 1540 * not necessary and we'd be safe even without it. But 1541 * better safe than sorry (plus there's not only Intel). 1542 */ 1543 text_poke_sync(); 1544 } 1545 1546 /* 1547 * Third step: replace the first byte (int3) by the first byte of 1548 * replacing opcode. 1549 */ 1550 for (do_sync = 0, i = 0; i < nr_entries; i++) { 1551 if (tp[i].text[0] == INT3_INSN_OPCODE) 1552 continue; 1553 1554 text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); 1555 do_sync++; 1556 } 1557 1558 if (do_sync) 1559 text_poke_sync(); 1560 1561 /* 1562 * Remove and synchronize_rcu(), except we have a very primitive 1563 * refcount based completion. 1564 */ 1565 WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ 1566 if (!atomic_dec_and_test(&desc.refs)) 1567 atomic_cond_read_acquire(&desc.refs, !VAL); 1568 } 1569 1570 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, 1571 const void *opcode, size_t len, const void *emulate) 1572 { 1573 struct insn insn; 1574 int ret, i; 1575 1576 memcpy((void *)tp->text, opcode, len); 1577 if (!emulate) 1578 emulate = opcode; 1579 1580 ret = insn_decode_kernel(&insn, emulate); 1581 BUG_ON(ret < 0); 1582 1583 tp->rel_addr = addr - (void *)_stext; 1584 tp->len = len; 1585 tp->opcode = insn.opcode.bytes[0]; 1586 1587 switch (tp->opcode) { 1588 case RET_INSN_OPCODE: 1589 case JMP32_INSN_OPCODE: 1590 case JMP8_INSN_OPCODE: 1591 /* 1592 * Control flow instructions without implied execution of the 1593 * next instruction can be padded with INT3. 1594 */ 1595 for (i = insn.length; i < len; i++) 1596 BUG_ON(tp->text[i] != INT3_INSN_OPCODE); 1597 break; 1598 1599 default: 1600 BUG_ON(len != insn.length); 1601 }; 1602 1603 1604 switch (tp->opcode) { 1605 case INT3_INSN_OPCODE: 1606 case RET_INSN_OPCODE: 1607 break; 1608 1609 case CALL_INSN_OPCODE: 1610 case JMP32_INSN_OPCODE: 1611 case JMP8_INSN_OPCODE: 1612 tp->disp = insn.immediate.value; 1613 break; 1614 1615 default: /* assume NOP */ 1616 switch (len) { 1617 case 2: /* NOP2 -- emulate as JMP8+0 */ 1618 BUG_ON(memcmp(emulate, x86_nops[len], len)); 1619 tp->opcode = JMP8_INSN_OPCODE; 1620 tp->disp = 0; 1621 break; 1622 1623 case 5: /* NOP5 -- emulate as JMP32+0 */ 1624 BUG_ON(memcmp(emulate, x86_nops[len], len)); 1625 tp->opcode = JMP32_INSN_OPCODE; 1626 tp->disp = 0; 1627 break; 1628 1629 default: /* unknown instruction */ 1630 BUG(); 1631 } 1632 break; 1633 } 1634 } 1635 1636 /* 1637 * We hard rely on the tp_vec being ordered; ensure this is so by flushing 1638 * early if needed. 1639 */ 1640 static bool tp_order_fail(void *addr) 1641 { 1642 struct text_poke_loc *tp; 1643 1644 if (!tp_vec_nr) 1645 return false; 1646 1647 if (!addr) /* force */ 1648 return true; 1649 1650 tp = &tp_vec[tp_vec_nr - 1]; 1651 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) 1652 return true; 1653 1654 return false; 1655 } 1656 1657 static void text_poke_flush(void *addr) 1658 { 1659 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { 1660 text_poke_bp_batch(tp_vec, tp_vec_nr); 1661 tp_vec_nr = 0; 1662 } 1663 } 1664 1665 void text_poke_finish(void) 1666 { 1667 text_poke_flush(NULL); 1668 } 1669 1670 void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) 1671 { 1672 struct text_poke_loc *tp; 1673 1674 if (unlikely(system_state == SYSTEM_BOOTING)) { 1675 text_poke_early(addr, opcode, len); 1676 return; 1677 } 1678 1679 text_poke_flush(addr); 1680 1681 tp = &tp_vec[tp_vec_nr++]; 1682 text_poke_loc_init(tp, addr, opcode, len, emulate); 1683 } 1684 1685 /** 1686 * text_poke_bp() -- update instructions on live kernel on SMP 1687 * @addr: address to patch 1688 * @opcode: opcode of new instruction 1689 * @len: length to copy 1690 * @emulate: instruction to be emulated 1691 * 1692 * Update a single instruction with the vector in the stack, avoiding 1693 * dynamically allocated memory. This function should be used when it is 1694 * not possible to allocate memory. 1695 */ 1696 void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) 1697 { 1698 struct text_poke_loc tp; 1699 1700 if (unlikely(system_state == SYSTEM_BOOTING)) { 1701 text_poke_early(addr, opcode, len); 1702 return; 1703 } 1704 1705 text_poke_loc_init(&tp, addr, opcode, len, emulate); 1706 text_poke_bp_batch(&tp, 1); 1707 } 1708