1 // SPDX-License-Identifier: GPL-2.0-only 2 #define pr_fmt(fmt) "SMP alternatives: " fmt 3 4 #include <linux/module.h> 5 #include <linux/sched.h> 6 #include <linux/perf_event.h> 7 #include <linux/mutex.h> 8 #include <linux/list.h> 9 #include <linux/stringify.h> 10 #include <linux/highmem.h> 11 #include <linux/mm.h> 12 #include <linux/vmalloc.h> 13 #include <linux/memory.h> 14 #include <linux/stop_machine.h> 15 #include <linux/slab.h> 16 #include <linux/kdebug.h> 17 #include <linux/kprobes.h> 18 #include <linux/mmu_context.h> 19 #include <linux/bsearch.h> 20 #include <linux/sync_core.h> 21 #include <asm/text-patching.h> 22 #include <asm/alternative.h> 23 #include <asm/sections.h> 24 #include <asm/mce.h> 25 #include <asm/nmi.h> 26 #include <asm/cacheflush.h> 27 #include <asm/tlbflush.h> 28 #include <asm/insn.h> 29 #include <asm/io.h> 30 #include <asm/fixmap.h> 31 #include <asm/paravirt.h> 32 #include <asm/asm-prototypes.h> 33 34 int __read_mostly alternatives_patched; 35 36 EXPORT_SYMBOL_GPL(alternatives_patched); 37 38 #define MAX_PATCH_LEN (255-1) 39 40 static int __initdata_or_module debug_alternative; 41 42 static int __init debug_alt(char *str) 43 { 44 debug_alternative = 1; 45 return 1; 46 } 47 __setup("debug-alternative", debug_alt); 48 49 static int noreplace_smp; 50 51 static int __init setup_noreplace_smp(char *str) 52 { 53 noreplace_smp = 1; 54 return 1; 55 } 56 __setup("noreplace-smp", setup_noreplace_smp); 57 58 #define DPRINTK(fmt, args...) \ 59 do { \ 60 if (debug_alternative) \ 61 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ 62 } while (0) 63 64 #define DUMP_BYTES(buf, len, fmt, args...) \ 65 do { \ 66 if (unlikely(debug_alternative)) { \ 67 int j; \ 68 \ 69 if (!(len)) \ 70 break; \ 71 \ 72 printk(KERN_DEBUG pr_fmt(fmt), ##args); \ 73 for (j = 0; j < (len) - 1; j++) \ 74 printk(KERN_CONT "%02hhx ", buf[j]); \ 75 printk(KERN_CONT "%02hhx\n", buf[j]); \ 76 } \ 77 } while (0) 78 79 static const unsigned char x86nops[] = 80 { 81 BYTES_NOP1, 82 BYTES_NOP2, 83 BYTES_NOP3, 84 BYTES_NOP4, 85 BYTES_NOP5, 86 BYTES_NOP6, 87 BYTES_NOP7, 88 BYTES_NOP8, 89 }; 90 91 const unsigned char * const x86_nops[ASM_NOP_MAX+1] = 92 { 93 NULL, 94 x86nops, 95 x86nops + 1, 96 x86nops + 1 + 2, 97 x86nops + 1 + 2 + 3, 98 x86nops + 1 + 2 + 3 + 4, 99 x86nops + 1 + 2 + 3 + 4 + 5, 100 x86nops + 1 + 2 + 3 + 4 + 5 + 6, 101 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 102 }; 103 104 /* Use this to add nops to a buffer, then text_poke the whole buffer. */ 105 static void __init_or_module add_nops(void *insns, unsigned int len) 106 { 107 while (len > 0) { 108 unsigned int noplen = len; 109 if (noplen > ASM_NOP_MAX) 110 noplen = ASM_NOP_MAX; 111 memcpy(insns, x86_nops[noplen], noplen); 112 insns += noplen; 113 len -= noplen; 114 } 115 } 116 117 extern s32 __retpoline_sites[], __retpoline_sites_end[]; 118 extern s32 __return_sites[], __return_sites_end[]; 119 extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; 120 extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 121 extern s32 __smp_locks[], __smp_locks_end[]; 122 void text_poke_early(void *addr, const void *opcode, size_t len); 123 124 /* 125 * Are we looking at a near JMP with a 1 or 4-byte displacement. 126 */ 127 static inline bool is_jmp(const u8 opcode) 128 { 129 return opcode == 0xeb || opcode == 0xe9; 130 } 131 132 static void __init_or_module 133 recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) 134 { 135 u8 *next_rip, *tgt_rip; 136 s32 n_dspl, o_dspl; 137 int repl_len; 138 139 if (a->replacementlen != 5) 140 return; 141 142 o_dspl = *(s32 *)(insn_buff + 1); 143 144 /* next_rip of the replacement JMP */ 145 next_rip = repl_insn + a->replacementlen; 146 /* target rip of the replacement JMP */ 147 tgt_rip = next_rip + o_dspl; 148 n_dspl = tgt_rip - orig_insn; 149 150 DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); 151 152 if (tgt_rip - orig_insn >= 0) { 153 if (n_dspl - 2 <= 127) 154 goto two_byte_jmp; 155 else 156 goto five_byte_jmp; 157 /* negative offset */ 158 } else { 159 if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) 160 goto two_byte_jmp; 161 else 162 goto five_byte_jmp; 163 } 164 165 two_byte_jmp: 166 n_dspl -= 2; 167 168 insn_buff[0] = 0xeb; 169 insn_buff[1] = (s8)n_dspl; 170 add_nops(insn_buff + 2, 3); 171 172 repl_len = 2; 173 goto done; 174 175 five_byte_jmp: 176 n_dspl -= 5; 177 178 insn_buff[0] = 0xe9; 179 *(s32 *)&insn_buff[1] = n_dspl; 180 181 repl_len = 5; 182 183 done: 184 185 DPRINTK("final displ: 0x%08x, JMP 0x%lx", 186 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); 187 } 188 189 /* 190 * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) 191 * 192 * @instr: instruction byte stream 193 * @instrlen: length of the above 194 * @off: offset within @instr where the first NOP has been detected 195 * 196 * Return: number of NOPs found (and replaced). 197 */ 198 static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) 199 { 200 unsigned long flags; 201 int i = off, nnops; 202 203 while (i < instrlen) { 204 if (instr[i] != 0x90) 205 break; 206 207 i++; 208 } 209 210 nnops = i - off; 211 212 if (nnops <= 1) 213 return nnops; 214 215 local_irq_save(flags); 216 add_nops(instr + off, nnops); 217 local_irq_restore(flags); 218 219 DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); 220 221 return nnops; 222 } 223 224 /* 225 * "noinline" to cause control flow change and thus invalidate I$ and 226 * cause refetch after modification. 227 */ 228 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) 229 { 230 struct insn insn; 231 int i = 0; 232 233 /* 234 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger 235 * ones. 236 */ 237 for (;;) { 238 if (insn_decode_kernel(&insn, &instr[i])) 239 return; 240 241 /* 242 * See if this and any potentially following NOPs can be 243 * optimized. 244 */ 245 if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) 246 i += optimize_nops_range(instr, len, i); 247 else 248 i += insn.length; 249 250 if (i >= len) 251 return; 252 } 253 } 254 255 /* 256 * Replace instructions with better alternatives for this CPU type. This runs 257 * before SMP is initialized to avoid SMP problems with self modifying code. 258 * This implies that asymmetric systems where APs have less capabilities than 259 * the boot processor are not handled. Tough. Make sure you disable such 260 * features by hand. 261 * 262 * Marked "noinline" to cause control flow change and thus insn cache 263 * to refetch changed I$ lines. 264 */ 265 void __init_or_module noinline apply_alternatives(struct alt_instr *start, 266 struct alt_instr *end) 267 { 268 struct alt_instr *a; 269 u8 *instr, *replacement; 270 u8 insn_buff[MAX_PATCH_LEN]; 271 272 DPRINTK("alt table %px, -> %px", start, end); 273 /* 274 * The scan order should be from start to end. A later scanned 275 * alternative code can overwrite previously scanned alternative code. 276 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 277 * patch code. 278 * 279 * So be careful if you want to change the scan order to any other 280 * order. 281 */ 282 for (a = start; a < end; a++) { 283 int insn_buff_sz = 0; 284 /* Mask away "NOT" flag bit for feature to test. */ 285 u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; 286 287 instr = (u8 *)&a->instr_offset + a->instr_offset; 288 replacement = (u8 *)&a->repl_offset + a->repl_offset; 289 BUG_ON(a->instrlen > sizeof(insn_buff)); 290 BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); 291 292 /* 293 * Patch if either: 294 * - feature is present 295 * - feature not present but ALTINSTR_FLAG_INV is set to mean, 296 * patch if feature is *NOT* present. 297 */ 298 if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) 299 goto next; 300 301 DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", 302 (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", 303 feature >> 5, 304 feature & 0x1f, 305 instr, instr, a->instrlen, 306 replacement, a->replacementlen); 307 308 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); 309 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); 310 311 memcpy(insn_buff, replacement, a->replacementlen); 312 insn_buff_sz = a->replacementlen; 313 314 /* 315 * 0xe8 is a relative jump; fix the offset. 316 * 317 * Instruction length is checked before the opcode to avoid 318 * accessing uninitialized bytes for zero-length replacements. 319 */ 320 if (a->replacementlen == 5 && *insn_buff == 0xe8) { 321 *(s32 *)(insn_buff + 1) += replacement - instr; 322 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", 323 *(s32 *)(insn_buff + 1), 324 (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); 325 } 326 327 if (a->replacementlen && is_jmp(replacement[0])) 328 recompute_jump(a, instr, replacement, insn_buff); 329 330 for (; insn_buff_sz < a->instrlen; insn_buff_sz++) 331 insn_buff[insn_buff_sz] = 0x90; 332 333 DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); 334 335 text_poke_early(instr, insn_buff, insn_buff_sz); 336 337 next: 338 optimize_nops(instr, a->instrlen); 339 } 340 } 341 342 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) 343 344 /* 345 * CALL/JMP *%\reg 346 */ 347 static int emit_indirect(int op, int reg, u8 *bytes) 348 { 349 int i = 0; 350 u8 modrm; 351 352 switch (op) { 353 case CALL_INSN_OPCODE: 354 modrm = 0x10; /* Reg = 2; CALL r/m */ 355 break; 356 357 case JMP32_INSN_OPCODE: 358 modrm = 0x20; /* Reg = 4; JMP r/m */ 359 break; 360 361 default: 362 WARN_ON_ONCE(1); 363 return -1; 364 } 365 366 if (reg >= 8) { 367 bytes[i++] = 0x41; /* REX.B prefix */ 368 reg -= 8; 369 } 370 371 modrm |= 0xc0; /* Mod = 3 */ 372 modrm += reg; 373 374 bytes[i++] = 0xff; /* opcode */ 375 bytes[i++] = modrm; 376 377 return i; 378 } 379 380 /* 381 * Rewrite the compiler generated retpoline thunk calls. 382 * 383 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate 384 * indirect instructions, avoiding the extra indirection. 385 * 386 * For example, convert: 387 * 388 * CALL __x86_indirect_thunk_\reg 389 * 390 * into: 391 * 392 * CALL *%\reg 393 * 394 * It also tries to inline spectre_v2=retpoline,lfence when size permits. 395 */ 396 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) 397 { 398 retpoline_thunk_t *target; 399 int reg, ret, i = 0; 400 u8 op, cc; 401 402 target = addr + insn->length + insn->immediate.value; 403 reg = target - __x86_indirect_thunk_array; 404 405 if (WARN_ON_ONCE(reg & ~0xf)) 406 return -1; 407 408 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ 409 BUG_ON(reg == 4); 410 411 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && 412 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) 413 return -1; 414 415 op = insn->opcode.bytes[0]; 416 417 /* 418 * Convert: 419 * 420 * Jcc.d32 __x86_indirect_thunk_\reg 421 * 422 * into: 423 * 424 * Jncc.d8 1f 425 * [ LFENCE ] 426 * JMP *%\reg 427 * [ NOP ] 428 * 1: 429 */ 430 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ 431 if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { 432 cc = insn->opcode.bytes[1] & 0xf; 433 cc ^= 1; /* invert condition */ 434 435 bytes[i++] = 0x70 + cc; /* Jcc.d8 */ 436 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ 437 438 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ 439 op = JMP32_INSN_OPCODE; 440 } 441 442 /* 443 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. 444 */ 445 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { 446 bytes[i++] = 0x0f; 447 bytes[i++] = 0xae; 448 bytes[i++] = 0xe8; /* LFENCE */ 449 } 450 451 ret = emit_indirect(op, reg, bytes + i); 452 if (ret < 0) 453 return ret; 454 i += ret; 455 456 for (; i < insn->length;) 457 bytes[i++] = BYTES_NOP1; 458 459 return i; 460 } 461 462 /* 463 * Generated by 'objtool --retpoline'. 464 */ 465 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) 466 { 467 s32 *s; 468 469 for (s = start; s < end; s++) { 470 void *addr = (void *)s + *s; 471 struct insn insn; 472 int len, ret; 473 u8 bytes[16]; 474 u8 op1, op2; 475 476 ret = insn_decode_kernel(&insn, addr); 477 if (WARN_ON_ONCE(ret < 0)) 478 continue; 479 480 op1 = insn.opcode.bytes[0]; 481 op2 = insn.opcode.bytes[1]; 482 483 switch (op1) { 484 case CALL_INSN_OPCODE: 485 case JMP32_INSN_OPCODE: 486 break; 487 488 case 0x0f: /* escape */ 489 if (op2 >= 0x80 && op2 <= 0x8f) 490 break; 491 fallthrough; 492 default: 493 WARN_ON_ONCE(1); 494 continue; 495 } 496 497 DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", 498 addr, addr, insn.length, 499 addr + insn.length + insn.immediate.value); 500 501 len = patch_retpoline(addr, &insn, bytes); 502 if (len == insn.length) { 503 optimize_nops(bytes, len); 504 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); 505 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); 506 text_poke_early(addr, bytes, len); 507 } 508 } 509 } 510 511 #ifdef CONFIG_RETHUNK 512 /* 513 * Rewrite the compiler generated return thunk tail-calls. 514 * 515 * For example, convert: 516 * 517 * JMP __x86_return_thunk 518 * 519 * into: 520 * 521 * RET 522 */ 523 static int patch_return(void *addr, struct insn *insn, u8 *bytes) 524 { 525 int i = 0; 526 527 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) 528 return -1; 529 530 bytes[i++] = RET_INSN_OPCODE; 531 532 for (; i < insn->length;) 533 bytes[i++] = INT3_INSN_OPCODE; 534 535 return i; 536 } 537 538 void __init_or_module noinline apply_returns(s32 *start, s32 *end) 539 { 540 s32 *s; 541 542 for (s = start; s < end; s++) { 543 void *dest = NULL, *addr = (void *)s + *s; 544 struct insn insn; 545 int len, ret; 546 u8 bytes[16]; 547 u8 op; 548 549 ret = insn_decode_kernel(&insn, addr); 550 if (WARN_ON_ONCE(ret < 0)) 551 continue; 552 553 op = insn.opcode.bytes[0]; 554 if (op == JMP32_INSN_OPCODE) 555 dest = addr + insn.length + insn.immediate.value; 556 557 if (__static_call_fixup(addr, op, dest) || 558 WARN_ON_ONCE(dest != &__x86_return_thunk)) 559 continue; 560 561 DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", 562 addr, addr, insn.length, 563 addr + insn.length + insn.immediate.value); 564 565 len = patch_return(addr, &insn, bytes); 566 if (len == insn.length) { 567 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); 568 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); 569 text_poke_early(addr, bytes, len); 570 } 571 } 572 } 573 #else 574 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 575 #endif /* CONFIG_RETHUNK */ 576 577 #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ 578 579 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } 580 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 581 582 #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ 583 584 #ifdef CONFIG_X86_KERNEL_IBT 585 586 /* 587 * Generated by: objtool --ibt 588 */ 589 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) 590 { 591 s32 *s; 592 593 for (s = start; s < end; s++) { 594 u32 endbr, poison = gen_endbr_poison(); 595 void *addr = (void *)s + *s; 596 597 if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) 598 continue; 599 600 if (WARN_ON_ONCE(!is_endbr(endbr))) 601 continue; 602 603 DPRINTK("ENDBR at: %pS (%px)", addr, addr); 604 605 /* 606 * When we have IBT, the lack of ENDBR will trigger #CP 607 */ 608 DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); 609 DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); 610 text_poke_early(addr, &poison, 4); 611 } 612 } 613 614 #else 615 616 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } 617 618 #endif /* CONFIG_X86_KERNEL_IBT */ 619 620 #ifdef CONFIG_SMP 621 static void alternatives_smp_lock(const s32 *start, const s32 *end, 622 u8 *text, u8 *text_end) 623 { 624 const s32 *poff; 625 626 for (poff = start; poff < end; poff++) { 627 u8 *ptr = (u8 *)poff + *poff; 628 629 if (!*poff || ptr < text || ptr >= text_end) 630 continue; 631 /* turn DS segment override prefix into lock prefix */ 632 if (*ptr == 0x3e) 633 text_poke(ptr, ((unsigned char []){0xf0}), 1); 634 } 635 } 636 637 static void alternatives_smp_unlock(const s32 *start, const s32 *end, 638 u8 *text, u8 *text_end) 639 { 640 const s32 *poff; 641 642 for (poff = start; poff < end; poff++) { 643 u8 *ptr = (u8 *)poff + *poff; 644 645 if (!*poff || ptr < text || ptr >= text_end) 646 continue; 647 /* turn lock prefix into DS segment override prefix */ 648 if (*ptr == 0xf0) 649 text_poke(ptr, ((unsigned char []){0x3E}), 1); 650 } 651 } 652 653 struct smp_alt_module { 654 /* what is this ??? */ 655 struct module *mod; 656 char *name; 657 658 /* ptrs to lock prefixes */ 659 const s32 *locks; 660 const s32 *locks_end; 661 662 /* .text segment, needed to avoid patching init code ;) */ 663 u8 *text; 664 u8 *text_end; 665 666 struct list_head next; 667 }; 668 static LIST_HEAD(smp_alt_modules); 669 static bool uniproc_patched = false; /* protected by text_mutex */ 670 671 void __init_or_module alternatives_smp_module_add(struct module *mod, 672 char *name, 673 void *locks, void *locks_end, 674 void *text, void *text_end) 675 { 676 struct smp_alt_module *smp; 677 678 mutex_lock(&text_mutex); 679 if (!uniproc_patched) 680 goto unlock; 681 682 if (num_possible_cpus() == 1) 683 /* Don't bother remembering, we'll never have to undo it. */ 684 goto smp_unlock; 685 686 smp = kzalloc(sizeof(*smp), GFP_KERNEL); 687 if (NULL == smp) 688 /* we'll run the (safe but slow) SMP code then ... */ 689 goto unlock; 690 691 smp->mod = mod; 692 smp->name = name; 693 smp->locks = locks; 694 smp->locks_end = locks_end; 695 smp->text = text; 696 smp->text_end = text_end; 697 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", 698 smp->locks, smp->locks_end, 699 smp->text, smp->text_end, smp->name); 700 701 list_add_tail(&smp->next, &smp_alt_modules); 702 smp_unlock: 703 alternatives_smp_unlock(locks, locks_end, text, text_end); 704 unlock: 705 mutex_unlock(&text_mutex); 706 } 707 708 void __init_or_module alternatives_smp_module_del(struct module *mod) 709 { 710 struct smp_alt_module *item; 711 712 mutex_lock(&text_mutex); 713 list_for_each_entry(item, &smp_alt_modules, next) { 714 if (mod != item->mod) 715 continue; 716 list_del(&item->next); 717 kfree(item); 718 break; 719 } 720 mutex_unlock(&text_mutex); 721 } 722 723 void alternatives_enable_smp(void) 724 { 725 struct smp_alt_module *mod; 726 727 /* Why bother if there are no other CPUs? */ 728 BUG_ON(num_possible_cpus() == 1); 729 730 mutex_lock(&text_mutex); 731 732 if (uniproc_patched) { 733 pr_info("switching to SMP code\n"); 734 BUG_ON(num_online_cpus() != 1); 735 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 736 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 737 list_for_each_entry(mod, &smp_alt_modules, next) 738 alternatives_smp_lock(mod->locks, mod->locks_end, 739 mod->text, mod->text_end); 740 uniproc_patched = false; 741 } 742 mutex_unlock(&text_mutex); 743 } 744 745 /* 746 * Return 1 if the address range is reserved for SMP-alternatives. 747 * Must hold text_mutex. 748 */ 749 int alternatives_text_reserved(void *start, void *end) 750 { 751 struct smp_alt_module *mod; 752 const s32 *poff; 753 u8 *text_start = start; 754 u8 *text_end = end; 755 756 lockdep_assert_held(&text_mutex); 757 758 list_for_each_entry(mod, &smp_alt_modules, next) { 759 if (mod->text > text_end || mod->text_end < text_start) 760 continue; 761 for (poff = mod->locks; poff < mod->locks_end; poff++) { 762 const u8 *ptr = (const u8 *)poff + *poff; 763 764 if (text_start <= ptr && text_end > ptr) 765 return 1; 766 } 767 } 768 769 return 0; 770 } 771 #endif /* CONFIG_SMP */ 772 773 #ifdef CONFIG_PARAVIRT 774 void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 775 struct paravirt_patch_site *end) 776 { 777 struct paravirt_patch_site *p; 778 char insn_buff[MAX_PATCH_LEN]; 779 780 for (p = start; p < end; p++) { 781 unsigned int used; 782 783 BUG_ON(p->len > MAX_PATCH_LEN); 784 /* prep the buffer with the original instructions */ 785 memcpy(insn_buff, p->instr, p->len); 786 used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); 787 788 BUG_ON(used > p->len); 789 790 /* Pad the rest with nops */ 791 add_nops(insn_buff + used, p->len - used); 792 text_poke_early(p->instr, insn_buff, p->len); 793 } 794 } 795 extern struct paravirt_patch_site __start_parainstructions[], 796 __stop_parainstructions[]; 797 #endif /* CONFIG_PARAVIRT */ 798 799 /* 800 * Self-test for the INT3 based CALL emulation code. 801 * 802 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up 803 * properly and that there is a stack gap between the INT3 frame and the 804 * previous context. Without this gap doing a virtual PUSH on the interrupted 805 * stack would corrupt the INT3 IRET frame. 806 * 807 * See entry_{32,64}.S for more details. 808 */ 809 810 /* 811 * We define the int3_magic() function in assembly to control the calling 812 * convention such that we can 'call' it from assembly. 813 */ 814 815 extern void int3_magic(unsigned int *ptr); /* defined in asm */ 816 817 asm ( 818 " .pushsection .init.text, \"ax\", @progbits\n" 819 " .type int3_magic, @function\n" 820 "int3_magic:\n" 821 ANNOTATE_NOENDBR 822 " movl $1, (%" _ASM_ARG1 ")\n" 823 ASM_RET 824 " .size int3_magic, .-int3_magic\n" 825 " .popsection\n" 826 ); 827 828 extern void int3_selftest_ip(void); /* defined in asm below */ 829 830 static int __init 831 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) 832 { 833 unsigned long selftest = (unsigned long)&int3_selftest_ip; 834 struct die_args *args = data; 835 struct pt_regs *regs = args->regs; 836 837 OPTIMIZER_HIDE_VAR(selftest); 838 839 if (!regs || user_mode(regs)) 840 return NOTIFY_DONE; 841 842 if (val != DIE_INT3) 843 return NOTIFY_DONE; 844 845 if (regs->ip - INT3_INSN_SIZE != selftest) 846 return NOTIFY_DONE; 847 848 int3_emulate_call(regs, (unsigned long)&int3_magic); 849 return NOTIFY_STOP; 850 } 851 852 /* Must be noinline to ensure uniqueness of int3_selftest_ip. */ 853 static noinline void __init int3_selftest(void) 854 { 855 static __initdata struct notifier_block int3_exception_nb = { 856 .notifier_call = int3_exception_notify, 857 .priority = INT_MAX-1, /* last */ 858 }; 859 unsigned int val = 0; 860 861 BUG_ON(register_die_notifier(&int3_exception_nb)); 862 863 /* 864 * Basically: int3_magic(&val); but really complicated :-) 865 * 866 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb 867 * notifier above will emulate CALL for us. 868 */ 869 asm volatile ("int3_selftest_ip:\n\t" 870 ANNOTATE_NOENDBR 871 " int3; nop; nop; nop; nop\n\t" 872 : ASM_CALL_CONSTRAINT 873 : __ASM_SEL_RAW(a, D) (&val) 874 : "memory"); 875 876 BUG_ON(val != 1); 877 878 unregister_die_notifier(&int3_exception_nb); 879 } 880 881 void __init alternative_instructions(void) 882 { 883 int3_selftest(); 884 885 /* 886 * The patching is not fully atomic, so try to avoid local 887 * interruptions that might execute the to be patched code. 888 * Other CPUs are not running. 889 */ 890 stop_nmi(); 891 892 /* 893 * Don't stop machine check exceptions while patching. 894 * MCEs only happen when something got corrupted and in this 895 * case we must do something about the corruption. 896 * Ignoring it is worse than an unlikely patching race. 897 * Also machine checks tend to be broadcast and if one CPU 898 * goes into machine check the others follow quickly, so we don't 899 * expect a machine check to cause undue problems during to code 900 * patching. 901 */ 902 903 /* 904 * Paravirt patching and alternative patching can be combined to 905 * replace a function call with a short direct code sequence (e.g. 906 * by setting a constant return value instead of doing that in an 907 * external function). 908 * In order to make this work the following sequence is required: 909 * 1. set (artificial) features depending on used paravirt 910 * functions which can later influence alternative patching 911 * 2. apply paravirt patching (generally replacing an indirect 912 * function call with a direct one) 913 * 3. apply alternative patching (e.g. replacing a direct function 914 * call with a custom code sequence) 915 * Doing paravirt patching after alternative patching would clobber 916 * the optimization of the custom code with a function call again. 917 */ 918 paravirt_set_cap(); 919 920 /* 921 * First patch paravirt functions, such that we overwrite the indirect 922 * call with the direct call. 923 */ 924 apply_paravirt(__parainstructions, __parainstructions_end); 925 926 /* 927 * Rewrite the retpolines, must be done before alternatives since 928 * those can rewrite the retpoline thunks. 929 */ 930 apply_retpolines(__retpoline_sites, __retpoline_sites_end); 931 apply_returns(__return_sites, __return_sites_end); 932 933 /* 934 * Then patch alternatives, such that those paravirt calls that are in 935 * alternatives can be overwritten by their immediate fragments. 936 */ 937 apply_alternatives(__alt_instructions, __alt_instructions_end); 938 939 apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); 940 941 #ifdef CONFIG_SMP 942 /* Patch to UP if other cpus not imminent. */ 943 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { 944 uniproc_patched = true; 945 alternatives_smp_module_add(NULL, "core kernel", 946 __smp_locks, __smp_locks_end, 947 _text, _etext); 948 } 949 950 if (!uniproc_patched || num_possible_cpus() == 1) { 951 free_init_pages("SMP alternatives", 952 (unsigned long)__smp_locks, 953 (unsigned long)__smp_locks_end); 954 } 955 #endif 956 957 restart_nmi(); 958 alternatives_patched = 1; 959 } 960 961 /** 962 * text_poke_early - Update instructions on a live kernel at boot time 963 * @addr: address to modify 964 * @opcode: source of the copy 965 * @len: length to copy 966 * 967 * When you use this code to patch more than one byte of an instruction 968 * you need to make sure that other CPUs cannot execute this code in parallel. 969 * Also no thread must be currently preempted in the middle of these 970 * instructions. And on the local CPU you need to be protected against NMI or 971 * MCE handlers seeing an inconsistent instruction while you patch. 972 */ 973 void __init_or_module text_poke_early(void *addr, const void *opcode, 974 size_t len) 975 { 976 unsigned long flags; 977 978 if (boot_cpu_has(X86_FEATURE_NX) && 979 is_module_text_address((unsigned long)addr)) { 980 /* 981 * Modules text is marked initially as non-executable, so the 982 * code cannot be running and speculative code-fetches are 983 * prevented. Just change the code. 984 */ 985 memcpy(addr, opcode, len); 986 } else { 987 local_irq_save(flags); 988 memcpy(addr, opcode, len); 989 local_irq_restore(flags); 990 sync_core(); 991 992 /* 993 * Could also do a CLFLUSH here to speed up CPU recovery; but 994 * that causes hangs on some VIA CPUs. 995 */ 996 } 997 } 998 999 typedef struct { 1000 struct mm_struct *mm; 1001 } temp_mm_state_t; 1002 1003 /* 1004 * Using a temporary mm allows to set temporary mappings that are not accessible 1005 * by other CPUs. Such mappings are needed to perform sensitive memory writes 1006 * that override the kernel memory protections (e.g., W^X), without exposing the 1007 * temporary page-table mappings that are required for these write operations to 1008 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the 1009 * mapping is torn down. 1010 * 1011 * Context: The temporary mm needs to be used exclusively by a single core. To 1012 * harden security IRQs must be disabled while the temporary mm is 1013 * loaded, thereby preventing interrupt handler bugs from overriding 1014 * the kernel memory protection. 1015 */ 1016 static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) 1017 { 1018 temp_mm_state_t temp_state; 1019 1020 lockdep_assert_irqs_disabled(); 1021 1022 /* 1023 * Make sure not to be in TLB lazy mode, as otherwise we'll end up 1024 * with a stale address space WITHOUT being in lazy mode after 1025 * restoring the previous mm. 1026 */ 1027 if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) 1028 leave_mm(smp_processor_id()); 1029 1030 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); 1031 switch_mm_irqs_off(NULL, mm, current); 1032 1033 /* 1034 * If breakpoints are enabled, disable them while the temporary mm is 1035 * used. Userspace might set up watchpoints on addresses that are used 1036 * in the temporary mm, which would lead to wrong signals being sent or 1037 * crashes. 1038 * 1039 * Note that breakpoints are not disabled selectively, which also causes 1040 * kernel breakpoints (e.g., perf's) to be disabled. This might be 1041 * undesirable, but still seems reasonable as the code that runs in the 1042 * temporary mm should be short. 1043 */ 1044 if (hw_breakpoint_active()) 1045 hw_breakpoint_disable(); 1046 1047 return temp_state; 1048 } 1049 1050 static inline void unuse_temporary_mm(temp_mm_state_t prev_state) 1051 { 1052 lockdep_assert_irqs_disabled(); 1053 switch_mm_irqs_off(NULL, prev_state.mm, current); 1054 1055 /* 1056 * Restore the breakpoints if they were disabled before the temporary mm 1057 * was loaded. 1058 */ 1059 if (hw_breakpoint_active()) 1060 hw_breakpoint_restore(); 1061 } 1062 1063 __ro_after_init struct mm_struct *poking_mm; 1064 __ro_after_init unsigned long poking_addr; 1065 1066 static void text_poke_memcpy(void *dst, const void *src, size_t len) 1067 { 1068 memcpy(dst, src, len); 1069 } 1070 1071 static void text_poke_memset(void *dst, const void *src, size_t len) 1072 { 1073 int c = *(const int *)src; 1074 1075 memset(dst, c, len); 1076 } 1077 1078 typedef void text_poke_f(void *dst, const void *src, size_t len); 1079 1080 static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) 1081 { 1082 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; 1083 struct page *pages[2] = {NULL}; 1084 temp_mm_state_t prev; 1085 unsigned long flags; 1086 pte_t pte, *ptep; 1087 spinlock_t *ptl; 1088 pgprot_t pgprot; 1089 1090 /* 1091 * While boot memory allocator is running we cannot use struct pages as 1092 * they are not yet initialized. There is no way to recover. 1093 */ 1094 BUG_ON(!after_bootmem); 1095 1096 if (!core_kernel_text((unsigned long)addr)) { 1097 pages[0] = vmalloc_to_page(addr); 1098 if (cross_page_boundary) 1099 pages[1] = vmalloc_to_page(addr + PAGE_SIZE); 1100 } else { 1101 pages[0] = virt_to_page(addr); 1102 WARN_ON(!PageReserved(pages[0])); 1103 if (cross_page_boundary) 1104 pages[1] = virt_to_page(addr + PAGE_SIZE); 1105 } 1106 /* 1107 * If something went wrong, crash and burn since recovery paths are not 1108 * implemented. 1109 */ 1110 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); 1111 1112 /* 1113 * Map the page without the global bit, as TLB flushing is done with 1114 * flush_tlb_mm_range(), which is intended for non-global PTEs. 1115 */ 1116 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); 1117 1118 /* 1119 * The lock is not really needed, but this allows to avoid open-coding. 1120 */ 1121 ptep = get_locked_pte(poking_mm, poking_addr, &ptl); 1122 1123 /* 1124 * This must not fail; preallocated in poking_init(). 1125 */ 1126 VM_BUG_ON(!ptep); 1127 1128 local_irq_save(flags); 1129 1130 pte = mk_pte(pages[0], pgprot); 1131 set_pte_at(poking_mm, poking_addr, ptep, pte); 1132 1133 if (cross_page_boundary) { 1134 pte = mk_pte(pages[1], pgprot); 1135 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); 1136 } 1137 1138 /* 1139 * Loading the temporary mm behaves as a compiler barrier, which 1140 * guarantees that the PTE will be set at the time memcpy() is done. 1141 */ 1142 prev = use_temporary_mm(poking_mm); 1143 1144 kasan_disable_current(); 1145 func((u8 *)poking_addr + offset_in_page(addr), src, len); 1146 kasan_enable_current(); 1147 1148 /* 1149 * Ensure that the PTE is only cleared after the instructions of memcpy 1150 * were issued by using a compiler barrier. 1151 */ 1152 barrier(); 1153 1154 pte_clear(poking_mm, poking_addr, ptep); 1155 if (cross_page_boundary) 1156 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); 1157 1158 /* 1159 * Loading the previous page-table hierarchy requires a serializing 1160 * instruction that already allows the core to see the updated version. 1161 * Xen-PV is assumed to serialize execution in a similar manner. 1162 */ 1163 unuse_temporary_mm(prev); 1164 1165 /* 1166 * Flushing the TLB might involve IPIs, which would require enabled 1167 * IRQs, but not if the mm is not used, as it is in this point. 1168 */ 1169 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + 1170 (cross_page_boundary ? 2 : 1) * PAGE_SIZE, 1171 PAGE_SHIFT, false); 1172 1173 if (func == text_poke_memcpy) { 1174 /* 1175 * If the text does not match what we just wrote then something is 1176 * fundamentally screwy; there's nothing we can really do about that. 1177 */ 1178 BUG_ON(memcmp(addr, src, len)); 1179 } 1180 1181 local_irq_restore(flags); 1182 pte_unmap_unlock(ptep, ptl); 1183 return addr; 1184 } 1185 1186 /** 1187 * text_poke - Update instructions on a live kernel 1188 * @addr: address to modify 1189 * @opcode: source of the copy 1190 * @len: length to copy 1191 * 1192 * Only atomic text poke/set should be allowed when not doing early patching. 1193 * It means the size must be writable atomically and the address must be aligned 1194 * in a way that permits an atomic write. It also makes sure we fit on a single 1195 * page. 1196 * 1197 * Note that the caller must ensure that if the modified code is part of a 1198 * module, the module would not be removed during poking. This can be achieved 1199 * by registering a module notifier, and ordering module removal and patching 1200 * trough a mutex. 1201 */ 1202 void *text_poke(void *addr, const void *opcode, size_t len) 1203 { 1204 lockdep_assert_held(&text_mutex); 1205 1206 return __text_poke(text_poke_memcpy, addr, opcode, len); 1207 } 1208 1209 /** 1210 * text_poke_kgdb - Update instructions on a live kernel by kgdb 1211 * @addr: address to modify 1212 * @opcode: source of the copy 1213 * @len: length to copy 1214 * 1215 * Only atomic text poke/set should be allowed when not doing early patching. 1216 * It means the size must be writable atomically and the address must be aligned 1217 * in a way that permits an atomic write. It also makes sure we fit on a single 1218 * page. 1219 * 1220 * Context: should only be used by kgdb, which ensures no other core is running, 1221 * despite the fact it does not hold the text_mutex. 1222 */ 1223 void *text_poke_kgdb(void *addr, const void *opcode, size_t len) 1224 { 1225 return __text_poke(text_poke_memcpy, addr, opcode, len); 1226 } 1227 1228 /** 1229 * text_poke_copy - Copy instructions into (an unused part of) RX memory 1230 * @addr: address to modify 1231 * @opcode: source of the copy 1232 * @len: length to copy, could be more than 2x PAGE_SIZE 1233 * 1234 * Not safe against concurrent execution; useful for JITs to dump 1235 * new code blocks into unused regions of RX memory. Can be used in 1236 * conjunction with synchronize_rcu_tasks() to wait for existing 1237 * execution to quiesce after having made sure no existing functions 1238 * pointers are live. 1239 */ 1240 void *text_poke_copy(void *addr, const void *opcode, size_t len) 1241 { 1242 unsigned long start = (unsigned long)addr; 1243 size_t patched = 0; 1244 1245 if (WARN_ON_ONCE(core_kernel_text(start))) 1246 return NULL; 1247 1248 mutex_lock(&text_mutex); 1249 while (patched < len) { 1250 unsigned long ptr = start + patched; 1251 size_t s; 1252 1253 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 1254 1255 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); 1256 patched += s; 1257 } 1258 mutex_unlock(&text_mutex); 1259 return addr; 1260 } 1261 1262 /** 1263 * text_poke_set - memset into (an unused part of) RX memory 1264 * @addr: address to modify 1265 * @c: the byte to fill the area with 1266 * @len: length to copy, could be more than 2x PAGE_SIZE 1267 * 1268 * This is useful to overwrite unused regions of RX memory with illegal 1269 * instructions. 1270 */ 1271 void *text_poke_set(void *addr, int c, size_t len) 1272 { 1273 unsigned long start = (unsigned long)addr; 1274 size_t patched = 0; 1275 1276 if (WARN_ON_ONCE(core_kernel_text(start))) 1277 return NULL; 1278 1279 mutex_lock(&text_mutex); 1280 while (patched < len) { 1281 unsigned long ptr = start + patched; 1282 size_t s; 1283 1284 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 1285 1286 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); 1287 patched += s; 1288 } 1289 mutex_unlock(&text_mutex); 1290 return addr; 1291 } 1292 1293 static void do_sync_core(void *info) 1294 { 1295 sync_core(); 1296 } 1297 1298 void text_poke_sync(void) 1299 { 1300 on_each_cpu(do_sync_core, NULL, 1); 1301 } 1302 1303 struct text_poke_loc { 1304 /* addr := _stext + rel_addr */ 1305 s32 rel_addr; 1306 s32 disp; 1307 u8 len; 1308 u8 opcode; 1309 const u8 text[POKE_MAX_OPCODE_SIZE]; 1310 /* see text_poke_bp_batch() */ 1311 u8 old; 1312 }; 1313 1314 struct bp_patching_desc { 1315 struct text_poke_loc *vec; 1316 int nr_entries; 1317 atomic_t refs; 1318 }; 1319 1320 static struct bp_patching_desc *bp_desc; 1321 1322 static __always_inline 1323 struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) 1324 { 1325 /* rcu_dereference */ 1326 struct bp_patching_desc *desc = __READ_ONCE(*descp); 1327 1328 if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) 1329 return NULL; 1330 1331 return desc; 1332 } 1333 1334 static __always_inline void put_desc(struct bp_patching_desc *desc) 1335 { 1336 smp_mb__before_atomic(); 1337 arch_atomic_dec(&desc->refs); 1338 } 1339 1340 static __always_inline void *text_poke_addr(struct text_poke_loc *tp) 1341 { 1342 return _stext + tp->rel_addr; 1343 } 1344 1345 static __always_inline int patch_cmp(const void *key, const void *elt) 1346 { 1347 struct text_poke_loc *tp = (struct text_poke_loc *) elt; 1348 1349 if (key < text_poke_addr(tp)) 1350 return -1; 1351 if (key > text_poke_addr(tp)) 1352 return 1; 1353 return 0; 1354 } 1355 1356 noinstr int poke_int3_handler(struct pt_regs *regs) 1357 { 1358 struct bp_patching_desc *desc; 1359 struct text_poke_loc *tp; 1360 int ret = 0; 1361 void *ip; 1362 1363 if (user_mode(regs)) 1364 return 0; 1365 1366 /* 1367 * Having observed our INT3 instruction, we now must observe 1368 * bp_desc: 1369 * 1370 * bp_desc = desc INT3 1371 * WMB RMB 1372 * write INT3 if (desc) 1373 */ 1374 smp_rmb(); 1375 1376 desc = try_get_desc(&bp_desc); 1377 if (!desc) 1378 return 0; 1379 1380 /* 1381 * Discount the INT3. See text_poke_bp_batch(). 1382 */ 1383 ip = (void *) regs->ip - INT3_INSN_SIZE; 1384 1385 /* 1386 * Skip the binary search if there is a single member in the vector. 1387 */ 1388 if (unlikely(desc->nr_entries > 1)) { 1389 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, 1390 sizeof(struct text_poke_loc), 1391 patch_cmp); 1392 if (!tp) 1393 goto out_put; 1394 } else { 1395 tp = desc->vec; 1396 if (text_poke_addr(tp) != ip) 1397 goto out_put; 1398 } 1399 1400 ip += tp->len; 1401 1402 switch (tp->opcode) { 1403 case INT3_INSN_OPCODE: 1404 /* 1405 * Someone poked an explicit INT3, they'll want to handle it, 1406 * do not consume. 1407 */ 1408 goto out_put; 1409 1410 case RET_INSN_OPCODE: 1411 int3_emulate_ret(regs); 1412 break; 1413 1414 case CALL_INSN_OPCODE: 1415 int3_emulate_call(regs, (long)ip + tp->disp); 1416 break; 1417 1418 case JMP32_INSN_OPCODE: 1419 case JMP8_INSN_OPCODE: 1420 int3_emulate_jmp(regs, (long)ip + tp->disp); 1421 break; 1422 1423 default: 1424 BUG(); 1425 } 1426 1427 ret = 1; 1428 1429 out_put: 1430 put_desc(desc); 1431 return ret; 1432 } 1433 1434 #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) 1435 static struct text_poke_loc tp_vec[TP_VEC_MAX]; 1436 static int tp_vec_nr; 1437 1438 /** 1439 * text_poke_bp_batch() -- update instructions on live kernel on SMP 1440 * @tp: vector of instructions to patch 1441 * @nr_entries: number of entries in the vector 1442 * 1443 * Modify multi-byte instruction by using int3 breakpoint on SMP. 1444 * We completely avoid stop_machine() here, and achieve the 1445 * synchronization using int3 breakpoint. 1446 * 1447 * The way it is done: 1448 * - For each entry in the vector: 1449 * - add a int3 trap to the address that will be patched 1450 * - sync cores 1451 * - For each entry in the vector: 1452 * - update all but the first byte of the patched range 1453 * - sync cores 1454 * - For each entry in the vector: 1455 * - replace the first byte (int3) by the first byte of 1456 * replacing opcode 1457 * - sync cores 1458 */ 1459 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) 1460 { 1461 struct bp_patching_desc desc = { 1462 .vec = tp, 1463 .nr_entries = nr_entries, 1464 .refs = ATOMIC_INIT(1), 1465 }; 1466 unsigned char int3 = INT3_INSN_OPCODE; 1467 unsigned int i; 1468 int do_sync; 1469 1470 lockdep_assert_held(&text_mutex); 1471 1472 smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ 1473 1474 /* 1475 * Corresponding read barrier in int3 notifier for making sure the 1476 * nr_entries and handler are correctly ordered wrt. patching. 1477 */ 1478 smp_wmb(); 1479 1480 /* 1481 * First step: add a int3 trap to the address that will be patched. 1482 */ 1483 for (i = 0; i < nr_entries; i++) { 1484 tp[i].old = *(u8 *)text_poke_addr(&tp[i]); 1485 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); 1486 } 1487 1488 text_poke_sync(); 1489 1490 /* 1491 * Second step: update all but the first byte of the patched range. 1492 */ 1493 for (do_sync = 0, i = 0; i < nr_entries; i++) { 1494 u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; 1495 int len = tp[i].len; 1496 1497 if (len - INT3_INSN_SIZE > 0) { 1498 memcpy(old + INT3_INSN_SIZE, 1499 text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 1500 len - INT3_INSN_SIZE); 1501 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 1502 (const char *)tp[i].text + INT3_INSN_SIZE, 1503 len - INT3_INSN_SIZE); 1504 do_sync++; 1505 } 1506 1507 /* 1508 * Emit a perf event to record the text poke, primarily to 1509 * support Intel PT decoding which must walk the executable code 1510 * to reconstruct the trace. The flow up to here is: 1511 * - write INT3 byte 1512 * - IPI-SYNC 1513 * - write instruction tail 1514 * At this point the actual control flow will be through the 1515 * INT3 and handler and not hit the old or new instruction. 1516 * Intel PT outputs FUP/TIP packets for the INT3, so the flow 1517 * can still be decoded. Subsequently: 1518 * - emit RECORD_TEXT_POKE with the new instruction 1519 * - IPI-SYNC 1520 * - write first byte 1521 * - IPI-SYNC 1522 * So before the text poke event timestamp, the decoder will see 1523 * either the old instruction flow or FUP/TIP of INT3. After the 1524 * text poke event timestamp, the decoder will see either the 1525 * new instruction flow or FUP/TIP of INT3. Thus decoders can 1526 * use the timestamp as the point at which to modify the 1527 * executable code. 1528 * The old instruction is recorded so that the event can be 1529 * processed forwards or backwards. 1530 */ 1531 perf_event_text_poke(text_poke_addr(&tp[i]), old, len, 1532 tp[i].text, len); 1533 } 1534 1535 if (do_sync) { 1536 /* 1537 * According to Intel, this core syncing is very likely 1538 * not necessary and we'd be safe even without it. But 1539 * better safe than sorry (plus there's not only Intel). 1540 */ 1541 text_poke_sync(); 1542 } 1543 1544 /* 1545 * Third step: replace the first byte (int3) by the first byte of 1546 * replacing opcode. 1547 */ 1548 for (do_sync = 0, i = 0; i < nr_entries; i++) { 1549 if (tp[i].text[0] == INT3_INSN_OPCODE) 1550 continue; 1551 1552 text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); 1553 do_sync++; 1554 } 1555 1556 if (do_sync) 1557 text_poke_sync(); 1558 1559 /* 1560 * Remove and synchronize_rcu(), except we have a very primitive 1561 * refcount based completion. 1562 */ 1563 WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ 1564 if (!atomic_dec_and_test(&desc.refs)) 1565 atomic_cond_read_acquire(&desc.refs, !VAL); 1566 } 1567 1568 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, 1569 const void *opcode, size_t len, const void *emulate) 1570 { 1571 struct insn insn; 1572 int ret, i; 1573 1574 memcpy((void *)tp->text, opcode, len); 1575 if (!emulate) 1576 emulate = opcode; 1577 1578 ret = insn_decode_kernel(&insn, emulate); 1579 BUG_ON(ret < 0); 1580 1581 tp->rel_addr = addr - (void *)_stext; 1582 tp->len = len; 1583 tp->opcode = insn.opcode.bytes[0]; 1584 1585 switch (tp->opcode) { 1586 case RET_INSN_OPCODE: 1587 case JMP32_INSN_OPCODE: 1588 case JMP8_INSN_OPCODE: 1589 /* 1590 * Control flow instructions without implied execution of the 1591 * next instruction can be padded with INT3. 1592 */ 1593 for (i = insn.length; i < len; i++) 1594 BUG_ON(tp->text[i] != INT3_INSN_OPCODE); 1595 break; 1596 1597 default: 1598 BUG_ON(len != insn.length); 1599 }; 1600 1601 1602 switch (tp->opcode) { 1603 case INT3_INSN_OPCODE: 1604 case RET_INSN_OPCODE: 1605 break; 1606 1607 case CALL_INSN_OPCODE: 1608 case JMP32_INSN_OPCODE: 1609 case JMP8_INSN_OPCODE: 1610 tp->disp = insn.immediate.value; 1611 break; 1612 1613 default: /* assume NOP */ 1614 switch (len) { 1615 case 2: /* NOP2 -- emulate as JMP8+0 */ 1616 BUG_ON(memcmp(emulate, x86_nops[len], len)); 1617 tp->opcode = JMP8_INSN_OPCODE; 1618 tp->disp = 0; 1619 break; 1620 1621 case 5: /* NOP5 -- emulate as JMP32+0 */ 1622 BUG_ON(memcmp(emulate, x86_nops[len], len)); 1623 tp->opcode = JMP32_INSN_OPCODE; 1624 tp->disp = 0; 1625 break; 1626 1627 default: /* unknown instruction */ 1628 BUG(); 1629 } 1630 break; 1631 } 1632 } 1633 1634 /* 1635 * We hard rely on the tp_vec being ordered; ensure this is so by flushing 1636 * early if needed. 1637 */ 1638 static bool tp_order_fail(void *addr) 1639 { 1640 struct text_poke_loc *tp; 1641 1642 if (!tp_vec_nr) 1643 return false; 1644 1645 if (!addr) /* force */ 1646 return true; 1647 1648 tp = &tp_vec[tp_vec_nr - 1]; 1649 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) 1650 return true; 1651 1652 return false; 1653 } 1654 1655 static void text_poke_flush(void *addr) 1656 { 1657 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { 1658 text_poke_bp_batch(tp_vec, tp_vec_nr); 1659 tp_vec_nr = 0; 1660 } 1661 } 1662 1663 void text_poke_finish(void) 1664 { 1665 text_poke_flush(NULL); 1666 } 1667 1668 void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) 1669 { 1670 struct text_poke_loc *tp; 1671 1672 if (unlikely(system_state == SYSTEM_BOOTING)) { 1673 text_poke_early(addr, opcode, len); 1674 return; 1675 } 1676 1677 text_poke_flush(addr); 1678 1679 tp = &tp_vec[tp_vec_nr++]; 1680 text_poke_loc_init(tp, addr, opcode, len, emulate); 1681 } 1682 1683 /** 1684 * text_poke_bp() -- update instructions on live kernel on SMP 1685 * @addr: address to patch 1686 * @opcode: opcode of new instruction 1687 * @len: length to copy 1688 * @emulate: instruction to be emulated 1689 * 1690 * Update a single instruction with the vector in the stack, avoiding 1691 * dynamically allocated memory. This function should be used when it is 1692 * not possible to allocate memory. 1693 */ 1694 void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) 1695 { 1696 struct text_poke_loc tp; 1697 1698 if (unlikely(system_state == SYSTEM_BOOTING)) { 1699 text_poke_early(addr, opcode, len); 1700 return; 1701 } 1702 1703 text_poke_loc_init(&tp, addr, opcode, len, emulate); 1704 text_poke_bp_batch(&tp, 1); 1705 } 1706