1 #define pr_fmt(fmt) "SMP alternatives: " fmt 2 3 #include <linux/module.h> 4 #include <linux/sched.h> 5 #include <linux/mutex.h> 6 #include <linux/list.h> 7 #include <linux/stringify.h> 8 #include <linux/mm.h> 9 #include <linux/vmalloc.h> 10 #include <linux/memory.h> 11 #include <linux/stop_machine.h> 12 #include <linux/slab.h> 13 #include <linux/kdebug.h> 14 #include <asm/alternative.h> 15 #include <asm/sections.h> 16 #include <asm/pgtable.h> 17 #include <asm/mce.h> 18 #include <asm/nmi.h> 19 #include <asm/cacheflush.h> 20 #include <asm/tlbflush.h> 21 #include <asm/io.h> 22 #include <asm/fixmap.h> 23 24 #define MAX_PATCH_LEN (255-1) 25 26 static int __initdata_or_module debug_alternative; 27 28 static int __init debug_alt(char *str) 29 { 30 debug_alternative = 1; 31 return 1; 32 } 33 __setup("debug-alternative", debug_alt); 34 35 static int noreplace_smp; 36 37 static int __init setup_noreplace_smp(char *str) 38 { 39 noreplace_smp = 1; 40 return 1; 41 } 42 __setup("noreplace-smp", setup_noreplace_smp); 43 44 #ifdef CONFIG_PARAVIRT 45 static int __initdata_or_module noreplace_paravirt = 0; 46 47 static int __init setup_noreplace_paravirt(char *str) 48 { 49 noreplace_paravirt = 1; 50 return 1; 51 } 52 __setup("noreplace-paravirt", setup_noreplace_paravirt); 53 #endif 54 55 #define DPRINTK(fmt, ...) \ 56 do { \ 57 if (debug_alternative) \ 58 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ 59 } while (0) 60 61 /* 62 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes 63 * that correspond to that nop. Getting from one nop to the next, we 64 * add to the array the offset that is equal to the sum of all sizes of 65 * nops preceding the one we are after. 66 * 67 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the 68 * nice symmetry of sizes of the previous nops. 69 */ 70 #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) 71 static const unsigned char intelnops[] = 72 { 73 GENERIC_NOP1, 74 GENERIC_NOP2, 75 GENERIC_NOP3, 76 GENERIC_NOP4, 77 GENERIC_NOP5, 78 GENERIC_NOP6, 79 GENERIC_NOP7, 80 GENERIC_NOP8, 81 GENERIC_NOP5_ATOMIC 82 }; 83 static const unsigned char * const intel_nops[ASM_NOP_MAX+2] = 84 { 85 NULL, 86 intelnops, 87 intelnops + 1, 88 intelnops + 1 + 2, 89 intelnops + 1 + 2 + 3, 90 intelnops + 1 + 2 + 3 + 4, 91 intelnops + 1 + 2 + 3 + 4 + 5, 92 intelnops + 1 + 2 + 3 + 4 + 5 + 6, 93 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 94 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, 95 }; 96 #endif 97 98 #ifdef K8_NOP1 99 static const unsigned char k8nops[] = 100 { 101 K8_NOP1, 102 K8_NOP2, 103 K8_NOP3, 104 K8_NOP4, 105 K8_NOP5, 106 K8_NOP6, 107 K8_NOP7, 108 K8_NOP8, 109 K8_NOP5_ATOMIC 110 }; 111 static const unsigned char * const k8_nops[ASM_NOP_MAX+2] = 112 { 113 NULL, 114 k8nops, 115 k8nops + 1, 116 k8nops + 1 + 2, 117 k8nops + 1 + 2 + 3, 118 k8nops + 1 + 2 + 3 + 4, 119 k8nops + 1 + 2 + 3 + 4 + 5, 120 k8nops + 1 + 2 + 3 + 4 + 5 + 6, 121 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 122 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, 123 }; 124 #endif 125 126 #if defined(K7_NOP1) && !defined(CONFIG_X86_64) 127 static const unsigned char k7nops[] = 128 { 129 K7_NOP1, 130 K7_NOP2, 131 K7_NOP3, 132 K7_NOP4, 133 K7_NOP5, 134 K7_NOP6, 135 K7_NOP7, 136 K7_NOP8, 137 K7_NOP5_ATOMIC 138 }; 139 static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = 140 { 141 NULL, 142 k7nops, 143 k7nops + 1, 144 k7nops + 1 + 2, 145 k7nops + 1 + 2 + 3, 146 k7nops + 1 + 2 + 3 + 4, 147 k7nops + 1 + 2 + 3 + 4 + 5, 148 k7nops + 1 + 2 + 3 + 4 + 5 + 6, 149 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 150 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, 151 }; 152 #endif 153 154 #ifdef P6_NOP1 155 static const unsigned char p6nops[] = 156 { 157 P6_NOP1, 158 P6_NOP2, 159 P6_NOP3, 160 P6_NOP4, 161 P6_NOP5, 162 P6_NOP6, 163 P6_NOP7, 164 P6_NOP8, 165 P6_NOP5_ATOMIC 166 }; 167 static const unsigned char * const p6_nops[ASM_NOP_MAX+2] = 168 { 169 NULL, 170 p6nops, 171 p6nops + 1, 172 p6nops + 1 + 2, 173 p6nops + 1 + 2 + 3, 174 p6nops + 1 + 2 + 3 + 4, 175 p6nops + 1 + 2 + 3 + 4 + 5, 176 p6nops + 1 + 2 + 3 + 4 + 5 + 6, 177 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 178 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, 179 }; 180 #endif 181 182 /* Initialize these to a safe default */ 183 #ifdef CONFIG_X86_64 184 const unsigned char * const *ideal_nops = p6_nops; 185 #else 186 const unsigned char * const *ideal_nops = intel_nops; 187 #endif 188 189 void __init arch_init_ideal_nops(void) 190 { 191 switch (boot_cpu_data.x86_vendor) { 192 case X86_VENDOR_INTEL: 193 /* 194 * Due to a decoder implementation quirk, some 195 * specific Intel CPUs actually perform better with 196 * the "k8_nops" than with the SDM-recommended NOPs. 197 */ 198 if (boot_cpu_data.x86 == 6 && 199 boot_cpu_data.x86_model >= 0x0f && 200 boot_cpu_data.x86_model != 0x1c && 201 boot_cpu_data.x86_model != 0x26 && 202 boot_cpu_data.x86_model != 0x27 && 203 boot_cpu_data.x86_model < 0x30) { 204 ideal_nops = k8_nops; 205 } else if (boot_cpu_has(X86_FEATURE_NOPL)) { 206 ideal_nops = p6_nops; 207 } else { 208 #ifdef CONFIG_X86_64 209 ideal_nops = k8_nops; 210 #else 211 ideal_nops = intel_nops; 212 #endif 213 } 214 break; 215 default: 216 #ifdef CONFIG_X86_64 217 ideal_nops = k8_nops; 218 #else 219 if (boot_cpu_has(X86_FEATURE_K8)) 220 ideal_nops = k8_nops; 221 else if (boot_cpu_has(X86_FEATURE_K7)) 222 ideal_nops = k7_nops; 223 else 224 ideal_nops = intel_nops; 225 #endif 226 } 227 } 228 229 /* Use this to add nops to a buffer, then text_poke the whole buffer. */ 230 static void __init_or_module add_nops(void *insns, unsigned int len) 231 { 232 while (len > 0) { 233 unsigned int noplen = len; 234 if (noplen > ASM_NOP_MAX) 235 noplen = ASM_NOP_MAX; 236 memcpy(insns, ideal_nops[noplen], noplen); 237 insns += noplen; 238 len -= noplen; 239 } 240 } 241 242 extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 243 extern s32 __smp_locks[], __smp_locks_end[]; 244 void *text_poke_early(void *addr, const void *opcode, size_t len); 245 246 /* Replace instructions with better alternatives for this CPU type. 247 This runs before SMP is initialized to avoid SMP problems with 248 self modifying code. This implies that asymmetric systems where 249 APs have less capabilities than the boot processor are not handled. 250 Tough. Make sure you disable such features by hand. */ 251 252 void __init_or_module apply_alternatives(struct alt_instr *start, 253 struct alt_instr *end) 254 { 255 struct alt_instr *a; 256 u8 *instr, *replacement; 257 u8 insnbuf[MAX_PATCH_LEN]; 258 259 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 260 /* 261 * The scan order should be from start to end. A later scanned 262 * alternative code can overwrite a previous scanned alternative code. 263 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 264 * patch code. 265 * 266 * So be careful if you want to change the scan order to any other 267 * order. 268 */ 269 for (a = start; a < end; a++) { 270 instr = (u8 *)&a->instr_offset + a->instr_offset; 271 replacement = (u8 *)&a->repl_offset + a->repl_offset; 272 BUG_ON(a->replacementlen > a->instrlen); 273 BUG_ON(a->instrlen > sizeof(insnbuf)); 274 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); 275 if (!boot_cpu_has(a->cpuid)) 276 continue; 277 278 memcpy(insnbuf, replacement, a->replacementlen); 279 280 /* 0xe8 is a relative jump; fix the offset. */ 281 if (*insnbuf == 0xe8 && a->replacementlen == 5) 282 *(s32 *)(insnbuf + 1) += replacement - instr; 283 284 add_nops(insnbuf + a->replacementlen, 285 a->instrlen - a->replacementlen); 286 287 text_poke_early(instr, insnbuf, a->instrlen); 288 } 289 } 290 291 #ifdef CONFIG_SMP 292 293 static void alternatives_smp_lock(const s32 *start, const s32 *end, 294 u8 *text, u8 *text_end) 295 { 296 const s32 *poff; 297 298 mutex_lock(&text_mutex); 299 for (poff = start; poff < end; poff++) { 300 u8 *ptr = (u8 *)poff + *poff; 301 302 if (!*poff || ptr < text || ptr >= text_end) 303 continue; 304 /* turn DS segment override prefix into lock prefix */ 305 if (*ptr == 0x3e) 306 text_poke(ptr, ((unsigned char []){0xf0}), 1); 307 } 308 mutex_unlock(&text_mutex); 309 } 310 311 static void alternatives_smp_unlock(const s32 *start, const s32 *end, 312 u8 *text, u8 *text_end) 313 { 314 const s32 *poff; 315 316 mutex_lock(&text_mutex); 317 for (poff = start; poff < end; poff++) { 318 u8 *ptr = (u8 *)poff + *poff; 319 320 if (!*poff || ptr < text || ptr >= text_end) 321 continue; 322 /* turn lock prefix into DS segment override prefix */ 323 if (*ptr == 0xf0) 324 text_poke(ptr, ((unsigned char []){0x3E}), 1); 325 } 326 mutex_unlock(&text_mutex); 327 } 328 329 struct smp_alt_module { 330 /* what is this ??? */ 331 struct module *mod; 332 char *name; 333 334 /* ptrs to lock prefixes */ 335 const s32 *locks; 336 const s32 *locks_end; 337 338 /* .text segment, needed to avoid patching init code ;) */ 339 u8 *text; 340 u8 *text_end; 341 342 struct list_head next; 343 }; 344 static LIST_HEAD(smp_alt_modules); 345 static DEFINE_MUTEX(smp_alt); 346 static bool uniproc_patched = false; /* protected by smp_alt */ 347 348 void __init_or_module alternatives_smp_module_add(struct module *mod, 349 char *name, 350 void *locks, void *locks_end, 351 void *text, void *text_end) 352 { 353 struct smp_alt_module *smp; 354 355 mutex_lock(&smp_alt); 356 if (!uniproc_patched) 357 goto unlock; 358 359 if (num_possible_cpus() == 1) 360 /* Don't bother remembering, we'll never have to undo it. */ 361 goto smp_unlock; 362 363 smp = kzalloc(sizeof(*smp), GFP_KERNEL); 364 if (NULL == smp) 365 /* we'll run the (safe but slow) SMP code then ... */ 366 goto unlock; 367 368 smp->mod = mod; 369 smp->name = name; 370 smp->locks = locks; 371 smp->locks_end = locks_end; 372 smp->text = text; 373 smp->text_end = text_end; 374 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", 375 __func__, smp->locks, smp->locks_end, 376 smp->text, smp->text_end, smp->name); 377 378 list_add_tail(&smp->next, &smp_alt_modules); 379 smp_unlock: 380 alternatives_smp_unlock(locks, locks_end, text, text_end); 381 unlock: 382 mutex_unlock(&smp_alt); 383 } 384 385 void __init_or_module alternatives_smp_module_del(struct module *mod) 386 { 387 struct smp_alt_module *item; 388 389 mutex_lock(&smp_alt); 390 list_for_each_entry(item, &smp_alt_modules, next) { 391 if (mod != item->mod) 392 continue; 393 list_del(&item->next); 394 kfree(item); 395 break; 396 } 397 mutex_unlock(&smp_alt); 398 } 399 400 void alternatives_enable_smp(void) 401 { 402 struct smp_alt_module *mod; 403 404 /* Why bother if there are no other CPUs? */ 405 BUG_ON(num_possible_cpus() == 1); 406 407 mutex_lock(&smp_alt); 408 409 if (uniproc_patched) { 410 pr_info("switching to SMP code\n"); 411 BUG_ON(num_online_cpus() != 1); 412 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 413 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 414 list_for_each_entry(mod, &smp_alt_modules, next) 415 alternatives_smp_lock(mod->locks, mod->locks_end, 416 mod->text, mod->text_end); 417 uniproc_patched = false; 418 } 419 mutex_unlock(&smp_alt); 420 } 421 422 /* Return 1 if the address range is reserved for smp-alternatives */ 423 int alternatives_text_reserved(void *start, void *end) 424 { 425 struct smp_alt_module *mod; 426 const s32 *poff; 427 u8 *text_start = start; 428 u8 *text_end = end; 429 430 list_for_each_entry(mod, &smp_alt_modules, next) { 431 if (mod->text > text_end || mod->text_end < text_start) 432 continue; 433 for (poff = mod->locks; poff < mod->locks_end; poff++) { 434 const u8 *ptr = (const u8 *)poff + *poff; 435 436 if (text_start <= ptr && text_end > ptr) 437 return 1; 438 } 439 } 440 441 return 0; 442 } 443 #endif 444 445 #ifdef CONFIG_PARAVIRT 446 void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 447 struct paravirt_patch_site *end) 448 { 449 struct paravirt_patch_site *p; 450 char insnbuf[MAX_PATCH_LEN]; 451 452 if (noreplace_paravirt) 453 return; 454 455 for (p = start; p < end; p++) { 456 unsigned int used; 457 458 BUG_ON(p->len > MAX_PATCH_LEN); 459 /* prep the buffer with the original instructions */ 460 memcpy(insnbuf, p->instr, p->len); 461 used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, 462 (unsigned long)p->instr, p->len); 463 464 BUG_ON(used > p->len); 465 466 /* Pad the rest with nops */ 467 add_nops(insnbuf + used, p->len - used); 468 text_poke_early(p->instr, insnbuf, p->len); 469 } 470 } 471 extern struct paravirt_patch_site __start_parainstructions[], 472 __stop_parainstructions[]; 473 #endif /* CONFIG_PARAVIRT */ 474 475 void __init alternative_instructions(void) 476 { 477 /* The patching is not fully atomic, so try to avoid local interruptions 478 that might execute the to be patched code. 479 Other CPUs are not running. */ 480 stop_nmi(); 481 482 /* 483 * Don't stop machine check exceptions while patching. 484 * MCEs only happen when something got corrupted and in this 485 * case we must do something about the corruption. 486 * Ignoring it is worse than a unlikely patching race. 487 * Also machine checks tend to be broadcast and if one CPU 488 * goes into machine check the others follow quickly, so we don't 489 * expect a machine check to cause undue problems during to code 490 * patching. 491 */ 492 493 apply_alternatives(__alt_instructions, __alt_instructions_end); 494 495 #ifdef CONFIG_SMP 496 /* Patch to UP if other cpus not imminent. */ 497 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { 498 uniproc_patched = true; 499 alternatives_smp_module_add(NULL, "core kernel", 500 __smp_locks, __smp_locks_end, 501 _text, _etext); 502 } 503 504 if (!uniproc_patched || num_possible_cpus() == 1) 505 free_init_pages("SMP alternatives", 506 (unsigned long)__smp_locks, 507 (unsigned long)__smp_locks_end); 508 #endif 509 510 apply_paravirt(__parainstructions, __parainstructions_end); 511 512 restart_nmi(); 513 } 514 515 /** 516 * text_poke_early - Update instructions on a live kernel at boot time 517 * @addr: address to modify 518 * @opcode: source of the copy 519 * @len: length to copy 520 * 521 * When you use this code to patch more than one byte of an instruction 522 * you need to make sure that other CPUs cannot execute this code in parallel. 523 * Also no thread must be currently preempted in the middle of these 524 * instructions. And on the local CPU you need to be protected again NMI or MCE 525 * handlers seeing an inconsistent instruction while you patch. 526 */ 527 void *__init_or_module text_poke_early(void *addr, const void *opcode, 528 size_t len) 529 { 530 unsigned long flags; 531 local_irq_save(flags); 532 memcpy(addr, opcode, len); 533 sync_core(); 534 local_irq_restore(flags); 535 /* Could also do a CLFLUSH here to speed up CPU recovery; but 536 that causes hangs on some VIA CPUs. */ 537 return addr; 538 } 539 540 /** 541 * text_poke - Update instructions on a live kernel 542 * @addr: address to modify 543 * @opcode: source of the copy 544 * @len: length to copy 545 * 546 * Only atomic text poke/set should be allowed when not doing early patching. 547 * It means the size must be writable atomically and the address must be aligned 548 * in a way that permits an atomic write. It also makes sure we fit on a single 549 * page. 550 * 551 * Note: Must be called under text_mutex. 552 */ 553 void *text_poke(void *addr, const void *opcode, size_t len) 554 { 555 unsigned long flags; 556 char *vaddr; 557 struct page *pages[2]; 558 int i; 559 560 if (!core_kernel_text((unsigned long)addr)) { 561 pages[0] = vmalloc_to_page(addr); 562 pages[1] = vmalloc_to_page(addr + PAGE_SIZE); 563 } else { 564 pages[0] = virt_to_page(addr); 565 WARN_ON(!PageReserved(pages[0])); 566 pages[1] = virt_to_page(addr + PAGE_SIZE); 567 } 568 BUG_ON(!pages[0]); 569 local_irq_save(flags); 570 set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); 571 if (pages[1]) 572 set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); 573 vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); 574 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); 575 clear_fixmap(FIX_TEXT_POKE0); 576 if (pages[1]) 577 clear_fixmap(FIX_TEXT_POKE1); 578 local_flush_tlb(); 579 sync_core(); 580 /* Could also do a CLFLUSH here to speed up CPU recovery; but 581 that causes hangs on some VIA CPUs. */ 582 for (i = 0; i < len; i++) 583 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); 584 local_irq_restore(flags); 585 return addr; 586 } 587 588 static void do_sync_core(void *info) 589 { 590 sync_core(); 591 } 592 593 static bool bp_patching_in_progress; 594 static void *bp_int3_handler, *bp_int3_addr; 595 596 int poke_int3_handler(struct pt_regs *regs) 597 { 598 /* bp_patching_in_progress */ 599 smp_rmb(); 600 601 if (likely(!bp_patching_in_progress)) 602 return 0; 603 604 if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) 605 return 0; 606 607 /* set up the specified breakpoint handler */ 608 regs->ip = (unsigned long) bp_int3_handler; 609 610 return 1; 611 612 } 613 614 /** 615 * text_poke_bp() -- update instructions on live kernel on SMP 616 * @addr: address to patch 617 * @opcode: opcode of new instruction 618 * @len: length to copy 619 * @handler: address to jump to when the temporary breakpoint is hit 620 * 621 * Modify multi-byte instruction by using int3 breakpoint on SMP. 622 * We completely avoid stop_machine() here, and achieve the 623 * synchronization using int3 breakpoint. 624 * 625 * The way it is done: 626 * - add a int3 trap to the address that will be patched 627 * - sync cores 628 * - update all but the first byte of the patched range 629 * - sync cores 630 * - replace the first byte (int3) by the first byte of 631 * replacing opcode 632 * - sync cores 633 * 634 * Note: must be called under text_mutex. 635 */ 636 void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) 637 { 638 unsigned char int3 = 0xcc; 639 640 bp_int3_handler = handler; 641 bp_int3_addr = (u8 *)addr + sizeof(int3); 642 bp_patching_in_progress = true; 643 /* 644 * Corresponding read barrier in int3 notifier for 645 * making sure the in_progress flags is correctly ordered wrt. 646 * patching 647 */ 648 smp_wmb(); 649 650 text_poke(addr, &int3, sizeof(int3)); 651 652 on_each_cpu(do_sync_core, NULL, 1); 653 654 if (len - sizeof(int3) > 0) { 655 /* patch all but the first byte */ 656 text_poke((char *)addr + sizeof(int3), 657 (const char *) opcode + sizeof(int3), 658 len - sizeof(int3)); 659 /* 660 * According to Intel, this core syncing is very likely 661 * not necessary and we'd be safe even without it. But 662 * better safe than sorry (plus there's not only Intel). 663 */ 664 on_each_cpu(do_sync_core, NULL, 1); 665 } 666 667 /* patch the first byte */ 668 text_poke(addr, opcode, sizeof(int3)); 669 670 on_each_cpu(do_sync_core, NULL, 1); 671 672 bp_patching_in_progress = false; 673 smp_wmb(); 674 675 return addr; 676 } 677 678