1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * 4 * Common boot and setup code. 5 * 6 * Copyright (C) 2001 PPC64 Team, IBM Corp 7 */ 8 9 #include <linux/export.h> 10 #include <linux/string.h> 11 #include <linux/sched.h> 12 #include <linux/init.h> 13 #include <linux/kernel.h> 14 #include <linux/reboot.h> 15 #include <linux/delay.h> 16 #include <linux/initrd.h> 17 #include <linux/seq_file.h> 18 #include <linux/ioport.h> 19 #include <linux/console.h> 20 #include <linux/utsname.h> 21 #include <linux/tty.h> 22 #include <linux/root_dev.h> 23 #include <linux/notifier.h> 24 #include <linux/cpu.h> 25 #include <linux/unistd.h> 26 #include <linux/serial.h> 27 #include <linux/serial_8250.h> 28 #include <linux/memblock.h> 29 #include <linux/pci.h> 30 #include <linux/lockdep.h> 31 #include <linux/memory.h> 32 #include <linux/nmi.h> 33 34 #include <asm/debugfs.h> 35 #include <asm/io.h> 36 #include <asm/kdump.h> 37 #include <asm/prom.h> 38 #include <asm/processor.h> 39 #include <asm/pgtable.h> 40 #include <asm/smp.h> 41 #include <asm/elf.h> 42 #include <asm/machdep.h> 43 #include <asm/paca.h> 44 #include <asm/time.h> 45 #include <asm/cputable.h> 46 #include <asm/dt_cpu_ftrs.h> 47 #include <asm/sections.h> 48 #include <asm/btext.h> 49 #include <asm/nvram.h> 50 #include <asm/setup.h> 51 #include <asm/rtas.h> 52 #include <asm/iommu.h> 53 #include <asm/serial.h> 54 #include <asm/cache.h> 55 #include <asm/page.h> 56 #include <asm/mmu.h> 57 #include <asm/firmware.h> 58 #include <asm/xmon.h> 59 #include <asm/udbg.h> 60 #include <asm/kexec.h> 61 #include <asm/code-patching.h> 62 #include <asm/livepatch.h> 63 #include <asm/opal.h> 64 #include <asm/cputhreads.h> 65 #include <asm/hw_irq.h> 66 #include <asm/feature-fixups.h> 67 #include <asm/kup.h> 68 69 #include "setup.h" 70 71 #ifdef DEBUG 72 #define DBG(fmt...) udbg_printf(fmt) 73 #else 74 #define DBG(fmt...) 75 #endif 76 77 int spinning_secondaries; 78 u64 ppc64_pft_size; 79 80 struct ppc64_caches ppc64_caches = { 81 .l1d = { 82 .block_size = 0x40, 83 .log_block_size = 6, 84 }, 85 .l1i = { 86 .block_size = 0x40, 87 .log_block_size = 6 88 }, 89 }; 90 EXPORT_SYMBOL_GPL(ppc64_caches); 91 92 #if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP) 93 void __init setup_tlb_core_data(void) 94 { 95 int cpu; 96 97 BUILD_BUG_ON(offsetof(struct tlb_core_data, lock) != 0); 98 99 for_each_possible_cpu(cpu) { 100 int first = cpu_first_thread_sibling(cpu); 101 102 /* 103 * If we boot via kdump on a non-primary thread, 104 * make sure we point at the thread that actually 105 * set up this TLB. 106 */ 107 if (cpu_first_thread_sibling(boot_cpuid) == first) 108 first = boot_cpuid; 109 110 paca_ptrs[cpu]->tcd_ptr = &paca_ptrs[first]->tcd; 111 112 /* 113 * If we have threads, we need either tlbsrx. 114 * or e6500 tablewalk mode, or else TLB handlers 115 * will be racy and could produce duplicate entries. 116 * Should we panic instead? 117 */ 118 WARN_ONCE(smt_enabled_at_boot >= 2 && 119 !mmu_has_feature(MMU_FTR_USE_TLBRSRV) && 120 book3e_htw_mode != PPC_HTW_E6500, 121 "%s: unsupported MMU configuration\n", __func__); 122 } 123 } 124 #endif 125 126 #ifdef CONFIG_SMP 127 128 static char *smt_enabled_cmdline; 129 130 /* Look for ibm,smt-enabled OF option */ 131 void __init check_smt_enabled(void) 132 { 133 struct device_node *dn; 134 const char *smt_option; 135 136 /* Default to enabling all threads */ 137 smt_enabled_at_boot = threads_per_core; 138 139 /* Allow the command line to overrule the OF option */ 140 if (smt_enabled_cmdline) { 141 if (!strcmp(smt_enabled_cmdline, "on")) 142 smt_enabled_at_boot = threads_per_core; 143 else if (!strcmp(smt_enabled_cmdline, "off")) 144 smt_enabled_at_boot = 0; 145 else { 146 int smt; 147 int rc; 148 149 rc = kstrtoint(smt_enabled_cmdline, 10, &smt); 150 if (!rc) 151 smt_enabled_at_boot = 152 min(threads_per_core, smt); 153 } 154 } else { 155 dn = of_find_node_by_path("/options"); 156 if (dn) { 157 smt_option = of_get_property(dn, "ibm,smt-enabled", 158 NULL); 159 160 if (smt_option) { 161 if (!strcmp(smt_option, "on")) 162 smt_enabled_at_boot = threads_per_core; 163 else if (!strcmp(smt_option, "off")) 164 smt_enabled_at_boot = 0; 165 } 166 167 of_node_put(dn); 168 } 169 } 170 } 171 172 /* Look for smt-enabled= cmdline option */ 173 static int __init early_smt_enabled(char *p) 174 { 175 smt_enabled_cmdline = p; 176 return 0; 177 } 178 early_param("smt-enabled", early_smt_enabled); 179 180 #endif /* CONFIG_SMP */ 181 182 /** Fix up paca fields required for the boot cpu */ 183 static void __init fixup_boot_paca(void) 184 { 185 /* The boot cpu is started */ 186 get_paca()->cpu_start = 1; 187 /* Allow percpu accesses to work until we setup percpu data */ 188 get_paca()->data_offset = 0; 189 /* Mark interrupts disabled in PACA */ 190 irq_soft_mask_set(IRQS_DISABLED); 191 } 192 193 static void __init configure_exceptions(void) 194 { 195 /* 196 * Setup the trampolines from the lowmem exception vectors 197 * to the kdump kernel when not using a relocatable kernel. 198 */ 199 setup_kdump_trampoline(); 200 201 /* Under a PAPR hypervisor, we need hypercalls */ 202 if (firmware_has_feature(FW_FEATURE_SET_MODE)) { 203 /* Enable AIL if possible */ 204 pseries_enable_reloc_on_exc(); 205 206 /* 207 * Tell the hypervisor that we want our exceptions to 208 * be taken in little endian mode. 209 * 210 * We don't call this for big endian as our calling convention 211 * makes us always enter in BE, and the call may fail under 212 * some circumstances with kdump. 213 */ 214 #ifdef __LITTLE_ENDIAN__ 215 pseries_little_endian_exceptions(); 216 #endif 217 } else { 218 /* Set endian mode using OPAL */ 219 if (firmware_has_feature(FW_FEATURE_OPAL)) 220 opal_configure_cores(); 221 222 /* AIL on native is done in cpu_ready_for_interrupts() */ 223 } 224 } 225 226 static void cpu_ready_for_interrupts(void) 227 { 228 /* 229 * Enable AIL if supported, and we are in hypervisor mode. This 230 * is called once for every processor. 231 * 232 * If we are not in hypervisor mode the job is done once for 233 * the whole partition in configure_exceptions(). 234 */ 235 if (cpu_has_feature(CPU_FTR_HVMODE) && 236 cpu_has_feature(CPU_FTR_ARCH_207S)) { 237 unsigned long lpcr = mfspr(SPRN_LPCR); 238 mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3); 239 } 240 241 /* 242 * Set HFSCR:TM based on CPU features: 243 * In the special case of TM no suspend (P9N DD2.1), Linux is 244 * told TM is off via the dt-ftrs but told to (partially) use 245 * it via OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED. So HFSCR[TM] 246 * will be off from dt-ftrs but we need to turn it on for the 247 * no suspend case. 248 */ 249 if (cpu_has_feature(CPU_FTR_HVMODE)) { 250 if (cpu_has_feature(CPU_FTR_TM_COMP)) 251 mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) | HFSCR_TM); 252 else 253 mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM); 254 } 255 256 /* Set IR and DR in PACA MSR */ 257 get_paca()->kernel_msr = MSR_KERNEL; 258 } 259 260 unsigned long spr_default_dscr = 0; 261 262 void __init record_spr_defaults(void) 263 { 264 if (early_cpu_has_feature(CPU_FTR_DSCR)) 265 spr_default_dscr = mfspr(SPRN_DSCR); 266 } 267 268 /* 269 * Early initialization entry point. This is called by head.S 270 * with MMU translation disabled. We rely on the "feature" of 271 * the CPU that ignores the top 2 bits of the address in real 272 * mode so we can access kernel globals normally provided we 273 * only toy with things in the RMO region. From here, we do 274 * some early parsing of the device-tree to setup out MEMBLOCK 275 * data structures, and allocate & initialize the hash table 276 * and segment tables so we can start running with translation 277 * enabled. 278 * 279 * It is this function which will call the probe() callback of 280 * the various platform types and copy the matching one to the 281 * global ppc_md structure. Your platform can eventually do 282 * some very early initializations from the probe() routine, but 283 * this is not recommended, be very careful as, for example, the 284 * device-tree is not accessible via normal means at this point. 285 */ 286 287 void __init early_setup(unsigned long dt_ptr) 288 { 289 static __initdata struct paca_struct boot_paca; 290 291 /* -------- printk is _NOT_ safe to use here ! ------- */ 292 293 /* Try new device tree based feature discovery ... */ 294 if (!dt_cpu_ftrs_init(__va(dt_ptr))) 295 /* Otherwise use the old style CPU table */ 296 identify_cpu(0, mfspr(SPRN_PVR)); 297 298 /* Assume we're on cpu 0 for now. Don't write to the paca yet! */ 299 initialise_paca(&boot_paca, 0); 300 setup_paca(&boot_paca); 301 fixup_boot_paca(); 302 303 /* -------- printk is now safe to use ------- */ 304 305 /* Enable early debugging if any specified (see udbg.h) */ 306 udbg_early_init(); 307 308 DBG(" -> early_setup(), dt_ptr: 0x%lx\n", dt_ptr); 309 310 /* 311 * Do early initialization using the flattened device 312 * tree, such as retrieving the physical memory map or 313 * calculating/retrieving the hash table size. 314 */ 315 early_init_devtree(__va(dt_ptr)); 316 317 /* Now we know the logical id of our boot cpu, setup the paca. */ 318 if (boot_cpuid != 0) { 319 /* Poison paca_ptrs[0] again if it's not the boot cpu */ 320 memset(&paca_ptrs[0], 0x88, sizeof(paca_ptrs[0])); 321 } 322 setup_paca(paca_ptrs[boot_cpuid]); 323 fixup_boot_paca(); 324 325 /* 326 * Configure exception handlers. This include setting up trampolines 327 * if needed, setting exception endian mode, etc... 328 */ 329 configure_exceptions(); 330 331 /* 332 * Configure Kernel Userspace Protection. This needs to happen before 333 * feature fixups for platforms that implement this using features. 334 */ 335 setup_kup(); 336 337 /* Apply all the dynamic patching */ 338 apply_feature_fixups(); 339 setup_feature_keys(); 340 341 /* Initialize the hash table or TLB handling */ 342 early_init_mmu(); 343 344 /* 345 * After firmware and early platform setup code has set things up, 346 * we note the SPR values for configurable control/performance 347 * registers, and use those as initial defaults. 348 */ 349 record_spr_defaults(); 350 351 /* 352 * At this point, we can let interrupts switch to virtual mode 353 * (the MMU has been setup), so adjust the MSR in the PACA to 354 * have IR and DR set and enable AIL if it exists 355 */ 356 cpu_ready_for_interrupts(); 357 358 /* 359 * We enable ftrace here, but since we only support DYNAMIC_FTRACE, it 360 * will only actually get enabled on the boot cpu much later once 361 * ftrace itself has been initialized. 362 */ 363 this_cpu_enable_ftrace(); 364 365 DBG(" <- early_setup()\n"); 366 367 #ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX 368 /* 369 * This needs to be done *last* (after the above DBG() even) 370 * 371 * Right after we return from this function, we turn on the MMU 372 * which means the real-mode access trick that btext does will 373 * no longer work, it needs to switch to using a real MMU 374 * mapping. This call will ensure that it does 375 */ 376 btext_map(); 377 #endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */ 378 } 379 380 #ifdef CONFIG_SMP 381 void early_setup_secondary(void) 382 { 383 /* Mark interrupts disabled in PACA */ 384 irq_soft_mask_set(IRQS_DISABLED); 385 386 /* Initialize the hash table or TLB handling */ 387 early_init_mmu_secondary(); 388 389 /* Perform any KUP setup that is per-cpu */ 390 setup_kup(); 391 392 /* 393 * At this point, we can let interrupts switch to virtual mode 394 * (the MMU has been setup), so adjust the MSR in the PACA to 395 * have IR and DR set. 396 */ 397 cpu_ready_for_interrupts(); 398 } 399 400 #endif /* CONFIG_SMP */ 401 402 void panic_smp_self_stop(void) 403 { 404 hard_irq_disable(); 405 spin_begin(); 406 while (1) 407 spin_cpu_relax(); 408 } 409 410 #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) 411 static bool use_spinloop(void) 412 { 413 if (IS_ENABLED(CONFIG_PPC_BOOK3S)) { 414 /* 415 * See comments in head_64.S -- not all platforms insert 416 * secondaries at __secondary_hold and wait at the spin 417 * loop. 418 */ 419 if (firmware_has_feature(FW_FEATURE_OPAL)) 420 return false; 421 return true; 422 } 423 424 /* 425 * When book3e boots from kexec, the ePAPR spin table does 426 * not get used. 427 */ 428 return of_property_read_bool(of_chosen, "linux,booted-from-kexec"); 429 } 430 431 void smp_release_cpus(void) 432 { 433 unsigned long *ptr; 434 int i; 435 436 if (!use_spinloop()) 437 return; 438 439 DBG(" -> smp_release_cpus()\n"); 440 441 /* All secondary cpus are spinning on a common spinloop, release them 442 * all now so they can start to spin on their individual paca 443 * spinloops. For non SMP kernels, the secondary cpus never get out 444 * of the common spinloop. 445 */ 446 447 ptr = (unsigned long *)((unsigned long)&__secondary_hold_spinloop 448 - PHYSICAL_START); 449 *ptr = ppc_function_entry(generic_secondary_smp_init); 450 451 /* And wait a bit for them to catch up */ 452 for (i = 0; i < 100000; i++) { 453 mb(); 454 HMT_low(); 455 if (spinning_secondaries == 0) 456 break; 457 udelay(1); 458 } 459 DBG("spinning_secondaries = %d\n", spinning_secondaries); 460 461 DBG(" <- smp_release_cpus()\n"); 462 } 463 #endif /* CONFIG_SMP || CONFIG_KEXEC_CORE */ 464 465 /* 466 * Initialize some remaining members of the ppc64_caches and systemcfg 467 * structures 468 * (at least until we get rid of them completely). This is mostly some 469 * cache informations about the CPU that will be used by cache flush 470 * routines and/or provided to userland 471 */ 472 473 static void init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize, 474 u32 bsize, u32 sets) 475 { 476 info->size = size; 477 info->sets = sets; 478 info->line_size = lsize; 479 info->block_size = bsize; 480 info->log_block_size = __ilog2(bsize); 481 if (bsize) 482 info->blocks_per_page = PAGE_SIZE / bsize; 483 else 484 info->blocks_per_page = 0; 485 486 if (sets == 0) 487 info->assoc = 0xffff; 488 else 489 info->assoc = size / (sets * lsize); 490 } 491 492 static bool __init parse_cache_info(struct device_node *np, 493 bool icache, 494 struct ppc_cache_info *info) 495 { 496 static const char *ipropnames[] __initdata = { 497 "i-cache-size", 498 "i-cache-sets", 499 "i-cache-block-size", 500 "i-cache-line-size", 501 }; 502 static const char *dpropnames[] __initdata = { 503 "d-cache-size", 504 "d-cache-sets", 505 "d-cache-block-size", 506 "d-cache-line-size", 507 }; 508 const char **propnames = icache ? ipropnames : dpropnames; 509 const __be32 *sizep, *lsizep, *bsizep, *setsp; 510 u32 size, lsize, bsize, sets; 511 bool success = true; 512 513 size = 0; 514 sets = -1u; 515 lsize = bsize = cur_cpu_spec->dcache_bsize; 516 sizep = of_get_property(np, propnames[0], NULL); 517 if (sizep != NULL) 518 size = be32_to_cpu(*sizep); 519 setsp = of_get_property(np, propnames[1], NULL); 520 if (setsp != NULL) 521 sets = be32_to_cpu(*setsp); 522 bsizep = of_get_property(np, propnames[2], NULL); 523 lsizep = of_get_property(np, propnames[3], NULL); 524 if (bsizep == NULL) 525 bsizep = lsizep; 526 if (lsizep != NULL) 527 lsize = be32_to_cpu(*lsizep); 528 if (bsizep != NULL) 529 bsize = be32_to_cpu(*bsizep); 530 if (sizep == NULL || bsizep == NULL || lsizep == NULL) 531 success = false; 532 533 /* 534 * OF is weird .. it represents fully associative caches 535 * as "1 way" which doesn't make much sense and doesn't 536 * leave room for direct mapped. We'll assume that 0 537 * in OF means direct mapped for that reason. 538 */ 539 if (sets == 1) 540 sets = 0; 541 else if (sets == 0) 542 sets = 1; 543 544 init_cache_info(info, size, lsize, bsize, sets); 545 546 return success; 547 } 548 549 void __init initialize_cache_info(void) 550 { 551 struct device_node *cpu = NULL, *l2, *l3 = NULL; 552 u32 pvr; 553 554 DBG(" -> initialize_cache_info()\n"); 555 556 /* 557 * All shipping POWER8 machines have a firmware bug that 558 * puts incorrect information in the device-tree. This will 559 * be (hopefully) fixed for future chips but for now hard 560 * code the values if we are running on one of these 561 */ 562 pvr = PVR_VER(mfspr(SPRN_PVR)); 563 if (pvr == PVR_POWER8 || pvr == PVR_POWER8E || 564 pvr == PVR_POWER8NVL) { 565 /* size lsize blk sets */ 566 init_cache_info(&ppc64_caches.l1i, 0x8000, 128, 128, 32); 567 init_cache_info(&ppc64_caches.l1d, 0x10000, 128, 128, 64); 568 init_cache_info(&ppc64_caches.l2, 0x80000, 128, 0, 512); 569 init_cache_info(&ppc64_caches.l3, 0x800000, 128, 0, 8192); 570 } else 571 cpu = of_find_node_by_type(NULL, "cpu"); 572 573 /* 574 * We're assuming *all* of the CPUs have the same 575 * d-cache and i-cache sizes... -Peter 576 */ 577 if (cpu) { 578 if (!parse_cache_info(cpu, false, &ppc64_caches.l1d)) 579 DBG("Argh, can't find dcache properties !\n"); 580 581 if (!parse_cache_info(cpu, true, &ppc64_caches.l1i)) 582 DBG("Argh, can't find icache properties !\n"); 583 584 /* 585 * Try to find the L2 and L3 if any. Assume they are 586 * unified and use the D-side properties. 587 */ 588 l2 = of_find_next_cache_node(cpu); 589 of_node_put(cpu); 590 if (l2) { 591 parse_cache_info(l2, false, &ppc64_caches.l2); 592 l3 = of_find_next_cache_node(l2); 593 of_node_put(l2); 594 } 595 if (l3) { 596 parse_cache_info(l3, false, &ppc64_caches.l3); 597 of_node_put(l3); 598 } 599 } 600 601 /* For use by binfmt_elf */ 602 dcache_bsize = ppc64_caches.l1d.block_size; 603 icache_bsize = ppc64_caches.l1i.block_size; 604 605 cur_cpu_spec->dcache_bsize = dcache_bsize; 606 cur_cpu_spec->icache_bsize = icache_bsize; 607 608 DBG(" <- initialize_cache_info()\n"); 609 } 610 611 /* 612 * This returns the limit below which memory accesses to the linear 613 * mapping are guarnateed not to cause an architectural exception (e.g., 614 * TLB or SLB miss fault). 615 * 616 * This is used to allocate PACAs and various interrupt stacks that 617 * that are accessed early in interrupt handlers that must not cause 618 * re-entrant interrupts. 619 */ 620 __init u64 ppc64_bolted_size(void) 621 { 622 #ifdef CONFIG_PPC_BOOK3E 623 /* Freescale BookE bolts the entire linear mapping */ 624 /* XXX: BookE ppc64_rma_limit setup seems to disagree? */ 625 if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) 626 return linear_map_top; 627 /* Other BookE, we assume the first GB is bolted */ 628 return 1ul << 30; 629 #else 630 /* BookS radix, does not take faults on linear mapping */ 631 if (early_radix_enabled()) 632 return ULONG_MAX; 633 634 /* BookS hash, the first segment is bolted */ 635 if (early_mmu_has_feature(MMU_FTR_1T_SEGMENT)) 636 return 1UL << SID_SHIFT_1T; 637 return 1UL << SID_SHIFT; 638 #endif 639 } 640 641 static void *__init alloc_stack(unsigned long limit, int cpu) 642 { 643 void *ptr; 644 645 BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16); 646 647 ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE, 648 MEMBLOCK_LOW_LIMIT, limit, 649 early_cpu_to_node(cpu)); 650 if (!ptr) 651 panic("cannot allocate stacks"); 652 653 return ptr; 654 } 655 656 void __init irqstack_early_init(void) 657 { 658 u64 limit = ppc64_bolted_size(); 659 unsigned int i; 660 661 /* 662 * Interrupt stacks must be in the first segment since we 663 * cannot afford to take SLB misses on them. They are not 664 * accessed in realmode. 665 */ 666 for_each_possible_cpu(i) { 667 softirq_ctx[i] = alloc_stack(limit, i); 668 hardirq_ctx[i] = alloc_stack(limit, i); 669 } 670 } 671 672 #ifdef CONFIG_PPC_BOOK3E 673 void __init exc_lvl_early_init(void) 674 { 675 unsigned int i; 676 677 for_each_possible_cpu(i) { 678 void *sp; 679 680 sp = alloc_stack(ULONG_MAX, i); 681 critirq_ctx[i] = sp; 682 paca_ptrs[i]->crit_kstack = sp + THREAD_SIZE; 683 684 sp = alloc_stack(ULONG_MAX, i); 685 dbgirq_ctx[i] = sp; 686 paca_ptrs[i]->dbg_kstack = sp + THREAD_SIZE; 687 688 sp = alloc_stack(ULONG_MAX, i); 689 mcheckirq_ctx[i] = sp; 690 paca_ptrs[i]->mc_kstack = sp + THREAD_SIZE; 691 } 692 693 if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) 694 patch_exception(0x040, exc_debug_debug_book3e); 695 } 696 #endif 697 698 /* 699 * Stack space used when we detect a bad kernel stack pointer, and 700 * early in SMP boots before relocation is enabled. Exclusive emergency 701 * stack for machine checks. 702 */ 703 void __init emergency_stack_init(void) 704 { 705 u64 limit; 706 unsigned int i; 707 708 /* 709 * Emergency stacks must be under 256MB, we cannot afford to take 710 * SLB misses on them. The ABI also requires them to be 128-byte 711 * aligned. 712 * 713 * Since we use these as temporary stacks during secondary CPU 714 * bringup, machine check, system reset, and HMI, we need to get 715 * at them in real mode. This means they must also be within the RMO 716 * region. 717 * 718 * The IRQ stacks allocated elsewhere in this file are zeroed and 719 * initialized in kernel/irq.c. These are initialized here in order 720 * to have emergency stacks available as early as possible. 721 */ 722 limit = min(ppc64_bolted_size(), ppc64_rma_size); 723 724 for_each_possible_cpu(i) { 725 paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; 726 727 #ifdef CONFIG_PPC_BOOK3S_64 728 /* emergency stack for NMI exception handling. */ 729 paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; 730 731 /* emergency stack for machine check exception handling. */ 732 paca_ptrs[i]->mc_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; 733 #endif 734 } 735 } 736 737 #ifdef CONFIG_SMP 738 #define PCPU_DYN_SIZE () 739 740 static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) 741 { 742 return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), 743 MEMBLOCK_ALLOC_ACCESSIBLE, 744 early_cpu_to_node(cpu)); 745 746 } 747 748 static void __init pcpu_fc_free(void *ptr, size_t size) 749 { 750 memblock_free(__pa(ptr), size); 751 } 752 753 static int pcpu_cpu_distance(unsigned int from, unsigned int to) 754 { 755 if (early_cpu_to_node(from) == early_cpu_to_node(to)) 756 return LOCAL_DISTANCE; 757 else 758 return REMOTE_DISTANCE; 759 } 760 761 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 762 EXPORT_SYMBOL(__per_cpu_offset); 763 764 void __init setup_per_cpu_areas(void) 765 { 766 const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; 767 size_t atom_size; 768 unsigned long delta; 769 unsigned int cpu; 770 int rc; 771 772 /* 773 * Linear mapping is one of 4K, 1M and 16M. For 4K, no need 774 * to group units. For larger mappings, use 1M atom which 775 * should be large enough to contain a number of units. 776 */ 777 if (mmu_linear_psize == MMU_PAGE_4K) 778 atom_size = PAGE_SIZE; 779 else 780 atom_size = 1 << 20; 781 782 rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, 783 pcpu_fc_alloc, pcpu_fc_free); 784 if (rc < 0) 785 panic("cannot initialize percpu area (err=%d)", rc); 786 787 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 788 for_each_possible_cpu(cpu) { 789 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 790 paca_ptrs[cpu]->data_offset = __per_cpu_offset[cpu]; 791 } 792 } 793 #endif 794 795 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 796 unsigned long memory_block_size_bytes(void) 797 { 798 if (ppc_md.memory_block_size) 799 return ppc_md.memory_block_size(); 800 801 return MIN_MEMORY_BLOCK_SIZE; 802 } 803 #endif 804 805 #if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO) 806 struct ppc_pci_io ppc_pci_io; 807 EXPORT_SYMBOL(ppc_pci_io); 808 #endif 809 810 #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF 811 u64 hw_nmi_get_sample_period(int watchdog_thresh) 812 { 813 return ppc_proc_freq * watchdog_thresh; 814 } 815 #endif 816 817 /* 818 * The perf based hardlockup detector breaks PMU event based branches, so 819 * disable it by default. Book3S has a soft-nmi hardlockup detector based 820 * on the decrementer interrupt, so it does not suffer from this problem. 821 * 822 * It is likely to get false positives in VM guests, so disable it there 823 * by default too. 824 */ 825 static int __init disable_hardlockup_detector(void) 826 { 827 #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF 828 hardlockup_detector_disable(); 829 #else 830 if (firmware_has_feature(FW_FEATURE_LPAR)) 831 hardlockup_detector_disable(); 832 #endif 833 834 return 0; 835 } 836 early_initcall(disable_hardlockup_detector); 837 838 #ifdef CONFIG_PPC_BOOK3S_64 839 static enum l1d_flush_type enabled_flush_types; 840 static void *l1d_flush_fallback_area; 841 static bool no_rfi_flush; 842 bool rfi_flush; 843 844 static int __init handle_no_rfi_flush(char *p) 845 { 846 pr_info("rfi-flush: disabled on command line."); 847 no_rfi_flush = true; 848 return 0; 849 } 850 early_param("no_rfi_flush", handle_no_rfi_flush); 851 852 /* 853 * The RFI flush is not KPTI, but because users will see doco that says to use 854 * nopti we hijack that option here to also disable the RFI flush. 855 */ 856 static int __init handle_no_pti(char *p) 857 { 858 pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); 859 handle_no_rfi_flush(NULL); 860 return 0; 861 } 862 early_param("nopti", handle_no_pti); 863 864 static void do_nothing(void *unused) 865 { 866 /* 867 * We don't need to do the flush explicitly, just enter+exit kernel is 868 * sufficient, the RFI exit handlers will do the right thing. 869 */ 870 } 871 872 void rfi_flush_enable(bool enable) 873 { 874 if (enable) { 875 do_rfi_flush_fixups(enabled_flush_types); 876 on_each_cpu(do_nothing, NULL, 1); 877 } else 878 do_rfi_flush_fixups(L1D_FLUSH_NONE); 879 880 rfi_flush = enable; 881 } 882 883 static void __ref init_fallback_flush(void) 884 { 885 u64 l1d_size, limit; 886 int cpu; 887 888 /* Only allocate the fallback flush area once (at boot time). */ 889 if (l1d_flush_fallback_area) 890 return; 891 892 l1d_size = ppc64_caches.l1d.size; 893 894 /* 895 * If there is no d-cache-size property in the device tree, l1d_size 896 * could be zero. That leads to the loop in the asm wrapping around to 897 * 2^64-1, and then walking off the end of the fallback area and 898 * eventually causing a page fault which is fatal. Just default to 899 * something vaguely sane. 900 */ 901 if (!l1d_size) 902 l1d_size = (64 * 1024); 903 904 limit = min(ppc64_bolted_size(), ppc64_rma_size); 905 906 /* 907 * Align to L1d size, and size it at 2x L1d size, to catch possible 908 * hardware prefetch runoff. We don't have a recipe for load patterns to 909 * reliably avoid the prefetcher. 910 */ 911 l1d_flush_fallback_area = memblock_alloc_try_nid(l1d_size * 2, 912 l1d_size, MEMBLOCK_LOW_LIMIT, 913 limit, NUMA_NO_NODE); 914 if (!l1d_flush_fallback_area) 915 panic("%s: Failed to allocate %llu bytes align=0x%llx max_addr=%pa\n", 916 __func__, l1d_size * 2, l1d_size, &limit); 917 918 919 for_each_possible_cpu(cpu) { 920 struct paca_struct *paca = paca_ptrs[cpu]; 921 paca->rfi_flush_fallback_area = l1d_flush_fallback_area; 922 paca->l1d_flush_size = l1d_size; 923 } 924 } 925 926 void setup_rfi_flush(enum l1d_flush_type types, bool enable) 927 { 928 if (types & L1D_FLUSH_FALLBACK) { 929 pr_info("rfi-flush: fallback displacement flush available\n"); 930 init_fallback_flush(); 931 } 932 933 if (types & L1D_FLUSH_ORI) 934 pr_info("rfi-flush: ori type flush available\n"); 935 936 if (types & L1D_FLUSH_MTTRIG) 937 pr_info("rfi-flush: mttrig type flush available\n"); 938 939 enabled_flush_types = types; 940 941 if (!no_rfi_flush && !cpu_mitigations_off()) 942 rfi_flush_enable(enable); 943 } 944 945 #ifdef CONFIG_DEBUG_FS 946 static int rfi_flush_set(void *data, u64 val) 947 { 948 bool enable; 949 950 if (val == 1) 951 enable = true; 952 else if (val == 0) 953 enable = false; 954 else 955 return -EINVAL; 956 957 /* Only do anything if we're changing state */ 958 if (enable != rfi_flush) 959 rfi_flush_enable(enable); 960 961 return 0; 962 } 963 964 static int rfi_flush_get(void *data, u64 *val) 965 { 966 *val = rfi_flush ? 1 : 0; 967 return 0; 968 } 969 970 DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n"); 971 972 static __init int rfi_flush_debugfs_init(void) 973 { 974 debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); 975 return 0; 976 } 977 device_initcall(rfi_flush_debugfs_init); 978 #endif 979 #endif /* CONFIG_PPC_BOOK3S_64 */ 980