1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * 4 * Common boot and setup code. 5 * 6 * Copyright (C) 2001 PPC64 Team, IBM Corp 7 */ 8 9 #include <linux/export.h> 10 #include <linux/string.h> 11 #include <linux/sched.h> 12 #include <linux/init.h> 13 #include <linux/kernel.h> 14 #include <linux/reboot.h> 15 #include <linux/delay.h> 16 #include <linux/initrd.h> 17 #include <linux/seq_file.h> 18 #include <linux/ioport.h> 19 #include <linux/console.h> 20 #include <linux/utsname.h> 21 #include <linux/tty.h> 22 #include <linux/root_dev.h> 23 #include <linux/notifier.h> 24 #include <linux/cpu.h> 25 #include <linux/unistd.h> 26 #include <linux/serial.h> 27 #include <linux/serial_8250.h> 28 #include <linux/memblock.h> 29 #include <linux/pci.h> 30 #include <linux/lockdep.h> 31 #include <linux/memory.h> 32 #include <linux/nmi.h> 33 #include <linux/pgtable.h> 34 35 #include <asm/debugfs.h> 36 #include <asm/kvm_guest.h> 37 #include <asm/io.h> 38 #include <asm/kdump.h> 39 #include <asm/prom.h> 40 #include <asm/processor.h> 41 #include <asm/smp.h> 42 #include <asm/elf.h> 43 #include <asm/machdep.h> 44 #include <asm/paca.h> 45 #include <asm/time.h> 46 #include <asm/cputable.h> 47 #include <asm/dt_cpu_ftrs.h> 48 #include <asm/sections.h> 49 #include <asm/btext.h> 50 #include <asm/nvram.h> 51 #include <asm/setup.h> 52 #include <asm/rtas.h> 53 #include <asm/iommu.h> 54 #include <asm/serial.h> 55 #include <asm/cache.h> 56 #include <asm/page.h> 57 #include <asm/mmu.h> 58 #include <asm/firmware.h> 59 #include <asm/xmon.h> 60 #include <asm/udbg.h> 61 #include <asm/kexec.h> 62 #include <asm/code-patching.h> 63 #include <asm/livepatch.h> 64 #include <asm/opal.h> 65 #include <asm/cputhreads.h> 66 #include <asm/hw_irq.h> 67 #include <asm/feature-fixups.h> 68 #include <asm/kup.h> 69 #include <asm/early_ioremap.h> 70 #include <asm/pgalloc.h> 71 #include <asm/asm-prototypes.h> 72 73 #include "setup.h" 74 75 int spinning_secondaries; 76 u64 ppc64_pft_size; 77 78 struct ppc64_caches ppc64_caches = { 79 .l1d = { 80 .block_size = 0x40, 81 .log_block_size = 6, 82 }, 83 .l1i = { 84 .block_size = 0x40, 85 .log_block_size = 6 86 }, 87 }; 88 EXPORT_SYMBOL_GPL(ppc64_caches); 89 90 #if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP) 91 void __init setup_tlb_core_data(void) 92 { 93 int cpu; 94 95 BUILD_BUG_ON(offsetof(struct tlb_core_data, lock) != 0); 96 97 for_each_possible_cpu(cpu) { 98 int first = cpu_first_thread_sibling(cpu); 99 100 /* 101 * If we boot via kdump on a non-primary thread, 102 * make sure we point at the thread that actually 103 * set up this TLB. 104 */ 105 if (cpu_first_thread_sibling(boot_cpuid) == first) 106 first = boot_cpuid; 107 108 paca_ptrs[cpu]->tcd_ptr = &paca_ptrs[first]->tcd; 109 110 /* 111 * If we have threads, we need either tlbsrx. 112 * or e6500 tablewalk mode, or else TLB handlers 113 * will be racy and could produce duplicate entries. 114 * Should we panic instead? 115 */ 116 WARN_ONCE(smt_enabled_at_boot >= 2 && 117 !mmu_has_feature(MMU_FTR_USE_TLBRSRV) && 118 book3e_htw_mode != PPC_HTW_E6500, 119 "%s: unsupported MMU configuration\n", __func__); 120 } 121 } 122 #endif 123 124 #ifdef CONFIG_SMP 125 126 static char *smt_enabled_cmdline; 127 128 /* Look for ibm,smt-enabled OF option */ 129 void __init check_smt_enabled(void) 130 { 131 struct device_node *dn; 132 const char *smt_option; 133 134 /* Default to enabling all threads */ 135 smt_enabled_at_boot = threads_per_core; 136 137 /* Allow the command line to overrule the OF option */ 138 if (smt_enabled_cmdline) { 139 if (!strcmp(smt_enabled_cmdline, "on")) 140 smt_enabled_at_boot = threads_per_core; 141 else if (!strcmp(smt_enabled_cmdline, "off")) 142 smt_enabled_at_boot = 0; 143 else { 144 int smt; 145 int rc; 146 147 rc = kstrtoint(smt_enabled_cmdline, 10, &smt); 148 if (!rc) 149 smt_enabled_at_boot = 150 min(threads_per_core, smt); 151 } 152 } else { 153 dn = of_find_node_by_path("/options"); 154 if (dn) { 155 smt_option = of_get_property(dn, "ibm,smt-enabled", 156 NULL); 157 158 if (smt_option) { 159 if (!strcmp(smt_option, "on")) 160 smt_enabled_at_boot = threads_per_core; 161 else if (!strcmp(smt_option, "off")) 162 smt_enabled_at_boot = 0; 163 } 164 165 of_node_put(dn); 166 } 167 } 168 } 169 170 /* Look for smt-enabled= cmdline option */ 171 static int __init early_smt_enabled(char *p) 172 { 173 smt_enabled_cmdline = p; 174 return 0; 175 } 176 early_param("smt-enabled", early_smt_enabled); 177 178 #endif /* CONFIG_SMP */ 179 180 /** Fix up paca fields required for the boot cpu */ 181 static void __init fixup_boot_paca(void) 182 { 183 /* The boot cpu is started */ 184 get_paca()->cpu_start = 1; 185 /* Allow percpu accesses to work until we setup percpu data */ 186 get_paca()->data_offset = 0; 187 /* Mark interrupts disabled in PACA */ 188 irq_soft_mask_set(IRQS_DISABLED); 189 } 190 191 static void __init configure_exceptions(void) 192 { 193 /* 194 * Setup the trampolines from the lowmem exception vectors 195 * to the kdump kernel when not using a relocatable kernel. 196 */ 197 setup_kdump_trampoline(); 198 199 /* Under a PAPR hypervisor, we need hypercalls */ 200 if (firmware_has_feature(FW_FEATURE_SET_MODE)) { 201 /* Enable AIL if possible */ 202 if (!pseries_enable_reloc_on_exc()) { 203 init_task.thread.fscr &= ~FSCR_SCV; 204 cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_SCV; 205 } 206 207 /* 208 * Tell the hypervisor that we want our exceptions to 209 * be taken in little endian mode. 210 * 211 * We don't call this for big endian as our calling convention 212 * makes us always enter in BE, and the call may fail under 213 * some circumstances with kdump. 214 */ 215 #ifdef __LITTLE_ENDIAN__ 216 pseries_little_endian_exceptions(); 217 #endif 218 } else { 219 /* Set endian mode using OPAL */ 220 if (firmware_has_feature(FW_FEATURE_OPAL)) 221 opal_configure_cores(); 222 223 /* AIL on native is done in cpu_ready_for_interrupts() */ 224 } 225 } 226 227 static void cpu_ready_for_interrupts(void) 228 { 229 /* 230 * Enable AIL if supported, and we are in hypervisor mode. This 231 * is called once for every processor. 232 * 233 * If we are not in hypervisor mode the job is done once for 234 * the whole partition in configure_exceptions(). 235 */ 236 if (cpu_has_feature(CPU_FTR_HVMODE)) { 237 unsigned long lpcr = mfspr(SPRN_LPCR); 238 unsigned long new_lpcr = lpcr; 239 240 if (cpu_has_feature(CPU_FTR_ARCH_31)) { 241 /* P10 DD1 does not have HAIL */ 242 if (pvr_version_is(PVR_POWER10) && 243 (mfspr(SPRN_PVR) & 0xf00) == 0x100) 244 new_lpcr |= LPCR_AIL_3; 245 else 246 new_lpcr |= LPCR_HAIL; 247 } else if (cpu_has_feature(CPU_FTR_ARCH_207S)) { 248 new_lpcr |= LPCR_AIL_3; 249 } 250 251 if (new_lpcr != lpcr) 252 mtspr(SPRN_LPCR, new_lpcr); 253 } 254 255 /* 256 * Set HFSCR:TM based on CPU features: 257 * In the special case of TM no suspend (P9N DD2.1), Linux is 258 * told TM is off via the dt-ftrs but told to (partially) use 259 * it via OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED. So HFSCR[TM] 260 * will be off from dt-ftrs but we need to turn it on for the 261 * no suspend case. 262 */ 263 if (cpu_has_feature(CPU_FTR_HVMODE)) { 264 if (cpu_has_feature(CPU_FTR_TM_COMP)) 265 mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) | HFSCR_TM); 266 else 267 mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM); 268 } 269 270 /* Set IR and DR in PACA MSR */ 271 get_paca()->kernel_msr = MSR_KERNEL; 272 } 273 274 unsigned long spr_default_dscr = 0; 275 276 static void __init record_spr_defaults(void) 277 { 278 if (early_cpu_has_feature(CPU_FTR_DSCR)) 279 spr_default_dscr = mfspr(SPRN_DSCR); 280 } 281 282 /* 283 * Early initialization entry point. This is called by head.S 284 * with MMU translation disabled. We rely on the "feature" of 285 * the CPU that ignores the top 2 bits of the address in real 286 * mode so we can access kernel globals normally provided we 287 * only toy with things in the RMO region. From here, we do 288 * some early parsing of the device-tree to setup out MEMBLOCK 289 * data structures, and allocate & initialize the hash table 290 * and segment tables so we can start running with translation 291 * enabled. 292 * 293 * It is this function which will call the probe() callback of 294 * the various platform types and copy the matching one to the 295 * global ppc_md structure. Your platform can eventually do 296 * some very early initializations from the probe() routine, but 297 * this is not recommended, be very careful as, for example, the 298 * device-tree is not accessible via normal means at this point. 299 */ 300 301 void __init early_setup(unsigned long dt_ptr) 302 { 303 static __initdata struct paca_struct boot_paca; 304 305 /* -------- printk is _NOT_ safe to use here ! ------- */ 306 307 /* 308 * Assume we're on cpu 0 for now. 309 * 310 * We need to load a PACA very early for a few reasons. 311 * 312 * The stack protector canary is stored in the paca, so as soon as we 313 * call any stack protected code we need r13 pointing somewhere valid. 314 * 315 * If we are using kcov it will call in_task() in its instrumentation, 316 * which relies on the current task from the PACA. 317 * 318 * dt_cpu_ftrs_init() calls into generic OF/fdt code, as well as 319 * printk(), which can trigger both stack protector and kcov. 320 * 321 * percpu variables and spin locks also use the paca. 322 * 323 * So set up a temporary paca. It will be replaced below once we know 324 * what CPU we are on. 325 */ 326 initialise_paca(&boot_paca, 0); 327 setup_paca(&boot_paca); 328 fixup_boot_paca(); 329 330 /* -------- printk is now safe to use ------- */ 331 332 /* Try new device tree based feature discovery ... */ 333 if (!dt_cpu_ftrs_init(__va(dt_ptr))) 334 /* Otherwise use the old style CPU table */ 335 identify_cpu(0, mfspr(SPRN_PVR)); 336 337 /* Enable early debugging if any specified (see udbg.h) */ 338 udbg_early_init(); 339 340 udbg_printf(" -> %s(), dt_ptr: 0x%lx\n", __func__, dt_ptr); 341 342 /* 343 * Do early initialization using the flattened device 344 * tree, such as retrieving the physical memory map or 345 * calculating/retrieving the hash table size. 346 */ 347 early_init_devtree(__va(dt_ptr)); 348 349 /* Now we know the logical id of our boot cpu, setup the paca. */ 350 if (boot_cpuid != 0) { 351 /* Poison paca_ptrs[0] again if it's not the boot cpu */ 352 memset(&paca_ptrs[0], 0x88, sizeof(paca_ptrs[0])); 353 } 354 setup_paca(paca_ptrs[boot_cpuid]); 355 fixup_boot_paca(); 356 357 /* 358 * Configure exception handlers. This include setting up trampolines 359 * if needed, setting exception endian mode, etc... 360 */ 361 configure_exceptions(); 362 363 /* 364 * Configure Kernel Userspace Protection. This needs to happen before 365 * feature fixups for platforms that implement this using features. 366 */ 367 setup_kup(); 368 369 /* Apply all the dynamic patching */ 370 apply_feature_fixups(); 371 setup_feature_keys(); 372 373 /* Initialize the hash table or TLB handling */ 374 early_init_mmu(); 375 376 early_ioremap_setup(); 377 378 /* 379 * After firmware and early platform setup code has set things up, 380 * we note the SPR values for configurable control/performance 381 * registers, and use those as initial defaults. 382 */ 383 record_spr_defaults(); 384 385 /* 386 * At this point, we can let interrupts switch to virtual mode 387 * (the MMU has been setup), so adjust the MSR in the PACA to 388 * have IR and DR set and enable AIL if it exists 389 */ 390 cpu_ready_for_interrupts(); 391 392 /* 393 * We enable ftrace here, but since we only support DYNAMIC_FTRACE, it 394 * will only actually get enabled on the boot cpu much later once 395 * ftrace itself has been initialized. 396 */ 397 this_cpu_enable_ftrace(); 398 399 udbg_printf(" <- %s()\n", __func__); 400 401 #ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX 402 /* 403 * This needs to be done *last* (after the above udbg_printf() even) 404 * 405 * Right after we return from this function, we turn on the MMU 406 * which means the real-mode access trick that btext does will 407 * no longer work, it needs to switch to using a real MMU 408 * mapping. This call will ensure that it does 409 */ 410 btext_map(); 411 #endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */ 412 } 413 414 #ifdef CONFIG_SMP 415 void early_setup_secondary(void) 416 { 417 /* Mark interrupts disabled in PACA */ 418 irq_soft_mask_set(IRQS_DISABLED); 419 420 /* Initialize the hash table or TLB handling */ 421 early_init_mmu_secondary(); 422 423 /* Perform any KUP setup that is per-cpu */ 424 setup_kup(); 425 426 /* 427 * At this point, we can let interrupts switch to virtual mode 428 * (the MMU has been setup), so adjust the MSR in the PACA to 429 * have IR and DR set. 430 */ 431 cpu_ready_for_interrupts(); 432 } 433 434 #endif /* CONFIG_SMP */ 435 436 void panic_smp_self_stop(void) 437 { 438 hard_irq_disable(); 439 spin_begin(); 440 while (1) 441 spin_cpu_relax(); 442 } 443 444 #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) 445 static bool use_spinloop(void) 446 { 447 if (IS_ENABLED(CONFIG_PPC_BOOK3S)) { 448 /* 449 * See comments in head_64.S -- not all platforms insert 450 * secondaries at __secondary_hold and wait at the spin 451 * loop. 452 */ 453 if (firmware_has_feature(FW_FEATURE_OPAL)) 454 return false; 455 return true; 456 } 457 458 /* 459 * When book3e boots from kexec, the ePAPR spin table does 460 * not get used. 461 */ 462 return of_property_read_bool(of_chosen, "linux,booted-from-kexec"); 463 } 464 465 void smp_release_cpus(void) 466 { 467 unsigned long *ptr; 468 int i; 469 470 if (!use_spinloop()) 471 return; 472 473 /* All secondary cpus are spinning on a common spinloop, release them 474 * all now so they can start to spin on their individual paca 475 * spinloops. For non SMP kernels, the secondary cpus never get out 476 * of the common spinloop. 477 */ 478 479 ptr = (unsigned long *)((unsigned long)&__secondary_hold_spinloop 480 - PHYSICAL_START); 481 *ptr = ppc_function_entry(generic_secondary_smp_init); 482 483 /* And wait a bit for them to catch up */ 484 for (i = 0; i < 100000; i++) { 485 mb(); 486 HMT_low(); 487 if (spinning_secondaries == 0) 488 break; 489 udelay(1); 490 } 491 pr_debug("spinning_secondaries = %d\n", spinning_secondaries); 492 } 493 #endif /* CONFIG_SMP || CONFIG_KEXEC_CORE */ 494 495 /* 496 * Initialize some remaining members of the ppc64_caches and systemcfg 497 * structures 498 * (at least until we get rid of them completely). This is mostly some 499 * cache informations about the CPU that will be used by cache flush 500 * routines and/or provided to userland 501 */ 502 503 static void init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize, 504 u32 bsize, u32 sets) 505 { 506 info->size = size; 507 info->sets = sets; 508 info->line_size = lsize; 509 info->block_size = bsize; 510 info->log_block_size = __ilog2(bsize); 511 if (bsize) 512 info->blocks_per_page = PAGE_SIZE / bsize; 513 else 514 info->blocks_per_page = 0; 515 516 if (sets == 0) 517 info->assoc = 0xffff; 518 else 519 info->assoc = size / (sets * lsize); 520 } 521 522 static bool __init parse_cache_info(struct device_node *np, 523 bool icache, 524 struct ppc_cache_info *info) 525 { 526 static const char *ipropnames[] __initdata = { 527 "i-cache-size", 528 "i-cache-sets", 529 "i-cache-block-size", 530 "i-cache-line-size", 531 }; 532 static const char *dpropnames[] __initdata = { 533 "d-cache-size", 534 "d-cache-sets", 535 "d-cache-block-size", 536 "d-cache-line-size", 537 }; 538 const char **propnames = icache ? ipropnames : dpropnames; 539 const __be32 *sizep, *lsizep, *bsizep, *setsp; 540 u32 size, lsize, bsize, sets; 541 bool success = true; 542 543 size = 0; 544 sets = -1u; 545 lsize = bsize = cur_cpu_spec->dcache_bsize; 546 sizep = of_get_property(np, propnames[0], NULL); 547 if (sizep != NULL) 548 size = be32_to_cpu(*sizep); 549 setsp = of_get_property(np, propnames[1], NULL); 550 if (setsp != NULL) 551 sets = be32_to_cpu(*setsp); 552 bsizep = of_get_property(np, propnames[2], NULL); 553 lsizep = of_get_property(np, propnames[3], NULL); 554 if (bsizep == NULL) 555 bsizep = lsizep; 556 if (lsizep == NULL) 557 lsizep = bsizep; 558 if (lsizep != NULL) 559 lsize = be32_to_cpu(*lsizep); 560 if (bsizep != NULL) 561 bsize = be32_to_cpu(*bsizep); 562 if (sizep == NULL || bsizep == NULL || lsizep == NULL) 563 success = false; 564 565 /* 566 * OF is weird .. it represents fully associative caches 567 * as "1 way" which doesn't make much sense and doesn't 568 * leave room for direct mapped. We'll assume that 0 569 * in OF means direct mapped for that reason. 570 */ 571 if (sets == 1) 572 sets = 0; 573 else if (sets == 0) 574 sets = 1; 575 576 init_cache_info(info, size, lsize, bsize, sets); 577 578 return success; 579 } 580 581 void __init initialize_cache_info(void) 582 { 583 struct device_node *cpu = NULL, *l2, *l3 = NULL; 584 u32 pvr; 585 586 /* 587 * All shipping POWER8 machines have a firmware bug that 588 * puts incorrect information in the device-tree. This will 589 * be (hopefully) fixed for future chips but for now hard 590 * code the values if we are running on one of these 591 */ 592 pvr = PVR_VER(mfspr(SPRN_PVR)); 593 if (pvr == PVR_POWER8 || pvr == PVR_POWER8E || 594 pvr == PVR_POWER8NVL) { 595 /* size lsize blk sets */ 596 init_cache_info(&ppc64_caches.l1i, 0x8000, 128, 128, 32); 597 init_cache_info(&ppc64_caches.l1d, 0x10000, 128, 128, 64); 598 init_cache_info(&ppc64_caches.l2, 0x80000, 128, 0, 512); 599 init_cache_info(&ppc64_caches.l3, 0x800000, 128, 0, 8192); 600 } else 601 cpu = of_find_node_by_type(NULL, "cpu"); 602 603 /* 604 * We're assuming *all* of the CPUs have the same 605 * d-cache and i-cache sizes... -Peter 606 */ 607 if (cpu) { 608 if (!parse_cache_info(cpu, false, &ppc64_caches.l1d)) 609 pr_warn("Argh, can't find dcache properties !\n"); 610 611 if (!parse_cache_info(cpu, true, &ppc64_caches.l1i)) 612 pr_warn("Argh, can't find icache properties !\n"); 613 614 /* 615 * Try to find the L2 and L3 if any. Assume they are 616 * unified and use the D-side properties. 617 */ 618 l2 = of_find_next_cache_node(cpu); 619 of_node_put(cpu); 620 if (l2) { 621 parse_cache_info(l2, false, &ppc64_caches.l2); 622 l3 = of_find_next_cache_node(l2); 623 of_node_put(l2); 624 } 625 if (l3) { 626 parse_cache_info(l3, false, &ppc64_caches.l3); 627 of_node_put(l3); 628 } 629 } 630 631 /* For use by binfmt_elf */ 632 dcache_bsize = ppc64_caches.l1d.block_size; 633 icache_bsize = ppc64_caches.l1i.block_size; 634 635 cur_cpu_spec->dcache_bsize = dcache_bsize; 636 cur_cpu_spec->icache_bsize = icache_bsize; 637 } 638 639 /* 640 * This returns the limit below which memory accesses to the linear 641 * mapping are guarnateed not to cause an architectural exception (e.g., 642 * TLB or SLB miss fault). 643 * 644 * This is used to allocate PACAs and various interrupt stacks that 645 * that are accessed early in interrupt handlers that must not cause 646 * re-entrant interrupts. 647 */ 648 __init u64 ppc64_bolted_size(void) 649 { 650 #ifdef CONFIG_PPC_BOOK3E 651 /* Freescale BookE bolts the entire linear mapping */ 652 /* XXX: BookE ppc64_rma_limit setup seems to disagree? */ 653 if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) 654 return linear_map_top; 655 /* Other BookE, we assume the first GB is bolted */ 656 return 1ul << 30; 657 #else 658 /* BookS radix, does not take faults on linear mapping */ 659 if (early_radix_enabled()) 660 return ULONG_MAX; 661 662 /* BookS hash, the first segment is bolted */ 663 if (early_mmu_has_feature(MMU_FTR_1T_SEGMENT)) 664 return 1UL << SID_SHIFT_1T; 665 return 1UL << SID_SHIFT; 666 #endif 667 } 668 669 static void *__init alloc_stack(unsigned long limit, int cpu) 670 { 671 void *ptr; 672 673 BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16); 674 675 ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_ALIGN, 676 MEMBLOCK_LOW_LIMIT, limit, 677 early_cpu_to_node(cpu)); 678 if (!ptr) 679 panic("cannot allocate stacks"); 680 681 return ptr; 682 } 683 684 void __init irqstack_early_init(void) 685 { 686 u64 limit = ppc64_bolted_size(); 687 unsigned int i; 688 689 /* 690 * Interrupt stacks must be in the first segment since we 691 * cannot afford to take SLB misses on them. They are not 692 * accessed in realmode. 693 */ 694 for_each_possible_cpu(i) { 695 softirq_ctx[i] = alloc_stack(limit, i); 696 hardirq_ctx[i] = alloc_stack(limit, i); 697 } 698 } 699 700 #ifdef CONFIG_PPC_BOOK3E 701 void __init exc_lvl_early_init(void) 702 { 703 unsigned int i; 704 705 for_each_possible_cpu(i) { 706 void *sp; 707 708 sp = alloc_stack(ULONG_MAX, i); 709 critirq_ctx[i] = sp; 710 paca_ptrs[i]->crit_kstack = sp + THREAD_SIZE; 711 712 sp = alloc_stack(ULONG_MAX, i); 713 dbgirq_ctx[i] = sp; 714 paca_ptrs[i]->dbg_kstack = sp + THREAD_SIZE; 715 716 sp = alloc_stack(ULONG_MAX, i); 717 mcheckirq_ctx[i] = sp; 718 paca_ptrs[i]->mc_kstack = sp + THREAD_SIZE; 719 } 720 721 if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) 722 patch_exception(0x040, exc_debug_debug_book3e); 723 } 724 #endif 725 726 /* 727 * Stack space used when we detect a bad kernel stack pointer, and 728 * early in SMP boots before relocation is enabled. Exclusive emergency 729 * stack for machine checks. 730 */ 731 void __init emergency_stack_init(void) 732 { 733 u64 limit, mce_limit; 734 unsigned int i; 735 736 /* 737 * Emergency stacks must be under 256MB, we cannot afford to take 738 * SLB misses on them. The ABI also requires them to be 128-byte 739 * aligned. 740 * 741 * Since we use these as temporary stacks during secondary CPU 742 * bringup, machine check, system reset, and HMI, we need to get 743 * at them in real mode. This means they must also be within the RMO 744 * region. 745 * 746 * The IRQ stacks allocated elsewhere in this file are zeroed and 747 * initialized in kernel/irq.c. These are initialized here in order 748 * to have emergency stacks available as early as possible. 749 */ 750 limit = mce_limit = min(ppc64_bolted_size(), ppc64_rma_size); 751 752 /* 753 * Machine check on pseries calls rtas, but can't use the static 754 * rtas_args due to a machine check hitting while the lock is held. 755 * rtas args have to be under 4GB, so the machine check stack is 756 * limited to 4GB so args can be put on stack. 757 */ 758 if (firmware_has_feature(FW_FEATURE_LPAR) && mce_limit > SZ_4G) 759 mce_limit = SZ_4G; 760 761 for_each_possible_cpu(i) { 762 paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; 763 764 #ifdef CONFIG_PPC_BOOK3S_64 765 /* emergency stack for NMI exception handling. */ 766 paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; 767 768 /* emergency stack for machine check exception handling. */ 769 paca_ptrs[i]->mc_emergency_sp = alloc_stack(mce_limit, i) + THREAD_SIZE; 770 #endif 771 } 772 } 773 774 #ifdef CONFIG_SMP 775 /** 776 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu 777 * @cpu: cpu to allocate for 778 * @size: size allocation in bytes 779 * @align: alignment 780 * 781 * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper 782 * does the right thing for NUMA regardless of the current 783 * configuration. 784 * 785 * RETURNS: 786 * Pointer to the allocated area on success, NULL on failure. 787 */ 788 static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, 789 size_t align) 790 { 791 const unsigned long goal = __pa(MAX_DMA_ADDRESS); 792 #ifdef CONFIG_NUMA 793 int node = early_cpu_to_node(cpu); 794 void *ptr; 795 796 if (!node_online(node) || !NODE_DATA(node)) { 797 ptr = memblock_alloc_from(size, align, goal); 798 pr_info("cpu %d has no node %d or node-local memory\n", 799 cpu, node); 800 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", 801 cpu, size, __pa(ptr)); 802 } else { 803 ptr = memblock_alloc_try_nid(size, align, goal, 804 MEMBLOCK_ALLOC_ACCESSIBLE, node); 805 pr_debug("per cpu data for cpu%d %lu bytes on node%d at " 806 "%016lx\n", cpu, size, node, __pa(ptr)); 807 } 808 return ptr; 809 #else 810 return memblock_alloc_from(size, align, goal); 811 #endif 812 } 813 814 static void __init pcpu_free_bootmem(void *ptr, size_t size) 815 { 816 memblock_free(__pa(ptr), size); 817 } 818 819 static int pcpu_cpu_distance(unsigned int from, unsigned int to) 820 { 821 if (early_cpu_to_node(from) == early_cpu_to_node(to)) 822 return LOCAL_DISTANCE; 823 else 824 return REMOTE_DISTANCE; 825 } 826 827 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 828 EXPORT_SYMBOL(__per_cpu_offset); 829 830 static void __init pcpu_populate_pte(unsigned long addr) 831 { 832 pgd_t *pgd = pgd_offset_k(addr); 833 p4d_t *p4d; 834 pud_t *pud; 835 pmd_t *pmd; 836 837 p4d = p4d_offset(pgd, addr); 838 if (p4d_none(*p4d)) { 839 pud_t *new; 840 841 new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); 842 if (!new) 843 goto err_alloc; 844 p4d_populate(&init_mm, p4d, new); 845 } 846 847 pud = pud_offset(p4d, addr); 848 if (pud_none(*pud)) { 849 pmd_t *new; 850 851 new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); 852 if (!new) 853 goto err_alloc; 854 pud_populate(&init_mm, pud, new); 855 } 856 857 pmd = pmd_offset(pud, addr); 858 if (!pmd_present(*pmd)) { 859 pte_t *new; 860 861 new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); 862 if (!new) 863 goto err_alloc; 864 pmd_populate_kernel(&init_mm, pmd, new); 865 } 866 867 return; 868 869 err_alloc: 870 panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", 871 __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); 872 } 873 874 875 void __init setup_per_cpu_areas(void) 876 { 877 const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; 878 size_t atom_size; 879 unsigned long delta; 880 unsigned int cpu; 881 int rc = -EINVAL; 882 883 /* 884 * Linear mapping is one of 4K, 1M and 16M. For 4K, no need 885 * to group units. For larger mappings, use 1M atom which 886 * should be large enough to contain a number of units. 887 */ 888 if (mmu_linear_psize == MMU_PAGE_4K) 889 atom_size = PAGE_SIZE; 890 else 891 atom_size = 1 << 20; 892 893 if (pcpu_chosen_fc != PCPU_FC_PAGE) { 894 rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, 895 pcpu_alloc_bootmem, pcpu_free_bootmem); 896 if (rc) 897 pr_warn("PERCPU: %s allocator failed (%d), " 898 "falling back to page size\n", 899 pcpu_fc_names[pcpu_chosen_fc], rc); 900 } 901 902 if (rc < 0) 903 rc = pcpu_page_first_chunk(0, pcpu_alloc_bootmem, pcpu_free_bootmem, 904 pcpu_populate_pte); 905 if (rc < 0) 906 panic("cannot initialize percpu area (err=%d)", rc); 907 908 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 909 for_each_possible_cpu(cpu) { 910 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 911 paca_ptrs[cpu]->data_offset = __per_cpu_offset[cpu]; 912 } 913 } 914 #endif 915 916 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 917 unsigned long memory_block_size_bytes(void) 918 { 919 if (ppc_md.memory_block_size) 920 return ppc_md.memory_block_size(); 921 922 return MIN_MEMORY_BLOCK_SIZE; 923 } 924 #endif 925 926 #if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO) 927 struct ppc_pci_io ppc_pci_io; 928 EXPORT_SYMBOL(ppc_pci_io); 929 #endif 930 931 #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF 932 u64 hw_nmi_get_sample_period(int watchdog_thresh) 933 { 934 return ppc_proc_freq * watchdog_thresh; 935 } 936 #endif 937 938 /* 939 * The perf based hardlockup detector breaks PMU event based branches, so 940 * disable it by default. Book3S has a soft-nmi hardlockup detector based 941 * on the decrementer interrupt, so it does not suffer from this problem. 942 * 943 * It is likely to get false positives in KVM guests, so disable it there 944 * by default too. PowerVM will not stop or arbitrarily oversubscribe 945 * CPUs, but give a minimum regular allotment even with SPLPAR, so enable 946 * the detector for non-KVM guests, assume PowerVM. 947 */ 948 static int __init disable_hardlockup_detector(void) 949 { 950 #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF 951 hardlockup_detector_disable(); 952 #else 953 if (firmware_has_feature(FW_FEATURE_LPAR)) { 954 if (is_kvm_guest()) 955 hardlockup_detector_disable(); 956 } 957 #endif 958 959 return 0; 960 } 961 early_initcall(disable_hardlockup_detector); 962