1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Page table handling routines for radix page table. 4 * 5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 6 */ 7 8 #define pr_fmt(fmt) "radix-mmu: " fmt 9 10 #include <linux/kernel.h> 11 #include <linux/sched/mm.h> 12 #include <linux/memblock.h> 13 #include <linux/of_fdt.h> 14 #include <linux/mm.h> 15 #include <linux/string_helpers.h> 16 #include <linux/stop_machine.h> 17 18 #include <asm/pgtable.h> 19 #include <asm/pgalloc.h> 20 #include <asm/mmu_context.h> 21 #include <asm/dma.h> 22 #include <asm/machdep.h> 23 #include <asm/mmu.h> 24 #include <asm/firmware.h> 25 #include <asm/powernv.h> 26 #include <asm/sections.h> 27 #include <asm/trace.h> 28 #include <asm/uaccess.h> 29 30 #include <trace/events/thp.h> 31 32 unsigned int mmu_pid_bits; 33 unsigned int mmu_base_pid; 34 35 static int native_register_process_table(unsigned long base, unsigned long pg_sz, 36 unsigned long table_size) 37 { 38 unsigned long patb0, patb1; 39 40 patb0 = be64_to_cpu(partition_tb[0].patb0); 41 patb1 = base | table_size | PATB_GR; 42 43 mmu_partition_table_set_entry(0, patb0, patb1); 44 45 return 0; 46 } 47 48 static __ref void *early_alloc_pgtable(unsigned long size, int nid, 49 unsigned long region_start, unsigned long region_end) 50 { 51 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; 52 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; 53 void *ptr; 54 55 if (region_start) 56 min_addr = region_start; 57 if (region_end) 58 max_addr = region_end; 59 60 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); 61 62 if (!ptr) 63 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", 64 __func__, size, size, nid, &min_addr, &max_addr); 65 66 return ptr; 67 } 68 69 static int early_map_kernel_page(unsigned long ea, unsigned long pa, 70 pgprot_t flags, 71 unsigned int map_page_size, 72 int nid, 73 unsigned long region_start, unsigned long region_end) 74 { 75 unsigned long pfn = pa >> PAGE_SHIFT; 76 pgd_t *pgdp; 77 pud_t *pudp; 78 pmd_t *pmdp; 79 pte_t *ptep; 80 81 pgdp = pgd_offset_k(ea); 82 if (pgd_none(*pgdp)) { 83 pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, 84 region_start, region_end); 85 pgd_populate(&init_mm, pgdp, pudp); 86 } 87 pudp = pud_offset(pgdp, ea); 88 if (map_page_size == PUD_SIZE) { 89 ptep = (pte_t *)pudp; 90 goto set_the_pte; 91 } 92 if (pud_none(*pudp)) { 93 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, 94 region_start, region_end); 95 pud_populate(&init_mm, pudp, pmdp); 96 } 97 pmdp = pmd_offset(pudp, ea); 98 if (map_page_size == PMD_SIZE) { 99 ptep = pmdp_ptep(pmdp); 100 goto set_the_pte; 101 } 102 if (!pmd_present(*pmdp)) { 103 ptep = early_alloc_pgtable(PAGE_SIZE, nid, 104 region_start, region_end); 105 pmd_populate_kernel(&init_mm, pmdp, ptep); 106 } 107 ptep = pte_offset_kernel(pmdp, ea); 108 109 set_the_pte: 110 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 111 smp_wmb(); 112 return 0; 113 } 114 115 /* 116 * nid, region_start, and region_end are hints to try to place the page 117 * table memory in the same node or region. 118 */ 119 static int __map_kernel_page(unsigned long ea, unsigned long pa, 120 pgprot_t flags, 121 unsigned int map_page_size, 122 int nid, 123 unsigned long region_start, unsigned long region_end) 124 { 125 unsigned long pfn = pa >> PAGE_SHIFT; 126 pgd_t *pgdp; 127 pud_t *pudp; 128 pmd_t *pmdp; 129 pte_t *ptep; 130 /* 131 * Make sure task size is correct as per the max adddr 132 */ 133 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); 134 135 #ifdef CONFIG_PPC_64K_PAGES 136 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); 137 #endif 138 139 if (unlikely(!slab_is_available())) 140 return early_map_kernel_page(ea, pa, flags, map_page_size, 141 nid, region_start, region_end); 142 143 /* 144 * Should make page table allocation functions be able to take a 145 * node, so we can place kernel page tables on the right nodes after 146 * boot. 147 */ 148 pgdp = pgd_offset_k(ea); 149 pudp = pud_alloc(&init_mm, pgdp, ea); 150 if (!pudp) 151 return -ENOMEM; 152 if (map_page_size == PUD_SIZE) { 153 ptep = (pte_t *)pudp; 154 goto set_the_pte; 155 } 156 pmdp = pmd_alloc(&init_mm, pudp, ea); 157 if (!pmdp) 158 return -ENOMEM; 159 if (map_page_size == PMD_SIZE) { 160 ptep = pmdp_ptep(pmdp); 161 goto set_the_pte; 162 } 163 ptep = pte_alloc_kernel(pmdp, ea); 164 if (!ptep) 165 return -ENOMEM; 166 167 set_the_pte: 168 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 169 smp_wmb(); 170 return 0; 171 } 172 173 int radix__map_kernel_page(unsigned long ea, unsigned long pa, 174 pgprot_t flags, 175 unsigned int map_page_size) 176 { 177 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); 178 } 179 180 #ifdef CONFIG_STRICT_KERNEL_RWX 181 void radix__change_memory_range(unsigned long start, unsigned long end, 182 unsigned long clear) 183 { 184 unsigned long idx; 185 pgd_t *pgdp; 186 pud_t *pudp; 187 pmd_t *pmdp; 188 pte_t *ptep; 189 190 start = ALIGN_DOWN(start, PAGE_SIZE); 191 end = PAGE_ALIGN(end); // aligns up 192 193 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", 194 start, end, clear); 195 196 for (idx = start; idx < end; idx += PAGE_SIZE) { 197 pgdp = pgd_offset_k(idx); 198 pudp = pud_alloc(&init_mm, pgdp, idx); 199 if (!pudp) 200 continue; 201 if (pud_huge(*pudp)) { 202 ptep = (pte_t *)pudp; 203 goto update_the_pte; 204 } 205 pmdp = pmd_alloc(&init_mm, pudp, idx); 206 if (!pmdp) 207 continue; 208 if (pmd_huge(*pmdp)) { 209 ptep = pmdp_ptep(pmdp); 210 goto update_the_pte; 211 } 212 ptep = pte_alloc_kernel(pmdp, idx); 213 if (!ptep) 214 continue; 215 update_the_pte: 216 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); 217 } 218 219 radix__flush_tlb_kernel_range(start, end); 220 } 221 222 void radix__mark_rodata_ro(void) 223 { 224 unsigned long start, end; 225 226 start = (unsigned long)_stext; 227 end = (unsigned long)__init_begin; 228 229 radix__change_memory_range(start, end, _PAGE_WRITE); 230 } 231 232 void radix__mark_initmem_nx(void) 233 { 234 unsigned long start = (unsigned long)__init_begin; 235 unsigned long end = (unsigned long)__init_end; 236 237 radix__change_memory_range(start, end, _PAGE_EXEC); 238 } 239 #endif /* CONFIG_STRICT_KERNEL_RWX */ 240 241 static inline void __meminit 242 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) 243 { 244 char buf[10]; 245 246 if (end <= start) 247 return; 248 249 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); 250 251 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, 252 exec ? " (exec)" : ""); 253 } 254 255 static unsigned long next_boundary(unsigned long addr, unsigned long end) 256 { 257 #ifdef CONFIG_STRICT_KERNEL_RWX 258 if (addr < __pa_symbol(__init_begin)) 259 return __pa_symbol(__init_begin); 260 #endif 261 return end; 262 } 263 264 static int __meminit create_physical_mapping(unsigned long start, 265 unsigned long end, 266 int nid) 267 { 268 unsigned long vaddr, addr, mapping_size = 0; 269 bool prev_exec, exec = false; 270 pgprot_t prot; 271 int psize; 272 273 start = _ALIGN_UP(start, PAGE_SIZE); 274 for (addr = start; addr < end; addr += mapping_size) { 275 unsigned long gap, previous_size; 276 int rc; 277 278 gap = next_boundary(addr, end) - addr; 279 previous_size = mapping_size; 280 prev_exec = exec; 281 282 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && 283 mmu_psize_defs[MMU_PAGE_1G].shift) { 284 mapping_size = PUD_SIZE; 285 psize = MMU_PAGE_1G; 286 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && 287 mmu_psize_defs[MMU_PAGE_2M].shift) { 288 mapping_size = PMD_SIZE; 289 psize = MMU_PAGE_2M; 290 } else { 291 mapping_size = PAGE_SIZE; 292 psize = mmu_virtual_psize; 293 } 294 295 vaddr = (unsigned long)__va(addr); 296 297 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || 298 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { 299 prot = PAGE_KERNEL_X; 300 exec = true; 301 } else { 302 prot = PAGE_KERNEL; 303 exec = false; 304 } 305 306 if (mapping_size != previous_size || exec != prev_exec) { 307 print_mapping(start, addr, previous_size, prev_exec); 308 start = addr; 309 } 310 311 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); 312 if (rc) 313 return rc; 314 315 update_page_count(psize, 1); 316 } 317 318 print_mapping(start, addr, mapping_size, exec); 319 return 0; 320 } 321 322 void __init radix_init_pgtable(void) 323 { 324 unsigned long rts_field; 325 struct memblock_region *reg; 326 327 /* We don't support slb for radix */ 328 mmu_slb_size = 0; 329 /* 330 * Create the linear mapping, using standard page size for now 331 */ 332 for_each_memblock(memory, reg) { 333 /* 334 * The memblock allocator is up at this point, so the 335 * page tables will be allocated within the range. No 336 * need or a node (which we don't have yet). 337 */ 338 339 if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { 340 pr_warn("Outside the supported range\n"); 341 continue; 342 } 343 344 WARN_ON(create_physical_mapping(reg->base, 345 reg->base + reg->size, 346 -1)); 347 } 348 349 /* Find out how many PID bits are supported */ 350 if (cpu_has_feature(CPU_FTR_HVMODE)) { 351 if (!mmu_pid_bits) 352 mmu_pid_bits = 20; 353 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 354 /* 355 * When KVM is possible, we only use the top half of the 356 * PID space to avoid collisions between host and guest PIDs 357 * which can cause problems due to prefetch when exiting the 358 * guest with AIL=3 359 */ 360 mmu_base_pid = 1 << (mmu_pid_bits - 1); 361 #else 362 mmu_base_pid = 1; 363 #endif 364 } else { 365 /* The guest uses the bottom half of the PID space */ 366 if (!mmu_pid_bits) 367 mmu_pid_bits = 19; 368 mmu_base_pid = 1; 369 } 370 371 /* 372 * Allocate Partition table and process table for the 373 * host. 374 */ 375 BUG_ON(PRTB_SIZE_SHIFT > 36); 376 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); 377 /* 378 * Fill in the process table. 379 */ 380 rts_field = radix__get_tree_size(); 381 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 382 /* 383 * Fill in the partition table. We are suppose to use effective address 384 * of process table here. But our linear mapping also enable us to use 385 * physical address here. 386 */ 387 register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); 388 pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); 389 asm volatile("ptesync" : : : "memory"); 390 asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 391 "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); 392 asm volatile("eieio; tlbsync; ptesync" : : : "memory"); 393 trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); 394 395 /* 396 * The init_mm context is given the first available (non-zero) PID, 397 * which is the "guard PID" and contains no page table. PIDR should 398 * never be set to zero because that duplicates the kernel address 399 * space at the 0x0... offset (quadrant 0)! 400 * 401 * An arbitrary PID that may later be allocated by the PID allocator 402 * for userspace processes must not be used either, because that 403 * would cause stale user mappings for that PID on CPUs outside of 404 * the TLB invalidation scheme (because it won't be in mm_cpumask). 405 * 406 * So permanently carve out one PID for the purpose of a guard PID. 407 */ 408 init_mm.context.id = mmu_base_pid; 409 mmu_base_pid++; 410 } 411 412 static void __init radix_init_partition_table(void) 413 { 414 unsigned long rts_field, dw0; 415 416 mmu_partition_table_init(); 417 rts_field = radix__get_tree_size(); 418 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; 419 mmu_partition_table_set_entry(0, dw0, 0); 420 421 pr_info("Initializing Radix MMU\n"); 422 pr_info("Partition table %p\n", partition_tb); 423 } 424 425 void __init radix_init_native(void) 426 { 427 register_process_table = native_register_process_table; 428 } 429 430 static int __init get_idx_from_shift(unsigned int shift) 431 { 432 int idx = -1; 433 434 switch (shift) { 435 case 0xc: 436 idx = MMU_PAGE_4K; 437 break; 438 case 0x10: 439 idx = MMU_PAGE_64K; 440 break; 441 case 0x15: 442 idx = MMU_PAGE_2M; 443 break; 444 case 0x1e: 445 idx = MMU_PAGE_1G; 446 break; 447 } 448 return idx; 449 } 450 451 static int __init radix_dt_scan_page_sizes(unsigned long node, 452 const char *uname, int depth, 453 void *data) 454 { 455 int size = 0; 456 int shift, idx; 457 unsigned int ap; 458 const __be32 *prop; 459 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 460 461 /* We are scanning "cpu" nodes only */ 462 if (type == NULL || strcmp(type, "cpu") != 0) 463 return 0; 464 465 /* Find MMU PID size */ 466 prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); 467 if (prop && size == 4) 468 mmu_pid_bits = be32_to_cpup(prop); 469 470 /* Grab page size encodings */ 471 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); 472 if (!prop) 473 return 0; 474 475 pr_info("Page sizes from device-tree:\n"); 476 for (; size >= 4; size -= 4, ++prop) { 477 478 struct mmu_psize_def *def; 479 480 /* top 3 bit is AP encoding */ 481 shift = be32_to_cpu(prop[0]) & ~(0xe << 28); 482 ap = be32_to_cpu(prop[0]) >> 29; 483 pr_info("Page size shift = %d AP=0x%x\n", shift, ap); 484 485 idx = get_idx_from_shift(shift); 486 if (idx < 0) 487 continue; 488 489 def = &mmu_psize_defs[idx]; 490 def->shift = shift; 491 def->ap = ap; 492 } 493 494 /* needed ? */ 495 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 496 return 1; 497 } 498 499 void __init radix__early_init_devtree(void) 500 { 501 int rc; 502 503 /* 504 * Try to find the available page sizes in the device-tree 505 */ 506 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); 507 if (rc != 0) /* Found */ 508 goto found; 509 /* 510 * let's assume we have page 4k and 64k support 511 */ 512 mmu_psize_defs[MMU_PAGE_4K].shift = 12; 513 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; 514 515 mmu_psize_defs[MMU_PAGE_64K].shift = 16; 516 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; 517 found: 518 #ifdef CONFIG_SPARSEMEM_VMEMMAP 519 if (mmu_psize_defs[MMU_PAGE_2M].shift) { 520 /* 521 * map vmemmap using 2M if available 522 */ 523 mmu_vmemmap_psize = MMU_PAGE_2M; 524 } 525 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 526 return; 527 } 528 529 static void radix_init_amor(void) 530 { 531 /* 532 * In HV mode, we init AMOR (Authority Mask Override Register) so that 533 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 534 * Register), enable key 0 and set it to 1. 535 * 536 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 537 */ 538 mtspr(SPRN_AMOR, (3ul << 62)); 539 } 540 541 #ifdef CONFIG_PPC_KUEP 542 void setup_kuep(bool disabled) 543 { 544 if (disabled || !early_radix_enabled()) 545 return; 546 547 if (smp_processor_id() == boot_cpuid) 548 pr_info("Activating Kernel Userspace Execution Prevention\n"); 549 550 /* 551 * Radix always uses key0 of the IAMR to determine if an access is 552 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 553 * fetch. 554 */ 555 mtspr(SPRN_IAMR, (1ul << 62)); 556 } 557 #endif 558 559 #ifdef CONFIG_PPC_KUAP 560 void setup_kuap(bool disabled) 561 { 562 if (disabled || !early_radix_enabled()) 563 return; 564 565 if (smp_processor_id() == boot_cpuid) { 566 pr_info("Activating Kernel Userspace Access Prevention\n"); 567 cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; 568 } 569 570 /* Make sure userspace can't change the AMR */ 571 mtspr(SPRN_UAMOR, 0); 572 mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); 573 isync(); 574 } 575 #endif 576 577 void __init radix__early_init_mmu(void) 578 { 579 unsigned long lpcr; 580 581 #ifdef CONFIG_PPC_64K_PAGES 582 /* PAGE_SIZE mappings */ 583 mmu_virtual_psize = MMU_PAGE_64K; 584 #else 585 mmu_virtual_psize = MMU_PAGE_4K; 586 #endif 587 588 #ifdef CONFIG_SPARSEMEM_VMEMMAP 589 /* vmemmap mapping */ 590 mmu_vmemmap_psize = mmu_virtual_psize; 591 #endif 592 /* 593 * initialize page table size 594 */ 595 __pte_index_size = RADIX_PTE_INDEX_SIZE; 596 __pmd_index_size = RADIX_PMD_INDEX_SIZE; 597 __pud_index_size = RADIX_PUD_INDEX_SIZE; 598 __pgd_index_size = RADIX_PGD_INDEX_SIZE; 599 __pud_cache_index = RADIX_PUD_INDEX_SIZE; 600 __pte_table_size = RADIX_PTE_TABLE_SIZE; 601 __pmd_table_size = RADIX_PMD_TABLE_SIZE; 602 __pud_table_size = RADIX_PUD_TABLE_SIZE; 603 __pgd_table_size = RADIX_PGD_TABLE_SIZE; 604 605 __pmd_val_bits = RADIX_PMD_VAL_BITS; 606 __pud_val_bits = RADIX_PUD_VAL_BITS; 607 __pgd_val_bits = RADIX_PGD_VAL_BITS; 608 609 __kernel_virt_start = RADIX_KERN_VIRT_START; 610 __vmalloc_start = RADIX_VMALLOC_START; 611 __vmalloc_end = RADIX_VMALLOC_END; 612 __kernel_io_start = RADIX_KERN_IO_START; 613 __kernel_io_end = RADIX_KERN_IO_END; 614 vmemmap = (struct page *)RADIX_VMEMMAP_START; 615 ioremap_bot = IOREMAP_BASE; 616 617 #ifdef CONFIG_PCI 618 pci_io_base = ISA_IO_BASE; 619 #endif 620 __pte_frag_nr = RADIX_PTE_FRAG_NR; 621 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; 622 __pmd_frag_nr = RADIX_PMD_FRAG_NR; 623 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; 624 625 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 626 radix_init_native(); 627 lpcr = mfspr(SPRN_LPCR); 628 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 629 radix_init_partition_table(); 630 radix_init_amor(); 631 } else { 632 radix_init_pseries(); 633 } 634 635 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 636 637 radix_init_pgtable(); 638 /* Switch to the guard PID before turning on MMU */ 639 radix__switch_mmu_context(NULL, &init_mm); 640 if (cpu_has_feature(CPU_FTR_HVMODE)) 641 tlbiel_all(); 642 } 643 644 void radix__early_init_mmu_secondary(void) 645 { 646 unsigned long lpcr; 647 /* 648 * update partition table control register and UPRT 649 */ 650 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 651 lpcr = mfspr(SPRN_LPCR); 652 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 653 654 mtspr(SPRN_PTCR, 655 __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); 656 radix_init_amor(); 657 } 658 659 radix__switch_mmu_context(NULL, &init_mm); 660 if (cpu_has_feature(CPU_FTR_HVMODE)) 661 tlbiel_all(); 662 } 663 664 void radix__mmu_cleanup_all(void) 665 { 666 unsigned long lpcr; 667 668 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 669 lpcr = mfspr(SPRN_LPCR); 670 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); 671 mtspr(SPRN_PTCR, 0); 672 powernv_set_nmmu_ptcr(0); 673 radix__flush_tlb_all(); 674 } 675 } 676 677 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, 678 phys_addr_t first_memblock_size) 679 { 680 /* 681 * We don't currently support the first MEMBLOCK not mapping 0 682 * physical on those processors 683 */ 684 BUG_ON(first_memblock_base != 0); 685 686 /* 687 * Radix mode is not limited by RMA / VRMA addressing. 688 */ 689 ppc64_rma_size = ULONG_MAX; 690 } 691 692 #ifdef CONFIG_MEMORY_HOTPLUG 693 static void free_pte_table(pte_t *pte_start, pmd_t *pmd) 694 { 695 pte_t *pte; 696 int i; 697 698 for (i = 0; i < PTRS_PER_PTE; i++) { 699 pte = pte_start + i; 700 if (!pte_none(*pte)) 701 return; 702 } 703 704 pte_free_kernel(&init_mm, pte_start); 705 pmd_clear(pmd); 706 } 707 708 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) 709 { 710 pmd_t *pmd; 711 int i; 712 713 for (i = 0; i < PTRS_PER_PMD; i++) { 714 pmd = pmd_start + i; 715 if (!pmd_none(*pmd)) 716 return; 717 } 718 719 pmd_free(&init_mm, pmd_start); 720 pud_clear(pud); 721 } 722 723 struct change_mapping_params { 724 pte_t *pte; 725 unsigned long start; 726 unsigned long end; 727 unsigned long aligned_start; 728 unsigned long aligned_end; 729 }; 730 731 static int __meminit stop_machine_change_mapping(void *data) 732 { 733 struct change_mapping_params *params = 734 (struct change_mapping_params *)data; 735 736 if (!data) 737 return -1; 738 739 spin_unlock(&init_mm.page_table_lock); 740 pte_clear(&init_mm, params->aligned_start, params->pte); 741 create_physical_mapping(params->aligned_start, params->start, -1); 742 create_physical_mapping(params->end, params->aligned_end, -1); 743 spin_lock(&init_mm.page_table_lock); 744 return 0; 745 } 746 747 static void remove_pte_table(pte_t *pte_start, unsigned long addr, 748 unsigned long end) 749 { 750 unsigned long next; 751 pte_t *pte; 752 753 pte = pte_start + pte_index(addr); 754 for (; addr < end; addr = next, pte++) { 755 next = (addr + PAGE_SIZE) & PAGE_MASK; 756 if (next > end) 757 next = end; 758 759 if (!pte_present(*pte)) 760 continue; 761 762 if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { 763 /* 764 * The vmemmap_free() and remove_section_mapping() 765 * codepaths call us with aligned addresses. 766 */ 767 WARN_ONCE(1, "%s: unaligned range\n", __func__); 768 continue; 769 } 770 771 pte_clear(&init_mm, addr, pte); 772 } 773 } 774 775 /* 776 * clear the pte and potentially split the mapping helper 777 */ 778 static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, 779 unsigned long size, pte_t *pte) 780 { 781 unsigned long mask = ~(size - 1); 782 unsigned long aligned_start = addr & mask; 783 unsigned long aligned_end = addr + size; 784 struct change_mapping_params params; 785 bool split_region = false; 786 787 if ((end - addr) < size) { 788 /* 789 * We're going to clear the PTE, but not flushed 790 * the mapping, time to remap and flush. The 791 * effects if visible outside the processor or 792 * if we are running in code close to the 793 * mapping we cleared, we are in trouble. 794 */ 795 if (overlaps_kernel_text(aligned_start, addr) || 796 overlaps_kernel_text(end, aligned_end)) { 797 /* 798 * Hack, just return, don't pte_clear 799 */ 800 WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " 801 "text, not splitting\n", addr, end); 802 return; 803 } 804 split_region = true; 805 } 806 807 if (split_region) { 808 params.pte = pte; 809 params.start = addr; 810 params.end = end; 811 params.aligned_start = addr & ~(size - 1); 812 params.aligned_end = min_t(unsigned long, aligned_end, 813 (unsigned long)__va(memblock_end_of_DRAM())); 814 stop_machine(stop_machine_change_mapping, ¶ms, NULL); 815 return; 816 } 817 818 pte_clear(&init_mm, addr, pte); 819 } 820 821 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, 822 unsigned long end) 823 { 824 unsigned long next; 825 pte_t *pte_base; 826 pmd_t *pmd; 827 828 pmd = pmd_start + pmd_index(addr); 829 for (; addr < end; addr = next, pmd++) { 830 next = pmd_addr_end(addr, end); 831 832 if (!pmd_present(*pmd)) 833 continue; 834 835 if (pmd_huge(*pmd)) { 836 split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); 837 continue; 838 } 839 840 pte_base = (pte_t *)pmd_page_vaddr(*pmd); 841 remove_pte_table(pte_base, addr, next); 842 free_pte_table(pte_base, pmd); 843 } 844 } 845 846 static void remove_pud_table(pud_t *pud_start, unsigned long addr, 847 unsigned long end) 848 { 849 unsigned long next; 850 pmd_t *pmd_base; 851 pud_t *pud; 852 853 pud = pud_start + pud_index(addr); 854 for (; addr < end; addr = next, pud++) { 855 next = pud_addr_end(addr, end); 856 857 if (!pud_present(*pud)) 858 continue; 859 860 if (pud_huge(*pud)) { 861 split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); 862 continue; 863 } 864 865 pmd_base = (pmd_t *)pud_page_vaddr(*pud); 866 remove_pmd_table(pmd_base, addr, next); 867 free_pmd_table(pmd_base, pud); 868 } 869 } 870 871 static void __meminit remove_pagetable(unsigned long start, unsigned long end) 872 { 873 unsigned long addr, next; 874 pud_t *pud_base; 875 pgd_t *pgd; 876 877 spin_lock(&init_mm.page_table_lock); 878 879 for (addr = start; addr < end; addr = next) { 880 next = pgd_addr_end(addr, end); 881 882 pgd = pgd_offset_k(addr); 883 if (!pgd_present(*pgd)) 884 continue; 885 886 if (pgd_huge(*pgd)) { 887 split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); 888 continue; 889 } 890 891 pud_base = (pud_t *)pgd_page_vaddr(*pgd); 892 remove_pud_table(pud_base, addr, next); 893 } 894 895 spin_unlock(&init_mm.page_table_lock); 896 radix__flush_tlb_kernel_range(start, end); 897 } 898 899 int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) 900 { 901 if (end >= RADIX_VMALLOC_START) { 902 pr_warn("Outside the supported range\n"); 903 return -1; 904 } 905 906 return create_physical_mapping(start, end, nid); 907 } 908 909 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) 910 { 911 remove_pagetable(start, end); 912 return 0; 913 } 914 #endif /* CONFIG_MEMORY_HOTPLUG */ 915 916 #ifdef CONFIG_SPARSEMEM_VMEMMAP 917 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, 918 pgprot_t flags, unsigned int map_page_size, 919 int nid) 920 { 921 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); 922 } 923 924 int __meminit radix__vmemmap_create_mapping(unsigned long start, 925 unsigned long page_size, 926 unsigned long phys) 927 { 928 /* Create a PTE encoding */ 929 unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; 930 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); 931 int ret; 932 933 if ((start + page_size) >= RADIX_VMEMMAP_END) { 934 pr_warn("Outside the supported range\n"); 935 return -1; 936 } 937 938 ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); 939 BUG_ON(ret); 940 941 return 0; 942 } 943 944 #ifdef CONFIG_MEMORY_HOTPLUG 945 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) 946 { 947 remove_pagetable(start, start + page_size); 948 } 949 #endif 950 #endif 951 952 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 953 954 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 955 pmd_t *pmdp, unsigned long clr, 956 unsigned long set) 957 { 958 unsigned long old; 959 960 #ifdef CONFIG_DEBUG_VM 961 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); 962 assert_spin_locked(pmd_lockptr(mm, pmdp)); 963 #endif 964 965 old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); 966 trace_hugepage_update(addr, old, clr, set); 967 968 return old; 969 } 970 971 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 972 pmd_t *pmdp) 973 974 { 975 pmd_t pmd; 976 977 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 978 VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); 979 VM_BUG_ON(pmd_devmap(*pmdp)); 980 /* 981 * khugepaged calls this for normal pmd 982 */ 983 pmd = *pmdp; 984 pmd_clear(pmdp); 985 986 /*FIXME!! Verify whether we need this kick below */ 987 serialize_against_pte_lookup(vma->vm_mm); 988 989 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); 990 991 return pmd; 992 } 993 994 /* 995 * For us pgtable_t is pte_t *. Inorder to save the deposisted 996 * page table, we consider the allocated page table as a list 997 * head. On withdraw we need to make sure we zero out the used 998 * list_head memory area. 999 */ 1000 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1001 pgtable_t pgtable) 1002 { 1003 struct list_head *lh = (struct list_head *) pgtable; 1004 1005 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1006 1007 /* FIFO */ 1008 if (!pmd_huge_pte(mm, pmdp)) 1009 INIT_LIST_HEAD(lh); 1010 else 1011 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1012 pmd_huge_pte(mm, pmdp) = pgtable; 1013 } 1014 1015 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1016 { 1017 pte_t *ptep; 1018 pgtable_t pgtable; 1019 struct list_head *lh; 1020 1021 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1022 1023 /* FIFO */ 1024 pgtable = pmd_huge_pte(mm, pmdp); 1025 lh = (struct list_head *) pgtable; 1026 if (list_empty(lh)) 1027 pmd_huge_pte(mm, pmdp) = NULL; 1028 else { 1029 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1030 list_del(lh); 1031 } 1032 ptep = (pte_t *) pgtable; 1033 *ptep = __pte(0); 1034 ptep++; 1035 *ptep = __pte(0); 1036 return pgtable; 1037 } 1038 1039 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 1040 unsigned long addr, pmd_t *pmdp) 1041 { 1042 pmd_t old_pmd; 1043 unsigned long old; 1044 1045 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 1046 old_pmd = __pmd(old); 1047 /* 1048 * Serialize against find_current_mm_pte which does lock-less 1049 * lookup in page tables with local interrupts disabled. For huge pages 1050 * it casts pmd_t to pte_t. Since format of pte_t is different from 1051 * pmd_t we want to prevent transit from pmd pointing to page table 1052 * to pmd pointing to huge page (and back) while interrupts are disabled. 1053 * We clear pmd to possibly replace it with page table pointer in 1054 * different code paths. So make sure we wait for the parallel 1055 * find_current_mm_pte to finish. 1056 */ 1057 serialize_against_pte_lookup(mm); 1058 return old_pmd; 1059 } 1060 1061 int radix__has_transparent_hugepage(void) 1062 { 1063 /* For radix 2M at PMD level means thp */ 1064 if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) 1065 return 1; 1066 return 0; 1067 } 1068 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1069 1070 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, 1071 pte_t entry, unsigned long address, int psize) 1072 { 1073 struct mm_struct *mm = vma->vm_mm; 1074 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | 1075 _PAGE_RW | _PAGE_EXEC); 1076 1077 unsigned long change = pte_val(entry) ^ pte_val(*ptep); 1078 /* 1079 * To avoid NMMU hang while relaxing access, we need mark 1080 * the pte invalid in between. 1081 */ 1082 if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { 1083 unsigned long old_pte, new_pte; 1084 1085 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); 1086 /* 1087 * new value of pte 1088 */ 1089 new_pte = old_pte | set; 1090 radix__flush_tlb_page_psize(mm, address, psize); 1091 __radix_pte_update(ptep, _PAGE_INVALID, new_pte); 1092 } else { 1093 __radix_pte_update(ptep, 0, set); 1094 /* 1095 * Book3S does not require a TLB flush when relaxing access 1096 * restrictions when the address space is not attached to a 1097 * NMMU, because the core MMU will reload the pte after taking 1098 * an access fault, which is defined by the architectue. 1099 */ 1100 } 1101 /* See ptesync comment in radix__set_pte_at */ 1102 } 1103 1104 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 1105 unsigned long addr, pte_t *ptep, 1106 pte_t old_pte, pte_t pte) 1107 { 1108 struct mm_struct *mm = vma->vm_mm; 1109 1110 /* 1111 * To avoid NMMU hang while relaxing access we need to flush the tlb before 1112 * we set the new value. We need to do this only for radix, because hash 1113 * translation does flush when updating the linux pte. 1114 */ 1115 if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 1116 (atomic_read(&mm->context.copros) > 0)) 1117 radix__flush_tlb_page(vma, addr); 1118 1119 set_pte_at(mm, addr, ptep, pte); 1120 } 1121