1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Page table handling routines for radix page table. 4 * 5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 6 */ 7 8 #define pr_fmt(fmt) "radix-mmu: " fmt 9 10 #include <linux/io.h> 11 #include <linux/kernel.h> 12 #include <linux/sched/mm.h> 13 #include <linux/memblock.h> 14 #include <linux/of_fdt.h> 15 #include <linux/mm.h> 16 #include <linux/string_helpers.h> 17 #include <linux/stop_machine.h> 18 19 #include <asm/pgtable.h> 20 #include <asm/pgalloc.h> 21 #include <asm/mmu_context.h> 22 #include <asm/dma.h> 23 #include <asm/machdep.h> 24 #include <asm/mmu.h> 25 #include <asm/firmware.h> 26 #include <asm/powernv.h> 27 #include <asm/sections.h> 28 #include <asm/trace.h> 29 #include <asm/uaccess.h> 30 31 #include <trace/events/thp.h> 32 33 unsigned int mmu_pid_bits; 34 unsigned int mmu_base_pid; 35 36 static int native_register_process_table(unsigned long base, unsigned long pg_sz, 37 unsigned long table_size) 38 { 39 unsigned long patb0, patb1; 40 41 patb0 = be64_to_cpu(partition_tb[0].patb0); 42 patb1 = base | table_size | PATB_GR; 43 44 mmu_partition_table_set_entry(0, patb0, patb1); 45 46 return 0; 47 } 48 49 static __ref void *early_alloc_pgtable(unsigned long size, int nid, 50 unsigned long region_start, unsigned long region_end) 51 { 52 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; 53 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; 54 void *ptr; 55 56 if (region_start) 57 min_addr = region_start; 58 if (region_end) 59 max_addr = region_end; 60 61 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); 62 63 if (!ptr) 64 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", 65 __func__, size, size, nid, &min_addr, &max_addr); 66 67 return ptr; 68 } 69 70 static int early_map_kernel_page(unsigned long ea, unsigned long pa, 71 pgprot_t flags, 72 unsigned int map_page_size, 73 int nid, 74 unsigned long region_start, unsigned long region_end) 75 { 76 unsigned long pfn = pa >> PAGE_SHIFT; 77 pgd_t *pgdp; 78 pud_t *pudp; 79 pmd_t *pmdp; 80 pte_t *ptep; 81 82 pgdp = pgd_offset_k(ea); 83 if (pgd_none(*pgdp)) { 84 pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, 85 region_start, region_end); 86 pgd_populate(&init_mm, pgdp, pudp); 87 } 88 pudp = pud_offset(pgdp, ea); 89 if (map_page_size == PUD_SIZE) { 90 ptep = (pte_t *)pudp; 91 goto set_the_pte; 92 } 93 if (pud_none(*pudp)) { 94 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, 95 region_start, region_end); 96 pud_populate(&init_mm, pudp, pmdp); 97 } 98 pmdp = pmd_offset(pudp, ea); 99 if (map_page_size == PMD_SIZE) { 100 ptep = pmdp_ptep(pmdp); 101 goto set_the_pte; 102 } 103 if (!pmd_present(*pmdp)) { 104 ptep = early_alloc_pgtable(PAGE_SIZE, nid, 105 region_start, region_end); 106 pmd_populate_kernel(&init_mm, pmdp, ptep); 107 } 108 ptep = pte_offset_kernel(pmdp, ea); 109 110 set_the_pte: 111 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 112 smp_wmb(); 113 return 0; 114 } 115 116 /* 117 * nid, region_start, and region_end are hints to try to place the page 118 * table memory in the same node or region. 119 */ 120 static int __map_kernel_page(unsigned long ea, unsigned long pa, 121 pgprot_t flags, 122 unsigned int map_page_size, 123 int nid, 124 unsigned long region_start, unsigned long region_end) 125 { 126 unsigned long pfn = pa >> PAGE_SHIFT; 127 pgd_t *pgdp; 128 pud_t *pudp; 129 pmd_t *pmdp; 130 pte_t *ptep; 131 /* 132 * Make sure task size is correct as per the max adddr 133 */ 134 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); 135 136 #ifdef CONFIG_PPC_64K_PAGES 137 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); 138 #endif 139 140 if (unlikely(!slab_is_available())) 141 return early_map_kernel_page(ea, pa, flags, map_page_size, 142 nid, region_start, region_end); 143 144 /* 145 * Should make page table allocation functions be able to take a 146 * node, so we can place kernel page tables on the right nodes after 147 * boot. 148 */ 149 pgdp = pgd_offset_k(ea); 150 pudp = pud_alloc(&init_mm, pgdp, ea); 151 if (!pudp) 152 return -ENOMEM; 153 if (map_page_size == PUD_SIZE) { 154 ptep = (pte_t *)pudp; 155 goto set_the_pte; 156 } 157 pmdp = pmd_alloc(&init_mm, pudp, ea); 158 if (!pmdp) 159 return -ENOMEM; 160 if (map_page_size == PMD_SIZE) { 161 ptep = pmdp_ptep(pmdp); 162 goto set_the_pte; 163 } 164 ptep = pte_alloc_kernel(pmdp, ea); 165 if (!ptep) 166 return -ENOMEM; 167 168 set_the_pte: 169 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 170 smp_wmb(); 171 return 0; 172 } 173 174 int radix__map_kernel_page(unsigned long ea, unsigned long pa, 175 pgprot_t flags, 176 unsigned int map_page_size) 177 { 178 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); 179 } 180 181 #ifdef CONFIG_STRICT_KERNEL_RWX 182 void radix__change_memory_range(unsigned long start, unsigned long end, 183 unsigned long clear) 184 { 185 unsigned long idx; 186 pgd_t *pgdp; 187 pud_t *pudp; 188 pmd_t *pmdp; 189 pte_t *ptep; 190 191 start = ALIGN_DOWN(start, PAGE_SIZE); 192 end = PAGE_ALIGN(end); // aligns up 193 194 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", 195 start, end, clear); 196 197 for (idx = start; idx < end; idx += PAGE_SIZE) { 198 pgdp = pgd_offset_k(idx); 199 pudp = pud_alloc(&init_mm, pgdp, idx); 200 if (!pudp) 201 continue; 202 if (pud_is_leaf(*pudp)) { 203 ptep = (pte_t *)pudp; 204 goto update_the_pte; 205 } 206 pmdp = pmd_alloc(&init_mm, pudp, idx); 207 if (!pmdp) 208 continue; 209 if (pmd_is_leaf(*pmdp)) { 210 ptep = pmdp_ptep(pmdp); 211 goto update_the_pte; 212 } 213 ptep = pte_alloc_kernel(pmdp, idx); 214 if (!ptep) 215 continue; 216 update_the_pte: 217 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); 218 } 219 220 radix__flush_tlb_kernel_range(start, end); 221 } 222 223 void radix__mark_rodata_ro(void) 224 { 225 unsigned long start, end; 226 227 start = (unsigned long)_stext; 228 end = (unsigned long)__init_begin; 229 230 radix__change_memory_range(start, end, _PAGE_WRITE); 231 } 232 233 void radix__mark_initmem_nx(void) 234 { 235 unsigned long start = (unsigned long)__init_begin; 236 unsigned long end = (unsigned long)__init_end; 237 238 radix__change_memory_range(start, end, _PAGE_EXEC); 239 } 240 #endif /* CONFIG_STRICT_KERNEL_RWX */ 241 242 static inline void __meminit 243 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) 244 { 245 char buf[10]; 246 247 if (end <= start) 248 return; 249 250 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); 251 252 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, 253 exec ? " (exec)" : ""); 254 } 255 256 static unsigned long next_boundary(unsigned long addr, unsigned long end) 257 { 258 #ifdef CONFIG_STRICT_KERNEL_RWX 259 if (addr < __pa_symbol(__init_begin)) 260 return __pa_symbol(__init_begin); 261 #endif 262 return end; 263 } 264 265 static int __meminit create_physical_mapping(unsigned long start, 266 unsigned long end, 267 int nid) 268 { 269 unsigned long vaddr, addr, mapping_size = 0; 270 bool prev_exec, exec = false; 271 pgprot_t prot; 272 int psize; 273 274 start = _ALIGN_UP(start, PAGE_SIZE); 275 for (addr = start; addr < end; addr += mapping_size) { 276 unsigned long gap, previous_size; 277 int rc; 278 279 gap = next_boundary(addr, end) - addr; 280 previous_size = mapping_size; 281 prev_exec = exec; 282 283 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && 284 mmu_psize_defs[MMU_PAGE_1G].shift) { 285 mapping_size = PUD_SIZE; 286 psize = MMU_PAGE_1G; 287 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && 288 mmu_psize_defs[MMU_PAGE_2M].shift) { 289 mapping_size = PMD_SIZE; 290 psize = MMU_PAGE_2M; 291 } else { 292 mapping_size = PAGE_SIZE; 293 psize = mmu_virtual_psize; 294 } 295 296 vaddr = (unsigned long)__va(addr); 297 298 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || 299 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { 300 prot = PAGE_KERNEL_X; 301 exec = true; 302 } else { 303 prot = PAGE_KERNEL; 304 exec = false; 305 } 306 307 if (mapping_size != previous_size || exec != prev_exec) { 308 print_mapping(start, addr, previous_size, prev_exec); 309 start = addr; 310 } 311 312 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); 313 if (rc) 314 return rc; 315 316 update_page_count(psize, 1); 317 } 318 319 print_mapping(start, addr, mapping_size, exec); 320 return 0; 321 } 322 323 static void __init radix_init_pgtable(void) 324 { 325 unsigned long rts_field; 326 struct memblock_region *reg; 327 328 /* We don't support slb for radix */ 329 mmu_slb_size = 0; 330 /* 331 * Create the linear mapping, using standard page size for now 332 */ 333 for_each_memblock(memory, reg) { 334 /* 335 * The memblock allocator is up at this point, so the 336 * page tables will be allocated within the range. No 337 * need or a node (which we don't have yet). 338 */ 339 340 if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { 341 pr_warn("Outside the supported range\n"); 342 continue; 343 } 344 345 WARN_ON(create_physical_mapping(reg->base, 346 reg->base + reg->size, 347 -1)); 348 } 349 350 /* Find out how many PID bits are supported */ 351 if (cpu_has_feature(CPU_FTR_HVMODE)) { 352 if (!mmu_pid_bits) 353 mmu_pid_bits = 20; 354 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 355 /* 356 * When KVM is possible, we only use the top half of the 357 * PID space to avoid collisions between host and guest PIDs 358 * which can cause problems due to prefetch when exiting the 359 * guest with AIL=3 360 */ 361 mmu_base_pid = 1 << (mmu_pid_bits - 1); 362 #else 363 mmu_base_pid = 1; 364 #endif 365 } else { 366 /* The guest uses the bottom half of the PID space */ 367 if (!mmu_pid_bits) 368 mmu_pid_bits = 19; 369 mmu_base_pid = 1; 370 } 371 372 /* 373 * Allocate Partition table and process table for the 374 * host. 375 */ 376 BUG_ON(PRTB_SIZE_SHIFT > 36); 377 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); 378 /* 379 * Fill in the process table. 380 */ 381 rts_field = radix__get_tree_size(); 382 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 383 /* 384 * Fill in the partition table. We are suppose to use effective address 385 * of process table here. But our linear mapping also enable us to use 386 * physical address here. 387 */ 388 register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); 389 pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); 390 asm volatile("ptesync" : : : "memory"); 391 asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 392 "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); 393 asm volatile("eieio; tlbsync; ptesync" : : : "memory"); 394 trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); 395 396 /* 397 * The init_mm context is given the first available (non-zero) PID, 398 * which is the "guard PID" and contains no page table. PIDR should 399 * never be set to zero because that duplicates the kernel address 400 * space at the 0x0... offset (quadrant 0)! 401 * 402 * An arbitrary PID that may later be allocated by the PID allocator 403 * for userspace processes must not be used either, because that 404 * would cause stale user mappings for that PID on CPUs outside of 405 * the TLB invalidation scheme (because it won't be in mm_cpumask). 406 * 407 * So permanently carve out one PID for the purpose of a guard PID. 408 */ 409 init_mm.context.id = mmu_base_pid; 410 mmu_base_pid++; 411 } 412 413 static void __init radix_init_partition_table(void) 414 { 415 unsigned long rts_field, dw0; 416 417 mmu_partition_table_init(); 418 rts_field = radix__get_tree_size(); 419 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; 420 mmu_partition_table_set_entry(0, dw0, 0); 421 422 pr_info("Initializing Radix MMU\n"); 423 pr_info("Partition table %p\n", partition_tb); 424 } 425 426 void __init radix_init_native(void) 427 { 428 register_process_table = native_register_process_table; 429 } 430 431 static int __init get_idx_from_shift(unsigned int shift) 432 { 433 int idx = -1; 434 435 switch (shift) { 436 case 0xc: 437 idx = MMU_PAGE_4K; 438 break; 439 case 0x10: 440 idx = MMU_PAGE_64K; 441 break; 442 case 0x15: 443 idx = MMU_PAGE_2M; 444 break; 445 case 0x1e: 446 idx = MMU_PAGE_1G; 447 break; 448 } 449 return idx; 450 } 451 452 static int __init radix_dt_scan_page_sizes(unsigned long node, 453 const char *uname, int depth, 454 void *data) 455 { 456 int size = 0; 457 int shift, idx; 458 unsigned int ap; 459 const __be32 *prop; 460 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 461 462 /* We are scanning "cpu" nodes only */ 463 if (type == NULL || strcmp(type, "cpu") != 0) 464 return 0; 465 466 /* Find MMU PID size */ 467 prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); 468 if (prop && size == 4) 469 mmu_pid_bits = be32_to_cpup(prop); 470 471 /* Grab page size encodings */ 472 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); 473 if (!prop) 474 return 0; 475 476 pr_info("Page sizes from device-tree:\n"); 477 for (; size >= 4; size -= 4, ++prop) { 478 479 struct mmu_psize_def *def; 480 481 /* top 3 bit is AP encoding */ 482 shift = be32_to_cpu(prop[0]) & ~(0xe << 28); 483 ap = be32_to_cpu(prop[0]) >> 29; 484 pr_info("Page size shift = %d AP=0x%x\n", shift, ap); 485 486 idx = get_idx_from_shift(shift); 487 if (idx < 0) 488 continue; 489 490 def = &mmu_psize_defs[idx]; 491 def->shift = shift; 492 def->ap = ap; 493 } 494 495 /* needed ? */ 496 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 497 return 1; 498 } 499 500 void __init radix__early_init_devtree(void) 501 { 502 int rc; 503 504 /* 505 * Try to find the available page sizes in the device-tree 506 */ 507 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); 508 if (rc != 0) /* Found */ 509 goto found; 510 /* 511 * let's assume we have page 4k and 64k support 512 */ 513 mmu_psize_defs[MMU_PAGE_4K].shift = 12; 514 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; 515 516 mmu_psize_defs[MMU_PAGE_64K].shift = 16; 517 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; 518 found: 519 return; 520 } 521 522 static void radix_init_amor(void) 523 { 524 /* 525 * In HV mode, we init AMOR (Authority Mask Override Register) so that 526 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 527 * Register), enable key 0 and set it to 1. 528 * 529 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 530 */ 531 mtspr(SPRN_AMOR, (3ul << 62)); 532 } 533 534 #ifdef CONFIG_PPC_KUEP 535 void setup_kuep(bool disabled) 536 { 537 if (disabled || !early_radix_enabled()) 538 return; 539 540 if (smp_processor_id() == boot_cpuid) 541 pr_info("Activating Kernel Userspace Execution Prevention\n"); 542 543 /* 544 * Radix always uses key0 of the IAMR to determine if an access is 545 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 546 * fetch. 547 */ 548 mtspr(SPRN_IAMR, (1ul << 62)); 549 } 550 #endif 551 552 #ifdef CONFIG_PPC_KUAP 553 void setup_kuap(bool disabled) 554 { 555 if (disabled || !early_radix_enabled()) 556 return; 557 558 if (smp_processor_id() == boot_cpuid) { 559 pr_info("Activating Kernel Userspace Access Prevention\n"); 560 cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; 561 } 562 563 /* Make sure userspace can't change the AMR */ 564 mtspr(SPRN_UAMOR, 0); 565 mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); 566 isync(); 567 } 568 #endif 569 570 void __init radix__early_init_mmu(void) 571 { 572 unsigned long lpcr; 573 574 #ifdef CONFIG_PPC_64K_PAGES 575 /* PAGE_SIZE mappings */ 576 mmu_virtual_psize = MMU_PAGE_64K; 577 #else 578 mmu_virtual_psize = MMU_PAGE_4K; 579 #endif 580 581 #ifdef CONFIG_SPARSEMEM_VMEMMAP 582 /* vmemmap mapping */ 583 if (mmu_psize_defs[MMU_PAGE_2M].shift) { 584 /* 585 * map vmemmap using 2M if available 586 */ 587 mmu_vmemmap_psize = MMU_PAGE_2M; 588 } else 589 mmu_vmemmap_psize = mmu_virtual_psize; 590 #endif 591 /* 592 * initialize page table size 593 */ 594 __pte_index_size = RADIX_PTE_INDEX_SIZE; 595 __pmd_index_size = RADIX_PMD_INDEX_SIZE; 596 __pud_index_size = RADIX_PUD_INDEX_SIZE; 597 __pgd_index_size = RADIX_PGD_INDEX_SIZE; 598 __pud_cache_index = RADIX_PUD_INDEX_SIZE; 599 __pte_table_size = RADIX_PTE_TABLE_SIZE; 600 __pmd_table_size = RADIX_PMD_TABLE_SIZE; 601 __pud_table_size = RADIX_PUD_TABLE_SIZE; 602 __pgd_table_size = RADIX_PGD_TABLE_SIZE; 603 604 __pmd_val_bits = RADIX_PMD_VAL_BITS; 605 __pud_val_bits = RADIX_PUD_VAL_BITS; 606 __pgd_val_bits = RADIX_PGD_VAL_BITS; 607 608 __kernel_virt_start = RADIX_KERN_VIRT_START; 609 __vmalloc_start = RADIX_VMALLOC_START; 610 __vmalloc_end = RADIX_VMALLOC_END; 611 __kernel_io_start = RADIX_KERN_IO_START; 612 __kernel_io_end = RADIX_KERN_IO_END; 613 vmemmap = (struct page *)RADIX_VMEMMAP_START; 614 ioremap_bot = IOREMAP_BASE; 615 616 #ifdef CONFIG_PCI 617 pci_io_base = ISA_IO_BASE; 618 #endif 619 __pte_frag_nr = RADIX_PTE_FRAG_NR; 620 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; 621 __pmd_frag_nr = RADIX_PMD_FRAG_NR; 622 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; 623 624 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 625 radix_init_native(); 626 lpcr = mfspr(SPRN_LPCR); 627 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 628 radix_init_partition_table(); 629 radix_init_amor(); 630 } else { 631 radix_init_pseries(); 632 } 633 634 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 635 636 radix_init_pgtable(); 637 /* Switch to the guard PID before turning on MMU */ 638 radix__switch_mmu_context(NULL, &init_mm); 639 if (cpu_has_feature(CPU_FTR_HVMODE)) 640 tlbiel_all(); 641 } 642 643 void radix__early_init_mmu_secondary(void) 644 { 645 unsigned long lpcr; 646 /* 647 * update partition table control register and UPRT 648 */ 649 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 650 lpcr = mfspr(SPRN_LPCR); 651 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 652 653 mtspr(SPRN_PTCR, 654 __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); 655 radix_init_amor(); 656 } 657 658 radix__switch_mmu_context(NULL, &init_mm); 659 if (cpu_has_feature(CPU_FTR_HVMODE)) 660 tlbiel_all(); 661 } 662 663 void radix__mmu_cleanup_all(void) 664 { 665 unsigned long lpcr; 666 667 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 668 lpcr = mfspr(SPRN_LPCR); 669 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); 670 mtspr(SPRN_PTCR, 0); 671 powernv_set_nmmu_ptcr(0); 672 radix__flush_tlb_all(); 673 } 674 } 675 676 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, 677 phys_addr_t first_memblock_size) 678 { 679 /* 680 * We don't currently support the first MEMBLOCK not mapping 0 681 * physical on those processors 682 */ 683 BUG_ON(first_memblock_base != 0); 684 685 /* 686 * Radix mode is not limited by RMA / VRMA addressing. 687 */ 688 ppc64_rma_size = ULONG_MAX; 689 } 690 691 #ifdef CONFIG_MEMORY_HOTPLUG 692 static void free_pte_table(pte_t *pte_start, pmd_t *pmd) 693 { 694 pte_t *pte; 695 int i; 696 697 for (i = 0; i < PTRS_PER_PTE; i++) { 698 pte = pte_start + i; 699 if (!pte_none(*pte)) 700 return; 701 } 702 703 pte_free_kernel(&init_mm, pte_start); 704 pmd_clear(pmd); 705 } 706 707 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) 708 { 709 pmd_t *pmd; 710 int i; 711 712 for (i = 0; i < PTRS_PER_PMD; i++) { 713 pmd = pmd_start + i; 714 if (!pmd_none(*pmd)) 715 return; 716 } 717 718 pmd_free(&init_mm, pmd_start); 719 pud_clear(pud); 720 } 721 722 struct change_mapping_params { 723 pte_t *pte; 724 unsigned long start; 725 unsigned long end; 726 unsigned long aligned_start; 727 unsigned long aligned_end; 728 }; 729 730 static int __meminit stop_machine_change_mapping(void *data) 731 { 732 struct change_mapping_params *params = 733 (struct change_mapping_params *)data; 734 735 if (!data) 736 return -1; 737 738 spin_unlock(&init_mm.page_table_lock); 739 pte_clear(&init_mm, params->aligned_start, params->pte); 740 create_physical_mapping(params->aligned_start, params->start, -1); 741 create_physical_mapping(params->end, params->aligned_end, -1); 742 spin_lock(&init_mm.page_table_lock); 743 return 0; 744 } 745 746 static void remove_pte_table(pte_t *pte_start, unsigned long addr, 747 unsigned long end) 748 { 749 unsigned long next; 750 pte_t *pte; 751 752 pte = pte_start + pte_index(addr); 753 for (; addr < end; addr = next, pte++) { 754 next = (addr + PAGE_SIZE) & PAGE_MASK; 755 if (next > end) 756 next = end; 757 758 if (!pte_present(*pte)) 759 continue; 760 761 if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { 762 /* 763 * The vmemmap_free() and remove_section_mapping() 764 * codepaths call us with aligned addresses. 765 */ 766 WARN_ONCE(1, "%s: unaligned range\n", __func__); 767 continue; 768 } 769 770 pte_clear(&init_mm, addr, pte); 771 } 772 } 773 774 /* 775 * clear the pte and potentially split the mapping helper 776 */ 777 static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, 778 unsigned long size, pte_t *pte) 779 { 780 unsigned long mask = ~(size - 1); 781 unsigned long aligned_start = addr & mask; 782 unsigned long aligned_end = addr + size; 783 struct change_mapping_params params; 784 bool split_region = false; 785 786 if ((end - addr) < size) { 787 /* 788 * We're going to clear the PTE, but not flushed 789 * the mapping, time to remap and flush. The 790 * effects if visible outside the processor or 791 * if we are running in code close to the 792 * mapping we cleared, we are in trouble. 793 */ 794 if (overlaps_kernel_text(aligned_start, addr) || 795 overlaps_kernel_text(end, aligned_end)) { 796 /* 797 * Hack, just return, don't pte_clear 798 */ 799 WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " 800 "text, not splitting\n", addr, end); 801 return; 802 } 803 split_region = true; 804 } 805 806 if (split_region) { 807 params.pte = pte; 808 params.start = addr; 809 params.end = end; 810 params.aligned_start = addr & ~(size - 1); 811 params.aligned_end = min_t(unsigned long, aligned_end, 812 (unsigned long)__va(memblock_end_of_DRAM())); 813 stop_machine(stop_machine_change_mapping, ¶ms, NULL); 814 return; 815 } 816 817 pte_clear(&init_mm, addr, pte); 818 } 819 820 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, 821 unsigned long end) 822 { 823 unsigned long next; 824 pte_t *pte_base; 825 pmd_t *pmd; 826 827 pmd = pmd_start + pmd_index(addr); 828 for (; addr < end; addr = next, pmd++) { 829 next = pmd_addr_end(addr, end); 830 831 if (!pmd_present(*pmd)) 832 continue; 833 834 if (pmd_is_leaf(*pmd)) { 835 split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); 836 continue; 837 } 838 839 pte_base = (pte_t *)pmd_page_vaddr(*pmd); 840 remove_pte_table(pte_base, addr, next); 841 free_pte_table(pte_base, pmd); 842 } 843 } 844 845 static void remove_pud_table(pud_t *pud_start, unsigned long addr, 846 unsigned long end) 847 { 848 unsigned long next; 849 pmd_t *pmd_base; 850 pud_t *pud; 851 852 pud = pud_start + pud_index(addr); 853 for (; addr < end; addr = next, pud++) { 854 next = pud_addr_end(addr, end); 855 856 if (!pud_present(*pud)) 857 continue; 858 859 if (pud_is_leaf(*pud)) { 860 split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); 861 continue; 862 } 863 864 pmd_base = (pmd_t *)pud_page_vaddr(*pud); 865 remove_pmd_table(pmd_base, addr, next); 866 free_pmd_table(pmd_base, pud); 867 } 868 } 869 870 static void __meminit remove_pagetable(unsigned long start, unsigned long end) 871 { 872 unsigned long addr, next; 873 pud_t *pud_base; 874 pgd_t *pgd; 875 876 spin_lock(&init_mm.page_table_lock); 877 878 for (addr = start; addr < end; addr = next) { 879 next = pgd_addr_end(addr, end); 880 881 pgd = pgd_offset_k(addr); 882 if (!pgd_present(*pgd)) 883 continue; 884 885 if (pgd_is_leaf(*pgd)) { 886 split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); 887 continue; 888 } 889 890 pud_base = (pud_t *)pgd_page_vaddr(*pgd); 891 remove_pud_table(pud_base, addr, next); 892 } 893 894 spin_unlock(&init_mm.page_table_lock); 895 radix__flush_tlb_kernel_range(start, end); 896 } 897 898 int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) 899 { 900 if (end >= RADIX_VMALLOC_START) { 901 pr_warn("Outside the supported range\n"); 902 return -1; 903 } 904 905 return create_physical_mapping(start, end, nid); 906 } 907 908 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) 909 { 910 remove_pagetable(start, end); 911 return 0; 912 } 913 #endif /* CONFIG_MEMORY_HOTPLUG */ 914 915 #ifdef CONFIG_SPARSEMEM_VMEMMAP 916 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, 917 pgprot_t flags, unsigned int map_page_size, 918 int nid) 919 { 920 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); 921 } 922 923 int __meminit radix__vmemmap_create_mapping(unsigned long start, 924 unsigned long page_size, 925 unsigned long phys) 926 { 927 /* Create a PTE encoding */ 928 unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; 929 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); 930 int ret; 931 932 if ((start + page_size) >= RADIX_VMEMMAP_END) { 933 pr_warn("Outside the supported range\n"); 934 return -1; 935 } 936 937 ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); 938 BUG_ON(ret); 939 940 return 0; 941 } 942 943 #ifdef CONFIG_MEMORY_HOTPLUG 944 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) 945 { 946 remove_pagetable(start, start + page_size); 947 } 948 #endif 949 #endif 950 951 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 952 953 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 954 pmd_t *pmdp, unsigned long clr, 955 unsigned long set) 956 { 957 unsigned long old; 958 959 #ifdef CONFIG_DEBUG_VM 960 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); 961 assert_spin_locked(pmd_lockptr(mm, pmdp)); 962 #endif 963 964 old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); 965 trace_hugepage_update(addr, old, clr, set); 966 967 return old; 968 } 969 970 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 971 pmd_t *pmdp) 972 973 { 974 pmd_t pmd; 975 976 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 977 VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); 978 VM_BUG_ON(pmd_devmap(*pmdp)); 979 /* 980 * khugepaged calls this for normal pmd 981 */ 982 pmd = *pmdp; 983 pmd_clear(pmdp); 984 985 /*FIXME!! Verify whether we need this kick below */ 986 serialize_against_pte_lookup(vma->vm_mm); 987 988 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); 989 990 return pmd; 991 } 992 993 /* 994 * For us pgtable_t is pte_t *. Inorder to save the deposisted 995 * page table, we consider the allocated page table as a list 996 * head. On withdraw we need to make sure we zero out the used 997 * list_head memory area. 998 */ 999 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1000 pgtable_t pgtable) 1001 { 1002 struct list_head *lh = (struct list_head *) pgtable; 1003 1004 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1005 1006 /* FIFO */ 1007 if (!pmd_huge_pte(mm, pmdp)) 1008 INIT_LIST_HEAD(lh); 1009 else 1010 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1011 pmd_huge_pte(mm, pmdp) = pgtable; 1012 } 1013 1014 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1015 { 1016 pte_t *ptep; 1017 pgtable_t pgtable; 1018 struct list_head *lh; 1019 1020 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1021 1022 /* FIFO */ 1023 pgtable = pmd_huge_pte(mm, pmdp); 1024 lh = (struct list_head *) pgtable; 1025 if (list_empty(lh)) 1026 pmd_huge_pte(mm, pmdp) = NULL; 1027 else { 1028 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1029 list_del(lh); 1030 } 1031 ptep = (pte_t *) pgtable; 1032 *ptep = __pte(0); 1033 ptep++; 1034 *ptep = __pte(0); 1035 return pgtable; 1036 } 1037 1038 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 1039 unsigned long addr, pmd_t *pmdp) 1040 { 1041 pmd_t old_pmd; 1042 unsigned long old; 1043 1044 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 1045 old_pmd = __pmd(old); 1046 /* 1047 * Serialize against find_current_mm_pte which does lock-less 1048 * lookup in page tables with local interrupts disabled. For huge pages 1049 * it casts pmd_t to pte_t. Since format of pte_t is different from 1050 * pmd_t we want to prevent transit from pmd pointing to page table 1051 * to pmd pointing to huge page (and back) while interrupts are disabled. 1052 * We clear pmd to possibly replace it with page table pointer in 1053 * different code paths. So make sure we wait for the parallel 1054 * find_current_mm_pte to finish. 1055 */ 1056 serialize_against_pte_lookup(mm); 1057 return old_pmd; 1058 } 1059 1060 int radix__has_transparent_hugepage(void) 1061 { 1062 /* For radix 2M at PMD level means thp */ 1063 if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) 1064 return 1; 1065 return 0; 1066 } 1067 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1068 1069 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, 1070 pte_t entry, unsigned long address, int psize) 1071 { 1072 struct mm_struct *mm = vma->vm_mm; 1073 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | 1074 _PAGE_RW | _PAGE_EXEC); 1075 1076 unsigned long change = pte_val(entry) ^ pte_val(*ptep); 1077 /* 1078 * To avoid NMMU hang while relaxing access, we need mark 1079 * the pte invalid in between. 1080 */ 1081 if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { 1082 unsigned long old_pte, new_pte; 1083 1084 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); 1085 /* 1086 * new value of pte 1087 */ 1088 new_pte = old_pte | set; 1089 radix__flush_tlb_page_psize(mm, address, psize); 1090 __radix_pte_update(ptep, _PAGE_INVALID, new_pte); 1091 } else { 1092 __radix_pte_update(ptep, 0, set); 1093 /* 1094 * Book3S does not require a TLB flush when relaxing access 1095 * restrictions when the address space is not attached to a 1096 * NMMU, because the core MMU will reload the pte after taking 1097 * an access fault, which is defined by the architectue. 1098 */ 1099 } 1100 /* See ptesync comment in radix__set_pte_at */ 1101 } 1102 1103 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 1104 unsigned long addr, pte_t *ptep, 1105 pte_t old_pte, pte_t pte) 1106 { 1107 struct mm_struct *mm = vma->vm_mm; 1108 1109 /* 1110 * To avoid NMMU hang while relaxing access we need to flush the tlb before 1111 * we set the new value. We need to do this only for radix, because hash 1112 * translation does flush when updating the linux pte. 1113 */ 1114 if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 1115 (atomic_read(&mm->context.copros) > 0)) 1116 radix__flush_tlb_page(vma, addr); 1117 1118 set_pte_at(mm, addr, ptep, pte); 1119 } 1120 1121 int __init arch_ioremap_pud_supported(void) 1122 { 1123 /* HPT does not cope with large pages in the vmalloc area */ 1124 return radix_enabled(); 1125 } 1126 1127 int __init arch_ioremap_pmd_supported(void) 1128 { 1129 return radix_enabled(); 1130 } 1131 1132 int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) 1133 { 1134 return 0; 1135 } 1136 1137 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 1138 { 1139 pte_t *ptep = (pte_t *)pud; 1140 pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot); 1141 1142 if (!radix_enabled()) 1143 return 0; 1144 1145 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); 1146 1147 return 1; 1148 } 1149 1150 int pud_clear_huge(pud_t *pud) 1151 { 1152 if (pud_huge(*pud)) { 1153 pud_clear(pud); 1154 return 1; 1155 } 1156 1157 return 0; 1158 } 1159 1160 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 1161 { 1162 pmd_t *pmd; 1163 int i; 1164 1165 pmd = (pmd_t *)pud_page_vaddr(*pud); 1166 pud_clear(pud); 1167 1168 flush_tlb_kernel_range(addr, addr + PUD_SIZE); 1169 1170 for (i = 0; i < PTRS_PER_PMD; i++) { 1171 if (!pmd_none(pmd[i])) { 1172 pte_t *pte; 1173 pte = (pte_t *)pmd_page_vaddr(pmd[i]); 1174 1175 pte_free_kernel(&init_mm, pte); 1176 } 1177 } 1178 1179 pmd_free(&init_mm, pmd); 1180 1181 return 1; 1182 } 1183 1184 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 1185 { 1186 pte_t *ptep = (pte_t *)pmd; 1187 pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot); 1188 1189 if (!radix_enabled()) 1190 return 0; 1191 1192 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); 1193 1194 return 1; 1195 } 1196 1197 int pmd_clear_huge(pmd_t *pmd) 1198 { 1199 if (pmd_huge(*pmd)) { 1200 pmd_clear(pmd); 1201 return 1; 1202 } 1203 1204 return 0; 1205 } 1206 1207 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 1208 { 1209 pte_t *pte; 1210 1211 pte = (pte_t *)pmd_page_vaddr(*pmd); 1212 pmd_clear(pmd); 1213 1214 flush_tlb_kernel_range(addr, addr + PMD_SIZE); 1215 1216 pte_free_kernel(&init_mm, pte); 1217 1218 return 1; 1219 } 1220 1221 int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, 1222 pgprot_t prot, int nid) 1223 { 1224 if (likely(slab_is_available())) { 1225 int err = ioremap_page_range(ea, ea + size, pa, prot); 1226 if (err) 1227 unmap_kernel_range(ea, size); 1228 return err; 1229 } else { 1230 unsigned long i; 1231 1232 for (i = 0; i < size; i += PAGE_SIZE) { 1233 int err = map_kernel_page(ea + i, pa + i, prot); 1234 if (WARN_ON_ONCE(err)) /* Should clean up */ 1235 return err; 1236 } 1237 return 0; 1238 } 1239 } 1240 1241 int __init arch_ioremap_p4d_supported(void) 1242 { 1243 return 0; 1244 } 1245