1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Page table handling routines for radix page table. 4 * 5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 6 */ 7 8 #define pr_fmt(fmt) "radix-mmu: " fmt 9 10 #include <linux/io.h> 11 #include <linux/kernel.h> 12 #include <linux/sched/mm.h> 13 #include <linux/memblock.h> 14 #include <linux/of_fdt.h> 15 #include <linux/mm.h> 16 #include <linux/hugetlb.h> 17 #include <linux/string_helpers.h> 18 #include <linux/stop_machine.h> 19 20 #include <asm/pgalloc.h> 21 #include <asm/mmu_context.h> 22 #include <asm/dma.h> 23 #include <asm/machdep.h> 24 #include <asm/mmu.h> 25 #include <asm/firmware.h> 26 #include <asm/powernv.h> 27 #include <asm/sections.h> 28 #include <asm/smp.h> 29 #include <asm/trace.h> 30 #include <asm/uaccess.h> 31 #include <asm/ultravisor.h> 32 33 #include <trace/events/thp.h> 34 35 unsigned int mmu_pid_bits; 36 unsigned int mmu_base_pid; 37 38 static __ref void *early_alloc_pgtable(unsigned long size, int nid, 39 unsigned long region_start, unsigned long region_end) 40 { 41 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; 42 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; 43 void *ptr; 44 45 if (region_start) 46 min_addr = region_start; 47 if (region_end) 48 max_addr = region_end; 49 50 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); 51 52 if (!ptr) 53 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", 54 __func__, size, size, nid, &min_addr, &max_addr); 55 56 return ptr; 57 } 58 59 static int early_map_kernel_page(unsigned long ea, unsigned long pa, 60 pgprot_t flags, 61 unsigned int map_page_size, 62 int nid, 63 unsigned long region_start, unsigned long region_end) 64 { 65 unsigned long pfn = pa >> PAGE_SHIFT; 66 pgd_t *pgdp; 67 p4d_t *p4dp; 68 pud_t *pudp; 69 pmd_t *pmdp; 70 pte_t *ptep; 71 72 pgdp = pgd_offset_k(ea); 73 p4dp = p4d_offset(pgdp, ea); 74 if (p4d_none(*p4dp)) { 75 pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, 76 region_start, region_end); 77 p4d_populate(&init_mm, p4dp, pudp); 78 } 79 pudp = pud_offset(p4dp, ea); 80 if (map_page_size == PUD_SIZE) { 81 ptep = (pte_t *)pudp; 82 goto set_the_pte; 83 } 84 if (pud_none(*pudp)) { 85 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, 86 region_start, region_end); 87 pud_populate(&init_mm, pudp, pmdp); 88 } 89 pmdp = pmd_offset(pudp, ea); 90 if (map_page_size == PMD_SIZE) { 91 ptep = pmdp_ptep(pmdp); 92 goto set_the_pte; 93 } 94 if (!pmd_present(*pmdp)) { 95 ptep = early_alloc_pgtable(PAGE_SIZE, nid, 96 region_start, region_end); 97 pmd_populate_kernel(&init_mm, pmdp, ptep); 98 } 99 ptep = pte_offset_kernel(pmdp, ea); 100 101 set_the_pte: 102 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 103 smp_wmb(); 104 return 0; 105 } 106 107 /* 108 * nid, region_start, and region_end are hints to try to place the page 109 * table memory in the same node or region. 110 */ 111 static int __map_kernel_page(unsigned long ea, unsigned long pa, 112 pgprot_t flags, 113 unsigned int map_page_size, 114 int nid, 115 unsigned long region_start, unsigned long region_end) 116 { 117 unsigned long pfn = pa >> PAGE_SHIFT; 118 pgd_t *pgdp; 119 p4d_t *p4dp; 120 pud_t *pudp; 121 pmd_t *pmdp; 122 pte_t *ptep; 123 /* 124 * Make sure task size is correct as per the max adddr 125 */ 126 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); 127 128 #ifdef CONFIG_PPC_64K_PAGES 129 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); 130 #endif 131 132 if (unlikely(!slab_is_available())) 133 return early_map_kernel_page(ea, pa, flags, map_page_size, 134 nid, region_start, region_end); 135 136 /* 137 * Should make page table allocation functions be able to take a 138 * node, so we can place kernel page tables on the right nodes after 139 * boot. 140 */ 141 pgdp = pgd_offset_k(ea); 142 p4dp = p4d_offset(pgdp, ea); 143 pudp = pud_alloc(&init_mm, p4dp, ea); 144 if (!pudp) 145 return -ENOMEM; 146 if (map_page_size == PUD_SIZE) { 147 ptep = (pte_t *)pudp; 148 goto set_the_pte; 149 } 150 pmdp = pmd_alloc(&init_mm, pudp, ea); 151 if (!pmdp) 152 return -ENOMEM; 153 if (map_page_size == PMD_SIZE) { 154 ptep = pmdp_ptep(pmdp); 155 goto set_the_pte; 156 } 157 ptep = pte_alloc_kernel(pmdp, ea); 158 if (!ptep) 159 return -ENOMEM; 160 161 set_the_pte: 162 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 163 smp_wmb(); 164 return 0; 165 } 166 167 int radix__map_kernel_page(unsigned long ea, unsigned long pa, 168 pgprot_t flags, 169 unsigned int map_page_size) 170 { 171 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); 172 } 173 174 #ifdef CONFIG_STRICT_KERNEL_RWX 175 void radix__change_memory_range(unsigned long start, unsigned long end, 176 unsigned long clear) 177 { 178 unsigned long idx; 179 pgd_t *pgdp; 180 p4d_t *p4dp; 181 pud_t *pudp; 182 pmd_t *pmdp; 183 pte_t *ptep; 184 185 start = ALIGN_DOWN(start, PAGE_SIZE); 186 end = PAGE_ALIGN(end); // aligns up 187 188 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", 189 start, end, clear); 190 191 for (idx = start; idx < end; idx += PAGE_SIZE) { 192 pgdp = pgd_offset_k(idx); 193 p4dp = p4d_offset(pgdp, idx); 194 pudp = pud_alloc(&init_mm, p4dp, idx); 195 if (!pudp) 196 continue; 197 if (pud_is_leaf(*pudp)) { 198 ptep = (pte_t *)pudp; 199 goto update_the_pte; 200 } 201 pmdp = pmd_alloc(&init_mm, pudp, idx); 202 if (!pmdp) 203 continue; 204 if (pmd_is_leaf(*pmdp)) { 205 ptep = pmdp_ptep(pmdp); 206 goto update_the_pte; 207 } 208 ptep = pte_alloc_kernel(pmdp, idx); 209 if (!ptep) 210 continue; 211 update_the_pte: 212 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); 213 } 214 215 radix__flush_tlb_kernel_range(start, end); 216 } 217 218 void radix__mark_rodata_ro(void) 219 { 220 unsigned long start, end; 221 222 start = (unsigned long)_stext; 223 end = (unsigned long)__init_begin; 224 225 radix__change_memory_range(start, end, _PAGE_WRITE); 226 } 227 228 void radix__mark_initmem_nx(void) 229 { 230 unsigned long start = (unsigned long)__init_begin; 231 unsigned long end = (unsigned long)__init_end; 232 233 radix__change_memory_range(start, end, _PAGE_EXEC); 234 } 235 #endif /* CONFIG_STRICT_KERNEL_RWX */ 236 237 static inline void __meminit 238 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) 239 { 240 char buf[10]; 241 242 if (end <= start) 243 return; 244 245 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); 246 247 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, 248 exec ? " (exec)" : ""); 249 } 250 251 static unsigned long next_boundary(unsigned long addr, unsigned long end) 252 { 253 #ifdef CONFIG_STRICT_KERNEL_RWX 254 if (addr < __pa_symbol(__init_begin)) 255 return __pa_symbol(__init_begin); 256 #endif 257 return end; 258 } 259 260 static int __meminit create_physical_mapping(unsigned long start, 261 unsigned long end, 262 int nid, pgprot_t _prot) 263 { 264 unsigned long vaddr, addr, mapping_size = 0; 265 bool prev_exec, exec = false; 266 pgprot_t prot; 267 int psize; 268 269 start = ALIGN(start, PAGE_SIZE); 270 for (addr = start; addr < end; addr += mapping_size) { 271 unsigned long gap, previous_size; 272 int rc; 273 274 gap = next_boundary(addr, end) - addr; 275 previous_size = mapping_size; 276 prev_exec = exec; 277 278 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && 279 mmu_psize_defs[MMU_PAGE_1G].shift) { 280 mapping_size = PUD_SIZE; 281 psize = MMU_PAGE_1G; 282 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && 283 mmu_psize_defs[MMU_PAGE_2M].shift) { 284 mapping_size = PMD_SIZE; 285 psize = MMU_PAGE_2M; 286 } else { 287 mapping_size = PAGE_SIZE; 288 psize = mmu_virtual_psize; 289 } 290 291 vaddr = (unsigned long)__va(addr); 292 293 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || 294 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { 295 prot = PAGE_KERNEL_X; 296 exec = true; 297 } else { 298 prot = _prot; 299 exec = false; 300 } 301 302 if (mapping_size != previous_size || exec != prev_exec) { 303 print_mapping(start, addr, previous_size, prev_exec); 304 start = addr; 305 } 306 307 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); 308 if (rc) 309 return rc; 310 311 update_page_count(psize, 1); 312 } 313 314 print_mapping(start, addr, mapping_size, exec); 315 return 0; 316 } 317 318 static void __init radix_init_pgtable(void) 319 { 320 unsigned long rts_field; 321 struct memblock_region *reg; 322 323 /* We don't support slb for radix */ 324 mmu_slb_size = 0; 325 /* 326 * Create the linear mapping, using standard page size for now 327 */ 328 for_each_memblock(memory, reg) { 329 /* 330 * The memblock allocator is up at this point, so the 331 * page tables will be allocated within the range. No 332 * need or a node (which we don't have yet). 333 */ 334 335 if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { 336 pr_warn("Outside the supported range\n"); 337 continue; 338 } 339 340 WARN_ON(create_physical_mapping(reg->base, 341 reg->base + reg->size, 342 -1, PAGE_KERNEL)); 343 } 344 345 /* Find out how many PID bits are supported */ 346 if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { 347 if (!mmu_pid_bits) 348 mmu_pid_bits = 20; 349 mmu_base_pid = 1; 350 } else if (cpu_has_feature(CPU_FTR_HVMODE)) { 351 if (!mmu_pid_bits) 352 mmu_pid_bits = 20; 353 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 354 /* 355 * When KVM is possible, we only use the top half of the 356 * PID space to avoid collisions between host and guest PIDs 357 * which can cause problems due to prefetch when exiting the 358 * guest with AIL=3 359 */ 360 mmu_base_pid = 1 << (mmu_pid_bits - 1); 361 #else 362 mmu_base_pid = 1; 363 #endif 364 } else { 365 /* The guest uses the bottom half of the PID space */ 366 if (!mmu_pid_bits) 367 mmu_pid_bits = 19; 368 mmu_base_pid = 1; 369 } 370 371 /* 372 * Allocate Partition table and process table for the 373 * host. 374 */ 375 BUG_ON(PRTB_SIZE_SHIFT > 36); 376 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); 377 /* 378 * Fill in the process table. 379 */ 380 rts_field = radix__get_tree_size(); 381 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 382 383 /* 384 * The init_mm context is given the first available (non-zero) PID, 385 * which is the "guard PID" and contains no page table. PIDR should 386 * never be set to zero because that duplicates the kernel address 387 * space at the 0x0... offset (quadrant 0)! 388 * 389 * An arbitrary PID that may later be allocated by the PID allocator 390 * for userspace processes must not be used either, because that 391 * would cause stale user mappings for that PID on CPUs outside of 392 * the TLB invalidation scheme (because it won't be in mm_cpumask). 393 * 394 * So permanently carve out one PID for the purpose of a guard PID. 395 */ 396 init_mm.context.id = mmu_base_pid; 397 mmu_base_pid++; 398 } 399 400 static void __init radix_init_partition_table(void) 401 { 402 unsigned long rts_field, dw0, dw1; 403 404 mmu_partition_table_init(); 405 rts_field = radix__get_tree_size(); 406 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; 407 dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; 408 mmu_partition_table_set_entry(0, dw0, dw1, false); 409 410 pr_info("Initializing Radix MMU\n"); 411 } 412 413 static int __init get_idx_from_shift(unsigned int shift) 414 { 415 int idx = -1; 416 417 switch (shift) { 418 case 0xc: 419 idx = MMU_PAGE_4K; 420 break; 421 case 0x10: 422 idx = MMU_PAGE_64K; 423 break; 424 case 0x15: 425 idx = MMU_PAGE_2M; 426 break; 427 case 0x1e: 428 idx = MMU_PAGE_1G; 429 break; 430 } 431 return idx; 432 } 433 434 static int __init radix_dt_scan_page_sizes(unsigned long node, 435 const char *uname, int depth, 436 void *data) 437 { 438 int size = 0; 439 int shift, idx; 440 unsigned int ap; 441 const __be32 *prop; 442 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 443 444 /* We are scanning "cpu" nodes only */ 445 if (type == NULL || strcmp(type, "cpu") != 0) 446 return 0; 447 448 /* Find MMU PID size */ 449 prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); 450 if (prop && size == 4) 451 mmu_pid_bits = be32_to_cpup(prop); 452 453 /* Grab page size encodings */ 454 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); 455 if (!prop) 456 return 0; 457 458 pr_info("Page sizes from device-tree:\n"); 459 for (; size >= 4; size -= 4, ++prop) { 460 461 struct mmu_psize_def *def; 462 463 /* top 3 bit is AP encoding */ 464 shift = be32_to_cpu(prop[0]) & ~(0xe << 28); 465 ap = be32_to_cpu(prop[0]) >> 29; 466 pr_info("Page size shift = %d AP=0x%x\n", shift, ap); 467 468 idx = get_idx_from_shift(shift); 469 if (idx < 0) 470 continue; 471 472 def = &mmu_psize_defs[idx]; 473 def->shift = shift; 474 def->ap = ap; 475 } 476 477 /* needed ? */ 478 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 479 return 1; 480 } 481 482 void __init radix__early_init_devtree(void) 483 { 484 int rc; 485 486 /* 487 * Try to find the available page sizes in the device-tree 488 */ 489 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); 490 if (rc != 0) /* Found */ 491 goto found; 492 /* 493 * let's assume we have page 4k and 64k support 494 */ 495 mmu_psize_defs[MMU_PAGE_4K].shift = 12; 496 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; 497 498 mmu_psize_defs[MMU_PAGE_64K].shift = 16; 499 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; 500 found: 501 return; 502 } 503 504 static void radix_init_amor(void) 505 { 506 /* 507 * In HV mode, we init AMOR (Authority Mask Override Register) so that 508 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 509 * Register), enable key 0 and set it to 1. 510 * 511 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 512 */ 513 mtspr(SPRN_AMOR, (3ul << 62)); 514 } 515 516 #ifdef CONFIG_PPC_KUEP 517 void setup_kuep(bool disabled) 518 { 519 if (disabled || !early_radix_enabled()) 520 return; 521 522 if (smp_processor_id() == boot_cpuid) 523 pr_info("Activating Kernel Userspace Execution Prevention\n"); 524 525 /* 526 * Radix always uses key0 of the IAMR to determine if an access is 527 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 528 * fetch. 529 */ 530 mtspr(SPRN_IAMR, (1ul << 62)); 531 } 532 #endif 533 534 #ifdef CONFIG_PPC_KUAP 535 void setup_kuap(bool disabled) 536 { 537 if (disabled || !early_radix_enabled()) 538 return; 539 540 if (smp_processor_id() == boot_cpuid) { 541 pr_info("Activating Kernel Userspace Access Prevention\n"); 542 cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; 543 } 544 545 /* Make sure userspace can't change the AMR */ 546 mtspr(SPRN_UAMOR, 0); 547 mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); 548 isync(); 549 } 550 #endif 551 552 void __init radix__early_init_mmu(void) 553 { 554 unsigned long lpcr; 555 556 #ifdef CONFIG_PPC_64K_PAGES 557 /* PAGE_SIZE mappings */ 558 mmu_virtual_psize = MMU_PAGE_64K; 559 #else 560 mmu_virtual_psize = MMU_PAGE_4K; 561 #endif 562 563 #ifdef CONFIG_SPARSEMEM_VMEMMAP 564 /* vmemmap mapping */ 565 if (mmu_psize_defs[MMU_PAGE_2M].shift) { 566 /* 567 * map vmemmap using 2M if available 568 */ 569 mmu_vmemmap_psize = MMU_PAGE_2M; 570 } else 571 mmu_vmemmap_psize = mmu_virtual_psize; 572 #endif 573 /* 574 * initialize page table size 575 */ 576 __pte_index_size = RADIX_PTE_INDEX_SIZE; 577 __pmd_index_size = RADIX_PMD_INDEX_SIZE; 578 __pud_index_size = RADIX_PUD_INDEX_SIZE; 579 __pgd_index_size = RADIX_PGD_INDEX_SIZE; 580 __pud_cache_index = RADIX_PUD_INDEX_SIZE; 581 __pte_table_size = RADIX_PTE_TABLE_SIZE; 582 __pmd_table_size = RADIX_PMD_TABLE_SIZE; 583 __pud_table_size = RADIX_PUD_TABLE_SIZE; 584 __pgd_table_size = RADIX_PGD_TABLE_SIZE; 585 586 __pmd_val_bits = RADIX_PMD_VAL_BITS; 587 __pud_val_bits = RADIX_PUD_VAL_BITS; 588 __pgd_val_bits = RADIX_PGD_VAL_BITS; 589 590 __kernel_virt_start = RADIX_KERN_VIRT_START; 591 __vmalloc_start = RADIX_VMALLOC_START; 592 __vmalloc_end = RADIX_VMALLOC_END; 593 __kernel_io_start = RADIX_KERN_IO_START; 594 __kernel_io_end = RADIX_KERN_IO_END; 595 vmemmap = (struct page *)RADIX_VMEMMAP_START; 596 ioremap_bot = IOREMAP_BASE; 597 598 #ifdef CONFIG_PCI 599 pci_io_base = ISA_IO_BASE; 600 #endif 601 __pte_frag_nr = RADIX_PTE_FRAG_NR; 602 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; 603 __pmd_frag_nr = RADIX_PMD_FRAG_NR; 604 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; 605 606 radix_init_pgtable(); 607 608 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 609 lpcr = mfspr(SPRN_LPCR); 610 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 611 radix_init_partition_table(); 612 radix_init_amor(); 613 } else { 614 radix_init_pseries(); 615 } 616 617 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 618 619 /* Switch to the guard PID before turning on MMU */ 620 radix__switch_mmu_context(NULL, &init_mm); 621 tlbiel_all(); 622 } 623 624 void radix__early_init_mmu_secondary(void) 625 { 626 unsigned long lpcr; 627 /* 628 * update partition table control register and UPRT 629 */ 630 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 631 lpcr = mfspr(SPRN_LPCR); 632 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 633 634 set_ptcr_when_no_uv(__pa(partition_tb) | 635 (PATB_SIZE_SHIFT - 12)); 636 637 radix_init_amor(); 638 } 639 640 radix__switch_mmu_context(NULL, &init_mm); 641 tlbiel_all(); 642 } 643 644 void radix__mmu_cleanup_all(void) 645 { 646 unsigned long lpcr; 647 648 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 649 lpcr = mfspr(SPRN_LPCR); 650 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); 651 set_ptcr_when_no_uv(0); 652 powernv_set_nmmu_ptcr(0); 653 radix__flush_tlb_all(); 654 } 655 } 656 657 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, 658 phys_addr_t first_memblock_size) 659 { 660 /* 661 * We don't currently support the first MEMBLOCK not mapping 0 662 * physical on those processors 663 */ 664 BUG_ON(first_memblock_base != 0); 665 666 /* 667 * Radix mode is not limited by RMA / VRMA addressing. 668 */ 669 ppc64_rma_size = ULONG_MAX; 670 } 671 672 #ifdef CONFIG_MEMORY_HOTPLUG 673 static void free_pte_table(pte_t *pte_start, pmd_t *pmd) 674 { 675 pte_t *pte; 676 int i; 677 678 for (i = 0; i < PTRS_PER_PTE; i++) { 679 pte = pte_start + i; 680 if (!pte_none(*pte)) 681 return; 682 } 683 684 pte_free_kernel(&init_mm, pte_start); 685 pmd_clear(pmd); 686 } 687 688 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) 689 { 690 pmd_t *pmd; 691 int i; 692 693 for (i = 0; i < PTRS_PER_PMD; i++) { 694 pmd = pmd_start + i; 695 if (!pmd_none(*pmd)) 696 return; 697 } 698 699 pmd_free(&init_mm, pmd_start); 700 pud_clear(pud); 701 } 702 703 struct change_mapping_params { 704 pte_t *pte; 705 unsigned long start; 706 unsigned long end; 707 unsigned long aligned_start; 708 unsigned long aligned_end; 709 }; 710 711 static int __meminit stop_machine_change_mapping(void *data) 712 { 713 struct change_mapping_params *params = 714 (struct change_mapping_params *)data; 715 716 if (!data) 717 return -1; 718 719 spin_unlock(&init_mm.page_table_lock); 720 pte_clear(&init_mm, params->aligned_start, params->pte); 721 create_physical_mapping(__pa(params->aligned_start), 722 __pa(params->start), -1, PAGE_KERNEL); 723 create_physical_mapping(__pa(params->end), __pa(params->aligned_end), 724 -1, PAGE_KERNEL); 725 spin_lock(&init_mm.page_table_lock); 726 return 0; 727 } 728 729 static void remove_pte_table(pte_t *pte_start, unsigned long addr, 730 unsigned long end) 731 { 732 unsigned long next; 733 pte_t *pte; 734 735 pte = pte_start + pte_index(addr); 736 for (; addr < end; addr = next, pte++) { 737 next = (addr + PAGE_SIZE) & PAGE_MASK; 738 if (next > end) 739 next = end; 740 741 if (!pte_present(*pte)) 742 continue; 743 744 if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { 745 /* 746 * The vmemmap_free() and remove_section_mapping() 747 * codepaths call us with aligned addresses. 748 */ 749 WARN_ONCE(1, "%s: unaligned range\n", __func__); 750 continue; 751 } 752 753 pte_clear(&init_mm, addr, pte); 754 } 755 } 756 757 /* 758 * clear the pte and potentially split the mapping helper 759 */ 760 static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, 761 unsigned long size, pte_t *pte) 762 { 763 unsigned long mask = ~(size - 1); 764 unsigned long aligned_start = addr & mask; 765 unsigned long aligned_end = addr + size; 766 struct change_mapping_params params; 767 bool split_region = false; 768 769 if ((end - addr) < size) { 770 /* 771 * We're going to clear the PTE, but not flushed 772 * the mapping, time to remap and flush. The 773 * effects if visible outside the processor or 774 * if we are running in code close to the 775 * mapping we cleared, we are in trouble. 776 */ 777 if (overlaps_kernel_text(aligned_start, addr) || 778 overlaps_kernel_text(end, aligned_end)) { 779 /* 780 * Hack, just return, don't pte_clear 781 */ 782 WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " 783 "text, not splitting\n", addr, end); 784 return; 785 } 786 split_region = true; 787 } 788 789 if (split_region) { 790 params.pte = pte; 791 params.start = addr; 792 params.end = end; 793 params.aligned_start = addr & ~(size - 1); 794 params.aligned_end = min_t(unsigned long, aligned_end, 795 (unsigned long)__va(memblock_end_of_DRAM())); 796 stop_machine(stop_machine_change_mapping, ¶ms, NULL); 797 return; 798 } 799 800 pte_clear(&init_mm, addr, pte); 801 } 802 803 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, 804 unsigned long end) 805 { 806 unsigned long next; 807 pte_t *pte_base; 808 pmd_t *pmd; 809 810 pmd = pmd_start + pmd_index(addr); 811 for (; addr < end; addr = next, pmd++) { 812 next = pmd_addr_end(addr, end); 813 814 if (!pmd_present(*pmd)) 815 continue; 816 817 if (pmd_is_leaf(*pmd)) { 818 split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); 819 continue; 820 } 821 822 pte_base = (pte_t *)pmd_page_vaddr(*pmd); 823 remove_pte_table(pte_base, addr, next); 824 free_pte_table(pte_base, pmd); 825 } 826 } 827 828 static void remove_pud_table(pud_t *pud_start, unsigned long addr, 829 unsigned long end) 830 { 831 unsigned long next; 832 pmd_t *pmd_base; 833 pud_t *pud; 834 835 pud = pud_start + pud_index(addr); 836 for (; addr < end; addr = next, pud++) { 837 next = pud_addr_end(addr, end); 838 839 if (!pud_present(*pud)) 840 continue; 841 842 if (pud_is_leaf(*pud)) { 843 split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); 844 continue; 845 } 846 847 pmd_base = (pmd_t *)pud_page_vaddr(*pud); 848 remove_pmd_table(pmd_base, addr, next); 849 free_pmd_table(pmd_base, pud); 850 } 851 } 852 853 static void __meminit remove_pagetable(unsigned long start, unsigned long end) 854 { 855 unsigned long addr, next; 856 pud_t *pud_base; 857 pgd_t *pgd; 858 p4d_t *p4d; 859 860 spin_lock(&init_mm.page_table_lock); 861 862 for (addr = start; addr < end; addr = next) { 863 next = pgd_addr_end(addr, end); 864 865 pgd = pgd_offset_k(addr); 866 p4d = p4d_offset(pgd, addr); 867 if (!p4d_present(*p4d)) 868 continue; 869 870 if (p4d_is_leaf(*p4d)) { 871 split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d); 872 continue; 873 } 874 875 pud_base = (pud_t *)p4d_page_vaddr(*p4d); 876 remove_pud_table(pud_base, addr, next); 877 } 878 879 spin_unlock(&init_mm.page_table_lock); 880 radix__flush_tlb_kernel_range(start, end); 881 } 882 883 int __meminit radix__create_section_mapping(unsigned long start, 884 unsigned long end, int nid, 885 pgprot_t prot) 886 { 887 if (end >= RADIX_VMALLOC_START) { 888 pr_warn("Outside the supported range\n"); 889 return -1; 890 } 891 892 return create_physical_mapping(__pa(start), __pa(end), nid, prot); 893 } 894 895 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) 896 { 897 remove_pagetable(start, end); 898 return 0; 899 } 900 #endif /* CONFIG_MEMORY_HOTPLUG */ 901 902 #ifdef CONFIG_SPARSEMEM_VMEMMAP 903 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, 904 pgprot_t flags, unsigned int map_page_size, 905 int nid) 906 { 907 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); 908 } 909 910 int __meminit radix__vmemmap_create_mapping(unsigned long start, 911 unsigned long page_size, 912 unsigned long phys) 913 { 914 /* Create a PTE encoding */ 915 unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; 916 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); 917 int ret; 918 919 if ((start + page_size) >= RADIX_VMEMMAP_END) { 920 pr_warn("Outside the supported range\n"); 921 return -1; 922 } 923 924 ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); 925 BUG_ON(ret); 926 927 return 0; 928 } 929 930 #ifdef CONFIG_MEMORY_HOTPLUG 931 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) 932 { 933 remove_pagetable(start, start + page_size); 934 } 935 #endif 936 #endif 937 938 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 939 940 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 941 pmd_t *pmdp, unsigned long clr, 942 unsigned long set) 943 { 944 unsigned long old; 945 946 #ifdef CONFIG_DEBUG_VM 947 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); 948 assert_spin_locked(pmd_lockptr(mm, pmdp)); 949 #endif 950 951 old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); 952 trace_hugepage_update(addr, old, clr, set); 953 954 return old; 955 } 956 957 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 958 pmd_t *pmdp) 959 960 { 961 pmd_t pmd; 962 963 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 964 VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); 965 VM_BUG_ON(pmd_devmap(*pmdp)); 966 /* 967 * khugepaged calls this for normal pmd 968 */ 969 pmd = *pmdp; 970 pmd_clear(pmdp); 971 972 /* 973 * pmdp collapse_flush need to ensure that there are no parallel gup 974 * walk after this call. This is needed so that we can have stable 975 * page ref count when collapsing a page. We don't allow a collapse page 976 * if we have gup taken on the page. We can ensure that by sending IPI 977 * because gup walk happens with IRQ disabled. 978 */ 979 serialize_against_pte_lookup(vma->vm_mm); 980 981 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); 982 983 return pmd; 984 } 985 986 /* 987 * For us pgtable_t is pte_t *. Inorder to save the deposisted 988 * page table, we consider the allocated page table as a list 989 * head. On withdraw we need to make sure we zero out the used 990 * list_head memory area. 991 */ 992 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 993 pgtable_t pgtable) 994 { 995 struct list_head *lh = (struct list_head *) pgtable; 996 997 assert_spin_locked(pmd_lockptr(mm, pmdp)); 998 999 /* FIFO */ 1000 if (!pmd_huge_pte(mm, pmdp)) 1001 INIT_LIST_HEAD(lh); 1002 else 1003 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1004 pmd_huge_pte(mm, pmdp) = pgtable; 1005 } 1006 1007 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1008 { 1009 pte_t *ptep; 1010 pgtable_t pgtable; 1011 struct list_head *lh; 1012 1013 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1014 1015 /* FIFO */ 1016 pgtable = pmd_huge_pte(mm, pmdp); 1017 lh = (struct list_head *) pgtable; 1018 if (list_empty(lh)) 1019 pmd_huge_pte(mm, pmdp) = NULL; 1020 else { 1021 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1022 list_del(lh); 1023 } 1024 ptep = (pte_t *) pgtable; 1025 *ptep = __pte(0); 1026 ptep++; 1027 *ptep = __pte(0); 1028 return pgtable; 1029 } 1030 1031 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 1032 unsigned long addr, pmd_t *pmdp) 1033 { 1034 pmd_t old_pmd; 1035 unsigned long old; 1036 1037 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 1038 old_pmd = __pmd(old); 1039 return old_pmd; 1040 } 1041 1042 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1043 1044 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, 1045 pte_t entry, unsigned long address, int psize) 1046 { 1047 struct mm_struct *mm = vma->vm_mm; 1048 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | 1049 _PAGE_RW | _PAGE_EXEC); 1050 1051 unsigned long change = pte_val(entry) ^ pte_val(*ptep); 1052 /* 1053 * To avoid NMMU hang while relaxing access, we need mark 1054 * the pte invalid in between. 1055 */ 1056 if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { 1057 unsigned long old_pte, new_pte; 1058 1059 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); 1060 /* 1061 * new value of pte 1062 */ 1063 new_pte = old_pte | set; 1064 radix__flush_tlb_page_psize(mm, address, psize); 1065 __radix_pte_update(ptep, _PAGE_INVALID, new_pte); 1066 } else { 1067 __radix_pte_update(ptep, 0, set); 1068 /* 1069 * Book3S does not require a TLB flush when relaxing access 1070 * restrictions when the address space is not attached to a 1071 * NMMU, because the core MMU will reload the pte after taking 1072 * an access fault, which is defined by the architectue. 1073 */ 1074 } 1075 /* See ptesync comment in radix__set_pte_at */ 1076 } 1077 1078 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 1079 unsigned long addr, pte_t *ptep, 1080 pte_t old_pte, pte_t pte) 1081 { 1082 struct mm_struct *mm = vma->vm_mm; 1083 1084 /* 1085 * To avoid NMMU hang while relaxing access we need to flush the tlb before 1086 * we set the new value. We need to do this only for radix, because hash 1087 * translation does flush when updating the linux pte. 1088 */ 1089 if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 1090 (atomic_read(&mm->context.copros) > 0)) 1091 radix__flush_tlb_page(vma, addr); 1092 1093 set_pte_at(mm, addr, ptep, pte); 1094 } 1095 1096 int __init arch_ioremap_pud_supported(void) 1097 { 1098 /* HPT does not cope with large pages in the vmalloc area */ 1099 return radix_enabled(); 1100 } 1101 1102 int __init arch_ioremap_pmd_supported(void) 1103 { 1104 return radix_enabled(); 1105 } 1106 1107 int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) 1108 { 1109 return 0; 1110 } 1111 1112 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 1113 { 1114 pte_t *ptep = (pte_t *)pud; 1115 pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot); 1116 1117 if (!radix_enabled()) 1118 return 0; 1119 1120 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); 1121 1122 return 1; 1123 } 1124 1125 int pud_clear_huge(pud_t *pud) 1126 { 1127 if (pud_huge(*pud)) { 1128 pud_clear(pud); 1129 return 1; 1130 } 1131 1132 return 0; 1133 } 1134 1135 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 1136 { 1137 pmd_t *pmd; 1138 int i; 1139 1140 pmd = (pmd_t *)pud_page_vaddr(*pud); 1141 pud_clear(pud); 1142 1143 flush_tlb_kernel_range(addr, addr + PUD_SIZE); 1144 1145 for (i = 0; i < PTRS_PER_PMD; i++) { 1146 if (!pmd_none(pmd[i])) { 1147 pte_t *pte; 1148 pte = (pte_t *)pmd_page_vaddr(pmd[i]); 1149 1150 pte_free_kernel(&init_mm, pte); 1151 } 1152 } 1153 1154 pmd_free(&init_mm, pmd); 1155 1156 return 1; 1157 } 1158 1159 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 1160 { 1161 pte_t *ptep = (pte_t *)pmd; 1162 pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot); 1163 1164 if (!radix_enabled()) 1165 return 0; 1166 1167 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); 1168 1169 return 1; 1170 } 1171 1172 int pmd_clear_huge(pmd_t *pmd) 1173 { 1174 if (pmd_huge(*pmd)) { 1175 pmd_clear(pmd); 1176 return 1; 1177 } 1178 1179 return 0; 1180 } 1181 1182 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 1183 { 1184 pte_t *pte; 1185 1186 pte = (pte_t *)pmd_page_vaddr(*pmd); 1187 pmd_clear(pmd); 1188 1189 flush_tlb_kernel_range(addr, addr + PMD_SIZE); 1190 1191 pte_free_kernel(&init_mm, pte); 1192 1193 return 1; 1194 } 1195 1196 int __init arch_ioremap_p4d_supported(void) 1197 { 1198 return 0; 1199 } 1200