1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Page table handling routines for radix page table. 4 * 5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 6 */ 7 8 #define pr_fmt(fmt) "radix-mmu: " fmt 9 10 #include <linux/io.h> 11 #include <linux/kernel.h> 12 #include <linux/sched/mm.h> 13 #include <linux/memblock.h> 14 #include <linux/of_fdt.h> 15 #include <linux/mm.h> 16 #include <linux/hugetlb.h> 17 #include <linux/string_helpers.h> 18 #include <linux/stop_machine.h> 19 20 #include <asm/pgtable.h> 21 #include <asm/pgalloc.h> 22 #include <asm/mmu_context.h> 23 #include <asm/dma.h> 24 #include <asm/machdep.h> 25 #include <asm/mmu.h> 26 #include <asm/firmware.h> 27 #include <asm/powernv.h> 28 #include <asm/sections.h> 29 #include <asm/smp.h> 30 #include <asm/trace.h> 31 #include <asm/uaccess.h> 32 #include <asm/ultravisor.h> 33 34 #include <trace/events/thp.h> 35 36 unsigned int mmu_pid_bits; 37 unsigned int mmu_base_pid; 38 39 static __ref void *early_alloc_pgtable(unsigned long size, int nid, 40 unsigned long region_start, unsigned long region_end) 41 { 42 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; 43 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; 44 void *ptr; 45 46 if (region_start) 47 min_addr = region_start; 48 if (region_end) 49 max_addr = region_end; 50 51 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); 52 53 if (!ptr) 54 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", 55 __func__, size, size, nid, &min_addr, &max_addr); 56 57 return ptr; 58 } 59 60 static int early_map_kernel_page(unsigned long ea, unsigned long pa, 61 pgprot_t flags, 62 unsigned int map_page_size, 63 int nid, 64 unsigned long region_start, unsigned long region_end) 65 { 66 unsigned long pfn = pa >> PAGE_SHIFT; 67 pgd_t *pgdp; 68 p4d_t *p4dp; 69 pud_t *pudp; 70 pmd_t *pmdp; 71 pte_t *ptep; 72 73 pgdp = pgd_offset_k(ea); 74 p4dp = p4d_offset(pgdp, ea); 75 if (p4d_none(*p4dp)) { 76 pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, 77 region_start, region_end); 78 p4d_populate(&init_mm, p4dp, pudp); 79 } 80 pudp = pud_offset(p4dp, ea); 81 if (map_page_size == PUD_SIZE) { 82 ptep = (pte_t *)pudp; 83 goto set_the_pte; 84 } 85 if (pud_none(*pudp)) { 86 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, 87 region_start, region_end); 88 pud_populate(&init_mm, pudp, pmdp); 89 } 90 pmdp = pmd_offset(pudp, ea); 91 if (map_page_size == PMD_SIZE) { 92 ptep = pmdp_ptep(pmdp); 93 goto set_the_pte; 94 } 95 if (!pmd_present(*pmdp)) { 96 ptep = early_alloc_pgtable(PAGE_SIZE, nid, 97 region_start, region_end); 98 pmd_populate_kernel(&init_mm, pmdp, ptep); 99 } 100 ptep = pte_offset_kernel(pmdp, ea); 101 102 set_the_pte: 103 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 104 smp_wmb(); 105 return 0; 106 } 107 108 /* 109 * nid, region_start, and region_end are hints to try to place the page 110 * table memory in the same node or region. 111 */ 112 static int __map_kernel_page(unsigned long ea, unsigned long pa, 113 pgprot_t flags, 114 unsigned int map_page_size, 115 int nid, 116 unsigned long region_start, unsigned long region_end) 117 { 118 unsigned long pfn = pa >> PAGE_SHIFT; 119 pgd_t *pgdp; 120 p4d_t *p4dp; 121 pud_t *pudp; 122 pmd_t *pmdp; 123 pte_t *ptep; 124 /* 125 * Make sure task size is correct as per the max adddr 126 */ 127 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); 128 129 #ifdef CONFIG_PPC_64K_PAGES 130 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); 131 #endif 132 133 if (unlikely(!slab_is_available())) 134 return early_map_kernel_page(ea, pa, flags, map_page_size, 135 nid, region_start, region_end); 136 137 /* 138 * Should make page table allocation functions be able to take a 139 * node, so we can place kernel page tables on the right nodes after 140 * boot. 141 */ 142 pgdp = pgd_offset_k(ea); 143 p4dp = p4d_offset(pgdp, ea); 144 pudp = pud_alloc(&init_mm, p4dp, ea); 145 if (!pudp) 146 return -ENOMEM; 147 if (map_page_size == PUD_SIZE) { 148 ptep = (pte_t *)pudp; 149 goto set_the_pte; 150 } 151 pmdp = pmd_alloc(&init_mm, pudp, ea); 152 if (!pmdp) 153 return -ENOMEM; 154 if (map_page_size == PMD_SIZE) { 155 ptep = pmdp_ptep(pmdp); 156 goto set_the_pte; 157 } 158 ptep = pte_alloc_kernel(pmdp, ea); 159 if (!ptep) 160 return -ENOMEM; 161 162 set_the_pte: 163 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 164 smp_wmb(); 165 return 0; 166 } 167 168 int radix__map_kernel_page(unsigned long ea, unsigned long pa, 169 pgprot_t flags, 170 unsigned int map_page_size) 171 { 172 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); 173 } 174 175 #ifdef CONFIG_STRICT_KERNEL_RWX 176 void radix__change_memory_range(unsigned long start, unsigned long end, 177 unsigned long clear) 178 { 179 unsigned long idx; 180 pgd_t *pgdp; 181 p4d_t *p4dp; 182 pud_t *pudp; 183 pmd_t *pmdp; 184 pte_t *ptep; 185 186 start = ALIGN_DOWN(start, PAGE_SIZE); 187 end = PAGE_ALIGN(end); // aligns up 188 189 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", 190 start, end, clear); 191 192 for (idx = start; idx < end; idx += PAGE_SIZE) { 193 pgdp = pgd_offset_k(idx); 194 p4dp = p4d_offset(pgdp, idx); 195 pudp = pud_alloc(&init_mm, p4dp, idx); 196 if (!pudp) 197 continue; 198 if (pud_is_leaf(*pudp)) { 199 ptep = (pte_t *)pudp; 200 goto update_the_pte; 201 } 202 pmdp = pmd_alloc(&init_mm, pudp, idx); 203 if (!pmdp) 204 continue; 205 if (pmd_is_leaf(*pmdp)) { 206 ptep = pmdp_ptep(pmdp); 207 goto update_the_pte; 208 } 209 ptep = pte_alloc_kernel(pmdp, idx); 210 if (!ptep) 211 continue; 212 update_the_pte: 213 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); 214 } 215 216 radix__flush_tlb_kernel_range(start, end); 217 } 218 219 void radix__mark_rodata_ro(void) 220 { 221 unsigned long start, end; 222 223 start = (unsigned long)_stext; 224 end = (unsigned long)__init_begin; 225 226 radix__change_memory_range(start, end, _PAGE_WRITE); 227 } 228 229 void radix__mark_initmem_nx(void) 230 { 231 unsigned long start = (unsigned long)__init_begin; 232 unsigned long end = (unsigned long)__init_end; 233 234 radix__change_memory_range(start, end, _PAGE_EXEC); 235 } 236 #endif /* CONFIG_STRICT_KERNEL_RWX */ 237 238 static inline void __meminit 239 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) 240 { 241 char buf[10]; 242 243 if (end <= start) 244 return; 245 246 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); 247 248 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, 249 exec ? " (exec)" : ""); 250 } 251 252 static unsigned long next_boundary(unsigned long addr, unsigned long end) 253 { 254 #ifdef CONFIG_STRICT_KERNEL_RWX 255 if (addr < __pa_symbol(__init_begin)) 256 return __pa_symbol(__init_begin); 257 #endif 258 return end; 259 } 260 261 static int __meminit create_physical_mapping(unsigned long start, 262 unsigned long end, 263 int nid, pgprot_t _prot) 264 { 265 unsigned long vaddr, addr, mapping_size = 0; 266 bool prev_exec, exec = false; 267 pgprot_t prot; 268 int psize; 269 270 start = _ALIGN_UP(start, PAGE_SIZE); 271 for (addr = start; addr < end; addr += mapping_size) { 272 unsigned long gap, previous_size; 273 int rc; 274 275 gap = next_boundary(addr, end) - addr; 276 previous_size = mapping_size; 277 prev_exec = exec; 278 279 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && 280 mmu_psize_defs[MMU_PAGE_1G].shift) { 281 mapping_size = PUD_SIZE; 282 psize = MMU_PAGE_1G; 283 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && 284 mmu_psize_defs[MMU_PAGE_2M].shift) { 285 mapping_size = PMD_SIZE; 286 psize = MMU_PAGE_2M; 287 } else { 288 mapping_size = PAGE_SIZE; 289 psize = mmu_virtual_psize; 290 } 291 292 vaddr = (unsigned long)__va(addr); 293 294 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || 295 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { 296 prot = PAGE_KERNEL_X; 297 exec = true; 298 } else { 299 prot = _prot; 300 exec = false; 301 } 302 303 if (mapping_size != previous_size || exec != prev_exec) { 304 print_mapping(start, addr, previous_size, prev_exec); 305 start = addr; 306 } 307 308 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); 309 if (rc) 310 return rc; 311 312 update_page_count(psize, 1); 313 } 314 315 print_mapping(start, addr, mapping_size, exec); 316 return 0; 317 } 318 319 static void __init radix_init_pgtable(void) 320 { 321 unsigned long rts_field; 322 struct memblock_region *reg; 323 324 /* We don't support slb for radix */ 325 mmu_slb_size = 0; 326 /* 327 * Create the linear mapping, using standard page size for now 328 */ 329 for_each_memblock(memory, reg) { 330 /* 331 * The memblock allocator is up at this point, so the 332 * page tables will be allocated within the range. No 333 * need or a node (which we don't have yet). 334 */ 335 336 if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { 337 pr_warn("Outside the supported range\n"); 338 continue; 339 } 340 341 WARN_ON(create_physical_mapping(reg->base, 342 reg->base + reg->size, 343 -1, PAGE_KERNEL)); 344 } 345 346 /* Find out how many PID bits are supported */ 347 if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { 348 if (!mmu_pid_bits) 349 mmu_pid_bits = 20; 350 mmu_base_pid = 1; 351 } else if (cpu_has_feature(CPU_FTR_HVMODE)) { 352 if (!mmu_pid_bits) 353 mmu_pid_bits = 20; 354 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 355 /* 356 * When KVM is possible, we only use the top half of the 357 * PID space to avoid collisions between host and guest PIDs 358 * which can cause problems due to prefetch when exiting the 359 * guest with AIL=3 360 */ 361 mmu_base_pid = 1 << (mmu_pid_bits - 1); 362 #else 363 mmu_base_pid = 1; 364 #endif 365 } else { 366 /* The guest uses the bottom half of the PID space */ 367 if (!mmu_pid_bits) 368 mmu_pid_bits = 19; 369 mmu_base_pid = 1; 370 } 371 372 /* 373 * Allocate Partition table and process table for the 374 * host. 375 */ 376 BUG_ON(PRTB_SIZE_SHIFT > 36); 377 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); 378 /* 379 * Fill in the process table. 380 */ 381 rts_field = radix__get_tree_size(); 382 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 383 384 /* 385 * The init_mm context is given the first available (non-zero) PID, 386 * which is the "guard PID" and contains no page table. PIDR should 387 * never be set to zero because that duplicates the kernel address 388 * space at the 0x0... offset (quadrant 0)! 389 * 390 * An arbitrary PID that may later be allocated by the PID allocator 391 * for userspace processes must not be used either, because that 392 * would cause stale user mappings for that PID on CPUs outside of 393 * the TLB invalidation scheme (because it won't be in mm_cpumask). 394 * 395 * So permanently carve out one PID for the purpose of a guard PID. 396 */ 397 init_mm.context.id = mmu_base_pid; 398 mmu_base_pid++; 399 } 400 401 static void __init radix_init_partition_table(void) 402 { 403 unsigned long rts_field, dw0, dw1; 404 405 mmu_partition_table_init(); 406 rts_field = radix__get_tree_size(); 407 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; 408 dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; 409 mmu_partition_table_set_entry(0, dw0, dw1, false); 410 411 pr_info("Initializing Radix MMU\n"); 412 } 413 414 static int __init get_idx_from_shift(unsigned int shift) 415 { 416 int idx = -1; 417 418 switch (shift) { 419 case 0xc: 420 idx = MMU_PAGE_4K; 421 break; 422 case 0x10: 423 idx = MMU_PAGE_64K; 424 break; 425 case 0x15: 426 idx = MMU_PAGE_2M; 427 break; 428 case 0x1e: 429 idx = MMU_PAGE_1G; 430 break; 431 } 432 return idx; 433 } 434 435 static int __init radix_dt_scan_page_sizes(unsigned long node, 436 const char *uname, int depth, 437 void *data) 438 { 439 int size = 0; 440 int shift, idx; 441 unsigned int ap; 442 const __be32 *prop; 443 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 444 445 /* We are scanning "cpu" nodes only */ 446 if (type == NULL || strcmp(type, "cpu") != 0) 447 return 0; 448 449 /* Find MMU PID size */ 450 prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); 451 if (prop && size == 4) 452 mmu_pid_bits = be32_to_cpup(prop); 453 454 /* Grab page size encodings */ 455 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); 456 if (!prop) 457 return 0; 458 459 pr_info("Page sizes from device-tree:\n"); 460 for (; size >= 4; size -= 4, ++prop) { 461 462 struct mmu_psize_def *def; 463 464 /* top 3 bit is AP encoding */ 465 shift = be32_to_cpu(prop[0]) & ~(0xe << 28); 466 ap = be32_to_cpu(prop[0]) >> 29; 467 pr_info("Page size shift = %d AP=0x%x\n", shift, ap); 468 469 idx = get_idx_from_shift(shift); 470 if (idx < 0) 471 continue; 472 473 def = &mmu_psize_defs[idx]; 474 def->shift = shift; 475 def->ap = ap; 476 } 477 478 /* needed ? */ 479 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 480 return 1; 481 } 482 483 void __init radix__early_init_devtree(void) 484 { 485 int rc; 486 487 /* 488 * Try to find the available page sizes in the device-tree 489 */ 490 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); 491 if (rc != 0) /* Found */ 492 goto found; 493 /* 494 * let's assume we have page 4k and 64k support 495 */ 496 mmu_psize_defs[MMU_PAGE_4K].shift = 12; 497 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; 498 499 mmu_psize_defs[MMU_PAGE_64K].shift = 16; 500 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; 501 found: 502 return; 503 } 504 505 static void radix_init_amor(void) 506 { 507 /* 508 * In HV mode, we init AMOR (Authority Mask Override Register) so that 509 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 510 * Register), enable key 0 and set it to 1. 511 * 512 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 513 */ 514 mtspr(SPRN_AMOR, (3ul << 62)); 515 } 516 517 #ifdef CONFIG_PPC_KUEP 518 void setup_kuep(bool disabled) 519 { 520 if (disabled || !early_radix_enabled()) 521 return; 522 523 if (smp_processor_id() == boot_cpuid) 524 pr_info("Activating Kernel Userspace Execution Prevention\n"); 525 526 /* 527 * Radix always uses key0 of the IAMR to determine if an access is 528 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 529 * fetch. 530 */ 531 mtspr(SPRN_IAMR, (1ul << 62)); 532 } 533 #endif 534 535 #ifdef CONFIG_PPC_KUAP 536 void setup_kuap(bool disabled) 537 { 538 if (disabled || !early_radix_enabled()) 539 return; 540 541 if (smp_processor_id() == boot_cpuid) { 542 pr_info("Activating Kernel Userspace Access Prevention\n"); 543 cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; 544 } 545 546 /* Make sure userspace can't change the AMR */ 547 mtspr(SPRN_UAMOR, 0); 548 mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); 549 isync(); 550 } 551 #endif 552 553 void __init radix__early_init_mmu(void) 554 { 555 unsigned long lpcr; 556 557 #ifdef CONFIG_PPC_64K_PAGES 558 /* PAGE_SIZE mappings */ 559 mmu_virtual_psize = MMU_PAGE_64K; 560 #else 561 mmu_virtual_psize = MMU_PAGE_4K; 562 #endif 563 564 #ifdef CONFIG_SPARSEMEM_VMEMMAP 565 /* vmemmap mapping */ 566 if (mmu_psize_defs[MMU_PAGE_2M].shift) { 567 /* 568 * map vmemmap using 2M if available 569 */ 570 mmu_vmemmap_psize = MMU_PAGE_2M; 571 } else 572 mmu_vmemmap_psize = mmu_virtual_psize; 573 #endif 574 /* 575 * initialize page table size 576 */ 577 __pte_index_size = RADIX_PTE_INDEX_SIZE; 578 __pmd_index_size = RADIX_PMD_INDEX_SIZE; 579 __pud_index_size = RADIX_PUD_INDEX_SIZE; 580 __pgd_index_size = RADIX_PGD_INDEX_SIZE; 581 __pud_cache_index = RADIX_PUD_INDEX_SIZE; 582 __pte_table_size = RADIX_PTE_TABLE_SIZE; 583 __pmd_table_size = RADIX_PMD_TABLE_SIZE; 584 __pud_table_size = RADIX_PUD_TABLE_SIZE; 585 __pgd_table_size = RADIX_PGD_TABLE_SIZE; 586 587 __pmd_val_bits = RADIX_PMD_VAL_BITS; 588 __pud_val_bits = RADIX_PUD_VAL_BITS; 589 __pgd_val_bits = RADIX_PGD_VAL_BITS; 590 591 __kernel_virt_start = RADIX_KERN_VIRT_START; 592 __vmalloc_start = RADIX_VMALLOC_START; 593 __vmalloc_end = RADIX_VMALLOC_END; 594 __kernel_io_start = RADIX_KERN_IO_START; 595 __kernel_io_end = RADIX_KERN_IO_END; 596 vmemmap = (struct page *)RADIX_VMEMMAP_START; 597 ioremap_bot = IOREMAP_BASE; 598 599 #ifdef CONFIG_PCI 600 pci_io_base = ISA_IO_BASE; 601 #endif 602 __pte_frag_nr = RADIX_PTE_FRAG_NR; 603 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; 604 __pmd_frag_nr = RADIX_PMD_FRAG_NR; 605 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; 606 607 radix_init_pgtable(); 608 609 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 610 lpcr = mfspr(SPRN_LPCR); 611 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 612 radix_init_partition_table(); 613 radix_init_amor(); 614 } else { 615 radix_init_pseries(); 616 } 617 618 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 619 620 /* Switch to the guard PID before turning on MMU */ 621 radix__switch_mmu_context(NULL, &init_mm); 622 tlbiel_all(); 623 } 624 625 void radix__early_init_mmu_secondary(void) 626 { 627 unsigned long lpcr; 628 /* 629 * update partition table control register and UPRT 630 */ 631 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 632 lpcr = mfspr(SPRN_LPCR); 633 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 634 635 set_ptcr_when_no_uv(__pa(partition_tb) | 636 (PATB_SIZE_SHIFT - 12)); 637 638 radix_init_amor(); 639 } 640 641 radix__switch_mmu_context(NULL, &init_mm); 642 tlbiel_all(); 643 } 644 645 void radix__mmu_cleanup_all(void) 646 { 647 unsigned long lpcr; 648 649 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 650 lpcr = mfspr(SPRN_LPCR); 651 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); 652 set_ptcr_when_no_uv(0); 653 powernv_set_nmmu_ptcr(0); 654 radix__flush_tlb_all(); 655 } 656 } 657 658 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, 659 phys_addr_t first_memblock_size) 660 { 661 /* 662 * We don't currently support the first MEMBLOCK not mapping 0 663 * physical on those processors 664 */ 665 BUG_ON(first_memblock_base != 0); 666 667 /* 668 * Radix mode is not limited by RMA / VRMA addressing. 669 */ 670 ppc64_rma_size = ULONG_MAX; 671 } 672 673 #ifdef CONFIG_MEMORY_HOTPLUG 674 static void free_pte_table(pte_t *pte_start, pmd_t *pmd) 675 { 676 pte_t *pte; 677 int i; 678 679 for (i = 0; i < PTRS_PER_PTE; i++) { 680 pte = pte_start + i; 681 if (!pte_none(*pte)) 682 return; 683 } 684 685 pte_free_kernel(&init_mm, pte_start); 686 pmd_clear(pmd); 687 } 688 689 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) 690 { 691 pmd_t *pmd; 692 int i; 693 694 for (i = 0; i < PTRS_PER_PMD; i++) { 695 pmd = pmd_start + i; 696 if (!pmd_none(*pmd)) 697 return; 698 } 699 700 pmd_free(&init_mm, pmd_start); 701 pud_clear(pud); 702 } 703 704 struct change_mapping_params { 705 pte_t *pte; 706 unsigned long start; 707 unsigned long end; 708 unsigned long aligned_start; 709 unsigned long aligned_end; 710 }; 711 712 static int __meminit stop_machine_change_mapping(void *data) 713 { 714 struct change_mapping_params *params = 715 (struct change_mapping_params *)data; 716 717 if (!data) 718 return -1; 719 720 spin_unlock(&init_mm.page_table_lock); 721 pte_clear(&init_mm, params->aligned_start, params->pte); 722 create_physical_mapping(__pa(params->aligned_start), 723 __pa(params->start), -1, PAGE_KERNEL); 724 create_physical_mapping(__pa(params->end), __pa(params->aligned_end), 725 -1, PAGE_KERNEL); 726 spin_lock(&init_mm.page_table_lock); 727 return 0; 728 } 729 730 static void remove_pte_table(pte_t *pte_start, unsigned long addr, 731 unsigned long end) 732 { 733 unsigned long next; 734 pte_t *pte; 735 736 pte = pte_start + pte_index(addr); 737 for (; addr < end; addr = next, pte++) { 738 next = (addr + PAGE_SIZE) & PAGE_MASK; 739 if (next > end) 740 next = end; 741 742 if (!pte_present(*pte)) 743 continue; 744 745 if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { 746 /* 747 * The vmemmap_free() and remove_section_mapping() 748 * codepaths call us with aligned addresses. 749 */ 750 WARN_ONCE(1, "%s: unaligned range\n", __func__); 751 continue; 752 } 753 754 pte_clear(&init_mm, addr, pte); 755 } 756 } 757 758 /* 759 * clear the pte and potentially split the mapping helper 760 */ 761 static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, 762 unsigned long size, pte_t *pte) 763 { 764 unsigned long mask = ~(size - 1); 765 unsigned long aligned_start = addr & mask; 766 unsigned long aligned_end = addr + size; 767 struct change_mapping_params params; 768 bool split_region = false; 769 770 if ((end - addr) < size) { 771 /* 772 * We're going to clear the PTE, but not flushed 773 * the mapping, time to remap and flush. The 774 * effects if visible outside the processor or 775 * if we are running in code close to the 776 * mapping we cleared, we are in trouble. 777 */ 778 if (overlaps_kernel_text(aligned_start, addr) || 779 overlaps_kernel_text(end, aligned_end)) { 780 /* 781 * Hack, just return, don't pte_clear 782 */ 783 WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " 784 "text, not splitting\n", addr, end); 785 return; 786 } 787 split_region = true; 788 } 789 790 if (split_region) { 791 params.pte = pte; 792 params.start = addr; 793 params.end = end; 794 params.aligned_start = addr & ~(size - 1); 795 params.aligned_end = min_t(unsigned long, aligned_end, 796 (unsigned long)__va(memblock_end_of_DRAM())); 797 stop_machine(stop_machine_change_mapping, ¶ms, NULL); 798 return; 799 } 800 801 pte_clear(&init_mm, addr, pte); 802 } 803 804 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, 805 unsigned long end) 806 { 807 unsigned long next; 808 pte_t *pte_base; 809 pmd_t *pmd; 810 811 pmd = pmd_start + pmd_index(addr); 812 for (; addr < end; addr = next, pmd++) { 813 next = pmd_addr_end(addr, end); 814 815 if (!pmd_present(*pmd)) 816 continue; 817 818 if (pmd_is_leaf(*pmd)) { 819 split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); 820 continue; 821 } 822 823 pte_base = (pte_t *)pmd_page_vaddr(*pmd); 824 remove_pte_table(pte_base, addr, next); 825 free_pte_table(pte_base, pmd); 826 } 827 } 828 829 static void remove_pud_table(pud_t *pud_start, unsigned long addr, 830 unsigned long end) 831 { 832 unsigned long next; 833 pmd_t *pmd_base; 834 pud_t *pud; 835 836 pud = pud_start + pud_index(addr); 837 for (; addr < end; addr = next, pud++) { 838 next = pud_addr_end(addr, end); 839 840 if (!pud_present(*pud)) 841 continue; 842 843 if (pud_is_leaf(*pud)) { 844 split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); 845 continue; 846 } 847 848 pmd_base = (pmd_t *)pud_page_vaddr(*pud); 849 remove_pmd_table(pmd_base, addr, next); 850 free_pmd_table(pmd_base, pud); 851 } 852 } 853 854 static void __meminit remove_pagetable(unsigned long start, unsigned long end) 855 { 856 unsigned long addr, next; 857 pud_t *pud_base; 858 pgd_t *pgd; 859 p4d_t *p4d; 860 861 spin_lock(&init_mm.page_table_lock); 862 863 for (addr = start; addr < end; addr = next) { 864 next = pgd_addr_end(addr, end); 865 866 pgd = pgd_offset_k(addr); 867 p4d = p4d_offset(pgd, addr); 868 if (!p4d_present(*p4d)) 869 continue; 870 871 if (p4d_is_leaf(*p4d)) { 872 split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d); 873 continue; 874 } 875 876 pud_base = (pud_t *)p4d_page_vaddr(*p4d); 877 remove_pud_table(pud_base, addr, next); 878 } 879 880 spin_unlock(&init_mm.page_table_lock); 881 radix__flush_tlb_kernel_range(start, end); 882 } 883 884 int __meminit radix__create_section_mapping(unsigned long start, 885 unsigned long end, int nid, 886 pgprot_t prot) 887 { 888 if (end >= RADIX_VMALLOC_START) { 889 pr_warn("Outside the supported range\n"); 890 return -1; 891 } 892 893 return create_physical_mapping(__pa(start), __pa(end), nid, prot); 894 } 895 896 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) 897 { 898 remove_pagetable(start, end); 899 return 0; 900 } 901 #endif /* CONFIG_MEMORY_HOTPLUG */ 902 903 #ifdef CONFIG_SPARSEMEM_VMEMMAP 904 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, 905 pgprot_t flags, unsigned int map_page_size, 906 int nid) 907 { 908 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); 909 } 910 911 int __meminit radix__vmemmap_create_mapping(unsigned long start, 912 unsigned long page_size, 913 unsigned long phys) 914 { 915 /* Create a PTE encoding */ 916 unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; 917 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); 918 int ret; 919 920 if ((start + page_size) >= RADIX_VMEMMAP_END) { 921 pr_warn("Outside the supported range\n"); 922 return -1; 923 } 924 925 ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); 926 BUG_ON(ret); 927 928 return 0; 929 } 930 931 #ifdef CONFIG_MEMORY_HOTPLUG 932 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) 933 { 934 remove_pagetable(start, start + page_size); 935 } 936 #endif 937 #endif 938 939 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 940 941 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 942 pmd_t *pmdp, unsigned long clr, 943 unsigned long set) 944 { 945 unsigned long old; 946 947 #ifdef CONFIG_DEBUG_VM 948 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); 949 assert_spin_locked(pmd_lockptr(mm, pmdp)); 950 #endif 951 952 old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); 953 trace_hugepage_update(addr, old, clr, set); 954 955 return old; 956 } 957 958 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 959 pmd_t *pmdp) 960 961 { 962 pmd_t pmd; 963 964 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 965 VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); 966 VM_BUG_ON(pmd_devmap(*pmdp)); 967 /* 968 * khugepaged calls this for normal pmd 969 */ 970 pmd = *pmdp; 971 pmd_clear(pmdp); 972 973 /*FIXME!! Verify whether we need this kick below */ 974 serialize_against_pte_lookup(vma->vm_mm); 975 976 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); 977 978 return pmd; 979 } 980 981 /* 982 * For us pgtable_t is pte_t *. Inorder to save the deposisted 983 * page table, we consider the allocated page table as a list 984 * head. On withdraw we need to make sure we zero out the used 985 * list_head memory area. 986 */ 987 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 988 pgtable_t pgtable) 989 { 990 struct list_head *lh = (struct list_head *) pgtable; 991 992 assert_spin_locked(pmd_lockptr(mm, pmdp)); 993 994 /* FIFO */ 995 if (!pmd_huge_pte(mm, pmdp)) 996 INIT_LIST_HEAD(lh); 997 else 998 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 999 pmd_huge_pte(mm, pmdp) = pgtable; 1000 } 1001 1002 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1003 { 1004 pte_t *ptep; 1005 pgtable_t pgtable; 1006 struct list_head *lh; 1007 1008 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1009 1010 /* FIFO */ 1011 pgtable = pmd_huge_pte(mm, pmdp); 1012 lh = (struct list_head *) pgtable; 1013 if (list_empty(lh)) 1014 pmd_huge_pte(mm, pmdp) = NULL; 1015 else { 1016 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1017 list_del(lh); 1018 } 1019 ptep = (pte_t *) pgtable; 1020 *ptep = __pte(0); 1021 ptep++; 1022 *ptep = __pte(0); 1023 return pgtable; 1024 } 1025 1026 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 1027 unsigned long addr, pmd_t *pmdp) 1028 { 1029 pmd_t old_pmd; 1030 unsigned long old; 1031 1032 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 1033 old_pmd = __pmd(old); 1034 /* 1035 * Serialize against find_current_mm_pte which does lock-less 1036 * lookup in page tables with local interrupts disabled. For huge pages 1037 * it casts pmd_t to pte_t. Since format of pte_t is different from 1038 * pmd_t we want to prevent transit from pmd pointing to page table 1039 * to pmd pointing to huge page (and back) while interrupts are disabled. 1040 * We clear pmd to possibly replace it with page table pointer in 1041 * different code paths. So make sure we wait for the parallel 1042 * find_current_mm_pte to finish. 1043 */ 1044 serialize_against_pte_lookup(mm); 1045 return old_pmd; 1046 } 1047 1048 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1049 1050 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, 1051 pte_t entry, unsigned long address, int psize) 1052 { 1053 struct mm_struct *mm = vma->vm_mm; 1054 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | 1055 _PAGE_RW | _PAGE_EXEC); 1056 1057 unsigned long change = pte_val(entry) ^ pte_val(*ptep); 1058 /* 1059 * To avoid NMMU hang while relaxing access, we need mark 1060 * the pte invalid in between. 1061 */ 1062 if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { 1063 unsigned long old_pte, new_pte; 1064 1065 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); 1066 /* 1067 * new value of pte 1068 */ 1069 new_pte = old_pte | set; 1070 radix__flush_tlb_page_psize(mm, address, psize); 1071 __radix_pte_update(ptep, _PAGE_INVALID, new_pte); 1072 } else { 1073 __radix_pte_update(ptep, 0, set); 1074 /* 1075 * Book3S does not require a TLB flush when relaxing access 1076 * restrictions when the address space is not attached to a 1077 * NMMU, because the core MMU will reload the pte after taking 1078 * an access fault, which is defined by the architectue. 1079 */ 1080 } 1081 /* See ptesync comment in radix__set_pte_at */ 1082 } 1083 1084 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 1085 unsigned long addr, pte_t *ptep, 1086 pte_t old_pte, pte_t pte) 1087 { 1088 struct mm_struct *mm = vma->vm_mm; 1089 1090 /* 1091 * To avoid NMMU hang while relaxing access we need to flush the tlb before 1092 * we set the new value. We need to do this only for radix, because hash 1093 * translation does flush when updating the linux pte. 1094 */ 1095 if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 1096 (atomic_read(&mm->context.copros) > 0)) 1097 radix__flush_tlb_page(vma, addr); 1098 1099 set_pte_at(mm, addr, ptep, pte); 1100 } 1101 1102 int __init arch_ioremap_pud_supported(void) 1103 { 1104 /* HPT does not cope with large pages in the vmalloc area */ 1105 return radix_enabled(); 1106 } 1107 1108 int __init arch_ioremap_pmd_supported(void) 1109 { 1110 return radix_enabled(); 1111 } 1112 1113 int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) 1114 { 1115 return 0; 1116 } 1117 1118 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 1119 { 1120 pte_t *ptep = (pte_t *)pud; 1121 pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot); 1122 1123 if (!radix_enabled()) 1124 return 0; 1125 1126 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); 1127 1128 return 1; 1129 } 1130 1131 int pud_clear_huge(pud_t *pud) 1132 { 1133 if (pud_huge(*pud)) { 1134 pud_clear(pud); 1135 return 1; 1136 } 1137 1138 return 0; 1139 } 1140 1141 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 1142 { 1143 pmd_t *pmd; 1144 int i; 1145 1146 pmd = (pmd_t *)pud_page_vaddr(*pud); 1147 pud_clear(pud); 1148 1149 flush_tlb_kernel_range(addr, addr + PUD_SIZE); 1150 1151 for (i = 0; i < PTRS_PER_PMD; i++) { 1152 if (!pmd_none(pmd[i])) { 1153 pte_t *pte; 1154 pte = (pte_t *)pmd_page_vaddr(pmd[i]); 1155 1156 pte_free_kernel(&init_mm, pte); 1157 } 1158 } 1159 1160 pmd_free(&init_mm, pmd); 1161 1162 return 1; 1163 } 1164 1165 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 1166 { 1167 pte_t *ptep = (pte_t *)pmd; 1168 pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot); 1169 1170 if (!radix_enabled()) 1171 return 0; 1172 1173 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); 1174 1175 return 1; 1176 } 1177 1178 int pmd_clear_huge(pmd_t *pmd) 1179 { 1180 if (pmd_huge(*pmd)) { 1181 pmd_clear(pmd); 1182 return 1; 1183 } 1184 1185 return 0; 1186 } 1187 1188 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 1189 { 1190 pte_t *pte; 1191 1192 pte = (pte_t *)pmd_page_vaddr(*pmd); 1193 pmd_clear(pmd); 1194 1195 flush_tlb_kernel_range(addr, addr + PMD_SIZE); 1196 1197 pte_free_kernel(&init_mm, pte); 1198 1199 return 1; 1200 } 1201 1202 int __init arch_ioremap_p4d_supported(void) 1203 { 1204 return 0; 1205 } 1206