1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Page table handling routines for radix page table. 4 * 5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 6 */ 7 8 #define pr_fmt(fmt) "radix-mmu: " fmt 9 10 #include <linux/io.h> 11 #include <linux/kernel.h> 12 #include <linux/sched/mm.h> 13 #include <linux/memblock.h> 14 #include <linux/of_fdt.h> 15 #include <linux/mm.h> 16 #include <linux/hugetlb.h> 17 #include <linux/string_helpers.h> 18 #include <linux/stop_machine.h> 19 20 #include <asm/pgtable.h> 21 #include <asm/pgalloc.h> 22 #include <asm/mmu_context.h> 23 #include <asm/dma.h> 24 #include <asm/machdep.h> 25 #include <asm/mmu.h> 26 #include <asm/firmware.h> 27 #include <asm/powernv.h> 28 #include <asm/sections.h> 29 #include <asm/trace.h> 30 #include <asm/uaccess.h> 31 #include <asm/ultravisor.h> 32 33 #include <trace/events/thp.h> 34 35 unsigned int mmu_pid_bits; 36 unsigned int mmu_base_pid; 37 38 static __ref void *early_alloc_pgtable(unsigned long size, int nid, 39 unsigned long region_start, unsigned long region_end) 40 { 41 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; 42 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; 43 void *ptr; 44 45 if (region_start) 46 min_addr = region_start; 47 if (region_end) 48 max_addr = region_end; 49 50 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); 51 52 if (!ptr) 53 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", 54 __func__, size, size, nid, &min_addr, &max_addr); 55 56 return ptr; 57 } 58 59 static int early_map_kernel_page(unsigned long ea, unsigned long pa, 60 pgprot_t flags, 61 unsigned int map_page_size, 62 int nid, 63 unsigned long region_start, unsigned long region_end) 64 { 65 unsigned long pfn = pa >> PAGE_SHIFT; 66 pgd_t *pgdp; 67 pud_t *pudp; 68 pmd_t *pmdp; 69 pte_t *ptep; 70 71 pgdp = pgd_offset_k(ea); 72 if (pgd_none(*pgdp)) { 73 pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, 74 region_start, region_end); 75 pgd_populate(&init_mm, pgdp, pudp); 76 } 77 pudp = pud_offset(pgdp, ea); 78 if (map_page_size == PUD_SIZE) { 79 ptep = (pte_t *)pudp; 80 goto set_the_pte; 81 } 82 if (pud_none(*pudp)) { 83 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, 84 region_start, region_end); 85 pud_populate(&init_mm, pudp, pmdp); 86 } 87 pmdp = pmd_offset(pudp, ea); 88 if (map_page_size == PMD_SIZE) { 89 ptep = pmdp_ptep(pmdp); 90 goto set_the_pte; 91 } 92 if (!pmd_present(*pmdp)) { 93 ptep = early_alloc_pgtable(PAGE_SIZE, nid, 94 region_start, region_end); 95 pmd_populate_kernel(&init_mm, pmdp, ptep); 96 } 97 ptep = pte_offset_kernel(pmdp, ea); 98 99 set_the_pte: 100 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 101 smp_wmb(); 102 return 0; 103 } 104 105 /* 106 * nid, region_start, and region_end are hints to try to place the page 107 * table memory in the same node or region. 108 */ 109 static int __map_kernel_page(unsigned long ea, unsigned long pa, 110 pgprot_t flags, 111 unsigned int map_page_size, 112 int nid, 113 unsigned long region_start, unsigned long region_end) 114 { 115 unsigned long pfn = pa >> PAGE_SHIFT; 116 pgd_t *pgdp; 117 pud_t *pudp; 118 pmd_t *pmdp; 119 pte_t *ptep; 120 /* 121 * Make sure task size is correct as per the max adddr 122 */ 123 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); 124 125 #ifdef CONFIG_PPC_64K_PAGES 126 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); 127 #endif 128 129 if (unlikely(!slab_is_available())) 130 return early_map_kernel_page(ea, pa, flags, map_page_size, 131 nid, region_start, region_end); 132 133 /* 134 * Should make page table allocation functions be able to take a 135 * node, so we can place kernel page tables on the right nodes after 136 * boot. 137 */ 138 pgdp = pgd_offset_k(ea); 139 pudp = pud_alloc(&init_mm, pgdp, ea); 140 if (!pudp) 141 return -ENOMEM; 142 if (map_page_size == PUD_SIZE) { 143 ptep = (pte_t *)pudp; 144 goto set_the_pte; 145 } 146 pmdp = pmd_alloc(&init_mm, pudp, ea); 147 if (!pmdp) 148 return -ENOMEM; 149 if (map_page_size == PMD_SIZE) { 150 ptep = pmdp_ptep(pmdp); 151 goto set_the_pte; 152 } 153 ptep = pte_alloc_kernel(pmdp, ea); 154 if (!ptep) 155 return -ENOMEM; 156 157 set_the_pte: 158 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 159 smp_wmb(); 160 return 0; 161 } 162 163 int radix__map_kernel_page(unsigned long ea, unsigned long pa, 164 pgprot_t flags, 165 unsigned int map_page_size) 166 { 167 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); 168 } 169 170 #ifdef CONFIG_STRICT_KERNEL_RWX 171 void radix__change_memory_range(unsigned long start, unsigned long end, 172 unsigned long clear) 173 { 174 unsigned long idx; 175 pgd_t *pgdp; 176 pud_t *pudp; 177 pmd_t *pmdp; 178 pte_t *ptep; 179 180 start = ALIGN_DOWN(start, PAGE_SIZE); 181 end = PAGE_ALIGN(end); // aligns up 182 183 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", 184 start, end, clear); 185 186 for (idx = start; idx < end; idx += PAGE_SIZE) { 187 pgdp = pgd_offset_k(idx); 188 pudp = pud_alloc(&init_mm, pgdp, idx); 189 if (!pudp) 190 continue; 191 if (pud_is_leaf(*pudp)) { 192 ptep = (pte_t *)pudp; 193 goto update_the_pte; 194 } 195 pmdp = pmd_alloc(&init_mm, pudp, idx); 196 if (!pmdp) 197 continue; 198 if (pmd_is_leaf(*pmdp)) { 199 ptep = pmdp_ptep(pmdp); 200 goto update_the_pte; 201 } 202 ptep = pte_alloc_kernel(pmdp, idx); 203 if (!ptep) 204 continue; 205 update_the_pte: 206 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); 207 } 208 209 radix__flush_tlb_kernel_range(start, end); 210 } 211 212 void radix__mark_rodata_ro(void) 213 { 214 unsigned long start, end; 215 216 start = (unsigned long)_stext; 217 end = (unsigned long)__init_begin; 218 219 radix__change_memory_range(start, end, _PAGE_WRITE); 220 } 221 222 void radix__mark_initmem_nx(void) 223 { 224 unsigned long start = (unsigned long)__init_begin; 225 unsigned long end = (unsigned long)__init_end; 226 227 radix__change_memory_range(start, end, _PAGE_EXEC); 228 } 229 #endif /* CONFIG_STRICT_KERNEL_RWX */ 230 231 static inline void __meminit 232 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) 233 { 234 char buf[10]; 235 236 if (end <= start) 237 return; 238 239 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); 240 241 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, 242 exec ? " (exec)" : ""); 243 } 244 245 static unsigned long next_boundary(unsigned long addr, unsigned long end) 246 { 247 #ifdef CONFIG_STRICT_KERNEL_RWX 248 if (addr < __pa_symbol(__init_begin)) 249 return __pa_symbol(__init_begin); 250 #endif 251 return end; 252 } 253 254 static int __meminit create_physical_mapping(unsigned long start, 255 unsigned long end, 256 int nid) 257 { 258 unsigned long vaddr, addr, mapping_size = 0; 259 bool prev_exec, exec = false; 260 pgprot_t prot; 261 int psize; 262 263 start = _ALIGN_UP(start, PAGE_SIZE); 264 for (addr = start; addr < end; addr += mapping_size) { 265 unsigned long gap, previous_size; 266 int rc; 267 268 gap = next_boundary(addr, end) - addr; 269 previous_size = mapping_size; 270 prev_exec = exec; 271 272 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && 273 mmu_psize_defs[MMU_PAGE_1G].shift) { 274 mapping_size = PUD_SIZE; 275 psize = MMU_PAGE_1G; 276 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && 277 mmu_psize_defs[MMU_PAGE_2M].shift) { 278 mapping_size = PMD_SIZE; 279 psize = MMU_PAGE_2M; 280 } else { 281 mapping_size = PAGE_SIZE; 282 psize = mmu_virtual_psize; 283 } 284 285 vaddr = (unsigned long)__va(addr); 286 287 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || 288 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { 289 prot = PAGE_KERNEL_X; 290 exec = true; 291 } else { 292 prot = PAGE_KERNEL; 293 exec = false; 294 } 295 296 if (mapping_size != previous_size || exec != prev_exec) { 297 print_mapping(start, addr, previous_size, prev_exec); 298 start = addr; 299 } 300 301 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); 302 if (rc) 303 return rc; 304 305 update_page_count(psize, 1); 306 } 307 308 print_mapping(start, addr, mapping_size, exec); 309 return 0; 310 } 311 312 static void __init radix_init_pgtable(void) 313 { 314 unsigned long rts_field; 315 struct memblock_region *reg; 316 317 /* We don't support slb for radix */ 318 mmu_slb_size = 0; 319 /* 320 * Create the linear mapping, using standard page size for now 321 */ 322 for_each_memblock(memory, reg) { 323 /* 324 * The memblock allocator is up at this point, so the 325 * page tables will be allocated within the range. No 326 * need or a node (which we don't have yet). 327 */ 328 329 if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { 330 pr_warn("Outside the supported range\n"); 331 continue; 332 } 333 334 WARN_ON(create_physical_mapping(reg->base, 335 reg->base + reg->size, 336 -1)); 337 } 338 339 /* Find out how many PID bits are supported */ 340 if (cpu_has_feature(CPU_FTR_HVMODE)) { 341 if (!mmu_pid_bits) 342 mmu_pid_bits = 20; 343 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 344 /* 345 * When KVM is possible, we only use the top half of the 346 * PID space to avoid collisions between host and guest PIDs 347 * which can cause problems due to prefetch when exiting the 348 * guest with AIL=3 349 */ 350 mmu_base_pid = 1 << (mmu_pid_bits - 1); 351 #else 352 mmu_base_pid = 1; 353 #endif 354 } else { 355 /* The guest uses the bottom half of the PID space */ 356 if (!mmu_pid_bits) 357 mmu_pid_bits = 19; 358 mmu_base_pid = 1; 359 } 360 361 /* 362 * Allocate Partition table and process table for the 363 * host. 364 */ 365 BUG_ON(PRTB_SIZE_SHIFT > 36); 366 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); 367 /* 368 * Fill in the process table. 369 */ 370 rts_field = radix__get_tree_size(); 371 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 372 373 /* 374 * The init_mm context is given the first available (non-zero) PID, 375 * which is the "guard PID" and contains no page table. PIDR should 376 * never be set to zero because that duplicates the kernel address 377 * space at the 0x0... offset (quadrant 0)! 378 * 379 * An arbitrary PID that may later be allocated by the PID allocator 380 * for userspace processes must not be used either, because that 381 * would cause stale user mappings for that PID on CPUs outside of 382 * the TLB invalidation scheme (because it won't be in mm_cpumask). 383 * 384 * So permanently carve out one PID for the purpose of a guard PID. 385 */ 386 init_mm.context.id = mmu_base_pid; 387 mmu_base_pid++; 388 } 389 390 static void __init radix_init_partition_table(void) 391 { 392 unsigned long rts_field, dw0, dw1; 393 394 mmu_partition_table_init(); 395 rts_field = radix__get_tree_size(); 396 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; 397 dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; 398 mmu_partition_table_set_entry(0, dw0, dw1, false); 399 400 pr_info("Initializing Radix MMU\n"); 401 } 402 403 static int __init get_idx_from_shift(unsigned int shift) 404 { 405 int idx = -1; 406 407 switch (shift) { 408 case 0xc: 409 idx = MMU_PAGE_4K; 410 break; 411 case 0x10: 412 idx = MMU_PAGE_64K; 413 break; 414 case 0x15: 415 idx = MMU_PAGE_2M; 416 break; 417 case 0x1e: 418 idx = MMU_PAGE_1G; 419 break; 420 } 421 return idx; 422 } 423 424 static int __init radix_dt_scan_page_sizes(unsigned long node, 425 const char *uname, int depth, 426 void *data) 427 { 428 int size = 0; 429 int shift, idx; 430 unsigned int ap; 431 const __be32 *prop; 432 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 433 434 /* We are scanning "cpu" nodes only */ 435 if (type == NULL || strcmp(type, "cpu") != 0) 436 return 0; 437 438 /* Find MMU PID size */ 439 prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); 440 if (prop && size == 4) 441 mmu_pid_bits = be32_to_cpup(prop); 442 443 /* Grab page size encodings */ 444 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); 445 if (!prop) 446 return 0; 447 448 pr_info("Page sizes from device-tree:\n"); 449 for (; size >= 4; size -= 4, ++prop) { 450 451 struct mmu_psize_def *def; 452 453 /* top 3 bit is AP encoding */ 454 shift = be32_to_cpu(prop[0]) & ~(0xe << 28); 455 ap = be32_to_cpu(prop[0]) >> 29; 456 pr_info("Page size shift = %d AP=0x%x\n", shift, ap); 457 458 idx = get_idx_from_shift(shift); 459 if (idx < 0) 460 continue; 461 462 def = &mmu_psize_defs[idx]; 463 def->shift = shift; 464 def->ap = ap; 465 } 466 467 /* needed ? */ 468 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 469 return 1; 470 } 471 472 void __init radix__early_init_devtree(void) 473 { 474 int rc; 475 476 /* 477 * Try to find the available page sizes in the device-tree 478 */ 479 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); 480 if (rc != 0) /* Found */ 481 goto found; 482 /* 483 * let's assume we have page 4k and 64k support 484 */ 485 mmu_psize_defs[MMU_PAGE_4K].shift = 12; 486 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; 487 488 mmu_psize_defs[MMU_PAGE_64K].shift = 16; 489 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; 490 found: 491 return; 492 } 493 494 static void radix_init_amor(void) 495 { 496 /* 497 * In HV mode, we init AMOR (Authority Mask Override Register) so that 498 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 499 * Register), enable key 0 and set it to 1. 500 * 501 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 502 */ 503 mtspr(SPRN_AMOR, (3ul << 62)); 504 } 505 506 #ifdef CONFIG_PPC_KUEP 507 void setup_kuep(bool disabled) 508 { 509 if (disabled || !early_radix_enabled()) 510 return; 511 512 if (smp_processor_id() == boot_cpuid) 513 pr_info("Activating Kernel Userspace Execution Prevention\n"); 514 515 /* 516 * Radix always uses key0 of the IAMR to determine if an access is 517 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 518 * fetch. 519 */ 520 mtspr(SPRN_IAMR, (1ul << 62)); 521 } 522 #endif 523 524 #ifdef CONFIG_PPC_KUAP 525 void setup_kuap(bool disabled) 526 { 527 if (disabled || !early_radix_enabled()) 528 return; 529 530 if (smp_processor_id() == boot_cpuid) { 531 pr_info("Activating Kernel Userspace Access Prevention\n"); 532 cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; 533 } 534 535 /* Make sure userspace can't change the AMR */ 536 mtspr(SPRN_UAMOR, 0); 537 mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); 538 isync(); 539 } 540 #endif 541 542 void __init radix__early_init_mmu(void) 543 { 544 unsigned long lpcr; 545 546 #ifdef CONFIG_PPC_64K_PAGES 547 /* PAGE_SIZE mappings */ 548 mmu_virtual_psize = MMU_PAGE_64K; 549 #else 550 mmu_virtual_psize = MMU_PAGE_4K; 551 #endif 552 553 #ifdef CONFIG_SPARSEMEM_VMEMMAP 554 /* vmemmap mapping */ 555 if (mmu_psize_defs[MMU_PAGE_2M].shift) { 556 /* 557 * map vmemmap using 2M if available 558 */ 559 mmu_vmemmap_psize = MMU_PAGE_2M; 560 } else 561 mmu_vmemmap_psize = mmu_virtual_psize; 562 #endif 563 /* 564 * initialize page table size 565 */ 566 __pte_index_size = RADIX_PTE_INDEX_SIZE; 567 __pmd_index_size = RADIX_PMD_INDEX_SIZE; 568 __pud_index_size = RADIX_PUD_INDEX_SIZE; 569 __pgd_index_size = RADIX_PGD_INDEX_SIZE; 570 __pud_cache_index = RADIX_PUD_INDEX_SIZE; 571 __pte_table_size = RADIX_PTE_TABLE_SIZE; 572 __pmd_table_size = RADIX_PMD_TABLE_SIZE; 573 __pud_table_size = RADIX_PUD_TABLE_SIZE; 574 __pgd_table_size = RADIX_PGD_TABLE_SIZE; 575 576 __pmd_val_bits = RADIX_PMD_VAL_BITS; 577 __pud_val_bits = RADIX_PUD_VAL_BITS; 578 __pgd_val_bits = RADIX_PGD_VAL_BITS; 579 580 __kernel_virt_start = RADIX_KERN_VIRT_START; 581 __vmalloc_start = RADIX_VMALLOC_START; 582 __vmalloc_end = RADIX_VMALLOC_END; 583 __kernel_io_start = RADIX_KERN_IO_START; 584 __kernel_io_end = RADIX_KERN_IO_END; 585 vmemmap = (struct page *)RADIX_VMEMMAP_START; 586 ioremap_bot = IOREMAP_BASE; 587 588 #ifdef CONFIG_PCI 589 pci_io_base = ISA_IO_BASE; 590 #endif 591 __pte_frag_nr = RADIX_PTE_FRAG_NR; 592 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; 593 __pmd_frag_nr = RADIX_PMD_FRAG_NR; 594 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; 595 596 radix_init_pgtable(); 597 598 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 599 lpcr = mfspr(SPRN_LPCR); 600 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 601 radix_init_partition_table(); 602 radix_init_amor(); 603 } else { 604 radix_init_pseries(); 605 } 606 607 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 608 609 /* Switch to the guard PID before turning on MMU */ 610 radix__switch_mmu_context(NULL, &init_mm); 611 tlbiel_all(); 612 } 613 614 void radix__early_init_mmu_secondary(void) 615 { 616 unsigned long lpcr; 617 /* 618 * update partition table control register and UPRT 619 */ 620 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 621 lpcr = mfspr(SPRN_LPCR); 622 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 623 624 set_ptcr_when_no_uv(__pa(partition_tb) | 625 (PATB_SIZE_SHIFT - 12)); 626 627 radix_init_amor(); 628 } 629 630 radix__switch_mmu_context(NULL, &init_mm); 631 tlbiel_all(); 632 } 633 634 void radix__mmu_cleanup_all(void) 635 { 636 unsigned long lpcr; 637 638 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 639 lpcr = mfspr(SPRN_LPCR); 640 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); 641 set_ptcr_when_no_uv(0); 642 powernv_set_nmmu_ptcr(0); 643 radix__flush_tlb_all(); 644 } 645 } 646 647 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, 648 phys_addr_t first_memblock_size) 649 { 650 /* 651 * We don't currently support the first MEMBLOCK not mapping 0 652 * physical on those processors 653 */ 654 BUG_ON(first_memblock_base != 0); 655 656 /* 657 * Radix mode is not limited by RMA / VRMA addressing. 658 */ 659 ppc64_rma_size = ULONG_MAX; 660 } 661 662 #ifdef CONFIG_MEMORY_HOTPLUG 663 static void free_pte_table(pte_t *pte_start, pmd_t *pmd) 664 { 665 pte_t *pte; 666 int i; 667 668 for (i = 0; i < PTRS_PER_PTE; i++) { 669 pte = pte_start + i; 670 if (!pte_none(*pte)) 671 return; 672 } 673 674 pte_free_kernel(&init_mm, pte_start); 675 pmd_clear(pmd); 676 } 677 678 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) 679 { 680 pmd_t *pmd; 681 int i; 682 683 for (i = 0; i < PTRS_PER_PMD; i++) { 684 pmd = pmd_start + i; 685 if (!pmd_none(*pmd)) 686 return; 687 } 688 689 pmd_free(&init_mm, pmd_start); 690 pud_clear(pud); 691 } 692 693 struct change_mapping_params { 694 pte_t *pte; 695 unsigned long start; 696 unsigned long end; 697 unsigned long aligned_start; 698 unsigned long aligned_end; 699 }; 700 701 static int __meminit stop_machine_change_mapping(void *data) 702 { 703 struct change_mapping_params *params = 704 (struct change_mapping_params *)data; 705 706 if (!data) 707 return -1; 708 709 spin_unlock(&init_mm.page_table_lock); 710 pte_clear(&init_mm, params->aligned_start, params->pte); 711 create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1); 712 create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1); 713 spin_lock(&init_mm.page_table_lock); 714 return 0; 715 } 716 717 static void remove_pte_table(pte_t *pte_start, unsigned long addr, 718 unsigned long end) 719 { 720 unsigned long next; 721 pte_t *pte; 722 723 pte = pte_start + pte_index(addr); 724 for (; addr < end; addr = next, pte++) { 725 next = (addr + PAGE_SIZE) & PAGE_MASK; 726 if (next > end) 727 next = end; 728 729 if (!pte_present(*pte)) 730 continue; 731 732 if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { 733 /* 734 * The vmemmap_free() and remove_section_mapping() 735 * codepaths call us with aligned addresses. 736 */ 737 WARN_ONCE(1, "%s: unaligned range\n", __func__); 738 continue; 739 } 740 741 pte_clear(&init_mm, addr, pte); 742 } 743 } 744 745 /* 746 * clear the pte and potentially split the mapping helper 747 */ 748 static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, 749 unsigned long size, pte_t *pte) 750 { 751 unsigned long mask = ~(size - 1); 752 unsigned long aligned_start = addr & mask; 753 unsigned long aligned_end = addr + size; 754 struct change_mapping_params params; 755 bool split_region = false; 756 757 if ((end - addr) < size) { 758 /* 759 * We're going to clear the PTE, but not flushed 760 * the mapping, time to remap and flush. The 761 * effects if visible outside the processor or 762 * if we are running in code close to the 763 * mapping we cleared, we are in trouble. 764 */ 765 if (overlaps_kernel_text(aligned_start, addr) || 766 overlaps_kernel_text(end, aligned_end)) { 767 /* 768 * Hack, just return, don't pte_clear 769 */ 770 WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " 771 "text, not splitting\n", addr, end); 772 return; 773 } 774 split_region = true; 775 } 776 777 if (split_region) { 778 params.pte = pte; 779 params.start = addr; 780 params.end = end; 781 params.aligned_start = addr & ~(size - 1); 782 params.aligned_end = min_t(unsigned long, aligned_end, 783 (unsigned long)__va(memblock_end_of_DRAM())); 784 stop_machine(stop_machine_change_mapping, ¶ms, NULL); 785 return; 786 } 787 788 pte_clear(&init_mm, addr, pte); 789 } 790 791 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, 792 unsigned long end) 793 { 794 unsigned long next; 795 pte_t *pte_base; 796 pmd_t *pmd; 797 798 pmd = pmd_start + pmd_index(addr); 799 for (; addr < end; addr = next, pmd++) { 800 next = pmd_addr_end(addr, end); 801 802 if (!pmd_present(*pmd)) 803 continue; 804 805 if (pmd_is_leaf(*pmd)) { 806 split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); 807 continue; 808 } 809 810 pte_base = (pte_t *)pmd_page_vaddr(*pmd); 811 remove_pte_table(pte_base, addr, next); 812 free_pte_table(pte_base, pmd); 813 } 814 } 815 816 static void remove_pud_table(pud_t *pud_start, unsigned long addr, 817 unsigned long end) 818 { 819 unsigned long next; 820 pmd_t *pmd_base; 821 pud_t *pud; 822 823 pud = pud_start + pud_index(addr); 824 for (; addr < end; addr = next, pud++) { 825 next = pud_addr_end(addr, end); 826 827 if (!pud_present(*pud)) 828 continue; 829 830 if (pud_is_leaf(*pud)) { 831 split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); 832 continue; 833 } 834 835 pmd_base = (pmd_t *)pud_page_vaddr(*pud); 836 remove_pmd_table(pmd_base, addr, next); 837 free_pmd_table(pmd_base, pud); 838 } 839 } 840 841 static void __meminit remove_pagetable(unsigned long start, unsigned long end) 842 { 843 unsigned long addr, next; 844 pud_t *pud_base; 845 pgd_t *pgd; 846 847 spin_lock(&init_mm.page_table_lock); 848 849 for (addr = start; addr < end; addr = next) { 850 next = pgd_addr_end(addr, end); 851 852 pgd = pgd_offset_k(addr); 853 if (!pgd_present(*pgd)) 854 continue; 855 856 if (pgd_is_leaf(*pgd)) { 857 split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); 858 continue; 859 } 860 861 pud_base = (pud_t *)pgd_page_vaddr(*pgd); 862 remove_pud_table(pud_base, addr, next); 863 } 864 865 spin_unlock(&init_mm.page_table_lock); 866 radix__flush_tlb_kernel_range(start, end); 867 } 868 869 int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) 870 { 871 if (end >= RADIX_VMALLOC_START) { 872 pr_warn("Outside the supported range\n"); 873 return -1; 874 } 875 876 return create_physical_mapping(__pa(start), __pa(end), nid); 877 } 878 879 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) 880 { 881 remove_pagetable(start, end); 882 return 0; 883 } 884 #endif /* CONFIG_MEMORY_HOTPLUG */ 885 886 #ifdef CONFIG_SPARSEMEM_VMEMMAP 887 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, 888 pgprot_t flags, unsigned int map_page_size, 889 int nid) 890 { 891 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); 892 } 893 894 int __meminit radix__vmemmap_create_mapping(unsigned long start, 895 unsigned long page_size, 896 unsigned long phys) 897 { 898 /* Create a PTE encoding */ 899 unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; 900 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); 901 int ret; 902 903 if ((start + page_size) >= RADIX_VMEMMAP_END) { 904 pr_warn("Outside the supported range\n"); 905 return -1; 906 } 907 908 ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); 909 BUG_ON(ret); 910 911 return 0; 912 } 913 914 #ifdef CONFIG_MEMORY_HOTPLUG 915 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) 916 { 917 remove_pagetable(start, start + page_size); 918 } 919 #endif 920 #endif 921 922 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 923 924 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 925 pmd_t *pmdp, unsigned long clr, 926 unsigned long set) 927 { 928 unsigned long old; 929 930 #ifdef CONFIG_DEBUG_VM 931 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); 932 assert_spin_locked(pmd_lockptr(mm, pmdp)); 933 #endif 934 935 old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); 936 trace_hugepage_update(addr, old, clr, set); 937 938 return old; 939 } 940 941 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 942 pmd_t *pmdp) 943 944 { 945 pmd_t pmd; 946 947 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 948 VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); 949 VM_BUG_ON(pmd_devmap(*pmdp)); 950 /* 951 * khugepaged calls this for normal pmd 952 */ 953 pmd = *pmdp; 954 pmd_clear(pmdp); 955 956 /*FIXME!! Verify whether we need this kick below */ 957 serialize_against_pte_lookup(vma->vm_mm); 958 959 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); 960 961 return pmd; 962 } 963 964 /* 965 * For us pgtable_t is pte_t *. Inorder to save the deposisted 966 * page table, we consider the allocated page table as a list 967 * head. On withdraw we need to make sure we zero out the used 968 * list_head memory area. 969 */ 970 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 971 pgtable_t pgtable) 972 { 973 struct list_head *lh = (struct list_head *) pgtable; 974 975 assert_spin_locked(pmd_lockptr(mm, pmdp)); 976 977 /* FIFO */ 978 if (!pmd_huge_pte(mm, pmdp)) 979 INIT_LIST_HEAD(lh); 980 else 981 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 982 pmd_huge_pte(mm, pmdp) = pgtable; 983 } 984 985 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 986 { 987 pte_t *ptep; 988 pgtable_t pgtable; 989 struct list_head *lh; 990 991 assert_spin_locked(pmd_lockptr(mm, pmdp)); 992 993 /* FIFO */ 994 pgtable = pmd_huge_pte(mm, pmdp); 995 lh = (struct list_head *) pgtable; 996 if (list_empty(lh)) 997 pmd_huge_pte(mm, pmdp) = NULL; 998 else { 999 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1000 list_del(lh); 1001 } 1002 ptep = (pte_t *) pgtable; 1003 *ptep = __pte(0); 1004 ptep++; 1005 *ptep = __pte(0); 1006 return pgtable; 1007 } 1008 1009 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 1010 unsigned long addr, pmd_t *pmdp) 1011 { 1012 pmd_t old_pmd; 1013 unsigned long old; 1014 1015 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 1016 old_pmd = __pmd(old); 1017 /* 1018 * Serialize against find_current_mm_pte which does lock-less 1019 * lookup in page tables with local interrupts disabled. For huge pages 1020 * it casts pmd_t to pte_t. Since format of pte_t is different from 1021 * pmd_t we want to prevent transit from pmd pointing to page table 1022 * to pmd pointing to huge page (and back) while interrupts are disabled. 1023 * We clear pmd to possibly replace it with page table pointer in 1024 * different code paths. So make sure we wait for the parallel 1025 * find_current_mm_pte to finish. 1026 */ 1027 serialize_against_pte_lookup(mm); 1028 return old_pmd; 1029 } 1030 1031 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1032 1033 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, 1034 pte_t entry, unsigned long address, int psize) 1035 { 1036 struct mm_struct *mm = vma->vm_mm; 1037 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | 1038 _PAGE_RW | _PAGE_EXEC); 1039 1040 unsigned long change = pte_val(entry) ^ pte_val(*ptep); 1041 /* 1042 * To avoid NMMU hang while relaxing access, we need mark 1043 * the pte invalid in between. 1044 */ 1045 if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { 1046 unsigned long old_pte, new_pte; 1047 1048 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); 1049 /* 1050 * new value of pte 1051 */ 1052 new_pte = old_pte | set; 1053 radix__flush_tlb_page_psize(mm, address, psize); 1054 __radix_pte_update(ptep, _PAGE_INVALID, new_pte); 1055 } else { 1056 __radix_pte_update(ptep, 0, set); 1057 /* 1058 * Book3S does not require a TLB flush when relaxing access 1059 * restrictions when the address space is not attached to a 1060 * NMMU, because the core MMU will reload the pte after taking 1061 * an access fault, which is defined by the architectue. 1062 */ 1063 } 1064 /* See ptesync comment in radix__set_pte_at */ 1065 } 1066 1067 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 1068 unsigned long addr, pte_t *ptep, 1069 pte_t old_pte, pte_t pte) 1070 { 1071 struct mm_struct *mm = vma->vm_mm; 1072 1073 /* 1074 * To avoid NMMU hang while relaxing access we need to flush the tlb before 1075 * we set the new value. We need to do this only for radix, because hash 1076 * translation does flush when updating the linux pte. 1077 */ 1078 if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 1079 (atomic_read(&mm->context.copros) > 0)) 1080 radix__flush_tlb_page(vma, addr); 1081 1082 set_pte_at(mm, addr, ptep, pte); 1083 } 1084 1085 int __init arch_ioremap_pud_supported(void) 1086 { 1087 /* HPT does not cope with large pages in the vmalloc area */ 1088 return radix_enabled(); 1089 } 1090 1091 int __init arch_ioremap_pmd_supported(void) 1092 { 1093 return radix_enabled(); 1094 } 1095 1096 int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) 1097 { 1098 return 0; 1099 } 1100 1101 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 1102 { 1103 pte_t *ptep = (pte_t *)pud; 1104 pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot); 1105 1106 if (!radix_enabled()) 1107 return 0; 1108 1109 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); 1110 1111 return 1; 1112 } 1113 1114 int pud_clear_huge(pud_t *pud) 1115 { 1116 if (pud_huge(*pud)) { 1117 pud_clear(pud); 1118 return 1; 1119 } 1120 1121 return 0; 1122 } 1123 1124 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 1125 { 1126 pmd_t *pmd; 1127 int i; 1128 1129 pmd = (pmd_t *)pud_page_vaddr(*pud); 1130 pud_clear(pud); 1131 1132 flush_tlb_kernel_range(addr, addr + PUD_SIZE); 1133 1134 for (i = 0; i < PTRS_PER_PMD; i++) { 1135 if (!pmd_none(pmd[i])) { 1136 pte_t *pte; 1137 pte = (pte_t *)pmd_page_vaddr(pmd[i]); 1138 1139 pte_free_kernel(&init_mm, pte); 1140 } 1141 } 1142 1143 pmd_free(&init_mm, pmd); 1144 1145 return 1; 1146 } 1147 1148 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 1149 { 1150 pte_t *ptep = (pte_t *)pmd; 1151 pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot); 1152 1153 if (!radix_enabled()) 1154 return 0; 1155 1156 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); 1157 1158 return 1; 1159 } 1160 1161 int pmd_clear_huge(pmd_t *pmd) 1162 { 1163 if (pmd_huge(*pmd)) { 1164 pmd_clear(pmd); 1165 return 1; 1166 } 1167 1168 return 0; 1169 } 1170 1171 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 1172 { 1173 pte_t *pte; 1174 1175 pte = (pte_t *)pmd_page_vaddr(*pmd); 1176 pmd_clear(pmd); 1177 1178 flush_tlb_kernel_range(addr, addr + PMD_SIZE); 1179 1180 pte_free_kernel(&init_mm, pte); 1181 1182 return 1; 1183 } 1184 1185 int __init arch_ioremap_p4d_supported(void) 1186 { 1187 return 0; 1188 } 1189