1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Page table handling routines for radix page table. 4 * 5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 6 */ 7 8 #define pr_fmt(fmt) "radix-mmu: " fmt 9 10 #include <linux/io.h> 11 #include <linux/kernel.h> 12 #include <linux/sched/mm.h> 13 #include <linux/memblock.h> 14 #include <linux/of_fdt.h> 15 #include <linux/mm.h> 16 #include <linux/hugetlb.h> 17 #include <linux/string_helpers.h> 18 #include <linux/stop_machine.h> 19 20 #include <asm/pgtable.h> 21 #include <asm/pgalloc.h> 22 #include <asm/mmu_context.h> 23 #include <asm/dma.h> 24 #include <asm/machdep.h> 25 #include <asm/mmu.h> 26 #include <asm/firmware.h> 27 #include <asm/powernv.h> 28 #include <asm/sections.h> 29 #include <asm/trace.h> 30 #include <asm/uaccess.h> 31 #include <asm/ultravisor.h> 32 33 #include <trace/events/thp.h> 34 35 unsigned int mmu_pid_bits; 36 unsigned int mmu_base_pid; 37 38 static __ref void *early_alloc_pgtable(unsigned long size, int nid, 39 unsigned long region_start, unsigned long region_end) 40 { 41 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; 42 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; 43 void *ptr; 44 45 if (region_start) 46 min_addr = region_start; 47 if (region_end) 48 max_addr = region_end; 49 50 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); 51 52 if (!ptr) 53 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", 54 __func__, size, size, nid, &min_addr, &max_addr); 55 56 return ptr; 57 } 58 59 static int early_map_kernel_page(unsigned long ea, unsigned long pa, 60 pgprot_t flags, 61 unsigned int map_page_size, 62 int nid, 63 unsigned long region_start, unsigned long region_end) 64 { 65 unsigned long pfn = pa >> PAGE_SHIFT; 66 pgd_t *pgdp; 67 pud_t *pudp; 68 pmd_t *pmdp; 69 pte_t *ptep; 70 71 pgdp = pgd_offset_k(ea); 72 if (pgd_none(*pgdp)) { 73 pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, 74 region_start, region_end); 75 pgd_populate(&init_mm, pgdp, pudp); 76 } 77 pudp = pud_offset(pgdp, ea); 78 if (map_page_size == PUD_SIZE) { 79 ptep = (pte_t *)pudp; 80 goto set_the_pte; 81 } 82 if (pud_none(*pudp)) { 83 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, 84 region_start, region_end); 85 pud_populate(&init_mm, pudp, pmdp); 86 } 87 pmdp = pmd_offset(pudp, ea); 88 if (map_page_size == PMD_SIZE) { 89 ptep = pmdp_ptep(pmdp); 90 goto set_the_pte; 91 } 92 if (!pmd_present(*pmdp)) { 93 ptep = early_alloc_pgtable(PAGE_SIZE, nid, 94 region_start, region_end); 95 pmd_populate_kernel(&init_mm, pmdp, ptep); 96 } 97 ptep = pte_offset_kernel(pmdp, ea); 98 99 set_the_pte: 100 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 101 smp_wmb(); 102 return 0; 103 } 104 105 /* 106 * nid, region_start, and region_end are hints to try to place the page 107 * table memory in the same node or region. 108 */ 109 static int __map_kernel_page(unsigned long ea, unsigned long pa, 110 pgprot_t flags, 111 unsigned int map_page_size, 112 int nid, 113 unsigned long region_start, unsigned long region_end) 114 { 115 unsigned long pfn = pa >> PAGE_SHIFT; 116 pgd_t *pgdp; 117 pud_t *pudp; 118 pmd_t *pmdp; 119 pte_t *ptep; 120 /* 121 * Make sure task size is correct as per the max adddr 122 */ 123 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); 124 125 #ifdef CONFIG_PPC_64K_PAGES 126 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); 127 #endif 128 129 if (unlikely(!slab_is_available())) 130 return early_map_kernel_page(ea, pa, flags, map_page_size, 131 nid, region_start, region_end); 132 133 /* 134 * Should make page table allocation functions be able to take a 135 * node, so we can place kernel page tables on the right nodes after 136 * boot. 137 */ 138 pgdp = pgd_offset_k(ea); 139 pudp = pud_alloc(&init_mm, pgdp, ea); 140 if (!pudp) 141 return -ENOMEM; 142 if (map_page_size == PUD_SIZE) { 143 ptep = (pte_t *)pudp; 144 goto set_the_pte; 145 } 146 pmdp = pmd_alloc(&init_mm, pudp, ea); 147 if (!pmdp) 148 return -ENOMEM; 149 if (map_page_size == PMD_SIZE) { 150 ptep = pmdp_ptep(pmdp); 151 goto set_the_pte; 152 } 153 ptep = pte_alloc_kernel(pmdp, ea); 154 if (!ptep) 155 return -ENOMEM; 156 157 set_the_pte: 158 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 159 smp_wmb(); 160 return 0; 161 } 162 163 int radix__map_kernel_page(unsigned long ea, unsigned long pa, 164 pgprot_t flags, 165 unsigned int map_page_size) 166 { 167 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); 168 } 169 170 #ifdef CONFIG_STRICT_KERNEL_RWX 171 void radix__change_memory_range(unsigned long start, unsigned long end, 172 unsigned long clear) 173 { 174 unsigned long idx; 175 pgd_t *pgdp; 176 pud_t *pudp; 177 pmd_t *pmdp; 178 pte_t *ptep; 179 180 start = ALIGN_DOWN(start, PAGE_SIZE); 181 end = PAGE_ALIGN(end); // aligns up 182 183 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", 184 start, end, clear); 185 186 for (idx = start; idx < end; idx += PAGE_SIZE) { 187 pgdp = pgd_offset_k(idx); 188 pudp = pud_alloc(&init_mm, pgdp, idx); 189 if (!pudp) 190 continue; 191 if (pud_is_leaf(*pudp)) { 192 ptep = (pte_t *)pudp; 193 goto update_the_pte; 194 } 195 pmdp = pmd_alloc(&init_mm, pudp, idx); 196 if (!pmdp) 197 continue; 198 if (pmd_is_leaf(*pmdp)) { 199 ptep = pmdp_ptep(pmdp); 200 goto update_the_pte; 201 } 202 ptep = pte_alloc_kernel(pmdp, idx); 203 if (!ptep) 204 continue; 205 update_the_pte: 206 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); 207 } 208 209 radix__flush_tlb_kernel_range(start, end); 210 } 211 212 void radix__mark_rodata_ro(void) 213 { 214 unsigned long start, end; 215 216 start = (unsigned long)_stext; 217 end = (unsigned long)__init_begin; 218 219 radix__change_memory_range(start, end, _PAGE_WRITE); 220 } 221 222 void radix__mark_initmem_nx(void) 223 { 224 unsigned long start = (unsigned long)__init_begin; 225 unsigned long end = (unsigned long)__init_end; 226 227 radix__change_memory_range(start, end, _PAGE_EXEC); 228 } 229 #endif /* CONFIG_STRICT_KERNEL_RWX */ 230 231 static inline void __meminit 232 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) 233 { 234 char buf[10]; 235 236 if (end <= start) 237 return; 238 239 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); 240 241 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, 242 exec ? " (exec)" : ""); 243 } 244 245 static unsigned long next_boundary(unsigned long addr, unsigned long end) 246 { 247 #ifdef CONFIG_STRICT_KERNEL_RWX 248 if (addr < __pa_symbol(__init_begin)) 249 return __pa_symbol(__init_begin); 250 #endif 251 return end; 252 } 253 254 static int __meminit create_physical_mapping(unsigned long start, 255 unsigned long end, 256 int nid) 257 { 258 unsigned long vaddr, addr, mapping_size = 0; 259 bool prev_exec, exec = false; 260 pgprot_t prot; 261 int psize; 262 263 start = _ALIGN_UP(start, PAGE_SIZE); 264 for (addr = start; addr < end; addr += mapping_size) { 265 unsigned long gap, previous_size; 266 int rc; 267 268 gap = next_boundary(addr, end) - addr; 269 previous_size = mapping_size; 270 prev_exec = exec; 271 272 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && 273 mmu_psize_defs[MMU_PAGE_1G].shift) { 274 mapping_size = PUD_SIZE; 275 psize = MMU_PAGE_1G; 276 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && 277 mmu_psize_defs[MMU_PAGE_2M].shift) { 278 mapping_size = PMD_SIZE; 279 psize = MMU_PAGE_2M; 280 } else { 281 mapping_size = PAGE_SIZE; 282 psize = mmu_virtual_psize; 283 } 284 285 vaddr = (unsigned long)__va(addr); 286 287 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || 288 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { 289 prot = PAGE_KERNEL_X; 290 exec = true; 291 } else { 292 prot = PAGE_KERNEL; 293 exec = false; 294 } 295 296 if (mapping_size != previous_size || exec != prev_exec) { 297 print_mapping(start, addr, previous_size, prev_exec); 298 start = addr; 299 } 300 301 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); 302 if (rc) 303 return rc; 304 305 update_page_count(psize, 1); 306 } 307 308 print_mapping(start, addr, mapping_size, exec); 309 return 0; 310 } 311 312 static void __init radix_init_pgtable(void) 313 { 314 unsigned long rts_field; 315 struct memblock_region *reg; 316 317 /* We don't support slb for radix */ 318 mmu_slb_size = 0; 319 /* 320 * Create the linear mapping, using standard page size for now 321 */ 322 for_each_memblock(memory, reg) { 323 /* 324 * The memblock allocator is up at this point, so the 325 * page tables will be allocated within the range. No 326 * need or a node (which we don't have yet). 327 */ 328 329 if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { 330 pr_warn("Outside the supported range\n"); 331 continue; 332 } 333 334 WARN_ON(create_physical_mapping(reg->base, 335 reg->base + reg->size, 336 -1)); 337 } 338 339 /* Find out how many PID bits are supported */ 340 if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { 341 if (!mmu_pid_bits) 342 mmu_pid_bits = 20; 343 mmu_base_pid = 1; 344 } else if (cpu_has_feature(CPU_FTR_HVMODE)) { 345 if (!mmu_pid_bits) 346 mmu_pid_bits = 20; 347 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 348 /* 349 * When KVM is possible, we only use the top half of the 350 * PID space to avoid collisions between host and guest PIDs 351 * which can cause problems due to prefetch when exiting the 352 * guest with AIL=3 353 */ 354 mmu_base_pid = 1 << (mmu_pid_bits - 1); 355 #else 356 mmu_base_pid = 1; 357 #endif 358 } else { 359 /* The guest uses the bottom half of the PID space */ 360 if (!mmu_pid_bits) 361 mmu_pid_bits = 19; 362 mmu_base_pid = 1; 363 } 364 365 /* 366 * Allocate Partition table and process table for the 367 * host. 368 */ 369 BUG_ON(PRTB_SIZE_SHIFT > 36); 370 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); 371 /* 372 * Fill in the process table. 373 */ 374 rts_field = radix__get_tree_size(); 375 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 376 377 /* 378 * The init_mm context is given the first available (non-zero) PID, 379 * which is the "guard PID" and contains no page table. PIDR should 380 * never be set to zero because that duplicates the kernel address 381 * space at the 0x0... offset (quadrant 0)! 382 * 383 * An arbitrary PID that may later be allocated by the PID allocator 384 * for userspace processes must not be used either, because that 385 * would cause stale user mappings for that PID on CPUs outside of 386 * the TLB invalidation scheme (because it won't be in mm_cpumask). 387 * 388 * So permanently carve out one PID for the purpose of a guard PID. 389 */ 390 init_mm.context.id = mmu_base_pid; 391 mmu_base_pid++; 392 } 393 394 static void __init radix_init_partition_table(void) 395 { 396 unsigned long rts_field, dw0, dw1; 397 398 mmu_partition_table_init(); 399 rts_field = radix__get_tree_size(); 400 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; 401 dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; 402 mmu_partition_table_set_entry(0, dw0, dw1, false); 403 404 pr_info("Initializing Radix MMU\n"); 405 } 406 407 static int __init get_idx_from_shift(unsigned int shift) 408 { 409 int idx = -1; 410 411 switch (shift) { 412 case 0xc: 413 idx = MMU_PAGE_4K; 414 break; 415 case 0x10: 416 idx = MMU_PAGE_64K; 417 break; 418 case 0x15: 419 idx = MMU_PAGE_2M; 420 break; 421 case 0x1e: 422 idx = MMU_PAGE_1G; 423 break; 424 } 425 return idx; 426 } 427 428 static int __init radix_dt_scan_page_sizes(unsigned long node, 429 const char *uname, int depth, 430 void *data) 431 { 432 int size = 0; 433 int shift, idx; 434 unsigned int ap; 435 const __be32 *prop; 436 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 437 438 /* We are scanning "cpu" nodes only */ 439 if (type == NULL || strcmp(type, "cpu") != 0) 440 return 0; 441 442 /* Find MMU PID size */ 443 prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); 444 if (prop && size == 4) 445 mmu_pid_bits = be32_to_cpup(prop); 446 447 /* Grab page size encodings */ 448 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); 449 if (!prop) 450 return 0; 451 452 pr_info("Page sizes from device-tree:\n"); 453 for (; size >= 4; size -= 4, ++prop) { 454 455 struct mmu_psize_def *def; 456 457 /* top 3 bit is AP encoding */ 458 shift = be32_to_cpu(prop[0]) & ~(0xe << 28); 459 ap = be32_to_cpu(prop[0]) >> 29; 460 pr_info("Page size shift = %d AP=0x%x\n", shift, ap); 461 462 idx = get_idx_from_shift(shift); 463 if (idx < 0) 464 continue; 465 466 def = &mmu_psize_defs[idx]; 467 def->shift = shift; 468 def->ap = ap; 469 } 470 471 /* needed ? */ 472 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 473 return 1; 474 } 475 476 void __init radix__early_init_devtree(void) 477 { 478 int rc; 479 480 /* 481 * Try to find the available page sizes in the device-tree 482 */ 483 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); 484 if (rc != 0) /* Found */ 485 goto found; 486 /* 487 * let's assume we have page 4k and 64k support 488 */ 489 mmu_psize_defs[MMU_PAGE_4K].shift = 12; 490 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; 491 492 mmu_psize_defs[MMU_PAGE_64K].shift = 16; 493 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; 494 found: 495 return; 496 } 497 498 static void radix_init_amor(void) 499 { 500 /* 501 * In HV mode, we init AMOR (Authority Mask Override Register) so that 502 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 503 * Register), enable key 0 and set it to 1. 504 * 505 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 506 */ 507 mtspr(SPRN_AMOR, (3ul << 62)); 508 } 509 510 #ifdef CONFIG_PPC_KUEP 511 void setup_kuep(bool disabled) 512 { 513 if (disabled || !early_radix_enabled()) 514 return; 515 516 if (smp_processor_id() == boot_cpuid) 517 pr_info("Activating Kernel Userspace Execution Prevention\n"); 518 519 /* 520 * Radix always uses key0 of the IAMR to determine if an access is 521 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 522 * fetch. 523 */ 524 mtspr(SPRN_IAMR, (1ul << 62)); 525 } 526 #endif 527 528 #ifdef CONFIG_PPC_KUAP 529 void setup_kuap(bool disabled) 530 { 531 if (disabled || !early_radix_enabled()) 532 return; 533 534 if (smp_processor_id() == boot_cpuid) { 535 pr_info("Activating Kernel Userspace Access Prevention\n"); 536 cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; 537 } 538 539 /* Make sure userspace can't change the AMR */ 540 mtspr(SPRN_UAMOR, 0); 541 mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); 542 isync(); 543 } 544 #endif 545 546 void __init radix__early_init_mmu(void) 547 { 548 unsigned long lpcr; 549 550 #ifdef CONFIG_PPC_64K_PAGES 551 /* PAGE_SIZE mappings */ 552 mmu_virtual_psize = MMU_PAGE_64K; 553 #else 554 mmu_virtual_psize = MMU_PAGE_4K; 555 #endif 556 557 #ifdef CONFIG_SPARSEMEM_VMEMMAP 558 /* vmemmap mapping */ 559 if (mmu_psize_defs[MMU_PAGE_2M].shift) { 560 /* 561 * map vmemmap using 2M if available 562 */ 563 mmu_vmemmap_psize = MMU_PAGE_2M; 564 } else 565 mmu_vmemmap_psize = mmu_virtual_psize; 566 #endif 567 /* 568 * initialize page table size 569 */ 570 __pte_index_size = RADIX_PTE_INDEX_SIZE; 571 __pmd_index_size = RADIX_PMD_INDEX_SIZE; 572 __pud_index_size = RADIX_PUD_INDEX_SIZE; 573 __pgd_index_size = RADIX_PGD_INDEX_SIZE; 574 __pud_cache_index = RADIX_PUD_INDEX_SIZE; 575 __pte_table_size = RADIX_PTE_TABLE_SIZE; 576 __pmd_table_size = RADIX_PMD_TABLE_SIZE; 577 __pud_table_size = RADIX_PUD_TABLE_SIZE; 578 __pgd_table_size = RADIX_PGD_TABLE_SIZE; 579 580 __pmd_val_bits = RADIX_PMD_VAL_BITS; 581 __pud_val_bits = RADIX_PUD_VAL_BITS; 582 __pgd_val_bits = RADIX_PGD_VAL_BITS; 583 584 __kernel_virt_start = RADIX_KERN_VIRT_START; 585 __vmalloc_start = RADIX_VMALLOC_START; 586 __vmalloc_end = RADIX_VMALLOC_END; 587 __kernel_io_start = RADIX_KERN_IO_START; 588 __kernel_io_end = RADIX_KERN_IO_END; 589 vmemmap = (struct page *)RADIX_VMEMMAP_START; 590 ioremap_bot = IOREMAP_BASE; 591 592 #ifdef CONFIG_PCI 593 pci_io_base = ISA_IO_BASE; 594 #endif 595 __pte_frag_nr = RADIX_PTE_FRAG_NR; 596 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; 597 __pmd_frag_nr = RADIX_PMD_FRAG_NR; 598 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; 599 600 radix_init_pgtable(); 601 602 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 603 lpcr = mfspr(SPRN_LPCR); 604 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 605 radix_init_partition_table(); 606 radix_init_amor(); 607 } else { 608 radix_init_pseries(); 609 } 610 611 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 612 613 /* Switch to the guard PID before turning on MMU */ 614 radix__switch_mmu_context(NULL, &init_mm); 615 tlbiel_all(); 616 } 617 618 void radix__early_init_mmu_secondary(void) 619 { 620 unsigned long lpcr; 621 /* 622 * update partition table control register and UPRT 623 */ 624 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 625 lpcr = mfspr(SPRN_LPCR); 626 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 627 628 set_ptcr_when_no_uv(__pa(partition_tb) | 629 (PATB_SIZE_SHIFT - 12)); 630 631 radix_init_amor(); 632 } 633 634 radix__switch_mmu_context(NULL, &init_mm); 635 tlbiel_all(); 636 } 637 638 void radix__mmu_cleanup_all(void) 639 { 640 unsigned long lpcr; 641 642 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 643 lpcr = mfspr(SPRN_LPCR); 644 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); 645 set_ptcr_when_no_uv(0); 646 powernv_set_nmmu_ptcr(0); 647 radix__flush_tlb_all(); 648 } 649 } 650 651 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, 652 phys_addr_t first_memblock_size) 653 { 654 /* 655 * We don't currently support the first MEMBLOCK not mapping 0 656 * physical on those processors 657 */ 658 BUG_ON(first_memblock_base != 0); 659 660 /* 661 * Radix mode is not limited by RMA / VRMA addressing. 662 */ 663 ppc64_rma_size = ULONG_MAX; 664 } 665 666 #ifdef CONFIG_MEMORY_HOTPLUG 667 static void free_pte_table(pte_t *pte_start, pmd_t *pmd) 668 { 669 pte_t *pte; 670 int i; 671 672 for (i = 0; i < PTRS_PER_PTE; i++) { 673 pte = pte_start + i; 674 if (!pte_none(*pte)) 675 return; 676 } 677 678 pte_free_kernel(&init_mm, pte_start); 679 pmd_clear(pmd); 680 } 681 682 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) 683 { 684 pmd_t *pmd; 685 int i; 686 687 for (i = 0; i < PTRS_PER_PMD; i++) { 688 pmd = pmd_start + i; 689 if (!pmd_none(*pmd)) 690 return; 691 } 692 693 pmd_free(&init_mm, pmd_start); 694 pud_clear(pud); 695 } 696 697 struct change_mapping_params { 698 pte_t *pte; 699 unsigned long start; 700 unsigned long end; 701 unsigned long aligned_start; 702 unsigned long aligned_end; 703 }; 704 705 static int __meminit stop_machine_change_mapping(void *data) 706 { 707 struct change_mapping_params *params = 708 (struct change_mapping_params *)data; 709 710 if (!data) 711 return -1; 712 713 spin_unlock(&init_mm.page_table_lock); 714 pte_clear(&init_mm, params->aligned_start, params->pte); 715 create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1); 716 create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1); 717 spin_lock(&init_mm.page_table_lock); 718 return 0; 719 } 720 721 static void remove_pte_table(pte_t *pte_start, unsigned long addr, 722 unsigned long end) 723 { 724 unsigned long next; 725 pte_t *pte; 726 727 pte = pte_start + pte_index(addr); 728 for (; addr < end; addr = next, pte++) { 729 next = (addr + PAGE_SIZE) & PAGE_MASK; 730 if (next > end) 731 next = end; 732 733 if (!pte_present(*pte)) 734 continue; 735 736 if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { 737 /* 738 * The vmemmap_free() and remove_section_mapping() 739 * codepaths call us with aligned addresses. 740 */ 741 WARN_ONCE(1, "%s: unaligned range\n", __func__); 742 continue; 743 } 744 745 pte_clear(&init_mm, addr, pte); 746 } 747 } 748 749 /* 750 * clear the pte and potentially split the mapping helper 751 */ 752 static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, 753 unsigned long size, pte_t *pte) 754 { 755 unsigned long mask = ~(size - 1); 756 unsigned long aligned_start = addr & mask; 757 unsigned long aligned_end = addr + size; 758 struct change_mapping_params params; 759 bool split_region = false; 760 761 if ((end - addr) < size) { 762 /* 763 * We're going to clear the PTE, but not flushed 764 * the mapping, time to remap and flush. The 765 * effects if visible outside the processor or 766 * if we are running in code close to the 767 * mapping we cleared, we are in trouble. 768 */ 769 if (overlaps_kernel_text(aligned_start, addr) || 770 overlaps_kernel_text(end, aligned_end)) { 771 /* 772 * Hack, just return, don't pte_clear 773 */ 774 WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " 775 "text, not splitting\n", addr, end); 776 return; 777 } 778 split_region = true; 779 } 780 781 if (split_region) { 782 params.pte = pte; 783 params.start = addr; 784 params.end = end; 785 params.aligned_start = addr & ~(size - 1); 786 params.aligned_end = min_t(unsigned long, aligned_end, 787 (unsigned long)__va(memblock_end_of_DRAM())); 788 stop_machine(stop_machine_change_mapping, ¶ms, NULL); 789 return; 790 } 791 792 pte_clear(&init_mm, addr, pte); 793 } 794 795 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, 796 unsigned long end) 797 { 798 unsigned long next; 799 pte_t *pte_base; 800 pmd_t *pmd; 801 802 pmd = pmd_start + pmd_index(addr); 803 for (; addr < end; addr = next, pmd++) { 804 next = pmd_addr_end(addr, end); 805 806 if (!pmd_present(*pmd)) 807 continue; 808 809 if (pmd_is_leaf(*pmd)) { 810 split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); 811 continue; 812 } 813 814 pte_base = (pte_t *)pmd_page_vaddr(*pmd); 815 remove_pte_table(pte_base, addr, next); 816 free_pte_table(pte_base, pmd); 817 } 818 } 819 820 static void remove_pud_table(pud_t *pud_start, unsigned long addr, 821 unsigned long end) 822 { 823 unsigned long next; 824 pmd_t *pmd_base; 825 pud_t *pud; 826 827 pud = pud_start + pud_index(addr); 828 for (; addr < end; addr = next, pud++) { 829 next = pud_addr_end(addr, end); 830 831 if (!pud_present(*pud)) 832 continue; 833 834 if (pud_is_leaf(*pud)) { 835 split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); 836 continue; 837 } 838 839 pmd_base = (pmd_t *)pud_page_vaddr(*pud); 840 remove_pmd_table(pmd_base, addr, next); 841 free_pmd_table(pmd_base, pud); 842 } 843 } 844 845 static void __meminit remove_pagetable(unsigned long start, unsigned long end) 846 { 847 unsigned long addr, next; 848 pud_t *pud_base; 849 pgd_t *pgd; 850 851 spin_lock(&init_mm.page_table_lock); 852 853 for (addr = start; addr < end; addr = next) { 854 next = pgd_addr_end(addr, end); 855 856 pgd = pgd_offset_k(addr); 857 if (!pgd_present(*pgd)) 858 continue; 859 860 if (pgd_is_leaf(*pgd)) { 861 split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); 862 continue; 863 } 864 865 pud_base = (pud_t *)pgd_page_vaddr(*pgd); 866 remove_pud_table(pud_base, addr, next); 867 } 868 869 spin_unlock(&init_mm.page_table_lock); 870 radix__flush_tlb_kernel_range(start, end); 871 } 872 873 int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) 874 { 875 if (end >= RADIX_VMALLOC_START) { 876 pr_warn("Outside the supported range\n"); 877 return -1; 878 } 879 880 return create_physical_mapping(__pa(start), __pa(end), nid); 881 } 882 883 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) 884 { 885 remove_pagetable(start, end); 886 return 0; 887 } 888 #endif /* CONFIG_MEMORY_HOTPLUG */ 889 890 #ifdef CONFIG_SPARSEMEM_VMEMMAP 891 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, 892 pgprot_t flags, unsigned int map_page_size, 893 int nid) 894 { 895 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); 896 } 897 898 int __meminit radix__vmemmap_create_mapping(unsigned long start, 899 unsigned long page_size, 900 unsigned long phys) 901 { 902 /* Create a PTE encoding */ 903 unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; 904 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); 905 int ret; 906 907 if ((start + page_size) >= RADIX_VMEMMAP_END) { 908 pr_warn("Outside the supported range\n"); 909 return -1; 910 } 911 912 ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); 913 BUG_ON(ret); 914 915 return 0; 916 } 917 918 #ifdef CONFIG_MEMORY_HOTPLUG 919 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) 920 { 921 remove_pagetable(start, start + page_size); 922 } 923 #endif 924 #endif 925 926 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 927 928 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 929 pmd_t *pmdp, unsigned long clr, 930 unsigned long set) 931 { 932 unsigned long old; 933 934 #ifdef CONFIG_DEBUG_VM 935 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); 936 assert_spin_locked(pmd_lockptr(mm, pmdp)); 937 #endif 938 939 old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); 940 trace_hugepage_update(addr, old, clr, set); 941 942 return old; 943 } 944 945 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 946 pmd_t *pmdp) 947 948 { 949 pmd_t pmd; 950 951 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 952 VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); 953 VM_BUG_ON(pmd_devmap(*pmdp)); 954 /* 955 * khugepaged calls this for normal pmd 956 */ 957 pmd = *pmdp; 958 pmd_clear(pmdp); 959 960 /*FIXME!! Verify whether we need this kick below */ 961 serialize_against_pte_lookup(vma->vm_mm); 962 963 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); 964 965 return pmd; 966 } 967 968 /* 969 * For us pgtable_t is pte_t *. Inorder to save the deposisted 970 * page table, we consider the allocated page table as a list 971 * head. On withdraw we need to make sure we zero out the used 972 * list_head memory area. 973 */ 974 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 975 pgtable_t pgtable) 976 { 977 struct list_head *lh = (struct list_head *) pgtable; 978 979 assert_spin_locked(pmd_lockptr(mm, pmdp)); 980 981 /* FIFO */ 982 if (!pmd_huge_pte(mm, pmdp)) 983 INIT_LIST_HEAD(lh); 984 else 985 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 986 pmd_huge_pte(mm, pmdp) = pgtable; 987 } 988 989 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 990 { 991 pte_t *ptep; 992 pgtable_t pgtable; 993 struct list_head *lh; 994 995 assert_spin_locked(pmd_lockptr(mm, pmdp)); 996 997 /* FIFO */ 998 pgtable = pmd_huge_pte(mm, pmdp); 999 lh = (struct list_head *) pgtable; 1000 if (list_empty(lh)) 1001 pmd_huge_pte(mm, pmdp) = NULL; 1002 else { 1003 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1004 list_del(lh); 1005 } 1006 ptep = (pte_t *) pgtable; 1007 *ptep = __pte(0); 1008 ptep++; 1009 *ptep = __pte(0); 1010 return pgtable; 1011 } 1012 1013 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 1014 unsigned long addr, pmd_t *pmdp) 1015 { 1016 pmd_t old_pmd; 1017 unsigned long old; 1018 1019 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 1020 old_pmd = __pmd(old); 1021 /* 1022 * Serialize against find_current_mm_pte which does lock-less 1023 * lookup in page tables with local interrupts disabled. For huge pages 1024 * it casts pmd_t to pte_t. Since format of pte_t is different from 1025 * pmd_t we want to prevent transit from pmd pointing to page table 1026 * to pmd pointing to huge page (and back) while interrupts are disabled. 1027 * We clear pmd to possibly replace it with page table pointer in 1028 * different code paths. So make sure we wait for the parallel 1029 * find_current_mm_pte to finish. 1030 */ 1031 serialize_against_pte_lookup(mm); 1032 return old_pmd; 1033 } 1034 1035 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1036 1037 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, 1038 pte_t entry, unsigned long address, int psize) 1039 { 1040 struct mm_struct *mm = vma->vm_mm; 1041 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | 1042 _PAGE_RW | _PAGE_EXEC); 1043 1044 unsigned long change = pte_val(entry) ^ pte_val(*ptep); 1045 /* 1046 * To avoid NMMU hang while relaxing access, we need mark 1047 * the pte invalid in between. 1048 */ 1049 if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { 1050 unsigned long old_pte, new_pte; 1051 1052 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); 1053 /* 1054 * new value of pte 1055 */ 1056 new_pte = old_pte | set; 1057 radix__flush_tlb_page_psize(mm, address, psize); 1058 __radix_pte_update(ptep, _PAGE_INVALID, new_pte); 1059 } else { 1060 __radix_pte_update(ptep, 0, set); 1061 /* 1062 * Book3S does not require a TLB flush when relaxing access 1063 * restrictions when the address space is not attached to a 1064 * NMMU, because the core MMU will reload the pte after taking 1065 * an access fault, which is defined by the architectue. 1066 */ 1067 } 1068 /* See ptesync comment in radix__set_pte_at */ 1069 } 1070 1071 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 1072 unsigned long addr, pte_t *ptep, 1073 pte_t old_pte, pte_t pte) 1074 { 1075 struct mm_struct *mm = vma->vm_mm; 1076 1077 /* 1078 * To avoid NMMU hang while relaxing access we need to flush the tlb before 1079 * we set the new value. We need to do this only for radix, because hash 1080 * translation does flush when updating the linux pte. 1081 */ 1082 if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 1083 (atomic_read(&mm->context.copros) > 0)) 1084 radix__flush_tlb_page(vma, addr); 1085 1086 set_pte_at(mm, addr, ptep, pte); 1087 } 1088 1089 int __init arch_ioremap_pud_supported(void) 1090 { 1091 /* HPT does not cope with large pages in the vmalloc area */ 1092 return radix_enabled(); 1093 } 1094 1095 int __init arch_ioremap_pmd_supported(void) 1096 { 1097 return radix_enabled(); 1098 } 1099 1100 int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) 1101 { 1102 return 0; 1103 } 1104 1105 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 1106 { 1107 pte_t *ptep = (pte_t *)pud; 1108 pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot); 1109 1110 if (!radix_enabled()) 1111 return 0; 1112 1113 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); 1114 1115 return 1; 1116 } 1117 1118 int pud_clear_huge(pud_t *pud) 1119 { 1120 if (pud_huge(*pud)) { 1121 pud_clear(pud); 1122 return 1; 1123 } 1124 1125 return 0; 1126 } 1127 1128 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 1129 { 1130 pmd_t *pmd; 1131 int i; 1132 1133 pmd = (pmd_t *)pud_page_vaddr(*pud); 1134 pud_clear(pud); 1135 1136 flush_tlb_kernel_range(addr, addr + PUD_SIZE); 1137 1138 for (i = 0; i < PTRS_PER_PMD; i++) { 1139 if (!pmd_none(pmd[i])) { 1140 pte_t *pte; 1141 pte = (pte_t *)pmd_page_vaddr(pmd[i]); 1142 1143 pte_free_kernel(&init_mm, pte); 1144 } 1145 } 1146 1147 pmd_free(&init_mm, pmd); 1148 1149 return 1; 1150 } 1151 1152 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 1153 { 1154 pte_t *ptep = (pte_t *)pmd; 1155 pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot); 1156 1157 if (!radix_enabled()) 1158 return 0; 1159 1160 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); 1161 1162 return 1; 1163 } 1164 1165 int pmd_clear_huge(pmd_t *pmd) 1166 { 1167 if (pmd_huge(*pmd)) { 1168 pmd_clear(pmd); 1169 return 1; 1170 } 1171 1172 return 0; 1173 } 1174 1175 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 1176 { 1177 pte_t *pte; 1178 1179 pte = (pte_t *)pmd_page_vaddr(*pmd); 1180 pmd_clear(pmd); 1181 1182 flush_tlb_kernel_range(addr, addr + PMD_SIZE); 1183 1184 pte_free_kernel(&init_mm, pte); 1185 1186 return 1; 1187 } 1188 1189 int __init arch_ioremap_p4d_supported(void) 1190 { 1191 return 0; 1192 } 1193