1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Page table handling routines for radix page table. 4 * 5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 6 */ 7 8 #define pr_fmt(fmt) "radix-mmu: " fmt 9 10 #include <linux/io.h> 11 #include <linux/kernel.h> 12 #include <linux/sched/mm.h> 13 #include <linux/memblock.h> 14 #include <linux/of_fdt.h> 15 #include <linux/mm.h> 16 #include <linux/hugetlb.h> 17 #include <linux/string_helpers.h> 18 #include <linux/stop_machine.h> 19 20 #include <asm/pgtable.h> 21 #include <asm/pgalloc.h> 22 #include <asm/mmu_context.h> 23 #include <asm/dma.h> 24 #include <asm/machdep.h> 25 #include <asm/mmu.h> 26 #include <asm/firmware.h> 27 #include <asm/powernv.h> 28 #include <asm/sections.h> 29 #include <asm/smp.h> 30 #include <asm/trace.h> 31 #include <asm/uaccess.h> 32 #include <asm/ultravisor.h> 33 34 #include <trace/events/thp.h> 35 36 unsigned int mmu_pid_bits; 37 unsigned int mmu_base_pid; 38 39 static __ref void *early_alloc_pgtable(unsigned long size, int nid, 40 unsigned long region_start, unsigned long region_end) 41 { 42 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; 43 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; 44 void *ptr; 45 46 if (region_start) 47 min_addr = region_start; 48 if (region_end) 49 max_addr = region_end; 50 51 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); 52 53 if (!ptr) 54 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", 55 __func__, size, size, nid, &min_addr, &max_addr); 56 57 return ptr; 58 } 59 60 static int early_map_kernel_page(unsigned long ea, unsigned long pa, 61 pgprot_t flags, 62 unsigned int map_page_size, 63 int nid, 64 unsigned long region_start, unsigned long region_end) 65 { 66 unsigned long pfn = pa >> PAGE_SHIFT; 67 pgd_t *pgdp; 68 pud_t *pudp; 69 pmd_t *pmdp; 70 pte_t *ptep; 71 72 pgdp = pgd_offset_k(ea); 73 if (pgd_none(*pgdp)) { 74 pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, 75 region_start, region_end); 76 pgd_populate(&init_mm, pgdp, pudp); 77 } 78 pudp = pud_offset(pgdp, ea); 79 if (map_page_size == PUD_SIZE) { 80 ptep = (pte_t *)pudp; 81 goto set_the_pte; 82 } 83 if (pud_none(*pudp)) { 84 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, 85 region_start, region_end); 86 pud_populate(&init_mm, pudp, pmdp); 87 } 88 pmdp = pmd_offset(pudp, ea); 89 if (map_page_size == PMD_SIZE) { 90 ptep = pmdp_ptep(pmdp); 91 goto set_the_pte; 92 } 93 if (!pmd_present(*pmdp)) { 94 ptep = early_alloc_pgtable(PAGE_SIZE, nid, 95 region_start, region_end); 96 pmd_populate_kernel(&init_mm, pmdp, ptep); 97 } 98 ptep = pte_offset_kernel(pmdp, ea); 99 100 set_the_pte: 101 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 102 smp_wmb(); 103 return 0; 104 } 105 106 /* 107 * nid, region_start, and region_end are hints to try to place the page 108 * table memory in the same node or region. 109 */ 110 static int __map_kernel_page(unsigned long ea, unsigned long pa, 111 pgprot_t flags, 112 unsigned int map_page_size, 113 int nid, 114 unsigned long region_start, unsigned long region_end) 115 { 116 unsigned long pfn = pa >> PAGE_SHIFT; 117 pgd_t *pgdp; 118 pud_t *pudp; 119 pmd_t *pmdp; 120 pte_t *ptep; 121 /* 122 * Make sure task size is correct as per the max adddr 123 */ 124 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); 125 126 #ifdef CONFIG_PPC_64K_PAGES 127 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); 128 #endif 129 130 if (unlikely(!slab_is_available())) 131 return early_map_kernel_page(ea, pa, flags, map_page_size, 132 nid, region_start, region_end); 133 134 /* 135 * Should make page table allocation functions be able to take a 136 * node, so we can place kernel page tables on the right nodes after 137 * boot. 138 */ 139 pgdp = pgd_offset_k(ea); 140 pudp = pud_alloc(&init_mm, pgdp, ea); 141 if (!pudp) 142 return -ENOMEM; 143 if (map_page_size == PUD_SIZE) { 144 ptep = (pte_t *)pudp; 145 goto set_the_pte; 146 } 147 pmdp = pmd_alloc(&init_mm, pudp, ea); 148 if (!pmdp) 149 return -ENOMEM; 150 if (map_page_size == PMD_SIZE) { 151 ptep = pmdp_ptep(pmdp); 152 goto set_the_pte; 153 } 154 ptep = pte_alloc_kernel(pmdp, ea); 155 if (!ptep) 156 return -ENOMEM; 157 158 set_the_pte: 159 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 160 smp_wmb(); 161 return 0; 162 } 163 164 int radix__map_kernel_page(unsigned long ea, unsigned long pa, 165 pgprot_t flags, 166 unsigned int map_page_size) 167 { 168 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); 169 } 170 171 #ifdef CONFIG_STRICT_KERNEL_RWX 172 void radix__change_memory_range(unsigned long start, unsigned long end, 173 unsigned long clear) 174 { 175 unsigned long idx; 176 pgd_t *pgdp; 177 pud_t *pudp; 178 pmd_t *pmdp; 179 pte_t *ptep; 180 181 start = ALIGN_DOWN(start, PAGE_SIZE); 182 end = PAGE_ALIGN(end); // aligns up 183 184 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", 185 start, end, clear); 186 187 for (idx = start; idx < end; idx += PAGE_SIZE) { 188 pgdp = pgd_offset_k(idx); 189 pudp = pud_alloc(&init_mm, pgdp, idx); 190 if (!pudp) 191 continue; 192 if (pud_is_leaf(*pudp)) { 193 ptep = (pte_t *)pudp; 194 goto update_the_pte; 195 } 196 pmdp = pmd_alloc(&init_mm, pudp, idx); 197 if (!pmdp) 198 continue; 199 if (pmd_is_leaf(*pmdp)) { 200 ptep = pmdp_ptep(pmdp); 201 goto update_the_pte; 202 } 203 ptep = pte_alloc_kernel(pmdp, idx); 204 if (!ptep) 205 continue; 206 update_the_pte: 207 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); 208 } 209 210 radix__flush_tlb_kernel_range(start, end); 211 } 212 213 void radix__mark_rodata_ro(void) 214 { 215 unsigned long start, end; 216 217 start = (unsigned long)_stext; 218 end = (unsigned long)__init_begin; 219 220 radix__change_memory_range(start, end, _PAGE_WRITE); 221 } 222 223 void radix__mark_initmem_nx(void) 224 { 225 unsigned long start = (unsigned long)__init_begin; 226 unsigned long end = (unsigned long)__init_end; 227 228 radix__change_memory_range(start, end, _PAGE_EXEC); 229 } 230 #endif /* CONFIG_STRICT_KERNEL_RWX */ 231 232 static inline void __meminit 233 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) 234 { 235 char buf[10]; 236 237 if (end <= start) 238 return; 239 240 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); 241 242 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, 243 exec ? " (exec)" : ""); 244 } 245 246 static unsigned long next_boundary(unsigned long addr, unsigned long end) 247 { 248 #ifdef CONFIG_STRICT_KERNEL_RWX 249 if (addr < __pa_symbol(__init_begin)) 250 return __pa_symbol(__init_begin); 251 #endif 252 return end; 253 } 254 255 static int __meminit create_physical_mapping(unsigned long start, 256 unsigned long end, 257 int nid, pgprot_t _prot) 258 { 259 unsigned long vaddr, addr, mapping_size = 0; 260 bool prev_exec, exec = false; 261 pgprot_t prot; 262 int psize; 263 264 start = _ALIGN_UP(start, PAGE_SIZE); 265 for (addr = start; addr < end; addr += mapping_size) { 266 unsigned long gap, previous_size; 267 int rc; 268 269 gap = next_boundary(addr, end) - addr; 270 previous_size = mapping_size; 271 prev_exec = exec; 272 273 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && 274 mmu_psize_defs[MMU_PAGE_1G].shift) { 275 mapping_size = PUD_SIZE; 276 psize = MMU_PAGE_1G; 277 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && 278 mmu_psize_defs[MMU_PAGE_2M].shift) { 279 mapping_size = PMD_SIZE; 280 psize = MMU_PAGE_2M; 281 } else { 282 mapping_size = PAGE_SIZE; 283 psize = mmu_virtual_psize; 284 } 285 286 vaddr = (unsigned long)__va(addr); 287 288 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || 289 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { 290 prot = PAGE_KERNEL_X; 291 exec = true; 292 } else { 293 prot = _prot; 294 exec = false; 295 } 296 297 if (mapping_size != previous_size || exec != prev_exec) { 298 print_mapping(start, addr, previous_size, prev_exec); 299 start = addr; 300 } 301 302 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); 303 if (rc) 304 return rc; 305 306 update_page_count(psize, 1); 307 } 308 309 print_mapping(start, addr, mapping_size, exec); 310 return 0; 311 } 312 313 static void __init radix_init_pgtable(void) 314 { 315 unsigned long rts_field; 316 struct memblock_region *reg; 317 318 /* We don't support slb for radix */ 319 mmu_slb_size = 0; 320 /* 321 * Create the linear mapping, using standard page size for now 322 */ 323 for_each_memblock(memory, reg) { 324 /* 325 * The memblock allocator is up at this point, so the 326 * page tables will be allocated within the range. No 327 * need or a node (which we don't have yet). 328 */ 329 330 if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { 331 pr_warn("Outside the supported range\n"); 332 continue; 333 } 334 335 WARN_ON(create_physical_mapping(reg->base, 336 reg->base + reg->size, 337 -1, PAGE_KERNEL)); 338 } 339 340 /* Find out how many PID bits are supported */ 341 if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { 342 if (!mmu_pid_bits) 343 mmu_pid_bits = 20; 344 mmu_base_pid = 1; 345 } else if (cpu_has_feature(CPU_FTR_HVMODE)) { 346 if (!mmu_pid_bits) 347 mmu_pid_bits = 20; 348 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 349 /* 350 * When KVM is possible, we only use the top half of the 351 * PID space to avoid collisions between host and guest PIDs 352 * which can cause problems due to prefetch when exiting the 353 * guest with AIL=3 354 */ 355 mmu_base_pid = 1 << (mmu_pid_bits - 1); 356 #else 357 mmu_base_pid = 1; 358 #endif 359 } else { 360 /* The guest uses the bottom half of the PID space */ 361 if (!mmu_pid_bits) 362 mmu_pid_bits = 19; 363 mmu_base_pid = 1; 364 } 365 366 /* 367 * Allocate Partition table and process table for the 368 * host. 369 */ 370 BUG_ON(PRTB_SIZE_SHIFT > 36); 371 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); 372 /* 373 * Fill in the process table. 374 */ 375 rts_field = radix__get_tree_size(); 376 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 377 378 /* 379 * The init_mm context is given the first available (non-zero) PID, 380 * which is the "guard PID" and contains no page table. PIDR should 381 * never be set to zero because that duplicates the kernel address 382 * space at the 0x0... offset (quadrant 0)! 383 * 384 * An arbitrary PID that may later be allocated by the PID allocator 385 * for userspace processes must not be used either, because that 386 * would cause stale user mappings for that PID on CPUs outside of 387 * the TLB invalidation scheme (because it won't be in mm_cpumask). 388 * 389 * So permanently carve out one PID for the purpose of a guard PID. 390 */ 391 init_mm.context.id = mmu_base_pid; 392 mmu_base_pid++; 393 } 394 395 static void __init radix_init_partition_table(void) 396 { 397 unsigned long rts_field, dw0, dw1; 398 399 mmu_partition_table_init(); 400 rts_field = radix__get_tree_size(); 401 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; 402 dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; 403 mmu_partition_table_set_entry(0, dw0, dw1, false); 404 405 pr_info("Initializing Radix MMU\n"); 406 } 407 408 static int __init get_idx_from_shift(unsigned int shift) 409 { 410 int idx = -1; 411 412 switch (shift) { 413 case 0xc: 414 idx = MMU_PAGE_4K; 415 break; 416 case 0x10: 417 idx = MMU_PAGE_64K; 418 break; 419 case 0x15: 420 idx = MMU_PAGE_2M; 421 break; 422 case 0x1e: 423 idx = MMU_PAGE_1G; 424 break; 425 } 426 return idx; 427 } 428 429 static int __init radix_dt_scan_page_sizes(unsigned long node, 430 const char *uname, int depth, 431 void *data) 432 { 433 int size = 0; 434 int shift, idx; 435 unsigned int ap; 436 const __be32 *prop; 437 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 438 439 /* We are scanning "cpu" nodes only */ 440 if (type == NULL || strcmp(type, "cpu") != 0) 441 return 0; 442 443 /* Find MMU PID size */ 444 prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); 445 if (prop && size == 4) 446 mmu_pid_bits = be32_to_cpup(prop); 447 448 /* Grab page size encodings */ 449 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); 450 if (!prop) 451 return 0; 452 453 pr_info("Page sizes from device-tree:\n"); 454 for (; size >= 4; size -= 4, ++prop) { 455 456 struct mmu_psize_def *def; 457 458 /* top 3 bit is AP encoding */ 459 shift = be32_to_cpu(prop[0]) & ~(0xe << 28); 460 ap = be32_to_cpu(prop[0]) >> 29; 461 pr_info("Page size shift = %d AP=0x%x\n", shift, ap); 462 463 idx = get_idx_from_shift(shift); 464 if (idx < 0) 465 continue; 466 467 def = &mmu_psize_defs[idx]; 468 def->shift = shift; 469 def->ap = ap; 470 } 471 472 /* needed ? */ 473 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 474 return 1; 475 } 476 477 void __init radix__early_init_devtree(void) 478 { 479 int rc; 480 481 /* 482 * Try to find the available page sizes in the device-tree 483 */ 484 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); 485 if (rc != 0) /* Found */ 486 goto found; 487 /* 488 * let's assume we have page 4k and 64k support 489 */ 490 mmu_psize_defs[MMU_PAGE_4K].shift = 12; 491 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; 492 493 mmu_psize_defs[MMU_PAGE_64K].shift = 16; 494 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; 495 found: 496 return; 497 } 498 499 static void radix_init_amor(void) 500 { 501 /* 502 * In HV mode, we init AMOR (Authority Mask Override Register) so that 503 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 504 * Register), enable key 0 and set it to 1. 505 * 506 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 507 */ 508 mtspr(SPRN_AMOR, (3ul << 62)); 509 } 510 511 #ifdef CONFIG_PPC_KUEP 512 void setup_kuep(bool disabled) 513 { 514 if (disabled || !early_radix_enabled()) 515 return; 516 517 if (smp_processor_id() == boot_cpuid) 518 pr_info("Activating Kernel Userspace Execution Prevention\n"); 519 520 /* 521 * Radix always uses key0 of the IAMR to determine if an access is 522 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 523 * fetch. 524 */ 525 mtspr(SPRN_IAMR, (1ul << 62)); 526 } 527 #endif 528 529 #ifdef CONFIG_PPC_KUAP 530 void setup_kuap(bool disabled) 531 { 532 if (disabled || !early_radix_enabled()) 533 return; 534 535 if (smp_processor_id() == boot_cpuid) { 536 pr_info("Activating Kernel Userspace Access Prevention\n"); 537 cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; 538 } 539 540 /* Make sure userspace can't change the AMR */ 541 mtspr(SPRN_UAMOR, 0); 542 mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); 543 isync(); 544 } 545 #endif 546 547 void __init radix__early_init_mmu(void) 548 { 549 unsigned long lpcr; 550 551 #ifdef CONFIG_PPC_64K_PAGES 552 /* PAGE_SIZE mappings */ 553 mmu_virtual_psize = MMU_PAGE_64K; 554 #else 555 mmu_virtual_psize = MMU_PAGE_4K; 556 #endif 557 558 #ifdef CONFIG_SPARSEMEM_VMEMMAP 559 /* vmemmap mapping */ 560 if (mmu_psize_defs[MMU_PAGE_2M].shift) { 561 /* 562 * map vmemmap using 2M if available 563 */ 564 mmu_vmemmap_psize = MMU_PAGE_2M; 565 } else 566 mmu_vmemmap_psize = mmu_virtual_psize; 567 #endif 568 /* 569 * initialize page table size 570 */ 571 __pte_index_size = RADIX_PTE_INDEX_SIZE; 572 __pmd_index_size = RADIX_PMD_INDEX_SIZE; 573 __pud_index_size = RADIX_PUD_INDEX_SIZE; 574 __pgd_index_size = RADIX_PGD_INDEX_SIZE; 575 __pud_cache_index = RADIX_PUD_INDEX_SIZE; 576 __pte_table_size = RADIX_PTE_TABLE_SIZE; 577 __pmd_table_size = RADIX_PMD_TABLE_SIZE; 578 __pud_table_size = RADIX_PUD_TABLE_SIZE; 579 __pgd_table_size = RADIX_PGD_TABLE_SIZE; 580 581 __pmd_val_bits = RADIX_PMD_VAL_BITS; 582 __pud_val_bits = RADIX_PUD_VAL_BITS; 583 __pgd_val_bits = RADIX_PGD_VAL_BITS; 584 585 __kernel_virt_start = RADIX_KERN_VIRT_START; 586 __vmalloc_start = RADIX_VMALLOC_START; 587 __vmalloc_end = RADIX_VMALLOC_END; 588 __kernel_io_start = RADIX_KERN_IO_START; 589 __kernel_io_end = RADIX_KERN_IO_END; 590 vmemmap = (struct page *)RADIX_VMEMMAP_START; 591 ioremap_bot = IOREMAP_BASE; 592 593 #ifdef CONFIG_PCI 594 pci_io_base = ISA_IO_BASE; 595 #endif 596 __pte_frag_nr = RADIX_PTE_FRAG_NR; 597 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; 598 __pmd_frag_nr = RADIX_PMD_FRAG_NR; 599 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; 600 601 radix_init_pgtable(); 602 603 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 604 lpcr = mfspr(SPRN_LPCR); 605 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 606 radix_init_partition_table(); 607 radix_init_amor(); 608 } else { 609 radix_init_pseries(); 610 } 611 612 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 613 614 /* Switch to the guard PID before turning on MMU */ 615 radix__switch_mmu_context(NULL, &init_mm); 616 tlbiel_all(); 617 } 618 619 void radix__early_init_mmu_secondary(void) 620 { 621 unsigned long lpcr; 622 /* 623 * update partition table control register and UPRT 624 */ 625 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 626 lpcr = mfspr(SPRN_LPCR); 627 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 628 629 set_ptcr_when_no_uv(__pa(partition_tb) | 630 (PATB_SIZE_SHIFT - 12)); 631 632 radix_init_amor(); 633 } 634 635 radix__switch_mmu_context(NULL, &init_mm); 636 tlbiel_all(); 637 } 638 639 void radix__mmu_cleanup_all(void) 640 { 641 unsigned long lpcr; 642 643 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 644 lpcr = mfspr(SPRN_LPCR); 645 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); 646 set_ptcr_when_no_uv(0); 647 powernv_set_nmmu_ptcr(0); 648 radix__flush_tlb_all(); 649 } 650 } 651 652 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, 653 phys_addr_t first_memblock_size) 654 { 655 /* 656 * We don't currently support the first MEMBLOCK not mapping 0 657 * physical on those processors 658 */ 659 BUG_ON(first_memblock_base != 0); 660 661 /* 662 * Radix mode is not limited by RMA / VRMA addressing. 663 */ 664 ppc64_rma_size = ULONG_MAX; 665 } 666 667 #ifdef CONFIG_MEMORY_HOTPLUG 668 static void free_pte_table(pte_t *pte_start, pmd_t *pmd) 669 { 670 pte_t *pte; 671 int i; 672 673 for (i = 0; i < PTRS_PER_PTE; i++) { 674 pte = pte_start + i; 675 if (!pte_none(*pte)) 676 return; 677 } 678 679 pte_free_kernel(&init_mm, pte_start); 680 pmd_clear(pmd); 681 } 682 683 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) 684 { 685 pmd_t *pmd; 686 int i; 687 688 for (i = 0; i < PTRS_PER_PMD; i++) { 689 pmd = pmd_start + i; 690 if (!pmd_none(*pmd)) 691 return; 692 } 693 694 pmd_free(&init_mm, pmd_start); 695 pud_clear(pud); 696 } 697 698 struct change_mapping_params { 699 pte_t *pte; 700 unsigned long start; 701 unsigned long end; 702 unsigned long aligned_start; 703 unsigned long aligned_end; 704 }; 705 706 static int __meminit stop_machine_change_mapping(void *data) 707 { 708 struct change_mapping_params *params = 709 (struct change_mapping_params *)data; 710 711 if (!data) 712 return -1; 713 714 spin_unlock(&init_mm.page_table_lock); 715 pte_clear(&init_mm, params->aligned_start, params->pte); 716 create_physical_mapping(__pa(params->aligned_start), 717 __pa(params->start), -1, PAGE_KERNEL); 718 create_physical_mapping(__pa(params->end), __pa(params->aligned_end), 719 -1, PAGE_KERNEL); 720 spin_lock(&init_mm.page_table_lock); 721 return 0; 722 } 723 724 static void remove_pte_table(pte_t *pte_start, unsigned long addr, 725 unsigned long end) 726 { 727 unsigned long next; 728 pte_t *pte; 729 730 pte = pte_start + pte_index(addr); 731 for (; addr < end; addr = next, pte++) { 732 next = (addr + PAGE_SIZE) & PAGE_MASK; 733 if (next > end) 734 next = end; 735 736 if (!pte_present(*pte)) 737 continue; 738 739 if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { 740 /* 741 * The vmemmap_free() and remove_section_mapping() 742 * codepaths call us with aligned addresses. 743 */ 744 WARN_ONCE(1, "%s: unaligned range\n", __func__); 745 continue; 746 } 747 748 pte_clear(&init_mm, addr, pte); 749 } 750 } 751 752 /* 753 * clear the pte and potentially split the mapping helper 754 */ 755 static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, 756 unsigned long size, pte_t *pte) 757 { 758 unsigned long mask = ~(size - 1); 759 unsigned long aligned_start = addr & mask; 760 unsigned long aligned_end = addr + size; 761 struct change_mapping_params params; 762 bool split_region = false; 763 764 if ((end - addr) < size) { 765 /* 766 * We're going to clear the PTE, but not flushed 767 * the mapping, time to remap and flush. The 768 * effects if visible outside the processor or 769 * if we are running in code close to the 770 * mapping we cleared, we are in trouble. 771 */ 772 if (overlaps_kernel_text(aligned_start, addr) || 773 overlaps_kernel_text(end, aligned_end)) { 774 /* 775 * Hack, just return, don't pte_clear 776 */ 777 WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " 778 "text, not splitting\n", addr, end); 779 return; 780 } 781 split_region = true; 782 } 783 784 if (split_region) { 785 params.pte = pte; 786 params.start = addr; 787 params.end = end; 788 params.aligned_start = addr & ~(size - 1); 789 params.aligned_end = min_t(unsigned long, aligned_end, 790 (unsigned long)__va(memblock_end_of_DRAM())); 791 stop_machine(stop_machine_change_mapping, ¶ms, NULL); 792 return; 793 } 794 795 pte_clear(&init_mm, addr, pte); 796 } 797 798 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, 799 unsigned long end) 800 { 801 unsigned long next; 802 pte_t *pte_base; 803 pmd_t *pmd; 804 805 pmd = pmd_start + pmd_index(addr); 806 for (; addr < end; addr = next, pmd++) { 807 next = pmd_addr_end(addr, end); 808 809 if (!pmd_present(*pmd)) 810 continue; 811 812 if (pmd_is_leaf(*pmd)) { 813 split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); 814 continue; 815 } 816 817 pte_base = (pte_t *)pmd_page_vaddr(*pmd); 818 remove_pte_table(pte_base, addr, next); 819 free_pte_table(pte_base, pmd); 820 } 821 } 822 823 static void remove_pud_table(pud_t *pud_start, unsigned long addr, 824 unsigned long end) 825 { 826 unsigned long next; 827 pmd_t *pmd_base; 828 pud_t *pud; 829 830 pud = pud_start + pud_index(addr); 831 for (; addr < end; addr = next, pud++) { 832 next = pud_addr_end(addr, end); 833 834 if (!pud_present(*pud)) 835 continue; 836 837 if (pud_is_leaf(*pud)) { 838 split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); 839 continue; 840 } 841 842 pmd_base = (pmd_t *)pud_page_vaddr(*pud); 843 remove_pmd_table(pmd_base, addr, next); 844 free_pmd_table(pmd_base, pud); 845 } 846 } 847 848 static void __meminit remove_pagetable(unsigned long start, unsigned long end) 849 { 850 unsigned long addr, next; 851 pud_t *pud_base; 852 pgd_t *pgd; 853 854 spin_lock(&init_mm.page_table_lock); 855 856 for (addr = start; addr < end; addr = next) { 857 next = pgd_addr_end(addr, end); 858 859 pgd = pgd_offset_k(addr); 860 if (!pgd_present(*pgd)) 861 continue; 862 863 if (pgd_is_leaf(*pgd)) { 864 split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); 865 continue; 866 } 867 868 pud_base = (pud_t *)pgd_page_vaddr(*pgd); 869 remove_pud_table(pud_base, addr, next); 870 } 871 872 spin_unlock(&init_mm.page_table_lock); 873 radix__flush_tlb_kernel_range(start, end); 874 } 875 876 int __meminit radix__create_section_mapping(unsigned long start, 877 unsigned long end, int nid, 878 pgprot_t prot) 879 { 880 if (end >= RADIX_VMALLOC_START) { 881 pr_warn("Outside the supported range\n"); 882 return -1; 883 } 884 885 return create_physical_mapping(__pa(start), __pa(end), nid, prot); 886 } 887 888 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) 889 { 890 remove_pagetable(start, end); 891 return 0; 892 } 893 #endif /* CONFIG_MEMORY_HOTPLUG */ 894 895 #ifdef CONFIG_SPARSEMEM_VMEMMAP 896 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, 897 pgprot_t flags, unsigned int map_page_size, 898 int nid) 899 { 900 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); 901 } 902 903 int __meminit radix__vmemmap_create_mapping(unsigned long start, 904 unsigned long page_size, 905 unsigned long phys) 906 { 907 /* Create a PTE encoding */ 908 unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; 909 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); 910 int ret; 911 912 if ((start + page_size) >= RADIX_VMEMMAP_END) { 913 pr_warn("Outside the supported range\n"); 914 return -1; 915 } 916 917 ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); 918 BUG_ON(ret); 919 920 return 0; 921 } 922 923 #ifdef CONFIG_MEMORY_HOTPLUG 924 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) 925 { 926 remove_pagetable(start, start + page_size); 927 } 928 #endif 929 #endif 930 931 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 932 933 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 934 pmd_t *pmdp, unsigned long clr, 935 unsigned long set) 936 { 937 unsigned long old; 938 939 #ifdef CONFIG_DEBUG_VM 940 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); 941 assert_spin_locked(pmd_lockptr(mm, pmdp)); 942 #endif 943 944 old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); 945 trace_hugepage_update(addr, old, clr, set); 946 947 return old; 948 } 949 950 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 951 pmd_t *pmdp) 952 953 { 954 pmd_t pmd; 955 956 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 957 VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); 958 VM_BUG_ON(pmd_devmap(*pmdp)); 959 /* 960 * khugepaged calls this for normal pmd 961 */ 962 pmd = *pmdp; 963 pmd_clear(pmdp); 964 965 /*FIXME!! Verify whether we need this kick below */ 966 serialize_against_pte_lookup(vma->vm_mm); 967 968 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); 969 970 return pmd; 971 } 972 973 /* 974 * For us pgtable_t is pte_t *. Inorder to save the deposisted 975 * page table, we consider the allocated page table as a list 976 * head. On withdraw we need to make sure we zero out the used 977 * list_head memory area. 978 */ 979 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 980 pgtable_t pgtable) 981 { 982 struct list_head *lh = (struct list_head *) pgtable; 983 984 assert_spin_locked(pmd_lockptr(mm, pmdp)); 985 986 /* FIFO */ 987 if (!pmd_huge_pte(mm, pmdp)) 988 INIT_LIST_HEAD(lh); 989 else 990 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 991 pmd_huge_pte(mm, pmdp) = pgtable; 992 } 993 994 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 995 { 996 pte_t *ptep; 997 pgtable_t pgtable; 998 struct list_head *lh; 999 1000 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1001 1002 /* FIFO */ 1003 pgtable = pmd_huge_pte(mm, pmdp); 1004 lh = (struct list_head *) pgtable; 1005 if (list_empty(lh)) 1006 pmd_huge_pte(mm, pmdp) = NULL; 1007 else { 1008 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1009 list_del(lh); 1010 } 1011 ptep = (pte_t *) pgtable; 1012 *ptep = __pte(0); 1013 ptep++; 1014 *ptep = __pte(0); 1015 return pgtable; 1016 } 1017 1018 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 1019 unsigned long addr, pmd_t *pmdp) 1020 { 1021 pmd_t old_pmd; 1022 unsigned long old; 1023 1024 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 1025 old_pmd = __pmd(old); 1026 /* 1027 * Serialize against find_current_mm_pte which does lock-less 1028 * lookup in page tables with local interrupts disabled. For huge pages 1029 * it casts pmd_t to pte_t. Since format of pte_t is different from 1030 * pmd_t we want to prevent transit from pmd pointing to page table 1031 * to pmd pointing to huge page (and back) while interrupts are disabled. 1032 * We clear pmd to possibly replace it with page table pointer in 1033 * different code paths. So make sure we wait for the parallel 1034 * find_current_mm_pte to finish. 1035 */ 1036 serialize_against_pte_lookup(mm); 1037 return old_pmd; 1038 } 1039 1040 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1041 1042 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, 1043 pte_t entry, unsigned long address, int psize) 1044 { 1045 struct mm_struct *mm = vma->vm_mm; 1046 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | 1047 _PAGE_RW | _PAGE_EXEC); 1048 1049 unsigned long change = pte_val(entry) ^ pte_val(*ptep); 1050 /* 1051 * To avoid NMMU hang while relaxing access, we need mark 1052 * the pte invalid in between. 1053 */ 1054 if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { 1055 unsigned long old_pte, new_pte; 1056 1057 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); 1058 /* 1059 * new value of pte 1060 */ 1061 new_pte = old_pte | set; 1062 radix__flush_tlb_page_psize(mm, address, psize); 1063 __radix_pte_update(ptep, _PAGE_INVALID, new_pte); 1064 } else { 1065 __radix_pte_update(ptep, 0, set); 1066 /* 1067 * Book3S does not require a TLB flush when relaxing access 1068 * restrictions when the address space is not attached to a 1069 * NMMU, because the core MMU will reload the pte after taking 1070 * an access fault, which is defined by the architectue. 1071 */ 1072 } 1073 /* See ptesync comment in radix__set_pte_at */ 1074 } 1075 1076 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 1077 unsigned long addr, pte_t *ptep, 1078 pte_t old_pte, pte_t pte) 1079 { 1080 struct mm_struct *mm = vma->vm_mm; 1081 1082 /* 1083 * To avoid NMMU hang while relaxing access we need to flush the tlb before 1084 * we set the new value. We need to do this only for radix, because hash 1085 * translation does flush when updating the linux pte. 1086 */ 1087 if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 1088 (atomic_read(&mm->context.copros) > 0)) 1089 radix__flush_tlb_page(vma, addr); 1090 1091 set_pte_at(mm, addr, ptep, pte); 1092 } 1093 1094 int __init arch_ioremap_pud_supported(void) 1095 { 1096 /* HPT does not cope with large pages in the vmalloc area */ 1097 return radix_enabled(); 1098 } 1099 1100 int __init arch_ioremap_pmd_supported(void) 1101 { 1102 return radix_enabled(); 1103 } 1104 1105 int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) 1106 { 1107 return 0; 1108 } 1109 1110 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 1111 { 1112 pte_t *ptep = (pte_t *)pud; 1113 pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot); 1114 1115 if (!radix_enabled()) 1116 return 0; 1117 1118 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); 1119 1120 return 1; 1121 } 1122 1123 int pud_clear_huge(pud_t *pud) 1124 { 1125 if (pud_huge(*pud)) { 1126 pud_clear(pud); 1127 return 1; 1128 } 1129 1130 return 0; 1131 } 1132 1133 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 1134 { 1135 pmd_t *pmd; 1136 int i; 1137 1138 pmd = (pmd_t *)pud_page_vaddr(*pud); 1139 pud_clear(pud); 1140 1141 flush_tlb_kernel_range(addr, addr + PUD_SIZE); 1142 1143 for (i = 0; i < PTRS_PER_PMD; i++) { 1144 if (!pmd_none(pmd[i])) { 1145 pte_t *pte; 1146 pte = (pte_t *)pmd_page_vaddr(pmd[i]); 1147 1148 pte_free_kernel(&init_mm, pte); 1149 } 1150 } 1151 1152 pmd_free(&init_mm, pmd); 1153 1154 return 1; 1155 } 1156 1157 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 1158 { 1159 pte_t *ptep = (pte_t *)pmd; 1160 pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot); 1161 1162 if (!radix_enabled()) 1163 return 0; 1164 1165 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); 1166 1167 return 1; 1168 } 1169 1170 int pmd_clear_huge(pmd_t *pmd) 1171 { 1172 if (pmd_huge(*pmd)) { 1173 pmd_clear(pmd); 1174 return 1; 1175 } 1176 1177 return 0; 1178 } 1179 1180 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 1181 { 1182 pte_t *pte; 1183 1184 pte = (pte_t *)pmd_page_vaddr(*pmd); 1185 pmd_clear(pmd); 1186 1187 flush_tlb_kernel_range(addr, addr + PMD_SIZE); 1188 1189 pte_free_kernel(&init_mm, pte); 1190 1191 return 1; 1192 } 1193 1194 int __init arch_ioremap_p4d_supported(void) 1195 { 1196 return 0; 1197 } 1198