1 /* 2 * Page table handling routines for radix page table. 3 * 4 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 12 #define pr_fmt(fmt) "radix-mmu: " fmt 13 14 #include <linux/kernel.h> 15 #include <linux/sched/mm.h> 16 #include <linux/memblock.h> 17 #include <linux/of_fdt.h> 18 #include <linux/mm.h> 19 #include <linux/string_helpers.h> 20 #include <linux/stop_machine.h> 21 22 #include <asm/pgtable.h> 23 #include <asm/pgalloc.h> 24 #include <asm/mmu_context.h> 25 #include <asm/dma.h> 26 #include <asm/machdep.h> 27 #include <asm/mmu.h> 28 #include <asm/firmware.h> 29 #include <asm/powernv.h> 30 #include <asm/sections.h> 31 #include <asm/trace.h> 32 #include <asm/uaccess.h> 33 34 #include <trace/events/thp.h> 35 36 unsigned int mmu_pid_bits; 37 unsigned int mmu_base_pid; 38 39 static int native_register_process_table(unsigned long base, unsigned long pg_sz, 40 unsigned long table_size) 41 { 42 unsigned long patb0, patb1; 43 44 patb0 = be64_to_cpu(partition_tb[0].patb0); 45 patb1 = base | table_size | PATB_GR; 46 47 mmu_partition_table_set_entry(0, patb0, patb1); 48 49 return 0; 50 } 51 52 static __ref void *early_alloc_pgtable(unsigned long size, int nid, 53 unsigned long region_start, unsigned long region_end) 54 { 55 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; 56 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; 57 void *ptr; 58 59 if (region_start) 60 min_addr = region_start; 61 if (region_end) 62 max_addr = region_end; 63 64 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); 65 66 if (!ptr) 67 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", 68 __func__, size, size, nid, &min_addr, &max_addr); 69 70 return ptr; 71 } 72 73 static int early_map_kernel_page(unsigned long ea, unsigned long pa, 74 pgprot_t flags, 75 unsigned int map_page_size, 76 int nid, 77 unsigned long region_start, unsigned long region_end) 78 { 79 unsigned long pfn = pa >> PAGE_SHIFT; 80 pgd_t *pgdp; 81 pud_t *pudp; 82 pmd_t *pmdp; 83 pte_t *ptep; 84 85 pgdp = pgd_offset_k(ea); 86 if (pgd_none(*pgdp)) { 87 pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, 88 region_start, region_end); 89 pgd_populate(&init_mm, pgdp, pudp); 90 } 91 pudp = pud_offset(pgdp, ea); 92 if (map_page_size == PUD_SIZE) { 93 ptep = (pte_t *)pudp; 94 goto set_the_pte; 95 } 96 if (pud_none(*pudp)) { 97 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, 98 region_start, region_end); 99 pud_populate(&init_mm, pudp, pmdp); 100 } 101 pmdp = pmd_offset(pudp, ea); 102 if (map_page_size == PMD_SIZE) { 103 ptep = pmdp_ptep(pmdp); 104 goto set_the_pte; 105 } 106 if (!pmd_present(*pmdp)) { 107 ptep = early_alloc_pgtable(PAGE_SIZE, nid, 108 region_start, region_end); 109 pmd_populate_kernel(&init_mm, pmdp, ptep); 110 } 111 ptep = pte_offset_kernel(pmdp, ea); 112 113 set_the_pte: 114 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 115 smp_wmb(); 116 return 0; 117 } 118 119 /* 120 * nid, region_start, and region_end are hints to try to place the page 121 * table memory in the same node or region. 122 */ 123 static int __map_kernel_page(unsigned long ea, unsigned long pa, 124 pgprot_t flags, 125 unsigned int map_page_size, 126 int nid, 127 unsigned long region_start, unsigned long region_end) 128 { 129 unsigned long pfn = pa >> PAGE_SHIFT; 130 pgd_t *pgdp; 131 pud_t *pudp; 132 pmd_t *pmdp; 133 pte_t *ptep; 134 /* 135 * Make sure task size is correct as per the max adddr 136 */ 137 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); 138 139 #ifdef CONFIG_PPC_64K_PAGES 140 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); 141 #endif 142 143 if (unlikely(!slab_is_available())) 144 return early_map_kernel_page(ea, pa, flags, map_page_size, 145 nid, region_start, region_end); 146 147 /* 148 * Should make page table allocation functions be able to take a 149 * node, so we can place kernel page tables on the right nodes after 150 * boot. 151 */ 152 pgdp = pgd_offset_k(ea); 153 pudp = pud_alloc(&init_mm, pgdp, ea); 154 if (!pudp) 155 return -ENOMEM; 156 if (map_page_size == PUD_SIZE) { 157 ptep = (pte_t *)pudp; 158 goto set_the_pte; 159 } 160 pmdp = pmd_alloc(&init_mm, pudp, ea); 161 if (!pmdp) 162 return -ENOMEM; 163 if (map_page_size == PMD_SIZE) { 164 ptep = pmdp_ptep(pmdp); 165 goto set_the_pte; 166 } 167 ptep = pte_alloc_kernel(pmdp, ea); 168 if (!ptep) 169 return -ENOMEM; 170 171 set_the_pte: 172 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 173 smp_wmb(); 174 return 0; 175 } 176 177 int radix__map_kernel_page(unsigned long ea, unsigned long pa, 178 pgprot_t flags, 179 unsigned int map_page_size) 180 { 181 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); 182 } 183 184 #ifdef CONFIG_STRICT_KERNEL_RWX 185 void radix__change_memory_range(unsigned long start, unsigned long end, 186 unsigned long clear) 187 { 188 unsigned long idx; 189 pgd_t *pgdp; 190 pud_t *pudp; 191 pmd_t *pmdp; 192 pte_t *ptep; 193 194 start = ALIGN_DOWN(start, PAGE_SIZE); 195 end = PAGE_ALIGN(end); // aligns up 196 197 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", 198 start, end, clear); 199 200 for (idx = start; idx < end; idx += PAGE_SIZE) { 201 pgdp = pgd_offset_k(idx); 202 pudp = pud_alloc(&init_mm, pgdp, idx); 203 if (!pudp) 204 continue; 205 if (pud_huge(*pudp)) { 206 ptep = (pte_t *)pudp; 207 goto update_the_pte; 208 } 209 pmdp = pmd_alloc(&init_mm, pudp, idx); 210 if (!pmdp) 211 continue; 212 if (pmd_huge(*pmdp)) { 213 ptep = pmdp_ptep(pmdp); 214 goto update_the_pte; 215 } 216 ptep = pte_alloc_kernel(pmdp, idx); 217 if (!ptep) 218 continue; 219 update_the_pte: 220 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); 221 } 222 223 radix__flush_tlb_kernel_range(start, end); 224 } 225 226 void radix__mark_rodata_ro(void) 227 { 228 unsigned long start, end; 229 230 start = (unsigned long)_stext; 231 end = (unsigned long)__init_begin; 232 233 radix__change_memory_range(start, end, _PAGE_WRITE); 234 } 235 236 void radix__mark_initmem_nx(void) 237 { 238 unsigned long start = (unsigned long)__init_begin; 239 unsigned long end = (unsigned long)__init_end; 240 241 radix__change_memory_range(start, end, _PAGE_EXEC); 242 } 243 #endif /* CONFIG_STRICT_KERNEL_RWX */ 244 245 static inline void __meminit 246 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) 247 { 248 char buf[10]; 249 250 if (end <= start) 251 return; 252 253 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); 254 255 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, 256 exec ? " (exec)" : ""); 257 } 258 259 static unsigned long next_boundary(unsigned long addr, unsigned long end) 260 { 261 #ifdef CONFIG_STRICT_KERNEL_RWX 262 if (addr < __pa_symbol(__init_begin)) 263 return __pa_symbol(__init_begin); 264 #endif 265 return end; 266 } 267 268 static int __meminit create_physical_mapping(unsigned long start, 269 unsigned long end, 270 int nid) 271 { 272 unsigned long vaddr, addr, mapping_size = 0; 273 bool prev_exec, exec = false; 274 pgprot_t prot; 275 int psize; 276 277 start = _ALIGN_UP(start, PAGE_SIZE); 278 for (addr = start; addr < end; addr += mapping_size) { 279 unsigned long gap, previous_size; 280 int rc; 281 282 gap = next_boundary(addr, end) - addr; 283 previous_size = mapping_size; 284 prev_exec = exec; 285 286 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && 287 mmu_psize_defs[MMU_PAGE_1G].shift) { 288 mapping_size = PUD_SIZE; 289 psize = MMU_PAGE_1G; 290 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && 291 mmu_psize_defs[MMU_PAGE_2M].shift) { 292 mapping_size = PMD_SIZE; 293 psize = MMU_PAGE_2M; 294 } else { 295 mapping_size = PAGE_SIZE; 296 psize = mmu_virtual_psize; 297 } 298 299 vaddr = (unsigned long)__va(addr); 300 301 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || 302 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { 303 prot = PAGE_KERNEL_X; 304 exec = true; 305 } else { 306 prot = PAGE_KERNEL; 307 exec = false; 308 } 309 310 if (mapping_size != previous_size || exec != prev_exec) { 311 print_mapping(start, addr, previous_size, prev_exec); 312 start = addr; 313 } 314 315 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); 316 if (rc) 317 return rc; 318 319 update_page_count(psize, 1); 320 } 321 322 print_mapping(start, addr, mapping_size, exec); 323 return 0; 324 } 325 326 void __init radix_init_pgtable(void) 327 { 328 unsigned long rts_field; 329 struct memblock_region *reg; 330 331 /* We don't support slb for radix */ 332 mmu_slb_size = 0; 333 /* 334 * Create the linear mapping, using standard page size for now 335 */ 336 for_each_memblock(memory, reg) { 337 /* 338 * The memblock allocator is up at this point, so the 339 * page tables will be allocated within the range. No 340 * need or a node (which we don't have yet). 341 */ 342 343 if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { 344 pr_warn("Outside the supported range\n"); 345 continue; 346 } 347 348 WARN_ON(create_physical_mapping(reg->base, 349 reg->base + reg->size, 350 -1)); 351 } 352 353 /* Find out how many PID bits are supported */ 354 if (cpu_has_feature(CPU_FTR_HVMODE)) { 355 if (!mmu_pid_bits) 356 mmu_pid_bits = 20; 357 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 358 /* 359 * When KVM is possible, we only use the top half of the 360 * PID space to avoid collisions between host and guest PIDs 361 * which can cause problems due to prefetch when exiting the 362 * guest with AIL=3 363 */ 364 mmu_base_pid = 1 << (mmu_pid_bits - 1); 365 #else 366 mmu_base_pid = 1; 367 #endif 368 } else { 369 /* The guest uses the bottom half of the PID space */ 370 if (!mmu_pid_bits) 371 mmu_pid_bits = 19; 372 mmu_base_pid = 1; 373 } 374 375 /* 376 * Allocate Partition table and process table for the 377 * host. 378 */ 379 BUG_ON(PRTB_SIZE_SHIFT > 36); 380 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); 381 /* 382 * Fill in the process table. 383 */ 384 rts_field = radix__get_tree_size(); 385 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 386 /* 387 * Fill in the partition table. We are suppose to use effective address 388 * of process table here. But our linear mapping also enable us to use 389 * physical address here. 390 */ 391 register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); 392 pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); 393 asm volatile("ptesync" : : : "memory"); 394 asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 395 "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); 396 asm volatile("eieio; tlbsync; ptesync" : : : "memory"); 397 trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); 398 399 /* 400 * The init_mm context is given the first available (non-zero) PID, 401 * which is the "guard PID" and contains no page table. PIDR should 402 * never be set to zero because that duplicates the kernel address 403 * space at the 0x0... offset (quadrant 0)! 404 * 405 * An arbitrary PID that may later be allocated by the PID allocator 406 * for userspace processes must not be used either, because that 407 * would cause stale user mappings for that PID on CPUs outside of 408 * the TLB invalidation scheme (because it won't be in mm_cpumask). 409 * 410 * So permanently carve out one PID for the purpose of a guard PID. 411 */ 412 init_mm.context.id = mmu_base_pid; 413 mmu_base_pid++; 414 } 415 416 static void __init radix_init_partition_table(void) 417 { 418 unsigned long rts_field, dw0; 419 420 mmu_partition_table_init(); 421 rts_field = radix__get_tree_size(); 422 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; 423 mmu_partition_table_set_entry(0, dw0, 0); 424 425 pr_info("Initializing Radix MMU\n"); 426 pr_info("Partition table %p\n", partition_tb); 427 } 428 429 void __init radix_init_native(void) 430 { 431 register_process_table = native_register_process_table; 432 } 433 434 static int __init get_idx_from_shift(unsigned int shift) 435 { 436 int idx = -1; 437 438 switch (shift) { 439 case 0xc: 440 idx = MMU_PAGE_4K; 441 break; 442 case 0x10: 443 idx = MMU_PAGE_64K; 444 break; 445 case 0x15: 446 idx = MMU_PAGE_2M; 447 break; 448 case 0x1e: 449 idx = MMU_PAGE_1G; 450 break; 451 } 452 return idx; 453 } 454 455 static int __init radix_dt_scan_page_sizes(unsigned long node, 456 const char *uname, int depth, 457 void *data) 458 { 459 int size = 0; 460 int shift, idx; 461 unsigned int ap; 462 const __be32 *prop; 463 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 464 465 /* We are scanning "cpu" nodes only */ 466 if (type == NULL || strcmp(type, "cpu") != 0) 467 return 0; 468 469 /* Find MMU PID size */ 470 prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); 471 if (prop && size == 4) 472 mmu_pid_bits = be32_to_cpup(prop); 473 474 /* Grab page size encodings */ 475 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); 476 if (!prop) 477 return 0; 478 479 pr_info("Page sizes from device-tree:\n"); 480 for (; size >= 4; size -= 4, ++prop) { 481 482 struct mmu_psize_def *def; 483 484 /* top 3 bit is AP encoding */ 485 shift = be32_to_cpu(prop[0]) & ~(0xe << 28); 486 ap = be32_to_cpu(prop[0]) >> 29; 487 pr_info("Page size shift = %d AP=0x%x\n", shift, ap); 488 489 idx = get_idx_from_shift(shift); 490 if (idx < 0) 491 continue; 492 493 def = &mmu_psize_defs[idx]; 494 def->shift = shift; 495 def->ap = ap; 496 } 497 498 /* needed ? */ 499 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 500 return 1; 501 } 502 503 void __init radix__early_init_devtree(void) 504 { 505 int rc; 506 507 /* 508 * Try to find the available page sizes in the device-tree 509 */ 510 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); 511 if (rc != 0) /* Found */ 512 goto found; 513 /* 514 * let's assume we have page 4k and 64k support 515 */ 516 mmu_psize_defs[MMU_PAGE_4K].shift = 12; 517 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; 518 519 mmu_psize_defs[MMU_PAGE_64K].shift = 16; 520 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; 521 found: 522 #ifdef CONFIG_SPARSEMEM_VMEMMAP 523 if (mmu_psize_defs[MMU_PAGE_2M].shift) { 524 /* 525 * map vmemmap using 2M if available 526 */ 527 mmu_vmemmap_psize = MMU_PAGE_2M; 528 } 529 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 530 return; 531 } 532 533 static void radix_init_amor(void) 534 { 535 /* 536 * In HV mode, we init AMOR (Authority Mask Override Register) so that 537 * the hypervisor and guest can setup IAMR (Instruction Authority Mask 538 * Register), enable key 0 and set it to 1. 539 * 540 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) 541 */ 542 mtspr(SPRN_AMOR, (3ul << 62)); 543 } 544 545 #ifdef CONFIG_PPC_KUEP 546 void setup_kuep(bool disabled) 547 { 548 if (disabled || !early_radix_enabled()) 549 return; 550 551 if (smp_processor_id() == boot_cpuid) 552 pr_info("Activating Kernel Userspace Execution Prevention\n"); 553 554 /* 555 * Radix always uses key0 of the IAMR to determine if an access is 556 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction 557 * fetch. 558 */ 559 mtspr(SPRN_IAMR, (1ul << 62)); 560 } 561 #endif 562 563 #ifdef CONFIG_PPC_KUAP 564 void setup_kuap(bool disabled) 565 { 566 if (disabled || !early_radix_enabled()) 567 return; 568 569 if (smp_processor_id() == boot_cpuid) { 570 pr_info("Activating Kernel Userspace Access Prevention\n"); 571 cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; 572 } 573 574 /* Make sure userspace can't change the AMR */ 575 mtspr(SPRN_UAMOR, 0); 576 mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); 577 isync(); 578 } 579 #endif 580 581 void __init radix__early_init_mmu(void) 582 { 583 unsigned long lpcr; 584 585 #ifdef CONFIG_PPC_64K_PAGES 586 /* PAGE_SIZE mappings */ 587 mmu_virtual_psize = MMU_PAGE_64K; 588 #else 589 mmu_virtual_psize = MMU_PAGE_4K; 590 #endif 591 592 #ifdef CONFIG_SPARSEMEM_VMEMMAP 593 /* vmemmap mapping */ 594 mmu_vmemmap_psize = mmu_virtual_psize; 595 #endif 596 /* 597 * initialize page table size 598 */ 599 __pte_index_size = RADIX_PTE_INDEX_SIZE; 600 __pmd_index_size = RADIX_PMD_INDEX_SIZE; 601 __pud_index_size = RADIX_PUD_INDEX_SIZE; 602 __pgd_index_size = RADIX_PGD_INDEX_SIZE; 603 __pud_cache_index = RADIX_PUD_INDEX_SIZE; 604 __pte_table_size = RADIX_PTE_TABLE_SIZE; 605 __pmd_table_size = RADIX_PMD_TABLE_SIZE; 606 __pud_table_size = RADIX_PUD_TABLE_SIZE; 607 __pgd_table_size = RADIX_PGD_TABLE_SIZE; 608 609 __pmd_val_bits = RADIX_PMD_VAL_BITS; 610 __pud_val_bits = RADIX_PUD_VAL_BITS; 611 __pgd_val_bits = RADIX_PGD_VAL_BITS; 612 613 __kernel_virt_start = RADIX_KERN_VIRT_START; 614 __vmalloc_start = RADIX_VMALLOC_START; 615 __vmalloc_end = RADIX_VMALLOC_END; 616 __kernel_io_start = RADIX_KERN_IO_START; 617 __kernel_io_end = RADIX_KERN_IO_END; 618 vmemmap = (struct page *)RADIX_VMEMMAP_START; 619 ioremap_bot = IOREMAP_BASE; 620 621 #ifdef CONFIG_PCI 622 pci_io_base = ISA_IO_BASE; 623 #endif 624 __pte_frag_nr = RADIX_PTE_FRAG_NR; 625 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; 626 __pmd_frag_nr = RADIX_PMD_FRAG_NR; 627 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; 628 629 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 630 radix_init_native(); 631 lpcr = mfspr(SPRN_LPCR); 632 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 633 radix_init_partition_table(); 634 radix_init_amor(); 635 } else { 636 radix_init_pseries(); 637 } 638 639 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 640 641 radix_init_pgtable(); 642 /* Switch to the guard PID before turning on MMU */ 643 radix__switch_mmu_context(NULL, &init_mm); 644 if (cpu_has_feature(CPU_FTR_HVMODE)) 645 tlbiel_all(); 646 } 647 648 void radix__early_init_mmu_secondary(void) 649 { 650 unsigned long lpcr; 651 /* 652 * update partition table control register and UPRT 653 */ 654 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 655 lpcr = mfspr(SPRN_LPCR); 656 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 657 658 mtspr(SPRN_PTCR, 659 __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); 660 radix_init_amor(); 661 } 662 663 radix__switch_mmu_context(NULL, &init_mm); 664 if (cpu_has_feature(CPU_FTR_HVMODE)) 665 tlbiel_all(); 666 } 667 668 void radix__mmu_cleanup_all(void) 669 { 670 unsigned long lpcr; 671 672 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 673 lpcr = mfspr(SPRN_LPCR); 674 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); 675 mtspr(SPRN_PTCR, 0); 676 powernv_set_nmmu_ptcr(0); 677 radix__flush_tlb_all(); 678 } 679 } 680 681 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, 682 phys_addr_t first_memblock_size) 683 { 684 /* 685 * We don't currently support the first MEMBLOCK not mapping 0 686 * physical on those processors 687 */ 688 BUG_ON(first_memblock_base != 0); 689 690 /* 691 * Radix mode is not limited by RMA / VRMA addressing. 692 */ 693 ppc64_rma_size = ULONG_MAX; 694 } 695 696 #ifdef CONFIG_MEMORY_HOTPLUG 697 static void free_pte_table(pte_t *pte_start, pmd_t *pmd) 698 { 699 pte_t *pte; 700 int i; 701 702 for (i = 0; i < PTRS_PER_PTE; i++) { 703 pte = pte_start + i; 704 if (!pte_none(*pte)) 705 return; 706 } 707 708 pte_free_kernel(&init_mm, pte_start); 709 pmd_clear(pmd); 710 } 711 712 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) 713 { 714 pmd_t *pmd; 715 int i; 716 717 for (i = 0; i < PTRS_PER_PMD; i++) { 718 pmd = pmd_start + i; 719 if (!pmd_none(*pmd)) 720 return; 721 } 722 723 pmd_free(&init_mm, pmd_start); 724 pud_clear(pud); 725 } 726 727 struct change_mapping_params { 728 pte_t *pte; 729 unsigned long start; 730 unsigned long end; 731 unsigned long aligned_start; 732 unsigned long aligned_end; 733 }; 734 735 static int __meminit stop_machine_change_mapping(void *data) 736 { 737 struct change_mapping_params *params = 738 (struct change_mapping_params *)data; 739 740 if (!data) 741 return -1; 742 743 spin_unlock(&init_mm.page_table_lock); 744 pte_clear(&init_mm, params->aligned_start, params->pte); 745 create_physical_mapping(params->aligned_start, params->start, -1); 746 create_physical_mapping(params->end, params->aligned_end, -1); 747 spin_lock(&init_mm.page_table_lock); 748 return 0; 749 } 750 751 static void remove_pte_table(pte_t *pte_start, unsigned long addr, 752 unsigned long end) 753 { 754 unsigned long next; 755 pte_t *pte; 756 757 pte = pte_start + pte_index(addr); 758 for (; addr < end; addr = next, pte++) { 759 next = (addr + PAGE_SIZE) & PAGE_MASK; 760 if (next > end) 761 next = end; 762 763 if (!pte_present(*pte)) 764 continue; 765 766 if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { 767 /* 768 * The vmemmap_free() and remove_section_mapping() 769 * codepaths call us with aligned addresses. 770 */ 771 WARN_ONCE(1, "%s: unaligned range\n", __func__); 772 continue; 773 } 774 775 pte_clear(&init_mm, addr, pte); 776 } 777 } 778 779 /* 780 * clear the pte and potentially split the mapping helper 781 */ 782 static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, 783 unsigned long size, pte_t *pte) 784 { 785 unsigned long mask = ~(size - 1); 786 unsigned long aligned_start = addr & mask; 787 unsigned long aligned_end = addr + size; 788 struct change_mapping_params params; 789 bool split_region = false; 790 791 if ((end - addr) < size) { 792 /* 793 * We're going to clear the PTE, but not flushed 794 * the mapping, time to remap and flush. The 795 * effects if visible outside the processor or 796 * if we are running in code close to the 797 * mapping we cleared, we are in trouble. 798 */ 799 if (overlaps_kernel_text(aligned_start, addr) || 800 overlaps_kernel_text(end, aligned_end)) { 801 /* 802 * Hack, just return, don't pte_clear 803 */ 804 WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " 805 "text, not splitting\n", addr, end); 806 return; 807 } 808 split_region = true; 809 } 810 811 if (split_region) { 812 params.pte = pte; 813 params.start = addr; 814 params.end = end; 815 params.aligned_start = addr & ~(size - 1); 816 params.aligned_end = min_t(unsigned long, aligned_end, 817 (unsigned long)__va(memblock_end_of_DRAM())); 818 stop_machine(stop_machine_change_mapping, ¶ms, NULL); 819 return; 820 } 821 822 pte_clear(&init_mm, addr, pte); 823 } 824 825 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, 826 unsigned long end) 827 { 828 unsigned long next; 829 pte_t *pte_base; 830 pmd_t *pmd; 831 832 pmd = pmd_start + pmd_index(addr); 833 for (; addr < end; addr = next, pmd++) { 834 next = pmd_addr_end(addr, end); 835 836 if (!pmd_present(*pmd)) 837 continue; 838 839 if (pmd_huge(*pmd)) { 840 split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); 841 continue; 842 } 843 844 pte_base = (pte_t *)pmd_page_vaddr(*pmd); 845 remove_pte_table(pte_base, addr, next); 846 free_pte_table(pte_base, pmd); 847 } 848 } 849 850 static void remove_pud_table(pud_t *pud_start, unsigned long addr, 851 unsigned long end) 852 { 853 unsigned long next; 854 pmd_t *pmd_base; 855 pud_t *pud; 856 857 pud = pud_start + pud_index(addr); 858 for (; addr < end; addr = next, pud++) { 859 next = pud_addr_end(addr, end); 860 861 if (!pud_present(*pud)) 862 continue; 863 864 if (pud_huge(*pud)) { 865 split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); 866 continue; 867 } 868 869 pmd_base = (pmd_t *)pud_page_vaddr(*pud); 870 remove_pmd_table(pmd_base, addr, next); 871 free_pmd_table(pmd_base, pud); 872 } 873 } 874 875 static void __meminit remove_pagetable(unsigned long start, unsigned long end) 876 { 877 unsigned long addr, next; 878 pud_t *pud_base; 879 pgd_t *pgd; 880 881 spin_lock(&init_mm.page_table_lock); 882 883 for (addr = start; addr < end; addr = next) { 884 next = pgd_addr_end(addr, end); 885 886 pgd = pgd_offset_k(addr); 887 if (!pgd_present(*pgd)) 888 continue; 889 890 if (pgd_huge(*pgd)) { 891 split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); 892 continue; 893 } 894 895 pud_base = (pud_t *)pgd_page_vaddr(*pgd); 896 remove_pud_table(pud_base, addr, next); 897 } 898 899 spin_unlock(&init_mm.page_table_lock); 900 radix__flush_tlb_kernel_range(start, end); 901 } 902 903 int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) 904 { 905 if (end >= RADIX_VMALLOC_START) { 906 pr_warn("Outside the supported range\n"); 907 return -1; 908 } 909 910 return create_physical_mapping(start, end, nid); 911 } 912 913 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) 914 { 915 remove_pagetable(start, end); 916 return 0; 917 } 918 #endif /* CONFIG_MEMORY_HOTPLUG */ 919 920 #ifdef CONFIG_SPARSEMEM_VMEMMAP 921 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, 922 pgprot_t flags, unsigned int map_page_size, 923 int nid) 924 { 925 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); 926 } 927 928 int __meminit radix__vmemmap_create_mapping(unsigned long start, 929 unsigned long page_size, 930 unsigned long phys) 931 { 932 /* Create a PTE encoding */ 933 unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; 934 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); 935 int ret; 936 937 if ((start + page_size) >= RADIX_VMEMMAP_END) { 938 pr_warn("Outside the supported range\n"); 939 return -1; 940 } 941 942 ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); 943 BUG_ON(ret); 944 945 return 0; 946 } 947 948 #ifdef CONFIG_MEMORY_HOTPLUG 949 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) 950 { 951 remove_pagetable(start, start + page_size); 952 } 953 #endif 954 #endif 955 956 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 957 958 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 959 pmd_t *pmdp, unsigned long clr, 960 unsigned long set) 961 { 962 unsigned long old; 963 964 #ifdef CONFIG_DEBUG_VM 965 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); 966 assert_spin_locked(pmd_lockptr(mm, pmdp)); 967 #endif 968 969 old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); 970 trace_hugepage_update(addr, old, clr, set); 971 972 return old; 973 } 974 975 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 976 pmd_t *pmdp) 977 978 { 979 pmd_t pmd; 980 981 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 982 VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); 983 VM_BUG_ON(pmd_devmap(*pmdp)); 984 /* 985 * khugepaged calls this for normal pmd 986 */ 987 pmd = *pmdp; 988 pmd_clear(pmdp); 989 990 /*FIXME!! Verify whether we need this kick below */ 991 serialize_against_pte_lookup(vma->vm_mm); 992 993 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); 994 995 return pmd; 996 } 997 998 /* 999 * For us pgtable_t is pte_t *. Inorder to save the deposisted 1000 * page table, we consider the allocated page table as a list 1001 * head. On withdraw we need to make sure we zero out the used 1002 * list_head memory area. 1003 */ 1004 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1005 pgtable_t pgtable) 1006 { 1007 struct list_head *lh = (struct list_head *) pgtable; 1008 1009 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1010 1011 /* FIFO */ 1012 if (!pmd_huge_pte(mm, pmdp)) 1013 INIT_LIST_HEAD(lh); 1014 else 1015 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1016 pmd_huge_pte(mm, pmdp) = pgtable; 1017 } 1018 1019 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1020 { 1021 pte_t *ptep; 1022 pgtable_t pgtable; 1023 struct list_head *lh; 1024 1025 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1026 1027 /* FIFO */ 1028 pgtable = pmd_huge_pte(mm, pmdp); 1029 lh = (struct list_head *) pgtable; 1030 if (list_empty(lh)) 1031 pmd_huge_pte(mm, pmdp) = NULL; 1032 else { 1033 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1034 list_del(lh); 1035 } 1036 ptep = (pte_t *) pgtable; 1037 *ptep = __pte(0); 1038 ptep++; 1039 *ptep = __pte(0); 1040 return pgtable; 1041 } 1042 1043 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 1044 unsigned long addr, pmd_t *pmdp) 1045 { 1046 pmd_t old_pmd; 1047 unsigned long old; 1048 1049 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 1050 old_pmd = __pmd(old); 1051 /* 1052 * Serialize against find_current_mm_pte which does lock-less 1053 * lookup in page tables with local interrupts disabled. For huge pages 1054 * it casts pmd_t to pte_t. Since format of pte_t is different from 1055 * pmd_t we want to prevent transit from pmd pointing to page table 1056 * to pmd pointing to huge page (and back) while interrupts are disabled. 1057 * We clear pmd to possibly replace it with page table pointer in 1058 * different code paths. So make sure we wait for the parallel 1059 * find_current_mm_pte to finish. 1060 */ 1061 serialize_against_pte_lookup(mm); 1062 return old_pmd; 1063 } 1064 1065 int radix__has_transparent_hugepage(void) 1066 { 1067 /* For radix 2M at PMD level means thp */ 1068 if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) 1069 return 1; 1070 return 0; 1071 } 1072 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1073 1074 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, 1075 pte_t entry, unsigned long address, int psize) 1076 { 1077 struct mm_struct *mm = vma->vm_mm; 1078 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | 1079 _PAGE_RW | _PAGE_EXEC); 1080 1081 unsigned long change = pte_val(entry) ^ pte_val(*ptep); 1082 /* 1083 * To avoid NMMU hang while relaxing access, we need mark 1084 * the pte invalid in between. 1085 */ 1086 if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { 1087 unsigned long old_pte, new_pte; 1088 1089 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); 1090 /* 1091 * new value of pte 1092 */ 1093 new_pte = old_pte | set; 1094 radix__flush_tlb_page_psize(mm, address, psize); 1095 __radix_pte_update(ptep, _PAGE_INVALID, new_pte); 1096 } else { 1097 __radix_pte_update(ptep, 0, set); 1098 /* 1099 * Book3S does not require a TLB flush when relaxing access 1100 * restrictions when the address space is not attached to a 1101 * NMMU, because the core MMU will reload the pte after taking 1102 * an access fault, which is defined by the architectue. 1103 */ 1104 } 1105 /* See ptesync comment in radix__set_pte_at */ 1106 } 1107 1108 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 1109 unsigned long addr, pte_t *ptep, 1110 pte_t old_pte, pte_t pte) 1111 { 1112 struct mm_struct *mm = vma->vm_mm; 1113 1114 /* 1115 * To avoid NMMU hang while relaxing access we need to flush the tlb before 1116 * we set the new value. We need to do this only for radix, because hash 1117 * translation does flush when updating the linux pte. 1118 */ 1119 if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 1120 (atomic_read(&mm->context.copros) > 0)) 1121 radix__flush_tlb_page(vma, addr); 1122 1123 set_pte_at(mm, addr, ptep, pte); 1124 } 1125