1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2006 4 */ 5 6 #include <linux/memory_hotplug.h> 7 #include <linux/memblock.h> 8 #include <linux/pfn.h> 9 #include <linux/mm.h> 10 #include <linux/init.h> 11 #include <linux/list.h> 12 #include <linux/hugetlb.h> 13 #include <linux/slab.h> 14 #include <linux/sort.h> 15 #include <asm/page-states.h> 16 #include <asm/cacheflush.h> 17 #include <asm/nospec-branch.h> 18 #include <asm/pgalloc.h> 19 #include <asm/setup.h> 20 #include <asm/tlbflush.h> 21 #include <asm/sections.h> 22 #include <asm/set_memory.h> 23 24 static DEFINE_MUTEX(vmem_mutex); 25 26 static void __ref *vmem_alloc_pages(unsigned int order) 27 { 28 unsigned long size = PAGE_SIZE << order; 29 30 if (slab_is_available()) 31 return (void *)__get_free_pages(GFP_KERNEL, order); 32 return memblock_alloc(size, size); 33 } 34 35 static void vmem_free_pages(unsigned long addr, int order) 36 { 37 /* We don't expect boot memory to be removed ever. */ 38 if (!slab_is_available() || 39 WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) 40 return; 41 free_pages(addr, order); 42 } 43 44 void *vmem_crst_alloc(unsigned long val) 45 { 46 unsigned long *table; 47 48 table = vmem_alloc_pages(CRST_ALLOC_ORDER); 49 if (!table) 50 return NULL; 51 crst_table_init(table, val); 52 if (slab_is_available()) 53 arch_set_page_dat(virt_to_page(table), CRST_ALLOC_ORDER); 54 return table; 55 } 56 57 pte_t __ref *vmem_pte_alloc(void) 58 { 59 unsigned long size = PTRS_PER_PTE * sizeof(pte_t); 60 pte_t *pte; 61 62 if (slab_is_available()) 63 pte = (pte_t *) page_table_alloc(&init_mm); 64 else 65 pte = (pte_t *) memblock_alloc(size, size); 66 if (!pte) 67 return NULL; 68 memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); 69 return pte; 70 } 71 72 static void vmem_pte_free(unsigned long *table) 73 { 74 /* We don't expect boot memory to be removed ever. */ 75 if (!slab_is_available() || 76 WARN_ON_ONCE(PageReserved(virt_to_page(table)))) 77 return; 78 page_table_free(&init_mm, table); 79 } 80 81 #define PAGE_UNUSED 0xFD 82 83 /* 84 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges 85 * from unused_sub_pmd_start to next PMD_SIZE boundary. 86 */ 87 static unsigned long unused_sub_pmd_start; 88 89 static void vmemmap_flush_unused_sub_pmd(void) 90 { 91 if (!unused_sub_pmd_start) 92 return; 93 memset((void *)unused_sub_pmd_start, PAGE_UNUSED, 94 ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); 95 unused_sub_pmd_start = 0; 96 } 97 98 static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) 99 { 100 /* 101 * As we expect to add in the same granularity as we remove, it's 102 * sufficient to mark only some piece used to block the memmap page from 103 * getting removed (just in case the memmap never gets initialized, 104 * e.g., because the memory block never gets onlined). 105 */ 106 memset((void *)start, 0, sizeof(struct page)); 107 } 108 109 static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) 110 { 111 /* 112 * We only optimize if the new used range directly follows the 113 * previously unused range (esp., when populating consecutive sections). 114 */ 115 if (unused_sub_pmd_start == start) { 116 unused_sub_pmd_start = end; 117 if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) 118 unused_sub_pmd_start = 0; 119 return; 120 } 121 vmemmap_flush_unused_sub_pmd(); 122 vmemmap_mark_sub_pmd_used(start, end); 123 } 124 125 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) 126 { 127 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 128 129 vmemmap_flush_unused_sub_pmd(); 130 131 /* Could be our memmap page is filled with PAGE_UNUSED already ... */ 132 vmemmap_mark_sub_pmd_used(start, end); 133 134 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ 135 if (!IS_ALIGNED(start, PMD_SIZE)) 136 memset((void *)page, PAGE_UNUSED, start - page); 137 /* 138 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of 139 * consecutive sections. Remember for the last added PMD the last 140 * unused range in the populated PMD. 141 */ 142 if (!IS_ALIGNED(end, PMD_SIZE)) 143 unused_sub_pmd_start = end; 144 } 145 146 /* Returns true if the PMD is completely unused and can be freed. */ 147 static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) 148 { 149 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 150 151 vmemmap_flush_unused_sub_pmd(); 152 memset((void *)start, PAGE_UNUSED, end - start); 153 return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); 154 } 155 156 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 157 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, 158 unsigned long end, bool add, bool direct) 159 { 160 unsigned long prot, pages = 0; 161 int ret = -ENOMEM; 162 pte_t *pte; 163 164 prot = pgprot_val(PAGE_KERNEL); 165 if (!MACHINE_HAS_NX) 166 prot &= ~_PAGE_NOEXEC; 167 168 pte = pte_offset_kernel(pmd, addr); 169 for (; addr < end; addr += PAGE_SIZE, pte++) { 170 if (!add) { 171 if (pte_none(*pte)) 172 continue; 173 if (!direct) 174 vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); 175 pte_clear(&init_mm, addr, pte); 176 } else if (pte_none(*pte)) { 177 if (!direct) { 178 void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); 179 180 if (!new_page) 181 goto out; 182 set_pte(pte, __pte(__pa(new_page) | prot)); 183 } else { 184 set_pte(pte, __pte(__pa(addr) | prot)); 185 } 186 } else { 187 continue; 188 } 189 pages++; 190 } 191 ret = 0; 192 out: 193 if (direct) 194 update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); 195 return ret; 196 } 197 198 static void try_free_pte_table(pmd_t *pmd, unsigned long start) 199 { 200 pte_t *pte; 201 int i; 202 203 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ 204 pte = pte_offset_kernel(pmd, start); 205 for (i = 0; i < PTRS_PER_PTE; i++, pte++) { 206 if (!pte_none(*pte)) 207 return; 208 } 209 vmem_pte_free((unsigned long *) pmd_deref(*pmd)); 210 pmd_clear(pmd); 211 } 212 213 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 214 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, 215 unsigned long end, bool add, bool direct) 216 { 217 unsigned long next, prot, pages = 0; 218 int ret = -ENOMEM; 219 pmd_t *pmd; 220 pte_t *pte; 221 222 prot = pgprot_val(SEGMENT_KERNEL); 223 if (!MACHINE_HAS_NX) 224 prot &= ~_SEGMENT_ENTRY_NOEXEC; 225 226 pmd = pmd_offset(pud, addr); 227 for (; addr < end; addr = next, pmd++) { 228 next = pmd_addr_end(addr, end); 229 if (!add) { 230 if (pmd_none(*pmd)) 231 continue; 232 if (pmd_large(*pmd)) { 233 if (IS_ALIGNED(addr, PMD_SIZE) && 234 IS_ALIGNED(next, PMD_SIZE)) { 235 if (!direct) 236 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); 237 pmd_clear(pmd); 238 pages++; 239 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { 240 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); 241 pmd_clear(pmd); 242 } 243 continue; 244 } 245 } else if (pmd_none(*pmd)) { 246 if (IS_ALIGNED(addr, PMD_SIZE) && 247 IS_ALIGNED(next, PMD_SIZE) && 248 MACHINE_HAS_EDAT1 && direct && 249 !debug_pagealloc_enabled()) { 250 set_pmd(pmd, __pmd(__pa(addr) | prot)); 251 pages++; 252 continue; 253 } else if (!direct && MACHINE_HAS_EDAT1) { 254 void *new_page; 255 256 /* 257 * Use 1MB frames for vmemmap if available. We 258 * always use large frames even if they are only 259 * partially used. Otherwise we would have also 260 * page tables since vmemmap_populate gets 261 * called for each section separately. 262 */ 263 new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); 264 if (new_page) { 265 set_pmd(pmd, __pmd(__pa(new_page) | prot)); 266 if (!IS_ALIGNED(addr, PMD_SIZE) || 267 !IS_ALIGNED(next, PMD_SIZE)) { 268 vmemmap_use_new_sub_pmd(addr, next); 269 } 270 continue; 271 } 272 } 273 pte = vmem_pte_alloc(); 274 if (!pte) 275 goto out; 276 pmd_populate(&init_mm, pmd, pte); 277 } else if (pmd_large(*pmd)) { 278 if (!direct) 279 vmemmap_use_sub_pmd(addr, next); 280 continue; 281 } 282 ret = modify_pte_table(pmd, addr, next, add, direct); 283 if (ret) 284 goto out; 285 if (!add) 286 try_free_pte_table(pmd, addr & PMD_MASK); 287 } 288 ret = 0; 289 out: 290 if (direct) 291 update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); 292 return ret; 293 } 294 295 static void try_free_pmd_table(pud_t *pud, unsigned long start) 296 { 297 pmd_t *pmd; 298 int i; 299 300 pmd = pmd_offset(pud, start); 301 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) 302 if (!pmd_none(*pmd)) 303 return; 304 vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); 305 pud_clear(pud); 306 } 307 308 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, 309 bool add, bool direct) 310 { 311 unsigned long next, prot, pages = 0; 312 int ret = -ENOMEM; 313 pud_t *pud; 314 pmd_t *pmd; 315 316 prot = pgprot_val(REGION3_KERNEL); 317 if (!MACHINE_HAS_NX) 318 prot &= ~_REGION_ENTRY_NOEXEC; 319 pud = pud_offset(p4d, addr); 320 for (; addr < end; addr = next, pud++) { 321 next = pud_addr_end(addr, end); 322 if (!add) { 323 if (pud_none(*pud)) 324 continue; 325 if (pud_leaf(*pud)) { 326 if (IS_ALIGNED(addr, PUD_SIZE) && 327 IS_ALIGNED(next, PUD_SIZE)) { 328 pud_clear(pud); 329 pages++; 330 } 331 continue; 332 } 333 } else if (pud_none(*pud)) { 334 if (IS_ALIGNED(addr, PUD_SIZE) && 335 IS_ALIGNED(next, PUD_SIZE) && 336 MACHINE_HAS_EDAT2 && direct && 337 !debug_pagealloc_enabled()) { 338 set_pud(pud, __pud(__pa(addr) | prot)); 339 pages++; 340 continue; 341 } 342 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 343 if (!pmd) 344 goto out; 345 pud_populate(&init_mm, pud, pmd); 346 } else if (pud_leaf(*pud)) { 347 continue; 348 } 349 ret = modify_pmd_table(pud, addr, next, add, direct); 350 if (ret) 351 goto out; 352 if (!add) 353 try_free_pmd_table(pud, addr & PUD_MASK); 354 } 355 ret = 0; 356 out: 357 if (direct) 358 update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); 359 return ret; 360 } 361 362 static void try_free_pud_table(p4d_t *p4d, unsigned long start) 363 { 364 pud_t *pud; 365 int i; 366 367 pud = pud_offset(p4d, start); 368 for (i = 0; i < PTRS_PER_PUD; i++, pud++) { 369 if (!pud_none(*pud)) 370 return; 371 } 372 vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); 373 p4d_clear(p4d); 374 } 375 376 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, 377 bool add, bool direct) 378 { 379 unsigned long next; 380 int ret = -ENOMEM; 381 p4d_t *p4d; 382 pud_t *pud; 383 384 p4d = p4d_offset(pgd, addr); 385 for (; addr < end; addr = next, p4d++) { 386 next = p4d_addr_end(addr, end); 387 if (!add) { 388 if (p4d_none(*p4d)) 389 continue; 390 } else if (p4d_none(*p4d)) { 391 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 392 if (!pud) 393 goto out; 394 p4d_populate(&init_mm, p4d, pud); 395 } 396 ret = modify_pud_table(p4d, addr, next, add, direct); 397 if (ret) 398 goto out; 399 if (!add) 400 try_free_pud_table(p4d, addr & P4D_MASK); 401 } 402 ret = 0; 403 out: 404 return ret; 405 } 406 407 static void try_free_p4d_table(pgd_t *pgd, unsigned long start) 408 { 409 p4d_t *p4d; 410 int i; 411 412 p4d = p4d_offset(pgd, start); 413 for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { 414 if (!p4d_none(*p4d)) 415 return; 416 } 417 vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); 418 pgd_clear(pgd); 419 } 420 421 static int modify_pagetable(unsigned long start, unsigned long end, bool add, 422 bool direct) 423 { 424 unsigned long addr, next; 425 int ret = -ENOMEM; 426 pgd_t *pgd; 427 p4d_t *p4d; 428 429 if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) 430 return -EINVAL; 431 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ 432 if (WARN_ON_ONCE(end > VMALLOC_START)) 433 return -EINVAL; 434 for (addr = start; addr < end; addr = next) { 435 next = pgd_addr_end(addr, end); 436 pgd = pgd_offset_k(addr); 437 438 if (!add) { 439 if (pgd_none(*pgd)) 440 continue; 441 } else if (pgd_none(*pgd)) { 442 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 443 if (!p4d) 444 goto out; 445 pgd_populate(&init_mm, pgd, p4d); 446 } 447 ret = modify_p4d_table(pgd, addr, next, add, direct); 448 if (ret) 449 goto out; 450 if (!add) 451 try_free_p4d_table(pgd, addr & PGDIR_MASK); 452 } 453 ret = 0; 454 out: 455 if (!add) 456 flush_tlb_kernel_range(start, end); 457 return ret; 458 } 459 460 static int add_pagetable(unsigned long start, unsigned long end, bool direct) 461 { 462 return modify_pagetable(start, end, true, direct); 463 } 464 465 static int remove_pagetable(unsigned long start, unsigned long end, bool direct) 466 { 467 return modify_pagetable(start, end, false, direct); 468 } 469 470 /* 471 * Add a physical memory range to the 1:1 mapping. 472 */ 473 static int vmem_add_range(unsigned long start, unsigned long size) 474 { 475 start = (unsigned long)__va(start); 476 return add_pagetable(start, start + size, true); 477 } 478 479 /* 480 * Remove a physical memory range from the 1:1 mapping. 481 */ 482 static void vmem_remove_range(unsigned long start, unsigned long size) 483 { 484 start = (unsigned long)__va(start); 485 remove_pagetable(start, start + size, true); 486 } 487 488 /* 489 * Add a backed mem_map array to the virtual mem_map array. 490 */ 491 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 492 struct vmem_altmap *altmap) 493 { 494 int ret; 495 496 mutex_lock(&vmem_mutex); 497 /* We don't care about the node, just use NUMA_NO_NODE on allocations */ 498 ret = add_pagetable(start, end, false); 499 if (ret) 500 remove_pagetable(start, end, false); 501 mutex_unlock(&vmem_mutex); 502 return ret; 503 } 504 505 void vmemmap_free(unsigned long start, unsigned long end, 506 struct vmem_altmap *altmap) 507 { 508 mutex_lock(&vmem_mutex); 509 remove_pagetable(start, end, false); 510 mutex_unlock(&vmem_mutex); 511 } 512 513 void vmem_remove_mapping(unsigned long start, unsigned long size) 514 { 515 mutex_lock(&vmem_mutex); 516 vmem_remove_range(start, size); 517 mutex_unlock(&vmem_mutex); 518 } 519 520 struct range arch_get_mappable_range(void) 521 { 522 struct range mhp_range; 523 524 mhp_range.start = 0; 525 mhp_range.end = max_mappable - 1; 526 return mhp_range; 527 } 528 529 int vmem_add_mapping(unsigned long start, unsigned long size) 530 { 531 struct range range = arch_get_mappable_range(); 532 int ret; 533 534 if (start < range.start || 535 start + size > range.end + 1 || 536 start + size < start) 537 return -ERANGE; 538 539 mutex_lock(&vmem_mutex); 540 ret = vmem_add_range(start, size); 541 if (ret) 542 vmem_remove_range(start, size); 543 mutex_unlock(&vmem_mutex); 544 return ret; 545 } 546 547 /* 548 * Allocate new or return existing page-table entry, but do not map it 549 * to any physical address. If missing, allocate segment- and region- 550 * table entries along. Meeting a large segment- or region-table entry 551 * while traversing is an error, since the function is expected to be 552 * called against virtual regions reserved for 4KB mappings only. 553 */ 554 pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) 555 { 556 pte_t *ptep = NULL; 557 pgd_t *pgd; 558 p4d_t *p4d; 559 pud_t *pud; 560 pmd_t *pmd; 561 pte_t *pte; 562 563 pgd = pgd_offset_k(addr); 564 if (pgd_none(*pgd)) { 565 if (!alloc) 566 goto out; 567 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 568 if (!p4d) 569 goto out; 570 pgd_populate(&init_mm, pgd, p4d); 571 } 572 p4d = p4d_offset(pgd, addr); 573 if (p4d_none(*p4d)) { 574 if (!alloc) 575 goto out; 576 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 577 if (!pud) 578 goto out; 579 p4d_populate(&init_mm, p4d, pud); 580 } 581 pud = pud_offset(p4d, addr); 582 if (pud_none(*pud)) { 583 if (!alloc) 584 goto out; 585 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 586 if (!pmd) 587 goto out; 588 pud_populate(&init_mm, pud, pmd); 589 } else if (WARN_ON_ONCE(pud_leaf(*pud))) { 590 goto out; 591 } 592 pmd = pmd_offset(pud, addr); 593 if (pmd_none(*pmd)) { 594 if (!alloc) 595 goto out; 596 pte = vmem_pte_alloc(); 597 if (!pte) 598 goto out; 599 pmd_populate(&init_mm, pmd, pte); 600 } else if (WARN_ON_ONCE(pmd_large(*pmd))) { 601 goto out; 602 } 603 ptep = pte_offset_kernel(pmd, addr); 604 out: 605 return ptep; 606 } 607 608 int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) 609 { 610 pte_t *ptep, pte; 611 612 if (!IS_ALIGNED(addr, PAGE_SIZE)) 613 return -EINVAL; 614 ptep = vmem_get_alloc_pte(addr, alloc); 615 if (!ptep) 616 return -ENOMEM; 617 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 618 pte = mk_pte_phys(phys, prot); 619 set_pte(ptep, pte); 620 return 0; 621 } 622 623 int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) 624 { 625 int rc; 626 627 mutex_lock(&vmem_mutex); 628 rc = __vmem_map_4k_page(addr, phys, prot, true); 629 mutex_unlock(&vmem_mutex); 630 return rc; 631 } 632 633 void vmem_unmap_4k_page(unsigned long addr) 634 { 635 pte_t *ptep; 636 637 mutex_lock(&vmem_mutex); 638 ptep = virt_to_kpte(addr); 639 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 640 pte_clear(&init_mm, addr, ptep); 641 mutex_unlock(&vmem_mutex); 642 } 643 644 void __init vmem_map_init(void) 645 { 646 __set_memory_rox(_stext, _etext); 647 __set_memory_ro(_etext, __end_rodata); 648 __set_memory_rox(_sinittext, _einittext); 649 __set_memory_rox(__stext_amode31, __etext_amode31); 650 /* 651 * If the BEAR-enhancement facility is not installed the first 652 * prefix page is used to return to the previous context with 653 * an LPSWE instruction and therefore must be executable. 654 */ 655 if (!static_key_enabled(&cpu_has_bear)) 656 set_memory_x(0, 1); 657 if (debug_pagealloc_enabled()) { 658 /* 659 * Use RELOC_HIDE() as long as __va(0) translates to NULL, 660 * since performing pointer arithmetic on a NULL pointer 661 * has undefined behavior and generates compiler warnings. 662 */ 663 __set_memory_4k(__va(0), RELOC_HIDE(__va(0), ident_map_size)); 664 } 665 if (MACHINE_HAS_NX) 666 ctl_set_bit(0, 20); 667 pr_info("Write protected kernel read-only data: %luk\n", 668 (unsigned long)(__end_rodata - _stext) >> 10); 669 } 670