1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2006 4 * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> 5 */ 6 7 #include <linux/memory_hotplug.h> 8 #include <linux/memblock.h> 9 #include <linux/pfn.h> 10 #include <linux/mm.h> 11 #include <linux/init.h> 12 #include <linux/list.h> 13 #include <linux/hugetlb.h> 14 #include <linux/slab.h> 15 #include <asm/cacheflush.h> 16 #include <asm/nospec-branch.h> 17 #include <asm/pgalloc.h> 18 #include <asm/setup.h> 19 #include <asm/tlbflush.h> 20 #include <asm/sections.h> 21 #include <asm/set_memory.h> 22 23 static DEFINE_MUTEX(vmem_mutex); 24 25 static void __ref *vmem_alloc_pages(unsigned int order) 26 { 27 unsigned long size = PAGE_SIZE << order; 28 29 if (slab_is_available()) 30 return (void *)__get_free_pages(GFP_KERNEL, order); 31 return memblock_alloc(size, size); 32 } 33 34 static void vmem_free_pages(unsigned long addr, int order) 35 { 36 /* We don't expect boot memory to be removed ever. */ 37 if (!slab_is_available() || 38 WARN_ON_ONCE(PageReserved(virt_to_page(addr)))) 39 return; 40 free_pages(addr, order); 41 } 42 43 void *vmem_crst_alloc(unsigned long val) 44 { 45 unsigned long *table; 46 47 table = vmem_alloc_pages(CRST_ALLOC_ORDER); 48 if (table) 49 crst_table_init(table, val); 50 return table; 51 } 52 53 pte_t __ref *vmem_pte_alloc(void) 54 { 55 unsigned long size = PTRS_PER_PTE * sizeof(pte_t); 56 pte_t *pte; 57 58 if (slab_is_available()) 59 pte = (pte_t *) page_table_alloc(&init_mm); 60 else 61 pte = (pte_t *) memblock_alloc(size, size); 62 if (!pte) 63 return NULL; 64 memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); 65 return pte; 66 } 67 68 static void vmem_pte_free(unsigned long *table) 69 { 70 /* We don't expect boot memory to be removed ever. */ 71 if (!slab_is_available() || 72 WARN_ON_ONCE(PageReserved(virt_to_page(table)))) 73 return; 74 page_table_free(&init_mm, table); 75 } 76 77 #define PAGE_UNUSED 0xFD 78 79 /* 80 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges 81 * from unused_sub_pmd_start to next PMD_SIZE boundary. 82 */ 83 static unsigned long unused_sub_pmd_start; 84 85 static void vmemmap_flush_unused_sub_pmd(void) 86 { 87 if (!unused_sub_pmd_start) 88 return; 89 memset((void *)unused_sub_pmd_start, PAGE_UNUSED, 90 ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); 91 unused_sub_pmd_start = 0; 92 } 93 94 static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) 95 { 96 /* 97 * As we expect to add in the same granularity as we remove, it's 98 * sufficient to mark only some piece used to block the memmap page from 99 * getting removed (just in case the memmap never gets initialized, 100 * e.g., because the memory block never gets onlined). 101 */ 102 memset((void *)start, 0, sizeof(struct page)); 103 } 104 105 static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) 106 { 107 /* 108 * We only optimize if the new used range directly follows the 109 * previously unused range (esp., when populating consecutive sections). 110 */ 111 if (unused_sub_pmd_start == start) { 112 unused_sub_pmd_start = end; 113 if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) 114 unused_sub_pmd_start = 0; 115 return; 116 } 117 vmemmap_flush_unused_sub_pmd(); 118 vmemmap_mark_sub_pmd_used(start, end); 119 } 120 121 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) 122 { 123 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 124 125 vmemmap_flush_unused_sub_pmd(); 126 127 /* Could be our memmap page is filled with PAGE_UNUSED already ... */ 128 vmemmap_mark_sub_pmd_used(start, end); 129 130 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ 131 if (!IS_ALIGNED(start, PMD_SIZE)) 132 memset((void *)page, PAGE_UNUSED, start - page); 133 /* 134 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of 135 * consecutive sections. Remember for the last added PMD the last 136 * unused range in the populated PMD. 137 */ 138 if (!IS_ALIGNED(end, PMD_SIZE)) 139 unused_sub_pmd_start = end; 140 } 141 142 /* Returns true if the PMD is completely unused and can be freed. */ 143 static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) 144 { 145 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 146 147 vmemmap_flush_unused_sub_pmd(); 148 memset((void *)start, PAGE_UNUSED, end - start); 149 return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); 150 } 151 152 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 153 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, 154 unsigned long end, bool add, bool direct) 155 { 156 unsigned long prot, pages = 0; 157 int ret = -ENOMEM; 158 pte_t *pte; 159 160 prot = pgprot_val(PAGE_KERNEL); 161 if (!MACHINE_HAS_NX) 162 prot &= ~_PAGE_NOEXEC; 163 164 pte = pte_offset_kernel(pmd, addr); 165 for (; addr < end; addr += PAGE_SIZE, pte++) { 166 if (!add) { 167 if (pte_none(*pte)) 168 continue; 169 if (!direct) 170 vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); 171 pte_clear(&init_mm, addr, pte); 172 } else if (pte_none(*pte)) { 173 if (!direct) { 174 void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); 175 176 if (!new_page) 177 goto out; 178 pte_val(*pte) = __pa(new_page) | prot; 179 } else { 180 pte_val(*pte) = __pa(addr) | prot; 181 } 182 } else { 183 continue; 184 } 185 pages++; 186 } 187 ret = 0; 188 out: 189 if (direct) 190 update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); 191 return ret; 192 } 193 194 static void try_free_pte_table(pmd_t *pmd, unsigned long start) 195 { 196 pte_t *pte; 197 int i; 198 199 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ 200 pte = pte_offset_kernel(pmd, start); 201 for (i = 0; i < PTRS_PER_PTE; i++, pte++) { 202 if (!pte_none(*pte)) 203 return; 204 } 205 vmem_pte_free((unsigned long *) pmd_deref(*pmd)); 206 pmd_clear(pmd); 207 } 208 209 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 210 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, 211 unsigned long end, bool add, bool direct) 212 { 213 unsigned long next, prot, pages = 0; 214 int ret = -ENOMEM; 215 pmd_t *pmd; 216 pte_t *pte; 217 218 prot = pgprot_val(SEGMENT_KERNEL); 219 if (!MACHINE_HAS_NX) 220 prot &= ~_SEGMENT_ENTRY_NOEXEC; 221 222 pmd = pmd_offset(pud, addr); 223 for (; addr < end; addr = next, pmd++) { 224 next = pmd_addr_end(addr, end); 225 if (!add) { 226 if (pmd_none(*pmd)) 227 continue; 228 if (pmd_large(*pmd)) { 229 if (IS_ALIGNED(addr, PMD_SIZE) && 230 IS_ALIGNED(next, PMD_SIZE)) { 231 if (!direct) 232 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); 233 pmd_clear(pmd); 234 pages++; 235 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { 236 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); 237 pmd_clear(pmd); 238 } 239 continue; 240 } 241 } else if (pmd_none(*pmd)) { 242 if (IS_ALIGNED(addr, PMD_SIZE) && 243 IS_ALIGNED(next, PMD_SIZE) && 244 MACHINE_HAS_EDAT1 && addr && direct && 245 !debug_pagealloc_enabled()) { 246 pmd_val(*pmd) = __pa(addr) | prot; 247 pages++; 248 continue; 249 } else if (!direct && MACHINE_HAS_EDAT1) { 250 void *new_page; 251 252 /* 253 * Use 1MB frames for vmemmap if available. We 254 * always use large frames even if they are only 255 * partially used. Otherwise we would have also 256 * page tables since vmemmap_populate gets 257 * called for each section separately. 258 */ 259 new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); 260 if (new_page) { 261 pmd_val(*pmd) = __pa(new_page) | prot; 262 if (!IS_ALIGNED(addr, PMD_SIZE) || 263 !IS_ALIGNED(next, PMD_SIZE)) { 264 vmemmap_use_new_sub_pmd(addr, next); 265 } 266 continue; 267 } 268 } 269 pte = vmem_pte_alloc(); 270 if (!pte) 271 goto out; 272 pmd_populate(&init_mm, pmd, pte); 273 } else if (pmd_large(*pmd)) { 274 if (!direct) 275 vmemmap_use_sub_pmd(addr, next); 276 continue; 277 } 278 ret = modify_pte_table(pmd, addr, next, add, direct); 279 if (ret) 280 goto out; 281 if (!add) 282 try_free_pte_table(pmd, addr & PMD_MASK); 283 } 284 ret = 0; 285 out: 286 if (direct) 287 update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); 288 return ret; 289 } 290 291 static void try_free_pmd_table(pud_t *pud, unsigned long start) 292 { 293 const unsigned long end = start + PUD_SIZE; 294 pmd_t *pmd; 295 int i; 296 297 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ 298 if (end > VMALLOC_START) 299 return; 300 #ifdef CONFIG_KASAN 301 if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) 302 return; 303 #endif 304 pmd = pmd_offset(pud, start); 305 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) 306 if (!pmd_none(*pmd)) 307 return; 308 vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); 309 pud_clear(pud); 310 } 311 312 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, 313 bool add, bool direct) 314 { 315 unsigned long next, prot, pages = 0; 316 int ret = -ENOMEM; 317 pud_t *pud; 318 pmd_t *pmd; 319 320 prot = pgprot_val(REGION3_KERNEL); 321 if (!MACHINE_HAS_NX) 322 prot &= ~_REGION_ENTRY_NOEXEC; 323 pud = pud_offset(p4d, addr); 324 for (; addr < end; addr = next, pud++) { 325 next = pud_addr_end(addr, end); 326 if (!add) { 327 if (pud_none(*pud)) 328 continue; 329 if (pud_large(*pud)) { 330 if (IS_ALIGNED(addr, PUD_SIZE) && 331 IS_ALIGNED(next, PUD_SIZE)) { 332 pud_clear(pud); 333 pages++; 334 } 335 continue; 336 } 337 } else if (pud_none(*pud)) { 338 if (IS_ALIGNED(addr, PUD_SIZE) && 339 IS_ALIGNED(next, PUD_SIZE) && 340 MACHINE_HAS_EDAT2 && addr && direct && 341 !debug_pagealloc_enabled()) { 342 pud_val(*pud) = __pa(addr) | prot; 343 pages++; 344 continue; 345 } 346 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 347 if (!pmd) 348 goto out; 349 pud_populate(&init_mm, pud, pmd); 350 } else if (pud_large(*pud)) { 351 continue; 352 } 353 ret = modify_pmd_table(pud, addr, next, add, direct); 354 if (ret) 355 goto out; 356 if (!add) 357 try_free_pmd_table(pud, addr & PUD_MASK); 358 } 359 ret = 0; 360 out: 361 if (direct) 362 update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); 363 return ret; 364 } 365 366 static void try_free_pud_table(p4d_t *p4d, unsigned long start) 367 { 368 const unsigned long end = start + P4D_SIZE; 369 pud_t *pud; 370 int i; 371 372 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ 373 if (end > VMALLOC_START) 374 return; 375 #ifdef CONFIG_KASAN 376 if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) 377 return; 378 #endif 379 380 pud = pud_offset(p4d, start); 381 for (i = 0; i < PTRS_PER_PUD; i++, pud++) { 382 if (!pud_none(*pud)) 383 return; 384 } 385 vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); 386 p4d_clear(p4d); 387 } 388 389 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, 390 bool add, bool direct) 391 { 392 unsigned long next; 393 int ret = -ENOMEM; 394 p4d_t *p4d; 395 pud_t *pud; 396 397 p4d = p4d_offset(pgd, addr); 398 for (; addr < end; addr = next, p4d++) { 399 next = p4d_addr_end(addr, end); 400 if (!add) { 401 if (p4d_none(*p4d)) 402 continue; 403 } else if (p4d_none(*p4d)) { 404 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 405 if (!pud) 406 goto out; 407 p4d_populate(&init_mm, p4d, pud); 408 } 409 ret = modify_pud_table(p4d, addr, next, add, direct); 410 if (ret) 411 goto out; 412 if (!add) 413 try_free_pud_table(p4d, addr & P4D_MASK); 414 } 415 ret = 0; 416 out: 417 return ret; 418 } 419 420 static void try_free_p4d_table(pgd_t *pgd, unsigned long start) 421 { 422 const unsigned long end = start + PGDIR_SIZE; 423 p4d_t *p4d; 424 int i; 425 426 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ 427 if (end > VMALLOC_START) 428 return; 429 #ifdef CONFIG_KASAN 430 if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) 431 return; 432 #endif 433 434 p4d = p4d_offset(pgd, start); 435 for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { 436 if (!p4d_none(*p4d)) 437 return; 438 } 439 vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); 440 pgd_clear(pgd); 441 } 442 443 static int modify_pagetable(unsigned long start, unsigned long end, bool add, 444 bool direct) 445 { 446 unsigned long addr, next; 447 int ret = -ENOMEM; 448 pgd_t *pgd; 449 p4d_t *p4d; 450 451 if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) 452 return -EINVAL; 453 for (addr = start; addr < end; addr = next) { 454 next = pgd_addr_end(addr, end); 455 pgd = pgd_offset_k(addr); 456 457 if (!add) { 458 if (pgd_none(*pgd)) 459 continue; 460 } else if (pgd_none(*pgd)) { 461 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 462 if (!p4d) 463 goto out; 464 pgd_populate(&init_mm, pgd, p4d); 465 } 466 ret = modify_p4d_table(pgd, addr, next, add, direct); 467 if (ret) 468 goto out; 469 if (!add) 470 try_free_p4d_table(pgd, addr & PGDIR_MASK); 471 } 472 ret = 0; 473 out: 474 if (!add) 475 flush_tlb_kernel_range(start, end); 476 return ret; 477 } 478 479 static int add_pagetable(unsigned long start, unsigned long end, bool direct) 480 { 481 return modify_pagetable(start, end, true, direct); 482 } 483 484 static int remove_pagetable(unsigned long start, unsigned long end, bool direct) 485 { 486 return modify_pagetable(start, end, false, direct); 487 } 488 489 /* 490 * Add a physical memory range to the 1:1 mapping. 491 */ 492 static int vmem_add_range(unsigned long start, unsigned long size) 493 { 494 return add_pagetable(start, start + size, true); 495 } 496 497 /* 498 * Remove a physical memory range from the 1:1 mapping. 499 */ 500 static void vmem_remove_range(unsigned long start, unsigned long size) 501 { 502 remove_pagetable(start, start + size, true); 503 } 504 505 /* 506 * Add a backed mem_map array to the virtual mem_map array. 507 */ 508 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 509 struct vmem_altmap *altmap) 510 { 511 int ret; 512 513 mutex_lock(&vmem_mutex); 514 /* We don't care about the node, just use NUMA_NO_NODE on allocations */ 515 ret = add_pagetable(start, end, false); 516 if (ret) 517 remove_pagetable(start, end, false); 518 mutex_unlock(&vmem_mutex); 519 return ret; 520 } 521 522 void vmemmap_free(unsigned long start, unsigned long end, 523 struct vmem_altmap *altmap) 524 { 525 mutex_lock(&vmem_mutex); 526 remove_pagetable(start, end, false); 527 mutex_unlock(&vmem_mutex); 528 } 529 530 void vmem_remove_mapping(unsigned long start, unsigned long size) 531 { 532 mutex_lock(&vmem_mutex); 533 vmem_remove_range(start, size); 534 mutex_unlock(&vmem_mutex); 535 } 536 537 struct range arch_get_mappable_range(void) 538 { 539 struct range mhp_range; 540 541 mhp_range.start = 0; 542 mhp_range.end = VMEM_MAX_PHYS - 1; 543 return mhp_range; 544 } 545 546 int vmem_add_mapping(unsigned long start, unsigned long size) 547 { 548 struct range range = arch_get_mappable_range(); 549 int ret; 550 551 if (start < range.start || 552 start + size > range.end + 1 || 553 start + size < start) 554 return -ERANGE; 555 556 mutex_lock(&vmem_mutex); 557 ret = vmem_add_range(start, size); 558 if (ret) 559 vmem_remove_range(start, size); 560 mutex_unlock(&vmem_mutex); 561 return ret; 562 } 563 564 /* 565 * map whole physical memory to virtual memory (identity mapping) 566 * we reserve enough space in the vmalloc area for vmemmap to hotplug 567 * additional memory segments. 568 */ 569 void __init vmem_map_init(void) 570 { 571 phys_addr_t base, end; 572 u64 i; 573 574 for_each_mem_range(i, &base, &end) 575 vmem_add_range(base, end - base); 576 __set_memory((unsigned long)_stext, 577 (unsigned long)(_etext - _stext) >> PAGE_SHIFT, 578 SET_MEMORY_RO | SET_MEMORY_X); 579 __set_memory((unsigned long)_etext, 580 (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT, 581 SET_MEMORY_RO); 582 __set_memory((unsigned long)_sinittext, 583 (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, 584 SET_MEMORY_RO | SET_MEMORY_X); 585 __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, 586 SET_MEMORY_RO | SET_MEMORY_X); 587 588 if (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) { 589 /* 590 * Lowcore must be executable for LPSWE 591 * and expoline trampoline branch instructions. 592 */ 593 set_memory_x(0, 1); 594 } 595 596 pr_info("Write protected kernel read-only data: %luk\n", 597 (unsigned long)(__end_rodata - _stext) >> 10); 598 } 599