1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Virtual Memory Map support 4 * 5 * (C) 2007 sgi. Christoph Lameter. 6 * 7 * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, 8 * virt_to_page, page_address() to be implemented as a base offset 9 * calculation without memory access. 10 * 11 * However, virtual mappings need a page table and TLBs. Many Linux 12 * architectures already map their physical space using 1-1 mappings 13 * via TLBs. For those arches the virtual memory map is essentially 14 * for free if we use the same page size as the 1-1 mappings. In that 15 * case the overhead consists of a few additional pages that are 16 * allocated to create a view of memory for vmemmap. 17 * 18 * The architecture is expected to provide a vmemmap_populate() function 19 * to instantiate the mapping. 20 */ 21 #include <linux/mm.h> 22 #include <linux/mmzone.h> 23 #include <linux/memblock.h> 24 #include <linux/memremap.h> 25 #include <linux/highmem.h> 26 #include <linux/slab.h> 27 #include <linux/spinlock.h> 28 #include <linux/vmalloc.h> 29 #include <linux/sched.h> 30 #include <linux/pgtable.h> 31 #include <linux/bootmem_info.h> 32 33 #include <asm/dma.h> 34 #include <asm/pgalloc.h> 35 #include <asm/tlbflush.h> 36 37 /** 38 * struct vmemmap_remap_walk - walk vmemmap page table 39 * 40 * @remap_pte: called for each lowest-level entry (PTE). 41 * @nr_walked: the number of walked pte. 42 * @reuse_page: the page which is reused for the tail vmemmap pages. 43 * @reuse_addr: the virtual address of the @reuse_page page. 44 * @vmemmap_pages: the list head of the vmemmap pages that can be freed 45 * or is mapped from. 46 */ 47 struct vmemmap_remap_walk { 48 void (*remap_pte)(pte_t *pte, unsigned long addr, 49 struct vmemmap_remap_walk *walk); 50 unsigned long nr_walked; 51 struct page *reuse_page; 52 unsigned long reuse_addr; 53 struct list_head *vmemmap_pages; 54 }; 55 56 static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, 57 struct vmemmap_remap_walk *walk) 58 { 59 pmd_t __pmd; 60 int i; 61 unsigned long addr = start; 62 struct page *page = pmd_page(*pmd); 63 pte_t *pgtable = pte_alloc_one_kernel(&init_mm); 64 65 if (!pgtable) 66 return -ENOMEM; 67 68 pmd_populate_kernel(&init_mm, &__pmd, pgtable); 69 70 for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) { 71 pte_t entry, *pte; 72 pgprot_t pgprot = PAGE_KERNEL; 73 74 entry = mk_pte(page + i, pgprot); 75 pte = pte_offset_kernel(&__pmd, addr); 76 set_pte_at(&init_mm, addr, pte, entry); 77 } 78 79 /* Make pte visible before pmd. See comment in __pte_alloc(). */ 80 smp_wmb(); 81 pmd_populate_kernel(&init_mm, pmd, pgtable); 82 83 flush_tlb_kernel_range(start, start + PMD_SIZE); 84 85 return 0; 86 } 87 88 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, 89 unsigned long end, 90 struct vmemmap_remap_walk *walk) 91 { 92 pte_t *pte = pte_offset_kernel(pmd, addr); 93 94 /* 95 * The reuse_page is found 'first' in table walk before we start 96 * remapping (which is calling @walk->remap_pte). 97 */ 98 if (!walk->reuse_page) { 99 walk->reuse_page = pte_page(*pte); 100 /* 101 * Because the reuse address is part of the range that we are 102 * walking, skip the reuse address range. 103 */ 104 addr += PAGE_SIZE; 105 pte++; 106 walk->nr_walked++; 107 } 108 109 for (; addr != end; addr += PAGE_SIZE, pte++) { 110 walk->remap_pte(pte, addr, walk); 111 walk->nr_walked++; 112 } 113 } 114 115 static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, 116 unsigned long end, 117 struct vmemmap_remap_walk *walk) 118 { 119 pmd_t *pmd; 120 unsigned long next; 121 122 pmd = pmd_offset(pud, addr); 123 do { 124 if (pmd_leaf(*pmd)) { 125 int ret; 126 127 ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); 128 if (ret) 129 return ret; 130 } 131 next = pmd_addr_end(addr, end); 132 vmemmap_pte_range(pmd, addr, next, walk); 133 } while (pmd++, addr = next, addr != end); 134 135 return 0; 136 } 137 138 static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, 139 unsigned long end, 140 struct vmemmap_remap_walk *walk) 141 { 142 pud_t *pud; 143 unsigned long next; 144 145 pud = pud_offset(p4d, addr); 146 do { 147 int ret; 148 149 next = pud_addr_end(addr, end); 150 ret = vmemmap_pmd_range(pud, addr, next, walk); 151 if (ret) 152 return ret; 153 } while (pud++, addr = next, addr != end); 154 155 return 0; 156 } 157 158 static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, 159 unsigned long end, 160 struct vmemmap_remap_walk *walk) 161 { 162 p4d_t *p4d; 163 unsigned long next; 164 165 p4d = p4d_offset(pgd, addr); 166 do { 167 int ret; 168 169 next = p4d_addr_end(addr, end); 170 ret = vmemmap_pud_range(p4d, addr, next, walk); 171 if (ret) 172 return ret; 173 } while (p4d++, addr = next, addr != end); 174 175 return 0; 176 } 177 178 static int vmemmap_remap_range(unsigned long start, unsigned long end, 179 struct vmemmap_remap_walk *walk) 180 { 181 unsigned long addr = start; 182 unsigned long next; 183 pgd_t *pgd; 184 185 VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE)); 186 VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE)); 187 188 pgd = pgd_offset_k(addr); 189 do { 190 int ret; 191 192 next = pgd_addr_end(addr, end); 193 ret = vmemmap_p4d_range(pgd, addr, next, walk); 194 if (ret) 195 return ret; 196 } while (pgd++, addr = next, addr != end); 197 198 /* 199 * We only change the mapping of the vmemmap virtual address range 200 * [@start + PAGE_SIZE, end), so we only need to flush the TLB which 201 * belongs to the range. 202 */ 203 flush_tlb_kernel_range(start + PAGE_SIZE, end); 204 205 return 0; 206 } 207 208 /* 209 * Free a vmemmap page. A vmemmap page can be allocated from the memblock 210 * allocator or buddy allocator. If the PG_reserved flag is set, it means 211 * that it allocated from the memblock allocator, just free it via the 212 * free_bootmem_page(). Otherwise, use __free_page(). 213 */ 214 static inline void free_vmemmap_page(struct page *page) 215 { 216 if (PageReserved(page)) 217 free_bootmem_page(page); 218 else 219 __free_page(page); 220 } 221 222 /* Free a list of the vmemmap pages */ 223 static void free_vmemmap_page_list(struct list_head *list) 224 { 225 struct page *page, *next; 226 227 list_for_each_entry_safe(page, next, list, lru) { 228 list_del(&page->lru); 229 free_vmemmap_page(page); 230 } 231 } 232 233 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, 234 struct vmemmap_remap_walk *walk) 235 { 236 /* 237 * Remap the tail pages as read-only to catch illegal write operation 238 * to the tail pages. 239 */ 240 pgprot_t pgprot = PAGE_KERNEL_RO; 241 pte_t entry = mk_pte(walk->reuse_page, pgprot); 242 struct page *page = pte_page(*pte); 243 244 list_add_tail(&page->lru, walk->vmemmap_pages); 245 set_pte_at(&init_mm, addr, pte, entry); 246 } 247 248 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, 249 struct vmemmap_remap_walk *walk) 250 { 251 pgprot_t pgprot = PAGE_KERNEL; 252 struct page *page; 253 void *to; 254 255 BUG_ON(pte_page(*pte) != walk->reuse_page); 256 257 page = list_first_entry(walk->vmemmap_pages, struct page, lru); 258 list_del(&page->lru); 259 to = page_to_virt(page); 260 copy_page(to, (void *)walk->reuse_addr); 261 262 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); 263 } 264 265 /** 266 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) 267 * to the page which @reuse is mapped to, then free vmemmap 268 * which the range are mapped to. 269 * @start: start address of the vmemmap virtual address range that we want 270 * to remap. 271 * @end: end address of the vmemmap virtual address range that we want to 272 * remap. 273 * @reuse: reuse address. 274 * 275 * Return: %0 on success, negative error code otherwise. 276 */ 277 int vmemmap_remap_free(unsigned long start, unsigned long end, 278 unsigned long reuse) 279 { 280 int ret; 281 LIST_HEAD(vmemmap_pages); 282 struct vmemmap_remap_walk walk = { 283 .remap_pte = vmemmap_remap_pte, 284 .reuse_addr = reuse, 285 .vmemmap_pages = &vmemmap_pages, 286 }; 287 288 /* 289 * In order to make remapping routine most efficient for the huge pages, 290 * the routine of vmemmap page table walking has the following rules 291 * (see more details from the vmemmap_pte_range()): 292 * 293 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) 294 * should be continuous. 295 * - The @reuse address is part of the range [@reuse, @end) that we are 296 * walking which is passed to vmemmap_remap_range(). 297 * - The @reuse address is the first in the complete range. 298 * 299 * So we need to make sure that @start and @reuse meet the above rules. 300 */ 301 BUG_ON(start - reuse != PAGE_SIZE); 302 303 mmap_write_lock(&init_mm); 304 ret = vmemmap_remap_range(reuse, end, &walk); 305 mmap_write_downgrade(&init_mm); 306 307 if (ret && walk.nr_walked) { 308 end = reuse + walk.nr_walked * PAGE_SIZE; 309 /* 310 * vmemmap_pages contains pages from the previous 311 * vmemmap_remap_range call which failed. These 312 * are pages which were removed from the vmemmap. 313 * They will be restored in the following call. 314 */ 315 walk = (struct vmemmap_remap_walk) { 316 .remap_pte = vmemmap_restore_pte, 317 .reuse_addr = reuse, 318 .vmemmap_pages = &vmemmap_pages, 319 }; 320 321 vmemmap_remap_range(reuse, end, &walk); 322 } 323 mmap_read_unlock(&init_mm); 324 325 free_vmemmap_page_list(&vmemmap_pages); 326 327 return ret; 328 } 329 330 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, 331 gfp_t gfp_mask, struct list_head *list) 332 { 333 unsigned long nr_pages = (end - start) >> PAGE_SHIFT; 334 int nid = page_to_nid((struct page *)start); 335 struct page *page, *next; 336 337 while (nr_pages--) { 338 page = alloc_pages_node(nid, gfp_mask, 0); 339 if (!page) 340 goto out; 341 list_add_tail(&page->lru, list); 342 } 343 344 return 0; 345 out: 346 list_for_each_entry_safe(page, next, list, lru) 347 __free_pages(page, 0); 348 return -ENOMEM; 349 } 350 351 /** 352 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) 353 * to the page which is from the @vmemmap_pages 354 * respectively. 355 * @start: start address of the vmemmap virtual address range that we want 356 * to remap. 357 * @end: end address of the vmemmap virtual address range that we want to 358 * remap. 359 * @reuse: reuse address. 360 * @gfp_mask: GFP flag for allocating vmemmap pages. 361 * 362 * Return: %0 on success, negative error code otherwise. 363 */ 364 int vmemmap_remap_alloc(unsigned long start, unsigned long end, 365 unsigned long reuse, gfp_t gfp_mask) 366 { 367 LIST_HEAD(vmemmap_pages); 368 struct vmemmap_remap_walk walk = { 369 .remap_pte = vmemmap_restore_pte, 370 .reuse_addr = reuse, 371 .vmemmap_pages = &vmemmap_pages, 372 }; 373 374 /* See the comment in the vmemmap_remap_free(). */ 375 BUG_ON(start - reuse != PAGE_SIZE); 376 377 if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) 378 return -ENOMEM; 379 380 mmap_read_lock(&init_mm); 381 vmemmap_remap_range(reuse, end, &walk); 382 mmap_read_unlock(&init_mm); 383 384 return 0; 385 } 386 387 /* 388 * Allocate a block of memory to be used to back the virtual memory map 389 * or to back the page tables that are used to create the mapping. 390 * Uses the main allocators if they are available, else bootmem. 391 */ 392 393 static void * __ref __earlyonly_bootmem_alloc(int node, 394 unsigned long size, 395 unsigned long align, 396 unsigned long goal) 397 { 398 return memblock_alloc_try_nid_raw(size, align, goal, 399 MEMBLOCK_ALLOC_ACCESSIBLE, node); 400 } 401 402 void * __meminit vmemmap_alloc_block(unsigned long size, int node) 403 { 404 /* If the main allocator is up use that, fallback to bootmem. */ 405 if (slab_is_available()) { 406 gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; 407 int order = get_order(size); 408 static bool warned; 409 struct page *page; 410 411 page = alloc_pages_node(node, gfp_mask, order); 412 if (page) 413 return page_address(page); 414 415 if (!warned) { 416 warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL, 417 "vmemmap alloc failure: order:%u", order); 418 warned = true; 419 } 420 return NULL; 421 } else 422 return __earlyonly_bootmem_alloc(node, size, size, 423 __pa(MAX_DMA_ADDRESS)); 424 } 425 426 static void * __meminit altmap_alloc_block_buf(unsigned long size, 427 struct vmem_altmap *altmap); 428 429 /* need to make sure size is all the same during early stage */ 430 void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node, 431 struct vmem_altmap *altmap) 432 { 433 void *ptr; 434 435 if (altmap) 436 return altmap_alloc_block_buf(size, altmap); 437 438 ptr = sparse_buffer_alloc(size); 439 if (!ptr) 440 ptr = vmemmap_alloc_block(size, node); 441 return ptr; 442 } 443 444 static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) 445 { 446 return altmap->base_pfn + altmap->reserve + altmap->alloc 447 + altmap->align; 448 } 449 450 static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap) 451 { 452 unsigned long allocated = altmap->alloc + altmap->align; 453 454 if (altmap->free > allocated) 455 return altmap->free - allocated; 456 return 0; 457 } 458 459 static void * __meminit altmap_alloc_block_buf(unsigned long size, 460 struct vmem_altmap *altmap) 461 { 462 unsigned long pfn, nr_pfns, nr_align; 463 464 if (size & ~PAGE_MASK) { 465 pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n", 466 __func__, size); 467 return NULL; 468 } 469 470 pfn = vmem_altmap_next_pfn(altmap); 471 nr_pfns = size >> PAGE_SHIFT; 472 nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG); 473 nr_align = ALIGN(pfn, nr_align) - pfn; 474 if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap)) 475 return NULL; 476 477 altmap->alloc += nr_pfns; 478 altmap->align += nr_align; 479 pfn += nr_align; 480 481 pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n", 482 __func__, pfn, altmap->alloc, altmap->align, nr_pfns); 483 return __va(__pfn_to_phys(pfn)); 484 } 485 486 void __meminit vmemmap_verify(pte_t *pte, int node, 487 unsigned long start, unsigned long end) 488 { 489 unsigned long pfn = pte_pfn(*pte); 490 int actual_node = early_pfn_to_nid(pfn); 491 492 if (node_distance(actual_node, node) > LOCAL_DISTANCE) 493 pr_warn("[%lx-%lx] potential offnode page_structs\n", 494 start, end - 1); 495 } 496 497 pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, 498 struct vmem_altmap *altmap) 499 { 500 pte_t *pte = pte_offset_kernel(pmd, addr); 501 if (pte_none(*pte)) { 502 pte_t entry; 503 void *p; 504 505 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); 506 if (!p) 507 return NULL; 508 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 509 set_pte_at(&init_mm, addr, pte, entry); 510 } 511 return pte; 512 } 513 514 static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) 515 { 516 void *p = vmemmap_alloc_block(size, node); 517 518 if (!p) 519 return NULL; 520 memset(p, 0, size); 521 522 return p; 523 } 524 525 pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) 526 { 527 pmd_t *pmd = pmd_offset(pud, addr); 528 if (pmd_none(*pmd)) { 529 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); 530 if (!p) 531 return NULL; 532 pmd_populate_kernel(&init_mm, pmd, p); 533 } 534 return pmd; 535 } 536 537 pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) 538 { 539 pud_t *pud = pud_offset(p4d, addr); 540 if (pud_none(*pud)) { 541 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); 542 if (!p) 543 return NULL; 544 pud_populate(&init_mm, pud, p); 545 } 546 return pud; 547 } 548 549 p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) 550 { 551 p4d_t *p4d = p4d_offset(pgd, addr); 552 if (p4d_none(*p4d)) { 553 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); 554 if (!p) 555 return NULL; 556 p4d_populate(&init_mm, p4d, p); 557 } 558 return p4d; 559 } 560 561 pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) 562 { 563 pgd_t *pgd = pgd_offset_k(addr); 564 if (pgd_none(*pgd)) { 565 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); 566 if (!p) 567 return NULL; 568 pgd_populate(&init_mm, pgd, p); 569 } 570 return pgd; 571 } 572 573 int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, 574 int node, struct vmem_altmap *altmap) 575 { 576 unsigned long addr = start; 577 pgd_t *pgd; 578 p4d_t *p4d; 579 pud_t *pud; 580 pmd_t *pmd; 581 pte_t *pte; 582 583 for (; addr < end; addr += PAGE_SIZE) { 584 pgd = vmemmap_pgd_populate(addr, node); 585 if (!pgd) 586 return -ENOMEM; 587 p4d = vmemmap_p4d_populate(pgd, addr, node); 588 if (!p4d) 589 return -ENOMEM; 590 pud = vmemmap_pud_populate(p4d, addr, node); 591 if (!pud) 592 return -ENOMEM; 593 pmd = vmemmap_pmd_populate(pud, addr, node); 594 if (!pmd) 595 return -ENOMEM; 596 pte = vmemmap_pte_populate(pmd, addr, node, altmap); 597 if (!pte) 598 return -ENOMEM; 599 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); 600 } 601 602 return 0; 603 } 604 605 struct page * __meminit __populate_section_memmap(unsigned long pfn, 606 unsigned long nr_pages, int nid, struct vmem_altmap *altmap) 607 { 608 unsigned long start = (unsigned long) pfn_to_page(pfn); 609 unsigned long end = start + nr_pages * sizeof(struct page); 610 611 if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) || 612 !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION))) 613 return NULL; 614 615 if (vmemmap_populate(start, end, nid, altmap)) 616 return NULL; 617 618 return pfn_to_page(pfn); 619 } 620