1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1993 Linus Torvalds 4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 6 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 7 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 9 */ 10 11 #include <linux/vmalloc.h> 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/highmem.h> 15 #include <linux/sched/signal.h> 16 #include <linux/slab.h> 17 #include <linux/spinlock.h> 18 #include <linux/interrupt.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/set_memory.h> 22 #include <linux/debugobjects.h> 23 #include <linux/kallsyms.h> 24 #include <linux/list.h> 25 #include <linux/notifier.h> 26 #include <linux/rbtree.h> 27 #include <linux/xarray.h> 28 #include <linux/io.h> 29 #include <linux/rcupdate.h> 30 #include <linux/pfn.h> 31 #include <linux/kmemleak.h> 32 #include <linux/atomic.h> 33 #include <linux/compiler.h> 34 #include <linux/llist.h> 35 #include <linux/bitops.h> 36 #include <linux/rbtree_augmented.h> 37 #include <linux/overflow.h> 38 #include <linux/pgtable.h> 39 #include <linux/uaccess.h> 40 #include <linux/hugetlb.h> 41 #include <asm/tlbflush.h> 42 #include <asm/shmparam.h> 43 44 #include "internal.h" 45 #include "pgalloc-track.h" 46 47 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 48 static bool __ro_after_init vmap_allow_huge = true; 49 50 static int __init set_nohugevmalloc(char *str) 51 { 52 vmap_allow_huge = false; 53 return 0; 54 } 55 early_param("nohugevmalloc", set_nohugevmalloc); 56 #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ 57 static const bool vmap_allow_huge = false; 58 #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ 59 60 bool is_vmalloc_addr(const void *x) 61 { 62 unsigned long addr = (unsigned long)x; 63 64 return addr >= VMALLOC_START && addr < VMALLOC_END; 65 } 66 EXPORT_SYMBOL(is_vmalloc_addr); 67 68 struct vfree_deferred { 69 struct llist_head list; 70 struct work_struct wq; 71 }; 72 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); 73 74 static void __vunmap(const void *, int); 75 76 static void free_work(struct work_struct *w) 77 { 78 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); 79 struct llist_node *t, *llnode; 80 81 llist_for_each_safe(llnode, t, llist_del_all(&p->list)) 82 __vunmap((void *)llnode, 1); 83 } 84 85 /*** Page table manipulation functions ***/ 86 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 87 phys_addr_t phys_addr, pgprot_t prot, 88 unsigned int max_page_shift, pgtbl_mod_mask *mask) 89 { 90 pte_t *pte; 91 u64 pfn; 92 unsigned long size = PAGE_SIZE; 93 94 pfn = phys_addr >> PAGE_SHIFT; 95 pte = pte_alloc_kernel_track(pmd, addr, mask); 96 if (!pte) 97 return -ENOMEM; 98 do { 99 BUG_ON(!pte_none(*pte)); 100 101 #ifdef CONFIG_HUGETLB_PAGE 102 size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); 103 if (size != PAGE_SIZE) { 104 pte_t entry = pfn_pte(pfn, prot); 105 106 entry = pte_mkhuge(entry); 107 entry = arch_make_huge_pte(entry, ilog2(size), 0); 108 set_huge_pte_at(&init_mm, addr, pte, entry); 109 pfn += PFN_DOWN(size); 110 continue; 111 } 112 #endif 113 set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); 114 pfn++; 115 } while (pte += PFN_DOWN(size), addr += size, addr != end); 116 *mask |= PGTBL_PTE_MODIFIED; 117 return 0; 118 } 119 120 static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, 121 phys_addr_t phys_addr, pgprot_t prot, 122 unsigned int max_page_shift) 123 { 124 if (max_page_shift < PMD_SHIFT) 125 return 0; 126 127 if (!arch_vmap_pmd_supported(prot)) 128 return 0; 129 130 if ((end - addr) != PMD_SIZE) 131 return 0; 132 133 if (!IS_ALIGNED(addr, PMD_SIZE)) 134 return 0; 135 136 if (!IS_ALIGNED(phys_addr, PMD_SIZE)) 137 return 0; 138 139 if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) 140 return 0; 141 142 return pmd_set_huge(pmd, phys_addr, prot); 143 } 144 145 static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 146 phys_addr_t phys_addr, pgprot_t prot, 147 unsigned int max_page_shift, pgtbl_mod_mask *mask) 148 { 149 pmd_t *pmd; 150 unsigned long next; 151 152 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 153 if (!pmd) 154 return -ENOMEM; 155 do { 156 next = pmd_addr_end(addr, end); 157 158 if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, 159 max_page_shift)) { 160 *mask |= PGTBL_PMD_MODIFIED; 161 continue; 162 } 163 164 if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) 165 return -ENOMEM; 166 } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); 167 return 0; 168 } 169 170 static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, 171 phys_addr_t phys_addr, pgprot_t prot, 172 unsigned int max_page_shift) 173 { 174 if (max_page_shift < PUD_SHIFT) 175 return 0; 176 177 if (!arch_vmap_pud_supported(prot)) 178 return 0; 179 180 if ((end - addr) != PUD_SIZE) 181 return 0; 182 183 if (!IS_ALIGNED(addr, PUD_SIZE)) 184 return 0; 185 186 if (!IS_ALIGNED(phys_addr, PUD_SIZE)) 187 return 0; 188 189 if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) 190 return 0; 191 192 return pud_set_huge(pud, phys_addr, prot); 193 } 194 195 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 196 phys_addr_t phys_addr, pgprot_t prot, 197 unsigned int max_page_shift, pgtbl_mod_mask *mask) 198 { 199 pud_t *pud; 200 unsigned long next; 201 202 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 203 if (!pud) 204 return -ENOMEM; 205 do { 206 next = pud_addr_end(addr, end); 207 208 if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, 209 max_page_shift)) { 210 *mask |= PGTBL_PUD_MODIFIED; 211 continue; 212 } 213 214 if (vmap_pmd_range(pud, addr, next, phys_addr, prot, 215 max_page_shift, mask)) 216 return -ENOMEM; 217 } while (pud++, phys_addr += (next - addr), addr = next, addr != end); 218 return 0; 219 } 220 221 static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, 222 phys_addr_t phys_addr, pgprot_t prot, 223 unsigned int max_page_shift) 224 { 225 if (max_page_shift < P4D_SHIFT) 226 return 0; 227 228 if (!arch_vmap_p4d_supported(prot)) 229 return 0; 230 231 if ((end - addr) != P4D_SIZE) 232 return 0; 233 234 if (!IS_ALIGNED(addr, P4D_SIZE)) 235 return 0; 236 237 if (!IS_ALIGNED(phys_addr, P4D_SIZE)) 238 return 0; 239 240 if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) 241 return 0; 242 243 return p4d_set_huge(p4d, phys_addr, prot); 244 } 245 246 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 247 phys_addr_t phys_addr, pgprot_t prot, 248 unsigned int max_page_shift, pgtbl_mod_mask *mask) 249 { 250 p4d_t *p4d; 251 unsigned long next; 252 253 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 254 if (!p4d) 255 return -ENOMEM; 256 do { 257 next = p4d_addr_end(addr, end); 258 259 if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, 260 max_page_shift)) { 261 *mask |= PGTBL_P4D_MODIFIED; 262 continue; 263 } 264 265 if (vmap_pud_range(p4d, addr, next, phys_addr, prot, 266 max_page_shift, mask)) 267 return -ENOMEM; 268 } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); 269 return 0; 270 } 271 272 static int vmap_range_noflush(unsigned long addr, unsigned long end, 273 phys_addr_t phys_addr, pgprot_t prot, 274 unsigned int max_page_shift) 275 { 276 pgd_t *pgd; 277 unsigned long start; 278 unsigned long next; 279 int err; 280 pgtbl_mod_mask mask = 0; 281 282 might_sleep(); 283 BUG_ON(addr >= end); 284 285 start = addr; 286 pgd = pgd_offset_k(addr); 287 do { 288 next = pgd_addr_end(addr, end); 289 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, 290 max_page_shift, &mask); 291 if (err) 292 break; 293 } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); 294 295 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 296 arch_sync_kernel_mappings(start, end); 297 298 return err; 299 } 300 301 int vmap_range(unsigned long addr, unsigned long end, 302 phys_addr_t phys_addr, pgprot_t prot, 303 unsigned int max_page_shift) 304 { 305 int err; 306 307 err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift); 308 flush_cache_vmap(addr, end); 309 310 return err; 311 } 312 313 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 314 pgtbl_mod_mask *mask) 315 { 316 pte_t *pte; 317 318 pte = pte_offset_kernel(pmd, addr); 319 do { 320 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 321 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 322 } while (pte++, addr += PAGE_SIZE, addr != end); 323 *mask |= PGTBL_PTE_MODIFIED; 324 } 325 326 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 327 pgtbl_mod_mask *mask) 328 { 329 pmd_t *pmd; 330 unsigned long next; 331 int cleared; 332 333 pmd = pmd_offset(pud, addr); 334 do { 335 next = pmd_addr_end(addr, end); 336 337 cleared = pmd_clear_huge(pmd); 338 if (cleared || pmd_bad(*pmd)) 339 *mask |= PGTBL_PMD_MODIFIED; 340 341 if (cleared) 342 continue; 343 if (pmd_none_or_clear_bad(pmd)) 344 continue; 345 vunmap_pte_range(pmd, addr, next, mask); 346 347 cond_resched(); 348 } while (pmd++, addr = next, addr != end); 349 } 350 351 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 352 pgtbl_mod_mask *mask) 353 { 354 pud_t *pud; 355 unsigned long next; 356 int cleared; 357 358 pud = pud_offset(p4d, addr); 359 do { 360 next = pud_addr_end(addr, end); 361 362 cleared = pud_clear_huge(pud); 363 if (cleared || pud_bad(*pud)) 364 *mask |= PGTBL_PUD_MODIFIED; 365 366 if (cleared) 367 continue; 368 if (pud_none_or_clear_bad(pud)) 369 continue; 370 vunmap_pmd_range(pud, addr, next, mask); 371 } while (pud++, addr = next, addr != end); 372 } 373 374 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 375 pgtbl_mod_mask *mask) 376 { 377 p4d_t *p4d; 378 unsigned long next; 379 int cleared; 380 381 p4d = p4d_offset(pgd, addr); 382 do { 383 next = p4d_addr_end(addr, end); 384 385 cleared = p4d_clear_huge(p4d); 386 if (cleared || p4d_bad(*p4d)) 387 *mask |= PGTBL_P4D_MODIFIED; 388 389 if (cleared) 390 continue; 391 if (p4d_none_or_clear_bad(p4d)) 392 continue; 393 vunmap_pud_range(p4d, addr, next, mask); 394 } while (p4d++, addr = next, addr != end); 395 } 396 397 /* 398 * vunmap_range_noflush is similar to vunmap_range, but does not 399 * flush caches or TLBs. 400 * 401 * The caller is responsible for calling flush_cache_vmap() before calling 402 * this function, and flush_tlb_kernel_range after it has returned 403 * successfully (and before the addresses are expected to cause a page fault 404 * or be re-mapped for something else, if TLB flushes are being delayed or 405 * coalesced). 406 * 407 * This is an internal function only. Do not use outside mm/. 408 */ 409 void vunmap_range_noflush(unsigned long start, unsigned long end) 410 { 411 unsigned long next; 412 pgd_t *pgd; 413 unsigned long addr = start; 414 pgtbl_mod_mask mask = 0; 415 416 BUG_ON(addr >= end); 417 pgd = pgd_offset_k(addr); 418 do { 419 next = pgd_addr_end(addr, end); 420 if (pgd_bad(*pgd)) 421 mask |= PGTBL_PGD_MODIFIED; 422 if (pgd_none_or_clear_bad(pgd)) 423 continue; 424 vunmap_p4d_range(pgd, addr, next, &mask); 425 } while (pgd++, addr = next, addr != end); 426 427 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 428 arch_sync_kernel_mappings(start, end); 429 } 430 431 /** 432 * vunmap_range - unmap kernel virtual addresses 433 * @addr: start of the VM area to unmap 434 * @end: end of the VM area to unmap (non-inclusive) 435 * 436 * Clears any present PTEs in the virtual address range, flushes TLBs and 437 * caches. Any subsequent access to the address before it has been re-mapped 438 * is a kernel bug. 439 */ 440 void vunmap_range(unsigned long addr, unsigned long end) 441 { 442 flush_cache_vunmap(addr, end); 443 vunmap_range_noflush(addr, end); 444 flush_tlb_kernel_range(addr, end); 445 } 446 447 static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, 448 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 449 pgtbl_mod_mask *mask) 450 { 451 pte_t *pte; 452 453 /* 454 * nr is a running index into the array which helps higher level 455 * callers keep track of where we're up to. 456 */ 457 458 pte = pte_alloc_kernel_track(pmd, addr, mask); 459 if (!pte) 460 return -ENOMEM; 461 do { 462 struct page *page = pages[*nr]; 463 464 if (WARN_ON(!pte_none(*pte))) 465 return -EBUSY; 466 if (WARN_ON(!page)) 467 return -ENOMEM; 468 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 469 (*nr)++; 470 } while (pte++, addr += PAGE_SIZE, addr != end); 471 *mask |= PGTBL_PTE_MODIFIED; 472 return 0; 473 } 474 475 static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, 476 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 477 pgtbl_mod_mask *mask) 478 { 479 pmd_t *pmd; 480 unsigned long next; 481 482 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 483 if (!pmd) 484 return -ENOMEM; 485 do { 486 next = pmd_addr_end(addr, end); 487 if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) 488 return -ENOMEM; 489 } while (pmd++, addr = next, addr != end); 490 return 0; 491 } 492 493 static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, 494 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 495 pgtbl_mod_mask *mask) 496 { 497 pud_t *pud; 498 unsigned long next; 499 500 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 501 if (!pud) 502 return -ENOMEM; 503 do { 504 next = pud_addr_end(addr, end); 505 if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) 506 return -ENOMEM; 507 } while (pud++, addr = next, addr != end); 508 return 0; 509 } 510 511 static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, 512 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 513 pgtbl_mod_mask *mask) 514 { 515 p4d_t *p4d; 516 unsigned long next; 517 518 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 519 if (!p4d) 520 return -ENOMEM; 521 do { 522 next = p4d_addr_end(addr, end); 523 if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) 524 return -ENOMEM; 525 } while (p4d++, addr = next, addr != end); 526 return 0; 527 } 528 529 static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, 530 pgprot_t prot, struct page **pages) 531 { 532 unsigned long start = addr; 533 pgd_t *pgd; 534 unsigned long next; 535 int err = 0; 536 int nr = 0; 537 pgtbl_mod_mask mask = 0; 538 539 BUG_ON(addr >= end); 540 pgd = pgd_offset_k(addr); 541 do { 542 next = pgd_addr_end(addr, end); 543 if (pgd_bad(*pgd)) 544 mask |= PGTBL_PGD_MODIFIED; 545 err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); 546 if (err) 547 return err; 548 } while (pgd++, addr = next, addr != end); 549 550 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 551 arch_sync_kernel_mappings(start, end); 552 553 return 0; 554 } 555 556 /* 557 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not 558 * flush caches. 559 * 560 * The caller is responsible for calling flush_cache_vmap() after this 561 * function returns successfully and before the addresses are accessed. 562 * 563 * This is an internal function only. Do not use outside mm/. 564 */ 565 int vmap_pages_range_noflush(unsigned long addr, unsigned long end, 566 pgprot_t prot, struct page **pages, unsigned int page_shift) 567 { 568 unsigned int i, nr = (end - addr) >> PAGE_SHIFT; 569 570 WARN_ON(page_shift < PAGE_SHIFT); 571 572 if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || 573 page_shift == PAGE_SHIFT) 574 return vmap_small_pages_range_noflush(addr, end, prot, pages); 575 576 for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { 577 int err; 578 579 err = vmap_range_noflush(addr, addr + (1UL << page_shift), 580 __pa(page_address(pages[i])), prot, 581 page_shift); 582 if (err) 583 return err; 584 585 addr += 1UL << page_shift; 586 } 587 588 return 0; 589 } 590 591 /** 592 * vmap_pages_range - map pages to a kernel virtual address 593 * @addr: start of the VM area to map 594 * @end: end of the VM area to map (non-inclusive) 595 * @prot: page protection flags to use 596 * @pages: pages to map (always PAGE_SIZE pages) 597 * @page_shift: maximum shift that the pages may be mapped with, @pages must 598 * be aligned and contiguous up to at least this shift. 599 * 600 * RETURNS: 601 * 0 on success, -errno on failure. 602 */ 603 static int vmap_pages_range(unsigned long addr, unsigned long end, 604 pgprot_t prot, struct page **pages, unsigned int page_shift) 605 { 606 int err; 607 608 err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); 609 flush_cache_vmap(addr, end); 610 return err; 611 } 612 613 int is_vmalloc_or_module_addr(const void *x) 614 { 615 /* 616 * ARM, x86-64 and sparc64 put modules in a special place, 617 * and fall back on vmalloc() if that fails. Others 618 * just put it in the vmalloc space. 619 */ 620 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 621 unsigned long addr = (unsigned long)x; 622 if (addr >= MODULES_VADDR && addr < MODULES_END) 623 return 1; 624 #endif 625 return is_vmalloc_addr(x); 626 } 627 628 /* 629 * Walk a vmap address to the struct page it maps. Huge vmap mappings will 630 * return the tail page that corresponds to the base page address, which 631 * matches small vmap mappings. 632 */ 633 struct page *vmalloc_to_page(const void *vmalloc_addr) 634 { 635 unsigned long addr = (unsigned long) vmalloc_addr; 636 struct page *page = NULL; 637 pgd_t *pgd = pgd_offset_k(addr); 638 p4d_t *p4d; 639 pud_t *pud; 640 pmd_t *pmd; 641 pte_t *ptep, pte; 642 643 /* 644 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 645 * architectures that do not vmalloc module space 646 */ 647 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 648 649 if (pgd_none(*pgd)) 650 return NULL; 651 if (WARN_ON_ONCE(pgd_leaf(*pgd))) 652 return NULL; /* XXX: no allowance for huge pgd */ 653 if (WARN_ON_ONCE(pgd_bad(*pgd))) 654 return NULL; 655 656 p4d = p4d_offset(pgd, addr); 657 if (p4d_none(*p4d)) 658 return NULL; 659 if (p4d_leaf(*p4d)) 660 return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); 661 if (WARN_ON_ONCE(p4d_bad(*p4d))) 662 return NULL; 663 664 pud = pud_offset(p4d, addr); 665 if (pud_none(*pud)) 666 return NULL; 667 if (pud_leaf(*pud)) 668 return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 669 if (WARN_ON_ONCE(pud_bad(*pud))) 670 return NULL; 671 672 pmd = pmd_offset(pud, addr); 673 if (pmd_none(*pmd)) 674 return NULL; 675 if (pmd_leaf(*pmd)) 676 return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 677 if (WARN_ON_ONCE(pmd_bad(*pmd))) 678 return NULL; 679 680 ptep = pte_offset_map(pmd, addr); 681 pte = *ptep; 682 if (pte_present(pte)) 683 page = pte_page(pte); 684 pte_unmap(ptep); 685 686 return page; 687 } 688 EXPORT_SYMBOL(vmalloc_to_page); 689 690 /* 691 * Map a vmalloc()-space virtual address to the physical page frame number. 692 */ 693 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 694 { 695 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 696 } 697 EXPORT_SYMBOL(vmalloc_to_pfn); 698 699 700 /*** Global kva allocator ***/ 701 702 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 703 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 704 705 706 static DEFINE_SPINLOCK(vmap_area_lock); 707 static DEFINE_SPINLOCK(free_vmap_area_lock); 708 /* Export for kexec only */ 709 LIST_HEAD(vmap_area_list); 710 static struct rb_root vmap_area_root = RB_ROOT; 711 static bool vmap_initialized __read_mostly; 712 713 static struct rb_root purge_vmap_area_root = RB_ROOT; 714 static LIST_HEAD(purge_vmap_area_list); 715 static DEFINE_SPINLOCK(purge_vmap_area_lock); 716 717 /* 718 * This kmem_cache is used for vmap_area objects. Instead of 719 * allocating from slab we reuse an object from this cache to 720 * make things faster. Especially in "no edge" splitting of 721 * free block. 722 */ 723 static struct kmem_cache *vmap_area_cachep; 724 725 /* 726 * This linked list is used in pair with free_vmap_area_root. 727 * It gives O(1) access to prev/next to perform fast coalescing. 728 */ 729 static LIST_HEAD(free_vmap_area_list); 730 731 /* 732 * This augment red-black tree represents the free vmap space. 733 * All vmap_area objects in this tree are sorted by va->va_start 734 * address. It is used for allocation and merging when a vmap 735 * object is released. 736 * 737 * Each vmap_area node contains a maximum available free block 738 * of its sub-tree, right or left. Therefore it is possible to 739 * find a lowest match of free area. 740 */ 741 static struct rb_root free_vmap_area_root = RB_ROOT; 742 743 /* 744 * Preload a CPU with one object for "no edge" split case. The 745 * aim is to get rid of allocations from the atomic context, thus 746 * to use more permissive allocation masks. 747 */ 748 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); 749 750 static __always_inline unsigned long 751 va_size(struct vmap_area *va) 752 { 753 return (va->va_end - va->va_start); 754 } 755 756 static __always_inline unsigned long 757 get_subtree_max_size(struct rb_node *node) 758 { 759 struct vmap_area *va; 760 761 va = rb_entry_safe(node, struct vmap_area, rb_node); 762 return va ? va->subtree_max_size : 0; 763 } 764 765 /* 766 * Gets called when remove the node and rotate. 767 */ 768 static __always_inline unsigned long 769 compute_subtree_max_size(struct vmap_area *va) 770 { 771 return max3(va_size(va), 772 get_subtree_max_size(va->rb_node.rb_left), 773 get_subtree_max_size(va->rb_node.rb_right)); 774 } 775 776 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, 777 struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) 778 779 static void purge_vmap_area_lazy(void); 780 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 781 static unsigned long lazy_max_pages(void); 782 783 static atomic_long_t nr_vmalloc_pages; 784 785 unsigned long vmalloc_nr_pages(void) 786 { 787 return atomic_long_read(&nr_vmalloc_pages); 788 } 789 790 static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) 791 { 792 struct vmap_area *va = NULL; 793 struct rb_node *n = vmap_area_root.rb_node; 794 795 while (n) { 796 struct vmap_area *tmp; 797 798 tmp = rb_entry(n, struct vmap_area, rb_node); 799 if (tmp->va_end > addr) { 800 va = tmp; 801 if (tmp->va_start <= addr) 802 break; 803 804 n = n->rb_left; 805 } else 806 n = n->rb_right; 807 } 808 809 return va; 810 } 811 812 static struct vmap_area *__find_vmap_area(unsigned long addr) 813 { 814 struct rb_node *n = vmap_area_root.rb_node; 815 816 while (n) { 817 struct vmap_area *va; 818 819 va = rb_entry(n, struct vmap_area, rb_node); 820 if (addr < va->va_start) 821 n = n->rb_left; 822 else if (addr >= va->va_end) 823 n = n->rb_right; 824 else 825 return va; 826 } 827 828 return NULL; 829 } 830 831 /* 832 * This function returns back addresses of parent node 833 * and its left or right link for further processing. 834 * 835 * Otherwise NULL is returned. In that case all further 836 * steps regarding inserting of conflicting overlap range 837 * have to be declined and actually considered as a bug. 838 */ 839 static __always_inline struct rb_node ** 840 find_va_links(struct vmap_area *va, 841 struct rb_root *root, struct rb_node *from, 842 struct rb_node **parent) 843 { 844 struct vmap_area *tmp_va; 845 struct rb_node **link; 846 847 if (root) { 848 link = &root->rb_node; 849 if (unlikely(!*link)) { 850 *parent = NULL; 851 return link; 852 } 853 } else { 854 link = &from; 855 } 856 857 /* 858 * Go to the bottom of the tree. When we hit the last point 859 * we end up with parent rb_node and correct direction, i name 860 * it link, where the new va->rb_node will be attached to. 861 */ 862 do { 863 tmp_va = rb_entry(*link, struct vmap_area, rb_node); 864 865 /* 866 * During the traversal we also do some sanity check. 867 * Trigger the BUG() if there are sides(left/right) 868 * or full overlaps. 869 */ 870 if (va->va_start < tmp_va->va_end && 871 va->va_end <= tmp_va->va_start) 872 link = &(*link)->rb_left; 873 else if (va->va_end > tmp_va->va_start && 874 va->va_start >= tmp_va->va_end) 875 link = &(*link)->rb_right; 876 else { 877 WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", 878 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); 879 880 return NULL; 881 } 882 } while (*link); 883 884 *parent = &tmp_va->rb_node; 885 return link; 886 } 887 888 static __always_inline struct list_head * 889 get_va_next_sibling(struct rb_node *parent, struct rb_node **link) 890 { 891 struct list_head *list; 892 893 if (unlikely(!parent)) 894 /* 895 * The red-black tree where we try to find VA neighbors 896 * before merging or inserting is empty, i.e. it means 897 * there is no free vmap space. Normally it does not 898 * happen but we handle this case anyway. 899 */ 900 return NULL; 901 902 list = &rb_entry(parent, struct vmap_area, rb_node)->list; 903 return (&parent->rb_right == link ? list->next : list); 904 } 905 906 static __always_inline void 907 link_va(struct vmap_area *va, struct rb_root *root, 908 struct rb_node *parent, struct rb_node **link, struct list_head *head) 909 { 910 /* 911 * VA is still not in the list, but we can 912 * identify its future previous list_head node. 913 */ 914 if (likely(parent)) { 915 head = &rb_entry(parent, struct vmap_area, rb_node)->list; 916 if (&parent->rb_right != link) 917 head = head->prev; 918 } 919 920 /* Insert to the rb-tree */ 921 rb_link_node(&va->rb_node, parent, link); 922 if (root == &free_vmap_area_root) { 923 /* 924 * Some explanation here. Just perform simple insertion 925 * to the tree. We do not set va->subtree_max_size to 926 * its current size before calling rb_insert_augmented(). 927 * It is because of we populate the tree from the bottom 928 * to parent levels when the node _is_ in the tree. 929 * 930 * Therefore we set subtree_max_size to zero after insertion, 931 * to let __augment_tree_propagate_from() puts everything to 932 * the correct order later on. 933 */ 934 rb_insert_augmented(&va->rb_node, 935 root, &free_vmap_area_rb_augment_cb); 936 va->subtree_max_size = 0; 937 } else { 938 rb_insert_color(&va->rb_node, root); 939 } 940 941 /* Address-sort this list */ 942 list_add(&va->list, head); 943 } 944 945 static __always_inline void 946 unlink_va(struct vmap_area *va, struct rb_root *root) 947 { 948 if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) 949 return; 950 951 if (root == &free_vmap_area_root) 952 rb_erase_augmented(&va->rb_node, 953 root, &free_vmap_area_rb_augment_cb); 954 else 955 rb_erase(&va->rb_node, root); 956 957 list_del(&va->list); 958 RB_CLEAR_NODE(&va->rb_node); 959 } 960 961 #if DEBUG_AUGMENT_PROPAGATE_CHECK 962 static void 963 augment_tree_propagate_check(void) 964 { 965 struct vmap_area *va; 966 unsigned long computed_size; 967 968 list_for_each_entry(va, &free_vmap_area_list, list) { 969 computed_size = compute_subtree_max_size(va); 970 if (computed_size != va->subtree_max_size) 971 pr_emerg("tree is corrupted: %lu, %lu\n", 972 va_size(va), va->subtree_max_size); 973 } 974 } 975 #endif 976 977 /* 978 * This function populates subtree_max_size from bottom to upper 979 * levels starting from VA point. The propagation must be done 980 * when VA size is modified by changing its va_start/va_end. Or 981 * in case of newly inserting of VA to the tree. 982 * 983 * It means that __augment_tree_propagate_from() must be called: 984 * - After VA has been inserted to the tree(free path); 985 * - After VA has been shrunk(allocation path); 986 * - After VA has been increased(merging path). 987 * 988 * Please note that, it does not mean that upper parent nodes 989 * and their subtree_max_size are recalculated all the time up 990 * to the root node. 991 * 992 * 4--8 993 * /\ 994 * / \ 995 * / \ 996 * 2--2 8--8 997 * 998 * For example if we modify the node 4, shrinking it to 2, then 999 * no any modification is required. If we shrink the node 2 to 1 1000 * its subtree_max_size is updated only, and set to 1. If we shrink 1001 * the node 8 to 6, then its subtree_max_size is set to 6 and parent 1002 * node becomes 4--6. 1003 */ 1004 static __always_inline void 1005 augment_tree_propagate_from(struct vmap_area *va) 1006 { 1007 /* 1008 * Populate the tree from bottom towards the root until 1009 * the calculated maximum available size of checked node 1010 * is equal to its current one. 1011 */ 1012 free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); 1013 1014 #if DEBUG_AUGMENT_PROPAGATE_CHECK 1015 augment_tree_propagate_check(); 1016 #endif 1017 } 1018 1019 static void 1020 insert_vmap_area(struct vmap_area *va, 1021 struct rb_root *root, struct list_head *head) 1022 { 1023 struct rb_node **link; 1024 struct rb_node *parent; 1025 1026 link = find_va_links(va, root, NULL, &parent); 1027 if (link) 1028 link_va(va, root, parent, link, head); 1029 } 1030 1031 static void 1032 insert_vmap_area_augment(struct vmap_area *va, 1033 struct rb_node *from, struct rb_root *root, 1034 struct list_head *head) 1035 { 1036 struct rb_node **link; 1037 struct rb_node *parent; 1038 1039 if (from) 1040 link = find_va_links(va, NULL, from, &parent); 1041 else 1042 link = find_va_links(va, root, NULL, &parent); 1043 1044 if (link) { 1045 link_va(va, root, parent, link, head); 1046 augment_tree_propagate_from(va); 1047 } 1048 } 1049 1050 /* 1051 * Merge de-allocated chunk of VA memory with previous 1052 * and next free blocks. If coalesce is not done a new 1053 * free area is inserted. If VA has been merged, it is 1054 * freed. 1055 * 1056 * Please note, it can return NULL in case of overlap 1057 * ranges, followed by WARN() report. Despite it is a 1058 * buggy behaviour, a system can be alive and keep 1059 * ongoing. 1060 */ 1061 static __always_inline struct vmap_area * 1062 merge_or_add_vmap_area(struct vmap_area *va, 1063 struct rb_root *root, struct list_head *head) 1064 { 1065 struct vmap_area *sibling; 1066 struct list_head *next; 1067 struct rb_node **link; 1068 struct rb_node *parent; 1069 bool merged = false; 1070 1071 /* 1072 * Find a place in the tree where VA potentially will be 1073 * inserted, unless it is merged with its sibling/siblings. 1074 */ 1075 link = find_va_links(va, root, NULL, &parent); 1076 if (!link) 1077 return NULL; 1078 1079 /* 1080 * Get next node of VA to check if merging can be done. 1081 */ 1082 next = get_va_next_sibling(parent, link); 1083 if (unlikely(next == NULL)) 1084 goto insert; 1085 1086 /* 1087 * start end 1088 * | | 1089 * |<------VA------>|<-----Next----->| 1090 * | | 1091 * start end 1092 */ 1093 if (next != head) { 1094 sibling = list_entry(next, struct vmap_area, list); 1095 if (sibling->va_start == va->va_end) { 1096 sibling->va_start = va->va_start; 1097 1098 /* Free vmap_area object. */ 1099 kmem_cache_free(vmap_area_cachep, va); 1100 1101 /* Point to the new merged area. */ 1102 va = sibling; 1103 merged = true; 1104 } 1105 } 1106 1107 /* 1108 * start end 1109 * | | 1110 * |<-----Prev----->|<------VA------>| 1111 * | | 1112 * start end 1113 */ 1114 if (next->prev != head) { 1115 sibling = list_entry(next->prev, struct vmap_area, list); 1116 if (sibling->va_end == va->va_start) { 1117 /* 1118 * If both neighbors are coalesced, it is important 1119 * to unlink the "next" node first, followed by merging 1120 * with "previous" one. Otherwise the tree might not be 1121 * fully populated if a sibling's augmented value is 1122 * "normalized" because of rotation operations. 1123 */ 1124 if (merged) 1125 unlink_va(va, root); 1126 1127 sibling->va_end = va->va_end; 1128 1129 /* Free vmap_area object. */ 1130 kmem_cache_free(vmap_area_cachep, va); 1131 1132 /* Point to the new merged area. */ 1133 va = sibling; 1134 merged = true; 1135 } 1136 } 1137 1138 insert: 1139 if (!merged) 1140 link_va(va, root, parent, link, head); 1141 1142 return va; 1143 } 1144 1145 static __always_inline struct vmap_area * 1146 merge_or_add_vmap_area_augment(struct vmap_area *va, 1147 struct rb_root *root, struct list_head *head) 1148 { 1149 va = merge_or_add_vmap_area(va, root, head); 1150 if (va) 1151 augment_tree_propagate_from(va); 1152 1153 return va; 1154 } 1155 1156 static __always_inline bool 1157 is_within_this_va(struct vmap_area *va, unsigned long size, 1158 unsigned long align, unsigned long vstart) 1159 { 1160 unsigned long nva_start_addr; 1161 1162 if (va->va_start > vstart) 1163 nva_start_addr = ALIGN(va->va_start, align); 1164 else 1165 nva_start_addr = ALIGN(vstart, align); 1166 1167 /* Can be overflowed due to big size or alignment. */ 1168 if (nva_start_addr + size < nva_start_addr || 1169 nva_start_addr < vstart) 1170 return false; 1171 1172 return (nva_start_addr + size <= va->va_end); 1173 } 1174 1175 /* 1176 * Find the first free block(lowest start address) in the tree, 1177 * that will accomplish the request corresponding to passing 1178 * parameters. 1179 */ 1180 static __always_inline struct vmap_area * 1181 find_vmap_lowest_match(unsigned long size, 1182 unsigned long align, unsigned long vstart) 1183 { 1184 struct vmap_area *va; 1185 struct rb_node *node; 1186 unsigned long length; 1187 1188 /* Start from the root. */ 1189 node = free_vmap_area_root.rb_node; 1190 1191 /* Adjust the search size for alignment overhead. */ 1192 length = size + align - 1; 1193 1194 while (node) { 1195 va = rb_entry(node, struct vmap_area, rb_node); 1196 1197 if (get_subtree_max_size(node->rb_left) >= length && 1198 vstart < va->va_start) { 1199 node = node->rb_left; 1200 } else { 1201 if (is_within_this_va(va, size, align, vstart)) 1202 return va; 1203 1204 /* 1205 * Does not make sense to go deeper towards the right 1206 * sub-tree if it does not have a free block that is 1207 * equal or bigger to the requested search length. 1208 */ 1209 if (get_subtree_max_size(node->rb_right) >= length) { 1210 node = node->rb_right; 1211 continue; 1212 } 1213 1214 /* 1215 * OK. We roll back and find the first right sub-tree, 1216 * that will satisfy the search criteria. It can happen 1217 * only once due to "vstart" restriction. 1218 */ 1219 while ((node = rb_parent(node))) { 1220 va = rb_entry(node, struct vmap_area, rb_node); 1221 if (is_within_this_va(va, size, align, vstart)) 1222 return va; 1223 1224 if (get_subtree_max_size(node->rb_right) >= length && 1225 vstart <= va->va_start) { 1226 node = node->rb_right; 1227 break; 1228 } 1229 } 1230 } 1231 } 1232 1233 return NULL; 1234 } 1235 1236 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1237 #include <linux/random.h> 1238 1239 static struct vmap_area * 1240 find_vmap_lowest_linear_match(unsigned long size, 1241 unsigned long align, unsigned long vstart) 1242 { 1243 struct vmap_area *va; 1244 1245 list_for_each_entry(va, &free_vmap_area_list, list) { 1246 if (!is_within_this_va(va, size, align, vstart)) 1247 continue; 1248 1249 return va; 1250 } 1251 1252 return NULL; 1253 } 1254 1255 static void 1256 find_vmap_lowest_match_check(unsigned long size) 1257 { 1258 struct vmap_area *va_1, *va_2; 1259 unsigned long vstart; 1260 unsigned int rnd; 1261 1262 get_random_bytes(&rnd, sizeof(rnd)); 1263 vstart = VMALLOC_START + rnd; 1264 1265 va_1 = find_vmap_lowest_match(size, 1, vstart); 1266 va_2 = find_vmap_lowest_linear_match(size, 1, vstart); 1267 1268 if (va_1 != va_2) 1269 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", 1270 va_1, va_2, vstart); 1271 } 1272 #endif 1273 1274 enum fit_type { 1275 NOTHING_FIT = 0, 1276 FL_FIT_TYPE = 1, /* full fit */ 1277 LE_FIT_TYPE = 2, /* left edge fit */ 1278 RE_FIT_TYPE = 3, /* right edge fit */ 1279 NE_FIT_TYPE = 4 /* no edge fit */ 1280 }; 1281 1282 static __always_inline enum fit_type 1283 classify_va_fit_type(struct vmap_area *va, 1284 unsigned long nva_start_addr, unsigned long size) 1285 { 1286 enum fit_type type; 1287 1288 /* Check if it is within VA. */ 1289 if (nva_start_addr < va->va_start || 1290 nva_start_addr + size > va->va_end) 1291 return NOTHING_FIT; 1292 1293 /* Now classify. */ 1294 if (va->va_start == nva_start_addr) { 1295 if (va->va_end == nva_start_addr + size) 1296 type = FL_FIT_TYPE; 1297 else 1298 type = LE_FIT_TYPE; 1299 } else if (va->va_end == nva_start_addr + size) { 1300 type = RE_FIT_TYPE; 1301 } else { 1302 type = NE_FIT_TYPE; 1303 } 1304 1305 return type; 1306 } 1307 1308 static __always_inline int 1309 adjust_va_to_fit_type(struct vmap_area *va, 1310 unsigned long nva_start_addr, unsigned long size, 1311 enum fit_type type) 1312 { 1313 struct vmap_area *lva = NULL; 1314 1315 if (type == FL_FIT_TYPE) { 1316 /* 1317 * No need to split VA, it fully fits. 1318 * 1319 * | | 1320 * V NVA V 1321 * |---------------| 1322 */ 1323 unlink_va(va, &free_vmap_area_root); 1324 kmem_cache_free(vmap_area_cachep, va); 1325 } else if (type == LE_FIT_TYPE) { 1326 /* 1327 * Split left edge of fit VA. 1328 * 1329 * | | 1330 * V NVA V R 1331 * |-------|-------| 1332 */ 1333 va->va_start += size; 1334 } else if (type == RE_FIT_TYPE) { 1335 /* 1336 * Split right edge of fit VA. 1337 * 1338 * | | 1339 * L V NVA V 1340 * |-------|-------| 1341 */ 1342 va->va_end = nva_start_addr; 1343 } else if (type == NE_FIT_TYPE) { 1344 /* 1345 * Split no edge of fit VA. 1346 * 1347 * | | 1348 * L V NVA V R 1349 * |---|-------|---| 1350 */ 1351 lva = __this_cpu_xchg(ne_fit_preload_node, NULL); 1352 if (unlikely(!lva)) { 1353 /* 1354 * For percpu allocator we do not do any pre-allocation 1355 * and leave it as it is. The reason is it most likely 1356 * never ends up with NE_FIT_TYPE splitting. In case of 1357 * percpu allocations offsets and sizes are aligned to 1358 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE 1359 * are its main fitting cases. 1360 * 1361 * There are a few exceptions though, as an example it is 1362 * a first allocation (early boot up) when we have "one" 1363 * big free space that has to be split. 1364 * 1365 * Also we can hit this path in case of regular "vmap" 1366 * allocations, if "this" current CPU was not preloaded. 1367 * See the comment in alloc_vmap_area() why. If so, then 1368 * GFP_NOWAIT is used instead to get an extra object for 1369 * split purpose. That is rare and most time does not 1370 * occur. 1371 * 1372 * What happens if an allocation gets failed. Basically, 1373 * an "overflow" path is triggered to purge lazily freed 1374 * areas to free some memory, then, the "retry" path is 1375 * triggered to repeat one more time. See more details 1376 * in alloc_vmap_area() function. 1377 */ 1378 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 1379 if (!lva) 1380 return -1; 1381 } 1382 1383 /* 1384 * Build the remainder. 1385 */ 1386 lva->va_start = va->va_start; 1387 lva->va_end = nva_start_addr; 1388 1389 /* 1390 * Shrink this VA to remaining size. 1391 */ 1392 va->va_start = nva_start_addr + size; 1393 } else { 1394 return -1; 1395 } 1396 1397 if (type != FL_FIT_TYPE) { 1398 augment_tree_propagate_from(va); 1399 1400 if (lva) /* type == NE_FIT_TYPE */ 1401 insert_vmap_area_augment(lva, &va->rb_node, 1402 &free_vmap_area_root, &free_vmap_area_list); 1403 } 1404 1405 return 0; 1406 } 1407 1408 /* 1409 * Returns a start address of the newly allocated area, if success. 1410 * Otherwise a vend is returned that indicates failure. 1411 */ 1412 static __always_inline unsigned long 1413 __alloc_vmap_area(unsigned long size, unsigned long align, 1414 unsigned long vstart, unsigned long vend) 1415 { 1416 unsigned long nva_start_addr; 1417 struct vmap_area *va; 1418 enum fit_type type; 1419 int ret; 1420 1421 va = find_vmap_lowest_match(size, align, vstart); 1422 if (unlikely(!va)) 1423 return vend; 1424 1425 if (va->va_start > vstart) 1426 nva_start_addr = ALIGN(va->va_start, align); 1427 else 1428 nva_start_addr = ALIGN(vstart, align); 1429 1430 /* Check the "vend" restriction. */ 1431 if (nva_start_addr + size > vend) 1432 return vend; 1433 1434 /* Classify what we have found. */ 1435 type = classify_va_fit_type(va, nva_start_addr, size); 1436 if (WARN_ON_ONCE(type == NOTHING_FIT)) 1437 return vend; 1438 1439 /* Update the free vmap_area. */ 1440 ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); 1441 if (ret) 1442 return vend; 1443 1444 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1445 find_vmap_lowest_match_check(size); 1446 #endif 1447 1448 return nva_start_addr; 1449 } 1450 1451 /* 1452 * Free a region of KVA allocated by alloc_vmap_area 1453 */ 1454 static void free_vmap_area(struct vmap_area *va) 1455 { 1456 /* 1457 * Remove from the busy tree/list. 1458 */ 1459 spin_lock(&vmap_area_lock); 1460 unlink_va(va, &vmap_area_root); 1461 spin_unlock(&vmap_area_lock); 1462 1463 /* 1464 * Insert/Merge it back to the free tree/list. 1465 */ 1466 spin_lock(&free_vmap_area_lock); 1467 merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); 1468 spin_unlock(&free_vmap_area_lock); 1469 } 1470 1471 static inline void 1472 preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) 1473 { 1474 struct vmap_area *va = NULL; 1475 1476 /* 1477 * Preload this CPU with one extra vmap_area object. It is used 1478 * when fit type of free area is NE_FIT_TYPE. It guarantees that 1479 * a CPU that does an allocation is preloaded. 1480 * 1481 * We do it in non-atomic context, thus it allows us to use more 1482 * permissive allocation masks to be more stable under low memory 1483 * condition and high memory pressure. 1484 */ 1485 if (!this_cpu_read(ne_fit_preload_node)) 1486 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1487 1488 spin_lock(lock); 1489 1490 if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) 1491 kmem_cache_free(vmap_area_cachep, va); 1492 } 1493 1494 /* 1495 * Allocate a region of KVA of the specified size and alignment, within the 1496 * vstart and vend. 1497 */ 1498 static struct vmap_area *alloc_vmap_area(unsigned long size, 1499 unsigned long align, 1500 unsigned long vstart, unsigned long vend, 1501 int node, gfp_t gfp_mask) 1502 { 1503 struct vmap_area *va; 1504 unsigned long freed; 1505 unsigned long addr; 1506 int purged = 0; 1507 int ret; 1508 1509 BUG_ON(!size); 1510 BUG_ON(offset_in_page(size)); 1511 BUG_ON(!is_power_of_2(align)); 1512 1513 if (unlikely(!vmap_initialized)) 1514 return ERR_PTR(-EBUSY); 1515 1516 might_sleep(); 1517 gfp_mask = gfp_mask & GFP_RECLAIM_MASK; 1518 1519 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1520 if (unlikely(!va)) 1521 return ERR_PTR(-ENOMEM); 1522 1523 /* 1524 * Only scan the relevant parts containing pointers to other objects 1525 * to avoid false negatives. 1526 */ 1527 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); 1528 1529 retry: 1530 preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); 1531 addr = __alloc_vmap_area(size, align, vstart, vend); 1532 spin_unlock(&free_vmap_area_lock); 1533 1534 /* 1535 * If an allocation fails, the "vend" address is 1536 * returned. Therefore trigger the overflow path. 1537 */ 1538 if (unlikely(addr == vend)) 1539 goto overflow; 1540 1541 va->va_start = addr; 1542 va->va_end = addr + size; 1543 va->vm = NULL; 1544 1545 spin_lock(&vmap_area_lock); 1546 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1547 spin_unlock(&vmap_area_lock); 1548 1549 BUG_ON(!IS_ALIGNED(va->va_start, align)); 1550 BUG_ON(va->va_start < vstart); 1551 BUG_ON(va->va_end > vend); 1552 1553 ret = kasan_populate_vmalloc(addr, size); 1554 if (ret) { 1555 free_vmap_area(va); 1556 return ERR_PTR(ret); 1557 } 1558 1559 return va; 1560 1561 overflow: 1562 if (!purged) { 1563 purge_vmap_area_lazy(); 1564 purged = 1; 1565 goto retry; 1566 } 1567 1568 freed = 0; 1569 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); 1570 1571 if (freed > 0) { 1572 purged = 0; 1573 goto retry; 1574 } 1575 1576 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 1577 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", 1578 size); 1579 1580 kmem_cache_free(vmap_area_cachep, va); 1581 return ERR_PTR(-EBUSY); 1582 } 1583 1584 int register_vmap_purge_notifier(struct notifier_block *nb) 1585 { 1586 return blocking_notifier_chain_register(&vmap_notify_list, nb); 1587 } 1588 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); 1589 1590 int unregister_vmap_purge_notifier(struct notifier_block *nb) 1591 { 1592 return blocking_notifier_chain_unregister(&vmap_notify_list, nb); 1593 } 1594 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); 1595 1596 /* 1597 * lazy_max_pages is the maximum amount of virtual address space we gather up 1598 * before attempting to purge with a TLB flush. 1599 * 1600 * There is a tradeoff here: a larger number will cover more kernel page tables 1601 * and take slightly longer to purge, but it will linearly reduce the number of 1602 * global TLB flushes that must be performed. It would seem natural to scale 1603 * this number up linearly with the number of CPUs (because vmapping activity 1604 * could also scale linearly with the number of CPUs), however it is likely 1605 * that in practice, workloads might be constrained in other ways that mean 1606 * vmap activity will not scale linearly with CPUs. Also, I want to be 1607 * conservative and not introduce a big latency on huge systems, so go with 1608 * a less aggressive log scale. It will still be an improvement over the old 1609 * code, and it will be simple to change the scale factor if we find that it 1610 * becomes a problem on bigger systems. 1611 */ 1612 static unsigned long lazy_max_pages(void) 1613 { 1614 unsigned int log; 1615 1616 log = fls(num_online_cpus()); 1617 1618 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 1619 } 1620 1621 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); 1622 1623 /* 1624 * Serialize vmap purging. There is no actual critical section protected 1625 * by this look, but we want to avoid concurrent calls for performance 1626 * reasons and to make the pcpu_get_vm_areas more deterministic. 1627 */ 1628 static DEFINE_MUTEX(vmap_purge_lock); 1629 1630 /* for per-CPU blocks */ 1631 static void purge_fragmented_blocks_allcpus(void); 1632 1633 #ifdef CONFIG_X86_64 1634 /* 1635 * called before a call to iounmap() if the caller wants vm_area_struct's 1636 * immediately freed. 1637 */ 1638 void set_iounmap_nonlazy(void) 1639 { 1640 atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); 1641 } 1642 #endif /* CONFIG_X86_64 */ 1643 1644 /* 1645 * Purges all lazily-freed vmap areas. 1646 */ 1647 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) 1648 { 1649 unsigned long resched_threshold; 1650 struct list_head local_pure_list; 1651 struct vmap_area *va, *n_va; 1652 1653 lockdep_assert_held(&vmap_purge_lock); 1654 1655 spin_lock(&purge_vmap_area_lock); 1656 purge_vmap_area_root = RB_ROOT; 1657 list_replace_init(&purge_vmap_area_list, &local_pure_list); 1658 spin_unlock(&purge_vmap_area_lock); 1659 1660 if (unlikely(list_empty(&local_pure_list))) 1661 return false; 1662 1663 start = min(start, 1664 list_first_entry(&local_pure_list, 1665 struct vmap_area, list)->va_start); 1666 1667 end = max(end, 1668 list_last_entry(&local_pure_list, 1669 struct vmap_area, list)->va_end); 1670 1671 flush_tlb_kernel_range(start, end); 1672 resched_threshold = lazy_max_pages() << 1; 1673 1674 spin_lock(&free_vmap_area_lock); 1675 list_for_each_entry_safe(va, n_va, &local_pure_list, list) { 1676 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1677 unsigned long orig_start = va->va_start; 1678 unsigned long orig_end = va->va_end; 1679 1680 /* 1681 * Finally insert or merge lazily-freed area. It is 1682 * detached and there is no need to "unlink" it from 1683 * anything. 1684 */ 1685 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, 1686 &free_vmap_area_list); 1687 1688 if (!va) 1689 continue; 1690 1691 if (is_vmalloc_or_module_addr((void *)orig_start)) 1692 kasan_release_vmalloc(orig_start, orig_end, 1693 va->va_start, va->va_end); 1694 1695 atomic_long_sub(nr, &vmap_lazy_nr); 1696 1697 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) 1698 cond_resched_lock(&free_vmap_area_lock); 1699 } 1700 spin_unlock(&free_vmap_area_lock); 1701 return true; 1702 } 1703 1704 /* 1705 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody 1706 * is already purging. 1707 */ 1708 static void try_purge_vmap_area_lazy(void) 1709 { 1710 if (mutex_trylock(&vmap_purge_lock)) { 1711 __purge_vmap_area_lazy(ULONG_MAX, 0); 1712 mutex_unlock(&vmap_purge_lock); 1713 } 1714 } 1715 1716 /* 1717 * Kick off a purge of the outstanding lazy areas. 1718 */ 1719 static void purge_vmap_area_lazy(void) 1720 { 1721 mutex_lock(&vmap_purge_lock); 1722 purge_fragmented_blocks_allcpus(); 1723 __purge_vmap_area_lazy(ULONG_MAX, 0); 1724 mutex_unlock(&vmap_purge_lock); 1725 } 1726 1727 /* 1728 * Free a vmap area, caller ensuring that the area has been unmapped 1729 * and flush_cache_vunmap had been called for the correct range 1730 * previously. 1731 */ 1732 static void free_vmap_area_noflush(struct vmap_area *va) 1733 { 1734 unsigned long nr_lazy; 1735 1736 spin_lock(&vmap_area_lock); 1737 unlink_va(va, &vmap_area_root); 1738 spin_unlock(&vmap_area_lock); 1739 1740 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1741 PAGE_SHIFT, &vmap_lazy_nr); 1742 1743 /* 1744 * Merge or place it to the purge tree/list. 1745 */ 1746 spin_lock(&purge_vmap_area_lock); 1747 merge_or_add_vmap_area(va, 1748 &purge_vmap_area_root, &purge_vmap_area_list); 1749 spin_unlock(&purge_vmap_area_lock); 1750 1751 /* After this point, we may free va at any time */ 1752 if (unlikely(nr_lazy > lazy_max_pages())) 1753 try_purge_vmap_area_lazy(); 1754 } 1755 1756 /* 1757 * Free and unmap a vmap area 1758 */ 1759 static void free_unmap_vmap_area(struct vmap_area *va) 1760 { 1761 flush_cache_vunmap(va->va_start, va->va_end); 1762 vunmap_range_noflush(va->va_start, va->va_end); 1763 if (debug_pagealloc_enabled_static()) 1764 flush_tlb_kernel_range(va->va_start, va->va_end); 1765 1766 free_vmap_area_noflush(va); 1767 } 1768 1769 static struct vmap_area *find_vmap_area(unsigned long addr) 1770 { 1771 struct vmap_area *va; 1772 1773 spin_lock(&vmap_area_lock); 1774 va = __find_vmap_area(addr); 1775 spin_unlock(&vmap_area_lock); 1776 1777 return va; 1778 } 1779 1780 /*** Per cpu kva allocator ***/ 1781 1782 /* 1783 * vmap space is limited especially on 32 bit architectures. Ensure there is 1784 * room for at least 16 percpu vmap blocks per CPU. 1785 */ 1786 /* 1787 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 1788 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 1789 * instead (we just need a rough idea) 1790 */ 1791 #if BITS_PER_LONG == 32 1792 #define VMALLOC_SPACE (128UL*1024*1024) 1793 #else 1794 #define VMALLOC_SPACE (128UL*1024*1024*1024) 1795 #endif 1796 1797 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 1798 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 1799 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 1800 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 1801 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 1802 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 1803 #define VMAP_BBMAP_BITS \ 1804 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 1805 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 1806 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 1807 1808 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 1809 1810 struct vmap_block_queue { 1811 spinlock_t lock; 1812 struct list_head free; 1813 }; 1814 1815 struct vmap_block { 1816 spinlock_t lock; 1817 struct vmap_area *va; 1818 unsigned long free, dirty; 1819 unsigned long dirty_min, dirty_max; /*< dirty range */ 1820 struct list_head free_list; 1821 struct rcu_head rcu_head; 1822 struct list_head purge; 1823 }; 1824 1825 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 1826 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 1827 1828 /* 1829 * XArray of vmap blocks, indexed by address, to quickly find a vmap block 1830 * in the free path. Could get rid of this if we change the API to return a 1831 * "cookie" from alloc, to be passed to free. But no big deal yet. 1832 */ 1833 static DEFINE_XARRAY(vmap_blocks); 1834 1835 /* 1836 * We should probably have a fallback mechanism to allocate virtual memory 1837 * out of partially filled vmap blocks. However vmap block sizing should be 1838 * fairly reasonable according to the vmalloc size, so it shouldn't be a 1839 * big problem. 1840 */ 1841 1842 static unsigned long addr_to_vb_idx(unsigned long addr) 1843 { 1844 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 1845 addr /= VMAP_BLOCK_SIZE; 1846 return addr; 1847 } 1848 1849 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) 1850 { 1851 unsigned long addr; 1852 1853 addr = va_start + (pages_off << PAGE_SHIFT); 1854 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); 1855 return (void *)addr; 1856 } 1857 1858 /** 1859 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this 1860 * block. Of course pages number can't exceed VMAP_BBMAP_BITS 1861 * @order: how many 2^order pages should be occupied in newly allocated block 1862 * @gfp_mask: flags for the page level allocator 1863 * 1864 * Return: virtual address in a newly allocated block or ERR_PTR(-errno) 1865 */ 1866 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 1867 { 1868 struct vmap_block_queue *vbq; 1869 struct vmap_block *vb; 1870 struct vmap_area *va; 1871 unsigned long vb_idx; 1872 int node, err; 1873 void *vaddr; 1874 1875 node = numa_node_id(); 1876 1877 vb = kmalloc_node(sizeof(struct vmap_block), 1878 gfp_mask & GFP_RECLAIM_MASK, node); 1879 if (unlikely(!vb)) 1880 return ERR_PTR(-ENOMEM); 1881 1882 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 1883 VMALLOC_START, VMALLOC_END, 1884 node, gfp_mask); 1885 if (IS_ERR(va)) { 1886 kfree(vb); 1887 return ERR_CAST(va); 1888 } 1889 1890 vaddr = vmap_block_vaddr(va->va_start, 0); 1891 spin_lock_init(&vb->lock); 1892 vb->va = va; 1893 /* At least something should be left free */ 1894 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); 1895 vb->free = VMAP_BBMAP_BITS - (1UL << order); 1896 vb->dirty = 0; 1897 vb->dirty_min = VMAP_BBMAP_BITS; 1898 vb->dirty_max = 0; 1899 INIT_LIST_HEAD(&vb->free_list); 1900 1901 vb_idx = addr_to_vb_idx(va->va_start); 1902 err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); 1903 if (err) { 1904 kfree(vb); 1905 free_vmap_area(va); 1906 return ERR_PTR(err); 1907 } 1908 1909 vbq = &get_cpu_var(vmap_block_queue); 1910 spin_lock(&vbq->lock); 1911 list_add_tail_rcu(&vb->free_list, &vbq->free); 1912 spin_unlock(&vbq->lock); 1913 put_cpu_var(vmap_block_queue); 1914 1915 return vaddr; 1916 } 1917 1918 static void free_vmap_block(struct vmap_block *vb) 1919 { 1920 struct vmap_block *tmp; 1921 1922 tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); 1923 BUG_ON(tmp != vb); 1924 1925 free_vmap_area_noflush(vb->va); 1926 kfree_rcu(vb, rcu_head); 1927 } 1928 1929 static void purge_fragmented_blocks(int cpu) 1930 { 1931 LIST_HEAD(purge); 1932 struct vmap_block *vb; 1933 struct vmap_block *n_vb; 1934 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1935 1936 rcu_read_lock(); 1937 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1938 1939 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 1940 continue; 1941 1942 spin_lock(&vb->lock); 1943 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 1944 vb->free = 0; /* prevent further allocs after releasing lock */ 1945 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 1946 vb->dirty_min = 0; 1947 vb->dirty_max = VMAP_BBMAP_BITS; 1948 spin_lock(&vbq->lock); 1949 list_del_rcu(&vb->free_list); 1950 spin_unlock(&vbq->lock); 1951 spin_unlock(&vb->lock); 1952 list_add_tail(&vb->purge, &purge); 1953 } else 1954 spin_unlock(&vb->lock); 1955 } 1956 rcu_read_unlock(); 1957 1958 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 1959 list_del(&vb->purge); 1960 free_vmap_block(vb); 1961 } 1962 } 1963 1964 static void purge_fragmented_blocks_allcpus(void) 1965 { 1966 int cpu; 1967 1968 for_each_possible_cpu(cpu) 1969 purge_fragmented_blocks(cpu); 1970 } 1971 1972 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 1973 { 1974 struct vmap_block_queue *vbq; 1975 struct vmap_block *vb; 1976 void *vaddr = NULL; 1977 unsigned int order; 1978 1979 BUG_ON(offset_in_page(size)); 1980 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1981 if (WARN_ON(size == 0)) { 1982 /* 1983 * Allocating 0 bytes isn't what caller wants since 1984 * get_order(0) returns funny result. Just warn and terminate 1985 * early. 1986 */ 1987 return NULL; 1988 } 1989 order = get_order(size); 1990 1991 rcu_read_lock(); 1992 vbq = &get_cpu_var(vmap_block_queue); 1993 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1994 unsigned long pages_off; 1995 1996 spin_lock(&vb->lock); 1997 if (vb->free < (1UL << order)) { 1998 spin_unlock(&vb->lock); 1999 continue; 2000 } 2001 2002 pages_off = VMAP_BBMAP_BITS - vb->free; 2003 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); 2004 vb->free -= 1UL << order; 2005 if (vb->free == 0) { 2006 spin_lock(&vbq->lock); 2007 list_del_rcu(&vb->free_list); 2008 spin_unlock(&vbq->lock); 2009 } 2010 2011 spin_unlock(&vb->lock); 2012 break; 2013 } 2014 2015 put_cpu_var(vmap_block_queue); 2016 rcu_read_unlock(); 2017 2018 /* Allocate new block if nothing was found */ 2019 if (!vaddr) 2020 vaddr = new_vmap_block(order, gfp_mask); 2021 2022 return vaddr; 2023 } 2024 2025 static void vb_free(unsigned long addr, unsigned long size) 2026 { 2027 unsigned long offset; 2028 unsigned int order; 2029 struct vmap_block *vb; 2030 2031 BUG_ON(offset_in_page(size)); 2032 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 2033 2034 flush_cache_vunmap(addr, addr + size); 2035 2036 order = get_order(size); 2037 offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; 2038 vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); 2039 2040 vunmap_range_noflush(addr, addr + size); 2041 2042 if (debug_pagealloc_enabled_static()) 2043 flush_tlb_kernel_range(addr, addr + size); 2044 2045 spin_lock(&vb->lock); 2046 2047 /* Expand dirty range */ 2048 vb->dirty_min = min(vb->dirty_min, offset); 2049 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); 2050 2051 vb->dirty += 1UL << order; 2052 if (vb->dirty == VMAP_BBMAP_BITS) { 2053 BUG_ON(vb->free); 2054 spin_unlock(&vb->lock); 2055 free_vmap_block(vb); 2056 } else 2057 spin_unlock(&vb->lock); 2058 } 2059 2060 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) 2061 { 2062 int cpu; 2063 2064 if (unlikely(!vmap_initialized)) 2065 return; 2066 2067 might_sleep(); 2068 2069 for_each_possible_cpu(cpu) { 2070 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 2071 struct vmap_block *vb; 2072 2073 rcu_read_lock(); 2074 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 2075 spin_lock(&vb->lock); 2076 if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { 2077 unsigned long va_start = vb->va->va_start; 2078 unsigned long s, e; 2079 2080 s = va_start + (vb->dirty_min << PAGE_SHIFT); 2081 e = va_start + (vb->dirty_max << PAGE_SHIFT); 2082 2083 start = min(s, start); 2084 end = max(e, end); 2085 2086 flush = 1; 2087 } 2088 spin_unlock(&vb->lock); 2089 } 2090 rcu_read_unlock(); 2091 } 2092 2093 mutex_lock(&vmap_purge_lock); 2094 purge_fragmented_blocks_allcpus(); 2095 if (!__purge_vmap_area_lazy(start, end) && flush) 2096 flush_tlb_kernel_range(start, end); 2097 mutex_unlock(&vmap_purge_lock); 2098 } 2099 2100 /** 2101 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 2102 * 2103 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 2104 * to amortize TLB flushing overheads. What this means is that any page you 2105 * have now, may, in a former life, have been mapped into kernel virtual 2106 * address by the vmap layer and so there might be some CPUs with TLB entries 2107 * still referencing that page (additional to the regular 1:1 kernel mapping). 2108 * 2109 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 2110 * be sure that none of the pages we have control over will have any aliases 2111 * from the vmap layer. 2112 */ 2113 void vm_unmap_aliases(void) 2114 { 2115 unsigned long start = ULONG_MAX, end = 0; 2116 int flush = 0; 2117 2118 _vm_unmap_aliases(start, end, flush); 2119 } 2120 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 2121 2122 /** 2123 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 2124 * @mem: the pointer returned by vm_map_ram 2125 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 2126 */ 2127 void vm_unmap_ram(const void *mem, unsigned int count) 2128 { 2129 unsigned long size = (unsigned long)count << PAGE_SHIFT; 2130 unsigned long addr = (unsigned long)mem; 2131 struct vmap_area *va; 2132 2133 might_sleep(); 2134 BUG_ON(!addr); 2135 BUG_ON(addr < VMALLOC_START); 2136 BUG_ON(addr > VMALLOC_END); 2137 BUG_ON(!PAGE_ALIGNED(addr)); 2138 2139 kasan_poison_vmalloc(mem, size); 2140 2141 if (likely(count <= VMAP_MAX_ALLOC)) { 2142 debug_check_no_locks_freed(mem, size); 2143 vb_free(addr, size); 2144 return; 2145 } 2146 2147 va = find_vmap_area(addr); 2148 BUG_ON(!va); 2149 debug_check_no_locks_freed((void *)va->va_start, 2150 (va->va_end - va->va_start)); 2151 free_unmap_vmap_area(va); 2152 } 2153 EXPORT_SYMBOL(vm_unmap_ram); 2154 2155 /** 2156 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 2157 * @pages: an array of pointers to the pages to be mapped 2158 * @count: number of pages 2159 * @node: prefer to allocate data structures on this node 2160 * 2161 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be 2162 * faster than vmap so it's good. But if you mix long-life and short-life 2163 * objects with vm_map_ram(), it could consume lots of address space through 2164 * fragmentation (especially on a 32bit machine). You could see failures in 2165 * the end. Please use this function for short-lived objects. 2166 * 2167 * Returns: a pointer to the address that has been mapped, or %NULL on failure 2168 */ 2169 void *vm_map_ram(struct page **pages, unsigned int count, int node) 2170 { 2171 unsigned long size = (unsigned long)count << PAGE_SHIFT; 2172 unsigned long addr; 2173 void *mem; 2174 2175 if (likely(count <= VMAP_MAX_ALLOC)) { 2176 mem = vb_alloc(size, GFP_KERNEL); 2177 if (IS_ERR(mem)) 2178 return NULL; 2179 addr = (unsigned long)mem; 2180 } else { 2181 struct vmap_area *va; 2182 va = alloc_vmap_area(size, PAGE_SIZE, 2183 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 2184 if (IS_ERR(va)) 2185 return NULL; 2186 2187 addr = va->va_start; 2188 mem = (void *)addr; 2189 } 2190 2191 kasan_unpoison_vmalloc(mem, size); 2192 2193 if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, 2194 pages, PAGE_SHIFT) < 0) { 2195 vm_unmap_ram(mem, count); 2196 return NULL; 2197 } 2198 2199 return mem; 2200 } 2201 EXPORT_SYMBOL(vm_map_ram); 2202 2203 static struct vm_struct *vmlist __initdata; 2204 2205 static inline unsigned int vm_area_page_order(struct vm_struct *vm) 2206 { 2207 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 2208 return vm->page_order; 2209 #else 2210 return 0; 2211 #endif 2212 } 2213 2214 static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) 2215 { 2216 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 2217 vm->page_order = order; 2218 #else 2219 BUG_ON(order != 0); 2220 #endif 2221 } 2222 2223 /** 2224 * vm_area_add_early - add vmap area early during boot 2225 * @vm: vm_struct to add 2226 * 2227 * This function is used to add fixed kernel vm area to vmlist before 2228 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags 2229 * should contain proper values and the other fields should be zero. 2230 * 2231 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 2232 */ 2233 void __init vm_area_add_early(struct vm_struct *vm) 2234 { 2235 struct vm_struct *tmp, **p; 2236 2237 BUG_ON(vmap_initialized); 2238 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 2239 if (tmp->addr >= vm->addr) { 2240 BUG_ON(tmp->addr < vm->addr + vm->size); 2241 break; 2242 } else 2243 BUG_ON(tmp->addr + tmp->size > vm->addr); 2244 } 2245 vm->next = *p; 2246 *p = vm; 2247 } 2248 2249 /** 2250 * vm_area_register_early - register vmap area early during boot 2251 * @vm: vm_struct to register 2252 * @align: requested alignment 2253 * 2254 * This function is used to register kernel vm area before 2255 * vmalloc_init() is called. @vm->size and @vm->flags should contain 2256 * proper values on entry and other fields should be zero. On return, 2257 * vm->addr contains the allocated address. 2258 * 2259 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 2260 */ 2261 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 2262 { 2263 static size_t vm_init_off __initdata; 2264 unsigned long addr; 2265 2266 addr = ALIGN(VMALLOC_START + vm_init_off, align); 2267 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; 2268 2269 vm->addr = (void *)addr; 2270 2271 vm_area_add_early(vm); 2272 } 2273 2274 static void vmap_init_free_space(void) 2275 { 2276 unsigned long vmap_start = 1; 2277 const unsigned long vmap_end = ULONG_MAX; 2278 struct vmap_area *busy, *free; 2279 2280 /* 2281 * B F B B B F 2282 * -|-----|.....|-----|-----|-----|.....|- 2283 * | The KVA space | 2284 * |<--------------------------------->| 2285 */ 2286 list_for_each_entry(busy, &vmap_area_list, list) { 2287 if (busy->va_start - vmap_start > 0) { 2288 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2289 if (!WARN_ON_ONCE(!free)) { 2290 free->va_start = vmap_start; 2291 free->va_end = busy->va_start; 2292 2293 insert_vmap_area_augment(free, NULL, 2294 &free_vmap_area_root, 2295 &free_vmap_area_list); 2296 } 2297 } 2298 2299 vmap_start = busy->va_end; 2300 } 2301 2302 if (vmap_end - vmap_start > 0) { 2303 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2304 if (!WARN_ON_ONCE(!free)) { 2305 free->va_start = vmap_start; 2306 free->va_end = vmap_end; 2307 2308 insert_vmap_area_augment(free, NULL, 2309 &free_vmap_area_root, 2310 &free_vmap_area_list); 2311 } 2312 } 2313 } 2314 2315 void __init vmalloc_init(void) 2316 { 2317 struct vmap_area *va; 2318 struct vm_struct *tmp; 2319 int i; 2320 2321 /* 2322 * Create the cache for vmap_area objects. 2323 */ 2324 vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); 2325 2326 for_each_possible_cpu(i) { 2327 struct vmap_block_queue *vbq; 2328 struct vfree_deferred *p; 2329 2330 vbq = &per_cpu(vmap_block_queue, i); 2331 spin_lock_init(&vbq->lock); 2332 INIT_LIST_HEAD(&vbq->free); 2333 p = &per_cpu(vfree_deferred, i); 2334 init_llist_head(&p->list); 2335 INIT_WORK(&p->wq, free_work); 2336 } 2337 2338 /* Import existing vmlist entries. */ 2339 for (tmp = vmlist; tmp; tmp = tmp->next) { 2340 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2341 if (WARN_ON_ONCE(!va)) 2342 continue; 2343 2344 va->va_start = (unsigned long)tmp->addr; 2345 va->va_end = va->va_start + tmp->size; 2346 va->vm = tmp; 2347 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 2348 } 2349 2350 /* 2351 * Now we can initialize a free vmap space. 2352 */ 2353 vmap_init_free_space(); 2354 vmap_initialized = true; 2355 } 2356 2357 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, 2358 struct vmap_area *va, unsigned long flags, const void *caller) 2359 { 2360 vm->flags = flags; 2361 vm->addr = (void *)va->va_start; 2362 vm->size = va->va_end - va->va_start; 2363 vm->caller = caller; 2364 va->vm = vm; 2365 } 2366 2367 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 2368 unsigned long flags, const void *caller) 2369 { 2370 spin_lock(&vmap_area_lock); 2371 setup_vmalloc_vm_locked(vm, va, flags, caller); 2372 spin_unlock(&vmap_area_lock); 2373 } 2374 2375 static void clear_vm_uninitialized_flag(struct vm_struct *vm) 2376 { 2377 /* 2378 * Before removing VM_UNINITIALIZED, 2379 * we should make sure that vm has proper values. 2380 * Pair with smp_rmb() in show_numa_info(). 2381 */ 2382 smp_wmb(); 2383 vm->flags &= ~VM_UNINITIALIZED; 2384 } 2385 2386 static struct vm_struct *__get_vm_area_node(unsigned long size, 2387 unsigned long align, unsigned long shift, unsigned long flags, 2388 unsigned long start, unsigned long end, int node, 2389 gfp_t gfp_mask, const void *caller) 2390 { 2391 struct vmap_area *va; 2392 struct vm_struct *area; 2393 unsigned long requested_size = size; 2394 2395 BUG_ON(in_interrupt()); 2396 size = ALIGN(size, 1ul << shift); 2397 if (unlikely(!size)) 2398 return NULL; 2399 2400 if (flags & VM_IOREMAP) 2401 align = 1ul << clamp_t(int, get_count_order_long(size), 2402 PAGE_SHIFT, IOREMAP_MAX_ORDER); 2403 2404 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 2405 if (unlikely(!area)) 2406 return NULL; 2407 2408 if (!(flags & VM_NO_GUARD)) 2409 size += PAGE_SIZE; 2410 2411 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 2412 if (IS_ERR(va)) { 2413 kfree(area); 2414 return NULL; 2415 } 2416 2417 kasan_unpoison_vmalloc((void *)va->va_start, requested_size); 2418 2419 setup_vmalloc_vm(area, va, flags, caller); 2420 2421 return area; 2422 } 2423 2424 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 2425 unsigned long start, unsigned long end, 2426 const void *caller) 2427 { 2428 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, 2429 NUMA_NO_NODE, GFP_KERNEL, caller); 2430 } 2431 2432 /** 2433 * get_vm_area - reserve a contiguous kernel virtual area 2434 * @size: size of the area 2435 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 2436 * 2437 * Search an area of @size in the kernel virtual mapping area, 2438 * and reserved it for out purposes. Returns the area descriptor 2439 * on success or %NULL on failure. 2440 * 2441 * Return: the area descriptor on success or %NULL on failure. 2442 */ 2443 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 2444 { 2445 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2446 VMALLOC_START, VMALLOC_END, 2447 NUMA_NO_NODE, GFP_KERNEL, 2448 __builtin_return_address(0)); 2449 } 2450 2451 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 2452 const void *caller) 2453 { 2454 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2455 VMALLOC_START, VMALLOC_END, 2456 NUMA_NO_NODE, GFP_KERNEL, caller); 2457 } 2458 2459 /** 2460 * find_vm_area - find a continuous kernel virtual area 2461 * @addr: base address 2462 * 2463 * Search for the kernel VM area starting at @addr, and return it. 2464 * It is up to the caller to do all required locking to keep the returned 2465 * pointer valid. 2466 * 2467 * Return: the area descriptor on success or %NULL on failure. 2468 */ 2469 struct vm_struct *find_vm_area(const void *addr) 2470 { 2471 struct vmap_area *va; 2472 2473 va = find_vmap_area((unsigned long)addr); 2474 if (!va) 2475 return NULL; 2476 2477 return va->vm; 2478 } 2479 2480 /** 2481 * remove_vm_area - find and remove a continuous kernel virtual area 2482 * @addr: base address 2483 * 2484 * Search for the kernel VM area starting at @addr, and remove it. 2485 * This function returns the found VM area, but using it is NOT safe 2486 * on SMP machines, except for its size or flags. 2487 * 2488 * Return: the area descriptor on success or %NULL on failure. 2489 */ 2490 struct vm_struct *remove_vm_area(const void *addr) 2491 { 2492 struct vmap_area *va; 2493 2494 might_sleep(); 2495 2496 spin_lock(&vmap_area_lock); 2497 va = __find_vmap_area((unsigned long)addr); 2498 if (va && va->vm) { 2499 struct vm_struct *vm = va->vm; 2500 2501 va->vm = NULL; 2502 spin_unlock(&vmap_area_lock); 2503 2504 kasan_free_shadow(vm); 2505 free_unmap_vmap_area(va); 2506 2507 return vm; 2508 } 2509 2510 spin_unlock(&vmap_area_lock); 2511 return NULL; 2512 } 2513 2514 static inline void set_area_direct_map(const struct vm_struct *area, 2515 int (*set_direct_map)(struct page *page)) 2516 { 2517 int i; 2518 2519 /* HUGE_VMALLOC passes small pages to set_direct_map */ 2520 for (i = 0; i < area->nr_pages; i++) 2521 if (page_address(area->pages[i])) 2522 set_direct_map(area->pages[i]); 2523 } 2524 2525 /* Handle removing and resetting vm mappings related to the vm_struct. */ 2526 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) 2527 { 2528 unsigned long start = ULONG_MAX, end = 0; 2529 unsigned int page_order = vm_area_page_order(area); 2530 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; 2531 int flush_dmap = 0; 2532 int i; 2533 2534 remove_vm_area(area->addr); 2535 2536 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ 2537 if (!flush_reset) 2538 return; 2539 2540 /* 2541 * If not deallocating pages, just do the flush of the VM area and 2542 * return. 2543 */ 2544 if (!deallocate_pages) { 2545 vm_unmap_aliases(); 2546 return; 2547 } 2548 2549 /* 2550 * If execution gets here, flush the vm mapping and reset the direct 2551 * map. Find the start and end range of the direct mappings to make sure 2552 * the vm_unmap_aliases() flush includes the direct map. 2553 */ 2554 for (i = 0; i < area->nr_pages; i += 1U << page_order) { 2555 unsigned long addr = (unsigned long)page_address(area->pages[i]); 2556 if (addr) { 2557 unsigned long page_size; 2558 2559 page_size = PAGE_SIZE << page_order; 2560 start = min(addr, start); 2561 end = max(addr + page_size, end); 2562 flush_dmap = 1; 2563 } 2564 } 2565 2566 /* 2567 * Set direct map to something invalid so that it won't be cached if 2568 * there are any accesses after the TLB flush, then flush the TLB and 2569 * reset the direct map permissions to the default. 2570 */ 2571 set_area_direct_map(area, set_direct_map_invalid_noflush); 2572 _vm_unmap_aliases(start, end, flush_dmap); 2573 set_area_direct_map(area, set_direct_map_default_noflush); 2574 } 2575 2576 static void __vunmap(const void *addr, int deallocate_pages) 2577 { 2578 struct vm_struct *area; 2579 2580 if (!addr) 2581 return; 2582 2583 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", 2584 addr)) 2585 return; 2586 2587 area = find_vm_area(addr); 2588 if (unlikely(!area)) { 2589 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 2590 addr); 2591 return; 2592 } 2593 2594 debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); 2595 debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); 2596 2597 kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); 2598 2599 vm_remove_mappings(area, deallocate_pages); 2600 2601 if (deallocate_pages) { 2602 unsigned int page_order = vm_area_page_order(area); 2603 int i; 2604 2605 for (i = 0; i < area->nr_pages; i += 1U << page_order) { 2606 struct page *page = area->pages[i]; 2607 2608 BUG_ON(!page); 2609 __free_pages(page, page_order); 2610 cond_resched(); 2611 } 2612 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); 2613 2614 kvfree(area->pages); 2615 } 2616 2617 kfree(area); 2618 } 2619 2620 static inline void __vfree_deferred(const void *addr) 2621 { 2622 /* 2623 * Use raw_cpu_ptr() because this can be called from preemptible 2624 * context. Preemption is absolutely fine here, because the llist_add() 2625 * implementation is lockless, so it works even if we are adding to 2626 * another cpu's list. schedule_work() should be fine with this too. 2627 */ 2628 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); 2629 2630 if (llist_add((struct llist_node *)addr, &p->list)) 2631 schedule_work(&p->wq); 2632 } 2633 2634 /** 2635 * vfree_atomic - release memory allocated by vmalloc() 2636 * @addr: memory base address 2637 * 2638 * This one is just like vfree() but can be called in any atomic context 2639 * except NMIs. 2640 */ 2641 void vfree_atomic(const void *addr) 2642 { 2643 BUG_ON(in_nmi()); 2644 2645 kmemleak_free(addr); 2646 2647 if (!addr) 2648 return; 2649 __vfree_deferred(addr); 2650 } 2651 2652 static void __vfree(const void *addr) 2653 { 2654 if (unlikely(in_interrupt())) 2655 __vfree_deferred(addr); 2656 else 2657 __vunmap(addr, 1); 2658 } 2659 2660 /** 2661 * vfree - Release memory allocated by vmalloc() 2662 * @addr: Memory base address 2663 * 2664 * Free the virtually continuous memory area starting at @addr, as obtained 2665 * from one of the vmalloc() family of APIs. This will usually also free the 2666 * physical memory underlying the virtual allocation, but that memory is 2667 * reference counted, so it will not be freed until the last user goes away. 2668 * 2669 * If @addr is NULL, no operation is performed. 2670 * 2671 * Context: 2672 * May sleep if called *not* from interrupt context. 2673 * Must not be called in NMI context (strictly speaking, it could be 2674 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 2675 * conventions for vfree() arch-dependent would be a really bad idea). 2676 */ 2677 void vfree(const void *addr) 2678 { 2679 BUG_ON(in_nmi()); 2680 2681 kmemleak_free(addr); 2682 2683 might_sleep_if(!in_interrupt()); 2684 2685 if (!addr) 2686 return; 2687 2688 __vfree(addr); 2689 } 2690 EXPORT_SYMBOL(vfree); 2691 2692 /** 2693 * vunmap - release virtual mapping obtained by vmap() 2694 * @addr: memory base address 2695 * 2696 * Free the virtually contiguous memory area starting at @addr, 2697 * which was created from the page array passed to vmap(). 2698 * 2699 * Must not be called in interrupt context. 2700 */ 2701 void vunmap(const void *addr) 2702 { 2703 BUG_ON(in_interrupt()); 2704 might_sleep(); 2705 if (addr) 2706 __vunmap(addr, 0); 2707 } 2708 EXPORT_SYMBOL(vunmap); 2709 2710 /** 2711 * vmap - map an array of pages into virtually contiguous space 2712 * @pages: array of page pointers 2713 * @count: number of pages to map 2714 * @flags: vm_area->flags 2715 * @prot: page protection for the mapping 2716 * 2717 * Maps @count pages from @pages into contiguous kernel virtual space. 2718 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself 2719 * (which must be kmalloc or vmalloc memory) and one reference per pages in it 2720 * are transferred from the caller to vmap(), and will be freed / dropped when 2721 * vfree() is called on the return value. 2722 * 2723 * Return: the address of the area or %NULL on failure 2724 */ 2725 void *vmap(struct page **pages, unsigned int count, 2726 unsigned long flags, pgprot_t prot) 2727 { 2728 struct vm_struct *area; 2729 unsigned long addr; 2730 unsigned long size; /* In bytes */ 2731 2732 might_sleep(); 2733 2734 if (count > totalram_pages()) 2735 return NULL; 2736 2737 size = (unsigned long)count << PAGE_SHIFT; 2738 area = get_vm_area_caller(size, flags, __builtin_return_address(0)); 2739 if (!area) 2740 return NULL; 2741 2742 addr = (unsigned long)area->addr; 2743 if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), 2744 pages, PAGE_SHIFT) < 0) { 2745 vunmap(area->addr); 2746 return NULL; 2747 } 2748 2749 if (flags & VM_MAP_PUT_PAGES) { 2750 area->pages = pages; 2751 area->nr_pages = count; 2752 } 2753 return area->addr; 2754 } 2755 EXPORT_SYMBOL(vmap); 2756 2757 #ifdef CONFIG_VMAP_PFN 2758 struct vmap_pfn_data { 2759 unsigned long *pfns; 2760 pgprot_t prot; 2761 unsigned int idx; 2762 }; 2763 2764 static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) 2765 { 2766 struct vmap_pfn_data *data = private; 2767 2768 if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) 2769 return -EINVAL; 2770 *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); 2771 return 0; 2772 } 2773 2774 /** 2775 * vmap_pfn - map an array of PFNs into virtually contiguous space 2776 * @pfns: array of PFNs 2777 * @count: number of pages to map 2778 * @prot: page protection for the mapping 2779 * 2780 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns 2781 * the start address of the mapping. 2782 */ 2783 void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) 2784 { 2785 struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; 2786 struct vm_struct *area; 2787 2788 area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, 2789 __builtin_return_address(0)); 2790 if (!area) 2791 return NULL; 2792 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2793 count * PAGE_SIZE, vmap_pfn_apply, &data)) { 2794 free_vm_area(area); 2795 return NULL; 2796 } 2797 return area->addr; 2798 } 2799 EXPORT_SYMBOL_GPL(vmap_pfn); 2800 #endif /* CONFIG_VMAP_PFN */ 2801 2802 static inline unsigned int 2803 vm_area_alloc_pages(gfp_t gfp, int nid, 2804 unsigned int order, unsigned int nr_pages, struct page **pages) 2805 { 2806 unsigned int nr_allocated = 0; 2807 2808 /* 2809 * For order-0 pages we make use of bulk allocator, if 2810 * the page array is partly or not at all populated due 2811 * to fails, fallback to a single page allocator that is 2812 * more permissive. 2813 */ 2814 if (!order) { 2815 while (nr_allocated < nr_pages) { 2816 unsigned int nr, nr_pages_request; 2817 2818 /* 2819 * A maximum allowed request is hard-coded and is 100 2820 * pages per call. That is done in order to prevent a 2821 * long preemption off scenario in the bulk-allocator 2822 * so the range is [1:100]. 2823 */ 2824 nr_pages_request = min(100U, nr_pages - nr_allocated); 2825 2826 nr = alloc_pages_bulk_array_node(gfp, nid, 2827 nr_pages_request, pages + nr_allocated); 2828 2829 nr_allocated += nr; 2830 cond_resched(); 2831 2832 /* 2833 * If zero or pages were obtained partly, 2834 * fallback to a single page allocator. 2835 */ 2836 if (nr != nr_pages_request) 2837 break; 2838 } 2839 } else 2840 /* 2841 * Compound pages required for remap_vmalloc_page if 2842 * high-order pages. 2843 */ 2844 gfp |= __GFP_COMP; 2845 2846 /* High-order pages or fallback path if "bulk" fails. */ 2847 while (nr_allocated < nr_pages) { 2848 struct page *page; 2849 int i; 2850 2851 page = alloc_pages_node(nid, gfp, order); 2852 if (unlikely(!page)) 2853 break; 2854 2855 /* 2856 * Careful, we allocate and map page-order pages, but 2857 * tracking is done per PAGE_SIZE page so as to keep the 2858 * vm_struct APIs independent of the physical/mapped size. 2859 */ 2860 for (i = 0; i < (1U << order); i++) 2861 pages[nr_allocated + i] = page + i; 2862 2863 cond_resched(); 2864 nr_allocated += 1U << order; 2865 } 2866 2867 return nr_allocated; 2868 } 2869 2870 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 2871 pgprot_t prot, unsigned int page_shift, 2872 int node) 2873 { 2874 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 2875 unsigned long addr = (unsigned long)area->addr; 2876 unsigned long size = get_vm_area_size(area); 2877 unsigned long array_size; 2878 unsigned int nr_small_pages = size >> PAGE_SHIFT; 2879 unsigned int page_order; 2880 2881 array_size = (unsigned long)nr_small_pages * sizeof(struct page *); 2882 gfp_mask |= __GFP_NOWARN; 2883 if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) 2884 gfp_mask |= __GFP_HIGHMEM; 2885 2886 /* Please note that the recursion is strictly bounded. */ 2887 if (array_size > PAGE_SIZE) { 2888 area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, 2889 area->caller); 2890 } else { 2891 area->pages = kmalloc_node(array_size, nested_gfp, node); 2892 } 2893 2894 if (!area->pages) { 2895 warn_alloc(gfp_mask, NULL, 2896 "vmalloc error: size %lu, failed to allocated page array size %lu", 2897 nr_small_pages * PAGE_SIZE, array_size); 2898 free_vm_area(area); 2899 return NULL; 2900 } 2901 2902 set_vm_area_page_order(area, page_shift - PAGE_SHIFT); 2903 page_order = vm_area_page_order(area); 2904 2905 area->nr_pages = vm_area_alloc_pages(gfp_mask, node, 2906 page_order, nr_small_pages, area->pages); 2907 2908 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2909 2910 /* 2911 * If not enough pages were obtained to accomplish an 2912 * allocation request, free them via __vfree() if any. 2913 */ 2914 if (area->nr_pages != nr_small_pages) { 2915 warn_alloc(gfp_mask, NULL, 2916 "vmalloc error: size %lu, page order %u, failed to allocate pages", 2917 area->nr_pages * PAGE_SIZE, page_order); 2918 goto fail; 2919 } 2920 2921 if (vmap_pages_range(addr, addr + size, prot, area->pages, 2922 page_shift) < 0) { 2923 warn_alloc(gfp_mask, NULL, 2924 "vmalloc error: size %lu, failed to map pages", 2925 area->nr_pages * PAGE_SIZE); 2926 goto fail; 2927 } 2928 2929 return area->addr; 2930 2931 fail: 2932 __vfree(area->addr); 2933 return NULL; 2934 } 2935 2936 /** 2937 * __vmalloc_node_range - allocate virtually contiguous memory 2938 * @size: allocation size 2939 * @align: desired alignment 2940 * @start: vm area range start 2941 * @end: vm area range end 2942 * @gfp_mask: flags for the page level allocator 2943 * @prot: protection mask for the allocated pages 2944 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 2945 * @node: node to use for allocation or NUMA_NO_NODE 2946 * @caller: caller's return address 2947 * 2948 * Allocate enough pages to cover @size from the page level 2949 * allocator with @gfp_mask flags. Map them into contiguous 2950 * kernel virtual space, using a pagetable protection of @prot. 2951 * 2952 * Return: the address of the area or %NULL on failure 2953 */ 2954 void *__vmalloc_node_range(unsigned long size, unsigned long align, 2955 unsigned long start, unsigned long end, gfp_t gfp_mask, 2956 pgprot_t prot, unsigned long vm_flags, int node, 2957 const void *caller) 2958 { 2959 struct vm_struct *area; 2960 void *addr; 2961 unsigned long real_size = size; 2962 unsigned long real_align = align; 2963 unsigned int shift = PAGE_SHIFT; 2964 2965 if (WARN_ON_ONCE(!size)) 2966 return NULL; 2967 2968 if ((size >> PAGE_SHIFT) > totalram_pages()) { 2969 warn_alloc(gfp_mask, NULL, 2970 "vmalloc error: size %lu, exceeds total pages", 2971 real_size); 2972 return NULL; 2973 } 2974 2975 if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) { 2976 unsigned long size_per_node; 2977 2978 /* 2979 * Try huge pages. Only try for PAGE_KERNEL allocations, 2980 * others like modules don't yet expect huge pages in 2981 * their allocations due to apply_to_page_range not 2982 * supporting them. 2983 */ 2984 2985 size_per_node = size; 2986 if (node == NUMA_NO_NODE) 2987 size_per_node /= num_online_nodes(); 2988 if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) 2989 shift = PMD_SHIFT; 2990 else 2991 shift = arch_vmap_pte_supported_shift(size_per_node); 2992 2993 align = max(real_align, 1UL << shift); 2994 size = ALIGN(real_size, 1UL << shift); 2995 } 2996 2997 again: 2998 area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | 2999 VM_UNINITIALIZED | vm_flags, start, end, node, 3000 gfp_mask, caller); 3001 if (!area) { 3002 warn_alloc(gfp_mask, NULL, 3003 "vmalloc error: size %lu, vm_struct allocation failed", 3004 real_size); 3005 goto fail; 3006 } 3007 3008 addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); 3009 if (!addr) 3010 goto fail; 3011 3012 /* 3013 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 3014 * flag. It means that vm_struct is not fully initialized. 3015 * Now, it is fully initialized, so remove this flag here. 3016 */ 3017 clear_vm_uninitialized_flag(area); 3018 3019 size = PAGE_ALIGN(size); 3020 kmemleak_vmalloc(area, size, gfp_mask); 3021 3022 return addr; 3023 3024 fail: 3025 if (shift > PAGE_SHIFT) { 3026 shift = PAGE_SHIFT; 3027 align = real_align; 3028 size = real_size; 3029 goto again; 3030 } 3031 3032 return NULL; 3033 } 3034 3035 /** 3036 * __vmalloc_node - allocate virtually contiguous memory 3037 * @size: allocation size 3038 * @align: desired alignment 3039 * @gfp_mask: flags for the page level allocator 3040 * @node: node to use for allocation or NUMA_NO_NODE 3041 * @caller: caller's return address 3042 * 3043 * Allocate enough pages to cover @size from the page level allocator with 3044 * @gfp_mask flags. Map them into contiguous kernel virtual space. 3045 * 3046 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 3047 * and __GFP_NOFAIL are not supported 3048 * 3049 * Any use of gfp flags outside of GFP_KERNEL should be consulted 3050 * with mm people. 3051 * 3052 * Return: pointer to the allocated memory or %NULL on error 3053 */ 3054 void *__vmalloc_node(unsigned long size, unsigned long align, 3055 gfp_t gfp_mask, int node, const void *caller) 3056 { 3057 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 3058 gfp_mask, PAGE_KERNEL, 0, node, caller); 3059 } 3060 /* 3061 * This is only for performance analysis of vmalloc and stress purpose. 3062 * It is required by vmalloc test module, therefore do not use it other 3063 * than that. 3064 */ 3065 #ifdef CONFIG_TEST_VMALLOC_MODULE 3066 EXPORT_SYMBOL_GPL(__vmalloc_node); 3067 #endif 3068 3069 void *__vmalloc(unsigned long size, gfp_t gfp_mask) 3070 { 3071 return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, 3072 __builtin_return_address(0)); 3073 } 3074 EXPORT_SYMBOL(__vmalloc); 3075 3076 /** 3077 * vmalloc - allocate virtually contiguous memory 3078 * @size: allocation size 3079 * 3080 * Allocate enough pages to cover @size from the page level 3081 * allocator and map them into contiguous kernel virtual space. 3082 * 3083 * For tight control over page level allocator and protection flags 3084 * use __vmalloc() instead. 3085 * 3086 * Return: pointer to the allocated memory or %NULL on error 3087 */ 3088 void *vmalloc(unsigned long size) 3089 { 3090 return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, 3091 __builtin_return_address(0)); 3092 } 3093 EXPORT_SYMBOL(vmalloc); 3094 3095 /** 3096 * vmalloc_no_huge - allocate virtually contiguous memory using small pages 3097 * @size: allocation size 3098 * 3099 * Allocate enough non-huge pages to cover @size from the page level 3100 * allocator and map them into contiguous kernel virtual space. 3101 * 3102 * Return: pointer to the allocated memory or %NULL on error 3103 */ 3104 void *vmalloc_no_huge(unsigned long size) 3105 { 3106 return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, 3107 GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP, 3108 NUMA_NO_NODE, __builtin_return_address(0)); 3109 } 3110 EXPORT_SYMBOL(vmalloc_no_huge); 3111 3112 /** 3113 * vzalloc - allocate virtually contiguous memory with zero fill 3114 * @size: allocation size 3115 * 3116 * Allocate enough pages to cover @size from the page level 3117 * allocator and map them into contiguous kernel virtual space. 3118 * The memory allocated is set to zero. 3119 * 3120 * For tight control over page level allocator and protection flags 3121 * use __vmalloc() instead. 3122 * 3123 * Return: pointer to the allocated memory or %NULL on error 3124 */ 3125 void *vzalloc(unsigned long size) 3126 { 3127 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 3128 __builtin_return_address(0)); 3129 } 3130 EXPORT_SYMBOL(vzalloc); 3131 3132 /** 3133 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 3134 * @size: allocation size 3135 * 3136 * The resulting memory area is zeroed so it can be mapped to userspace 3137 * without leaking data. 3138 * 3139 * Return: pointer to the allocated memory or %NULL on error 3140 */ 3141 void *vmalloc_user(unsigned long size) 3142 { 3143 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 3144 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, 3145 VM_USERMAP, NUMA_NO_NODE, 3146 __builtin_return_address(0)); 3147 } 3148 EXPORT_SYMBOL(vmalloc_user); 3149 3150 /** 3151 * vmalloc_node - allocate memory on a specific node 3152 * @size: allocation size 3153 * @node: numa node 3154 * 3155 * Allocate enough pages to cover @size from the page level 3156 * allocator and map them into contiguous kernel virtual space. 3157 * 3158 * For tight control over page level allocator and protection flags 3159 * use __vmalloc() instead. 3160 * 3161 * Return: pointer to the allocated memory or %NULL on error 3162 */ 3163 void *vmalloc_node(unsigned long size, int node) 3164 { 3165 return __vmalloc_node(size, 1, GFP_KERNEL, node, 3166 __builtin_return_address(0)); 3167 } 3168 EXPORT_SYMBOL(vmalloc_node); 3169 3170 /** 3171 * vzalloc_node - allocate memory on a specific node with zero fill 3172 * @size: allocation size 3173 * @node: numa node 3174 * 3175 * Allocate enough pages to cover @size from the page level 3176 * allocator and map them into contiguous kernel virtual space. 3177 * The memory allocated is set to zero. 3178 * 3179 * Return: pointer to the allocated memory or %NULL on error 3180 */ 3181 void *vzalloc_node(unsigned long size, int node) 3182 { 3183 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, 3184 __builtin_return_address(0)); 3185 } 3186 EXPORT_SYMBOL(vzalloc_node); 3187 3188 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 3189 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 3190 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 3191 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) 3192 #else 3193 /* 3194 * 64b systems should always have either DMA or DMA32 zones. For others 3195 * GFP_DMA32 should do the right thing and use the normal zone. 3196 */ 3197 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 3198 #endif 3199 3200 /** 3201 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 3202 * @size: allocation size 3203 * 3204 * Allocate enough 32bit PA addressable pages to cover @size from the 3205 * page level allocator and map them into contiguous kernel virtual space. 3206 * 3207 * Return: pointer to the allocated memory or %NULL on error 3208 */ 3209 void *vmalloc_32(unsigned long size) 3210 { 3211 return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, 3212 __builtin_return_address(0)); 3213 } 3214 EXPORT_SYMBOL(vmalloc_32); 3215 3216 /** 3217 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 3218 * @size: allocation size 3219 * 3220 * The resulting memory area is 32bit addressable and zeroed so it can be 3221 * mapped to userspace without leaking data. 3222 * 3223 * Return: pointer to the allocated memory or %NULL on error 3224 */ 3225 void *vmalloc_32_user(unsigned long size) 3226 { 3227 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 3228 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 3229 VM_USERMAP, NUMA_NO_NODE, 3230 __builtin_return_address(0)); 3231 } 3232 EXPORT_SYMBOL(vmalloc_32_user); 3233 3234 /* 3235 * small helper routine , copy contents to buf from addr. 3236 * If the page is not present, fill zero. 3237 */ 3238 3239 static int aligned_vread(char *buf, char *addr, unsigned long count) 3240 { 3241 struct page *p; 3242 int copied = 0; 3243 3244 while (count) { 3245 unsigned long offset, length; 3246 3247 offset = offset_in_page(addr); 3248 length = PAGE_SIZE - offset; 3249 if (length > count) 3250 length = count; 3251 p = vmalloc_to_page(addr); 3252 /* 3253 * To do safe access to this _mapped_ area, we need 3254 * lock. But adding lock here means that we need to add 3255 * overhead of vmalloc()/vfree() calls for this _debug_ 3256 * interface, rarely used. Instead of that, we'll use 3257 * kmap() and get small overhead in this access function. 3258 */ 3259 if (p) { 3260 /* We can expect USER0 is not used -- see vread() */ 3261 void *map = kmap_atomic(p); 3262 memcpy(buf, map + offset, length); 3263 kunmap_atomic(map); 3264 } else 3265 memset(buf, 0, length); 3266 3267 addr += length; 3268 buf += length; 3269 copied += length; 3270 count -= length; 3271 } 3272 return copied; 3273 } 3274 3275 /** 3276 * vread() - read vmalloc area in a safe way. 3277 * @buf: buffer for reading data 3278 * @addr: vm address. 3279 * @count: number of bytes to be read. 3280 * 3281 * This function checks that addr is a valid vmalloc'ed area, and 3282 * copy data from that area to a given buffer. If the given memory range 3283 * of [addr...addr+count) includes some valid address, data is copied to 3284 * proper area of @buf. If there are memory holes, they'll be zero-filled. 3285 * IOREMAP area is treated as memory hole and no copy is done. 3286 * 3287 * If [addr...addr+count) doesn't includes any intersects with alive 3288 * vm_struct area, returns 0. @buf should be kernel's buffer. 3289 * 3290 * Note: In usual ops, vread() is never necessary because the caller 3291 * should know vmalloc() area is valid and can use memcpy(). 3292 * This is for routines which have to access vmalloc area without 3293 * any information, as /proc/kcore. 3294 * 3295 * Return: number of bytes for which addr and buf should be increased 3296 * (same number as @count) or %0 if [addr...addr+count) doesn't 3297 * include any intersection with valid vmalloc area 3298 */ 3299 long vread(char *buf, char *addr, unsigned long count) 3300 { 3301 struct vmap_area *va; 3302 struct vm_struct *vm; 3303 char *vaddr, *buf_start = buf; 3304 unsigned long buflen = count; 3305 unsigned long n; 3306 3307 /* Don't allow overflow */ 3308 if ((unsigned long) addr + count < count) 3309 count = -(unsigned long) addr; 3310 3311 spin_lock(&vmap_area_lock); 3312 va = find_vmap_area_exceed_addr((unsigned long)addr); 3313 if (!va) 3314 goto finished; 3315 3316 /* no intersects with alive vmap_area */ 3317 if ((unsigned long)addr + count <= va->va_start) 3318 goto finished; 3319 3320 list_for_each_entry_from(va, &vmap_area_list, list) { 3321 if (!count) 3322 break; 3323 3324 if (!va->vm) 3325 continue; 3326 3327 vm = va->vm; 3328 vaddr = (char *) vm->addr; 3329 if (addr >= vaddr + get_vm_area_size(vm)) 3330 continue; 3331 while (addr < vaddr) { 3332 if (count == 0) 3333 goto finished; 3334 *buf = '\0'; 3335 buf++; 3336 addr++; 3337 count--; 3338 } 3339 n = vaddr + get_vm_area_size(vm) - addr; 3340 if (n > count) 3341 n = count; 3342 if (!(vm->flags & VM_IOREMAP)) 3343 aligned_vread(buf, addr, n); 3344 else /* IOREMAP area is treated as memory hole */ 3345 memset(buf, 0, n); 3346 buf += n; 3347 addr += n; 3348 count -= n; 3349 } 3350 finished: 3351 spin_unlock(&vmap_area_lock); 3352 3353 if (buf == buf_start) 3354 return 0; 3355 /* zero-fill memory holes */ 3356 if (buf != buf_start + buflen) 3357 memset(buf, 0, buflen - (buf - buf_start)); 3358 3359 return buflen; 3360 } 3361 3362 /** 3363 * remap_vmalloc_range_partial - map vmalloc pages to userspace 3364 * @vma: vma to cover 3365 * @uaddr: target user address to start at 3366 * @kaddr: virtual address of vmalloc kernel memory 3367 * @pgoff: offset from @kaddr to start at 3368 * @size: size of map area 3369 * 3370 * Returns: 0 for success, -Exxx on failure 3371 * 3372 * This function checks that @kaddr is a valid vmalloc'ed area, 3373 * and that it is big enough to cover the range starting at 3374 * @uaddr in @vma. Will return failure if that criteria isn't 3375 * met. 3376 * 3377 * Similar to remap_pfn_range() (see mm/memory.c) 3378 */ 3379 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 3380 void *kaddr, unsigned long pgoff, 3381 unsigned long size) 3382 { 3383 struct vm_struct *area; 3384 unsigned long off; 3385 unsigned long end_index; 3386 3387 if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) 3388 return -EINVAL; 3389 3390 size = PAGE_ALIGN(size); 3391 3392 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) 3393 return -EINVAL; 3394 3395 area = find_vm_area(kaddr); 3396 if (!area) 3397 return -EINVAL; 3398 3399 if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) 3400 return -EINVAL; 3401 3402 if (check_add_overflow(size, off, &end_index) || 3403 end_index > get_vm_area_size(area)) 3404 return -EINVAL; 3405 kaddr += off; 3406 3407 do { 3408 struct page *page = vmalloc_to_page(kaddr); 3409 int ret; 3410 3411 ret = vm_insert_page(vma, uaddr, page); 3412 if (ret) 3413 return ret; 3414 3415 uaddr += PAGE_SIZE; 3416 kaddr += PAGE_SIZE; 3417 size -= PAGE_SIZE; 3418 } while (size > 0); 3419 3420 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3421 3422 return 0; 3423 } 3424 3425 /** 3426 * remap_vmalloc_range - map vmalloc pages to userspace 3427 * @vma: vma to cover (map full range of vma) 3428 * @addr: vmalloc memory 3429 * @pgoff: number of pages into addr before first page to map 3430 * 3431 * Returns: 0 for success, -Exxx on failure 3432 * 3433 * This function checks that addr is a valid vmalloc'ed area, and 3434 * that it is big enough to cover the vma. Will return failure if 3435 * that criteria isn't met. 3436 * 3437 * Similar to remap_pfn_range() (see mm/memory.c) 3438 */ 3439 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 3440 unsigned long pgoff) 3441 { 3442 return remap_vmalloc_range_partial(vma, vma->vm_start, 3443 addr, pgoff, 3444 vma->vm_end - vma->vm_start); 3445 } 3446 EXPORT_SYMBOL(remap_vmalloc_range); 3447 3448 void free_vm_area(struct vm_struct *area) 3449 { 3450 struct vm_struct *ret; 3451 ret = remove_vm_area(area->addr); 3452 BUG_ON(ret != area); 3453 kfree(area); 3454 } 3455 EXPORT_SYMBOL_GPL(free_vm_area); 3456 3457 #ifdef CONFIG_SMP 3458 static struct vmap_area *node_to_va(struct rb_node *n) 3459 { 3460 return rb_entry_safe(n, struct vmap_area, rb_node); 3461 } 3462 3463 /** 3464 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to 3465 * @addr: target address 3466 * 3467 * Returns: vmap_area if it is found. If there is no such area 3468 * the first highest(reverse order) vmap_area is returned 3469 * i.e. va->va_start < addr && va->va_end < addr or NULL 3470 * if there are no any areas before @addr. 3471 */ 3472 static struct vmap_area * 3473 pvm_find_va_enclose_addr(unsigned long addr) 3474 { 3475 struct vmap_area *va, *tmp; 3476 struct rb_node *n; 3477 3478 n = free_vmap_area_root.rb_node; 3479 va = NULL; 3480 3481 while (n) { 3482 tmp = rb_entry(n, struct vmap_area, rb_node); 3483 if (tmp->va_start <= addr) { 3484 va = tmp; 3485 if (tmp->va_end >= addr) 3486 break; 3487 3488 n = n->rb_right; 3489 } else { 3490 n = n->rb_left; 3491 } 3492 } 3493 3494 return va; 3495 } 3496 3497 /** 3498 * pvm_determine_end_from_reverse - find the highest aligned address 3499 * of free block below VMALLOC_END 3500 * @va: 3501 * in - the VA we start the search(reverse order); 3502 * out - the VA with the highest aligned end address. 3503 * @align: alignment for required highest address 3504 * 3505 * Returns: determined end address within vmap_area 3506 */ 3507 static unsigned long 3508 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) 3509 { 3510 unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3511 unsigned long addr; 3512 3513 if (likely(*va)) { 3514 list_for_each_entry_from_reverse((*va), 3515 &free_vmap_area_list, list) { 3516 addr = min((*va)->va_end & ~(align - 1), vmalloc_end); 3517 if ((*va)->va_start < addr) 3518 return addr; 3519 } 3520 } 3521 3522 return 0; 3523 } 3524 3525 /** 3526 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 3527 * @offsets: array containing offset of each area 3528 * @sizes: array containing size of each area 3529 * @nr_vms: the number of areas to allocate 3530 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 3531 * 3532 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 3533 * vm_structs on success, %NULL on failure 3534 * 3535 * Percpu allocator wants to use congruent vm areas so that it can 3536 * maintain the offsets among percpu areas. This function allocates 3537 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to 3538 * be scattered pretty far, distance between two areas easily going up 3539 * to gigabytes. To avoid interacting with regular vmallocs, these 3540 * areas are allocated from top. 3541 * 3542 * Despite its complicated look, this allocator is rather simple. It 3543 * does everything top-down and scans free blocks from the end looking 3544 * for matching base. While scanning, if any of the areas do not fit the 3545 * base address is pulled down to fit the area. Scanning is repeated till 3546 * all the areas fit and then all necessary data structures are inserted 3547 * and the result is returned. 3548 */ 3549 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 3550 const size_t *sizes, int nr_vms, 3551 size_t align) 3552 { 3553 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 3554 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3555 struct vmap_area **vas, *va; 3556 struct vm_struct **vms; 3557 int area, area2, last_area, term_area; 3558 unsigned long base, start, size, end, last_end, orig_start, orig_end; 3559 bool purged = false; 3560 enum fit_type type; 3561 3562 /* verify parameters and allocate data structures */ 3563 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); 3564 for (last_area = 0, area = 0; area < nr_vms; area++) { 3565 start = offsets[area]; 3566 end = start + sizes[area]; 3567 3568 /* is everything aligned properly? */ 3569 BUG_ON(!IS_ALIGNED(offsets[area], align)); 3570 BUG_ON(!IS_ALIGNED(sizes[area], align)); 3571 3572 /* detect the area with the highest address */ 3573 if (start > offsets[last_area]) 3574 last_area = area; 3575 3576 for (area2 = area + 1; area2 < nr_vms; area2++) { 3577 unsigned long start2 = offsets[area2]; 3578 unsigned long end2 = start2 + sizes[area2]; 3579 3580 BUG_ON(start2 < end && start < end2); 3581 } 3582 } 3583 last_end = offsets[last_area] + sizes[last_area]; 3584 3585 if (vmalloc_end - vmalloc_start < last_end) { 3586 WARN_ON(true); 3587 return NULL; 3588 } 3589 3590 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); 3591 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); 3592 if (!vas || !vms) 3593 goto err_free2; 3594 3595 for (area = 0; area < nr_vms; area++) { 3596 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); 3597 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 3598 if (!vas[area] || !vms[area]) 3599 goto err_free; 3600 } 3601 retry: 3602 spin_lock(&free_vmap_area_lock); 3603 3604 /* start scanning - we scan from the top, begin with the last area */ 3605 area = term_area = last_area; 3606 start = offsets[area]; 3607 end = start + sizes[area]; 3608 3609 va = pvm_find_va_enclose_addr(vmalloc_end); 3610 base = pvm_determine_end_from_reverse(&va, align) - end; 3611 3612 while (true) { 3613 /* 3614 * base might have underflowed, add last_end before 3615 * comparing. 3616 */ 3617 if (base + last_end < vmalloc_start + last_end) 3618 goto overflow; 3619 3620 /* 3621 * Fitting base has not been found. 3622 */ 3623 if (va == NULL) 3624 goto overflow; 3625 3626 /* 3627 * If required width exceeds current VA block, move 3628 * base downwards and then recheck. 3629 */ 3630 if (base + end > va->va_end) { 3631 base = pvm_determine_end_from_reverse(&va, align) - end; 3632 term_area = area; 3633 continue; 3634 } 3635 3636 /* 3637 * If this VA does not fit, move base downwards and recheck. 3638 */ 3639 if (base + start < va->va_start) { 3640 va = node_to_va(rb_prev(&va->rb_node)); 3641 base = pvm_determine_end_from_reverse(&va, align) - end; 3642 term_area = area; 3643 continue; 3644 } 3645 3646 /* 3647 * This area fits, move on to the previous one. If 3648 * the previous one is the terminal one, we're done. 3649 */ 3650 area = (area + nr_vms - 1) % nr_vms; 3651 if (area == term_area) 3652 break; 3653 3654 start = offsets[area]; 3655 end = start + sizes[area]; 3656 va = pvm_find_va_enclose_addr(base + end); 3657 } 3658 3659 /* we've found a fitting base, insert all va's */ 3660 for (area = 0; area < nr_vms; area++) { 3661 int ret; 3662 3663 start = base + offsets[area]; 3664 size = sizes[area]; 3665 3666 va = pvm_find_va_enclose_addr(start); 3667 if (WARN_ON_ONCE(va == NULL)) 3668 /* It is a BUG(), but trigger recovery instead. */ 3669 goto recovery; 3670 3671 type = classify_va_fit_type(va, start, size); 3672 if (WARN_ON_ONCE(type == NOTHING_FIT)) 3673 /* It is a BUG(), but trigger recovery instead. */ 3674 goto recovery; 3675 3676 ret = adjust_va_to_fit_type(va, start, size, type); 3677 if (unlikely(ret)) 3678 goto recovery; 3679 3680 /* Allocated area. */ 3681 va = vas[area]; 3682 va->va_start = start; 3683 va->va_end = start + size; 3684 } 3685 3686 spin_unlock(&free_vmap_area_lock); 3687 3688 /* populate the kasan shadow space */ 3689 for (area = 0; area < nr_vms; area++) { 3690 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) 3691 goto err_free_shadow; 3692 3693 kasan_unpoison_vmalloc((void *)vas[area]->va_start, 3694 sizes[area]); 3695 } 3696 3697 /* insert all vm's */ 3698 spin_lock(&vmap_area_lock); 3699 for (area = 0; area < nr_vms; area++) { 3700 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); 3701 3702 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, 3703 pcpu_get_vm_areas); 3704 } 3705 spin_unlock(&vmap_area_lock); 3706 3707 kfree(vas); 3708 return vms; 3709 3710 recovery: 3711 /* 3712 * Remove previously allocated areas. There is no 3713 * need in removing these areas from the busy tree, 3714 * because they are inserted only on the final step 3715 * and when pcpu_get_vm_areas() is success. 3716 */ 3717 while (area--) { 3718 orig_start = vas[area]->va_start; 3719 orig_end = vas[area]->va_end; 3720 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3721 &free_vmap_area_list); 3722 if (va) 3723 kasan_release_vmalloc(orig_start, orig_end, 3724 va->va_start, va->va_end); 3725 vas[area] = NULL; 3726 } 3727 3728 overflow: 3729 spin_unlock(&free_vmap_area_lock); 3730 if (!purged) { 3731 purge_vmap_area_lazy(); 3732 purged = true; 3733 3734 /* Before "retry", check if we recover. */ 3735 for (area = 0; area < nr_vms; area++) { 3736 if (vas[area]) 3737 continue; 3738 3739 vas[area] = kmem_cache_zalloc( 3740 vmap_area_cachep, GFP_KERNEL); 3741 if (!vas[area]) 3742 goto err_free; 3743 } 3744 3745 goto retry; 3746 } 3747 3748 err_free: 3749 for (area = 0; area < nr_vms; area++) { 3750 if (vas[area]) 3751 kmem_cache_free(vmap_area_cachep, vas[area]); 3752 3753 kfree(vms[area]); 3754 } 3755 err_free2: 3756 kfree(vas); 3757 kfree(vms); 3758 return NULL; 3759 3760 err_free_shadow: 3761 spin_lock(&free_vmap_area_lock); 3762 /* 3763 * We release all the vmalloc shadows, even the ones for regions that 3764 * hadn't been successfully added. This relies on kasan_release_vmalloc 3765 * being able to tolerate this case. 3766 */ 3767 for (area = 0; area < nr_vms; area++) { 3768 orig_start = vas[area]->va_start; 3769 orig_end = vas[area]->va_end; 3770 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3771 &free_vmap_area_list); 3772 if (va) 3773 kasan_release_vmalloc(orig_start, orig_end, 3774 va->va_start, va->va_end); 3775 vas[area] = NULL; 3776 kfree(vms[area]); 3777 } 3778 spin_unlock(&free_vmap_area_lock); 3779 kfree(vas); 3780 kfree(vms); 3781 return NULL; 3782 } 3783 3784 /** 3785 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 3786 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 3787 * @nr_vms: the number of allocated areas 3788 * 3789 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 3790 */ 3791 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 3792 { 3793 int i; 3794 3795 for (i = 0; i < nr_vms; i++) 3796 free_vm_area(vms[i]); 3797 kfree(vms); 3798 } 3799 #endif /* CONFIG_SMP */ 3800 3801 #ifdef CONFIG_PRINTK 3802 bool vmalloc_dump_obj(void *object) 3803 { 3804 struct vm_struct *vm; 3805 void *objp = (void *)PAGE_ALIGN((unsigned long)object); 3806 3807 vm = find_vm_area(objp); 3808 if (!vm) 3809 return false; 3810 pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", 3811 vm->nr_pages, (unsigned long)vm->addr, vm->caller); 3812 return true; 3813 } 3814 #endif 3815 3816 #ifdef CONFIG_PROC_FS 3817 static void *s_start(struct seq_file *m, loff_t *pos) 3818 __acquires(&vmap_purge_lock) 3819 __acquires(&vmap_area_lock) 3820 { 3821 mutex_lock(&vmap_purge_lock); 3822 spin_lock(&vmap_area_lock); 3823 3824 return seq_list_start(&vmap_area_list, *pos); 3825 } 3826 3827 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3828 { 3829 return seq_list_next(p, &vmap_area_list, pos); 3830 } 3831 3832 static void s_stop(struct seq_file *m, void *p) 3833 __releases(&vmap_area_lock) 3834 __releases(&vmap_purge_lock) 3835 { 3836 spin_unlock(&vmap_area_lock); 3837 mutex_unlock(&vmap_purge_lock); 3838 } 3839 3840 static void show_numa_info(struct seq_file *m, struct vm_struct *v) 3841 { 3842 if (IS_ENABLED(CONFIG_NUMA)) { 3843 unsigned int nr, *counters = m->private; 3844 3845 if (!counters) 3846 return; 3847 3848 if (v->flags & VM_UNINITIALIZED) 3849 return; 3850 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 3851 smp_rmb(); 3852 3853 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 3854 3855 for (nr = 0; nr < v->nr_pages; nr++) 3856 counters[page_to_nid(v->pages[nr])]++; 3857 3858 for_each_node_state(nr, N_HIGH_MEMORY) 3859 if (counters[nr]) 3860 seq_printf(m, " N%u=%u", nr, counters[nr]); 3861 } 3862 } 3863 3864 static void show_purge_info(struct seq_file *m) 3865 { 3866 struct vmap_area *va; 3867 3868 spin_lock(&purge_vmap_area_lock); 3869 list_for_each_entry(va, &purge_vmap_area_list, list) { 3870 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 3871 (void *)va->va_start, (void *)va->va_end, 3872 va->va_end - va->va_start); 3873 } 3874 spin_unlock(&purge_vmap_area_lock); 3875 } 3876 3877 static int s_show(struct seq_file *m, void *p) 3878 { 3879 struct vmap_area *va; 3880 struct vm_struct *v; 3881 3882 va = list_entry(p, struct vmap_area, list); 3883 3884 /* 3885 * s_show can encounter race with remove_vm_area, !vm on behalf 3886 * of vmap area is being tear down or vm_map_ram allocation. 3887 */ 3888 if (!va->vm) { 3889 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", 3890 (void *)va->va_start, (void *)va->va_end, 3891 va->va_end - va->va_start); 3892 3893 return 0; 3894 } 3895 3896 v = va->vm; 3897 3898 seq_printf(m, "0x%pK-0x%pK %7ld", 3899 v->addr, v->addr + v->size, v->size); 3900 3901 if (v->caller) 3902 seq_printf(m, " %pS", v->caller); 3903 3904 if (v->nr_pages) 3905 seq_printf(m, " pages=%d", v->nr_pages); 3906 3907 if (v->phys_addr) 3908 seq_printf(m, " phys=%pa", &v->phys_addr); 3909 3910 if (v->flags & VM_IOREMAP) 3911 seq_puts(m, " ioremap"); 3912 3913 if (v->flags & VM_ALLOC) 3914 seq_puts(m, " vmalloc"); 3915 3916 if (v->flags & VM_MAP) 3917 seq_puts(m, " vmap"); 3918 3919 if (v->flags & VM_USERMAP) 3920 seq_puts(m, " user"); 3921 3922 if (v->flags & VM_DMA_COHERENT) 3923 seq_puts(m, " dma-coherent"); 3924 3925 if (is_vmalloc_addr(v->pages)) 3926 seq_puts(m, " vpages"); 3927 3928 show_numa_info(m, v); 3929 seq_putc(m, '\n'); 3930 3931 /* 3932 * As a final step, dump "unpurged" areas. 3933 */ 3934 if (list_is_last(&va->list, &vmap_area_list)) 3935 show_purge_info(m); 3936 3937 return 0; 3938 } 3939 3940 static const struct seq_operations vmalloc_op = { 3941 .start = s_start, 3942 .next = s_next, 3943 .stop = s_stop, 3944 .show = s_show, 3945 }; 3946 3947 static int __init proc_vmalloc_init(void) 3948 { 3949 if (IS_ENABLED(CONFIG_NUMA)) 3950 proc_create_seq_private("vmallocinfo", 0400, NULL, 3951 &vmalloc_op, 3952 nr_node_ids * sizeof(unsigned int), NULL); 3953 else 3954 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); 3955 return 0; 3956 } 3957 module_init(proc_vmalloc_init); 3958 3959 #endif 3960