1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1993 Linus Torvalds 4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 6 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 7 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 9 */ 10 11 #include <linux/vmalloc.h> 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/highmem.h> 15 #include <linux/sched/signal.h> 16 #include <linux/slab.h> 17 #include <linux/spinlock.h> 18 #include <linux/interrupt.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/set_memory.h> 22 #include <linux/debugobjects.h> 23 #include <linux/kallsyms.h> 24 #include <linux/list.h> 25 #include <linux/notifier.h> 26 #include <linux/rbtree.h> 27 #include <linux/xarray.h> 28 #include <linux/rcupdate.h> 29 #include <linux/pfn.h> 30 #include <linux/kmemleak.h> 31 #include <linux/atomic.h> 32 #include <linux/compiler.h> 33 #include <linux/llist.h> 34 #include <linux/bitops.h> 35 #include <linux/rbtree_augmented.h> 36 #include <linux/overflow.h> 37 38 #include <linux/uaccess.h> 39 #include <asm/tlbflush.h> 40 #include <asm/shmparam.h> 41 42 #include "internal.h" 43 #include "pgalloc-track.h" 44 45 bool is_vmalloc_addr(const void *x) 46 { 47 unsigned long addr = (unsigned long)x; 48 49 return addr >= VMALLOC_START && addr < VMALLOC_END; 50 } 51 EXPORT_SYMBOL(is_vmalloc_addr); 52 53 struct vfree_deferred { 54 struct llist_head list; 55 struct work_struct wq; 56 }; 57 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); 58 59 static void __vunmap(const void *, int); 60 61 static void free_work(struct work_struct *w) 62 { 63 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); 64 struct llist_node *t, *llnode; 65 66 llist_for_each_safe(llnode, t, llist_del_all(&p->list)) 67 __vunmap((void *)llnode, 1); 68 } 69 70 /*** Page table manipulation functions ***/ 71 72 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 73 pgtbl_mod_mask *mask) 74 { 75 pte_t *pte; 76 77 pte = pte_offset_kernel(pmd, addr); 78 do { 79 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 80 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 81 } while (pte++, addr += PAGE_SIZE, addr != end); 82 *mask |= PGTBL_PTE_MODIFIED; 83 } 84 85 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 86 pgtbl_mod_mask *mask) 87 { 88 pmd_t *pmd; 89 unsigned long next; 90 int cleared; 91 92 pmd = pmd_offset(pud, addr); 93 do { 94 next = pmd_addr_end(addr, end); 95 96 cleared = pmd_clear_huge(pmd); 97 if (cleared || pmd_bad(*pmd)) 98 *mask |= PGTBL_PMD_MODIFIED; 99 100 if (cleared) 101 continue; 102 if (pmd_none_or_clear_bad(pmd)) 103 continue; 104 vunmap_pte_range(pmd, addr, next, mask); 105 106 cond_resched(); 107 } while (pmd++, addr = next, addr != end); 108 } 109 110 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 111 pgtbl_mod_mask *mask) 112 { 113 pud_t *pud; 114 unsigned long next; 115 int cleared; 116 117 pud = pud_offset(p4d, addr); 118 do { 119 next = pud_addr_end(addr, end); 120 121 cleared = pud_clear_huge(pud); 122 if (cleared || pud_bad(*pud)) 123 *mask |= PGTBL_PUD_MODIFIED; 124 125 if (cleared) 126 continue; 127 if (pud_none_or_clear_bad(pud)) 128 continue; 129 vunmap_pmd_range(pud, addr, next, mask); 130 } while (pud++, addr = next, addr != end); 131 } 132 133 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 134 pgtbl_mod_mask *mask) 135 { 136 p4d_t *p4d; 137 unsigned long next; 138 int cleared; 139 140 p4d = p4d_offset(pgd, addr); 141 do { 142 next = p4d_addr_end(addr, end); 143 144 cleared = p4d_clear_huge(p4d); 145 if (cleared || p4d_bad(*p4d)) 146 *mask |= PGTBL_P4D_MODIFIED; 147 148 if (cleared) 149 continue; 150 if (p4d_none_or_clear_bad(p4d)) 151 continue; 152 vunmap_pud_range(p4d, addr, next, mask); 153 } while (p4d++, addr = next, addr != end); 154 } 155 156 /** 157 * unmap_kernel_range_noflush - unmap kernel VM area 158 * @start: start of the VM area to unmap 159 * @size: size of the VM area to unmap 160 * 161 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify 162 * should have been allocated using get_vm_area() and its friends. 163 * 164 * NOTE: 165 * This function does NOT do any cache flushing. The caller is responsible 166 * for calling flush_cache_vunmap() on to-be-mapped areas before calling this 167 * function and flush_tlb_kernel_range() after. 168 */ 169 void unmap_kernel_range_noflush(unsigned long start, unsigned long size) 170 { 171 unsigned long end = start + size; 172 unsigned long next; 173 pgd_t *pgd; 174 unsigned long addr = start; 175 pgtbl_mod_mask mask = 0; 176 177 BUG_ON(addr >= end); 178 pgd = pgd_offset_k(addr); 179 do { 180 next = pgd_addr_end(addr, end); 181 if (pgd_bad(*pgd)) 182 mask |= PGTBL_PGD_MODIFIED; 183 if (pgd_none_or_clear_bad(pgd)) 184 continue; 185 vunmap_p4d_range(pgd, addr, next, &mask); 186 } while (pgd++, addr = next, addr != end); 187 188 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 189 arch_sync_kernel_mappings(start, end); 190 } 191 192 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 193 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 194 pgtbl_mod_mask *mask) 195 { 196 pte_t *pte; 197 198 /* 199 * nr is a running index into the array which helps higher level 200 * callers keep track of where we're up to. 201 */ 202 203 pte = pte_alloc_kernel_track(pmd, addr, mask); 204 if (!pte) 205 return -ENOMEM; 206 do { 207 struct page *page = pages[*nr]; 208 209 if (WARN_ON(!pte_none(*pte))) 210 return -EBUSY; 211 if (WARN_ON(!page)) 212 return -ENOMEM; 213 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 214 (*nr)++; 215 } while (pte++, addr += PAGE_SIZE, addr != end); 216 *mask |= PGTBL_PTE_MODIFIED; 217 return 0; 218 } 219 220 static int vmap_pmd_range(pud_t *pud, unsigned long addr, 221 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 222 pgtbl_mod_mask *mask) 223 { 224 pmd_t *pmd; 225 unsigned long next; 226 227 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 228 if (!pmd) 229 return -ENOMEM; 230 do { 231 next = pmd_addr_end(addr, end); 232 if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) 233 return -ENOMEM; 234 } while (pmd++, addr = next, addr != end); 235 return 0; 236 } 237 238 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, 239 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 240 pgtbl_mod_mask *mask) 241 { 242 pud_t *pud; 243 unsigned long next; 244 245 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 246 if (!pud) 247 return -ENOMEM; 248 do { 249 next = pud_addr_end(addr, end); 250 if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) 251 return -ENOMEM; 252 } while (pud++, addr = next, addr != end); 253 return 0; 254 } 255 256 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, 257 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 258 pgtbl_mod_mask *mask) 259 { 260 p4d_t *p4d; 261 unsigned long next; 262 263 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 264 if (!p4d) 265 return -ENOMEM; 266 do { 267 next = p4d_addr_end(addr, end); 268 if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) 269 return -ENOMEM; 270 } while (p4d++, addr = next, addr != end); 271 return 0; 272 } 273 274 /** 275 * map_kernel_range_noflush - map kernel VM area with the specified pages 276 * @addr: start of the VM area to map 277 * @size: size of the VM area to map 278 * @prot: page protection flags to use 279 * @pages: pages to map 280 * 281 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should 282 * have been allocated using get_vm_area() and its friends. 283 * 284 * NOTE: 285 * This function does NOT do any cache flushing. The caller is responsible for 286 * calling flush_cache_vmap() on to-be-mapped areas before calling this 287 * function. 288 * 289 * RETURNS: 290 * 0 on success, -errno on failure. 291 */ 292 int map_kernel_range_noflush(unsigned long addr, unsigned long size, 293 pgprot_t prot, struct page **pages) 294 { 295 unsigned long start = addr; 296 unsigned long end = addr + size; 297 unsigned long next; 298 pgd_t *pgd; 299 int err = 0; 300 int nr = 0; 301 pgtbl_mod_mask mask = 0; 302 303 BUG_ON(addr >= end); 304 pgd = pgd_offset_k(addr); 305 do { 306 next = pgd_addr_end(addr, end); 307 if (pgd_bad(*pgd)) 308 mask |= PGTBL_PGD_MODIFIED; 309 err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); 310 if (err) 311 return err; 312 } while (pgd++, addr = next, addr != end); 313 314 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 315 arch_sync_kernel_mappings(start, end); 316 317 return 0; 318 } 319 320 int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, 321 struct page **pages) 322 { 323 int ret; 324 325 ret = map_kernel_range_noflush(start, size, prot, pages); 326 flush_cache_vmap(start, start + size); 327 return ret; 328 } 329 330 int is_vmalloc_or_module_addr(const void *x) 331 { 332 /* 333 * ARM, x86-64 and sparc64 put modules in a special place, 334 * and fall back on vmalloc() if that fails. Others 335 * just put it in the vmalloc space. 336 */ 337 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 338 unsigned long addr = (unsigned long)x; 339 if (addr >= MODULES_VADDR && addr < MODULES_END) 340 return 1; 341 #endif 342 return is_vmalloc_addr(x); 343 } 344 345 /* 346 * Walk a vmap address to the struct page it maps. 347 */ 348 struct page *vmalloc_to_page(const void *vmalloc_addr) 349 { 350 unsigned long addr = (unsigned long) vmalloc_addr; 351 struct page *page = NULL; 352 pgd_t *pgd = pgd_offset_k(addr); 353 p4d_t *p4d; 354 pud_t *pud; 355 pmd_t *pmd; 356 pte_t *ptep, pte; 357 358 /* 359 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 360 * architectures that do not vmalloc module space 361 */ 362 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 363 364 if (pgd_none(*pgd)) 365 return NULL; 366 p4d = p4d_offset(pgd, addr); 367 if (p4d_none(*p4d)) 368 return NULL; 369 pud = pud_offset(p4d, addr); 370 371 /* 372 * Don't dereference bad PUD or PMD (below) entries. This will also 373 * identify huge mappings, which we may encounter on architectures 374 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be 375 * identified as vmalloc addresses by is_vmalloc_addr(), but are 376 * not [unambiguously] associated with a struct page, so there is 377 * no correct value to return for them. 378 */ 379 WARN_ON_ONCE(pud_bad(*pud)); 380 if (pud_none(*pud) || pud_bad(*pud)) 381 return NULL; 382 pmd = pmd_offset(pud, addr); 383 WARN_ON_ONCE(pmd_bad(*pmd)); 384 if (pmd_none(*pmd) || pmd_bad(*pmd)) 385 return NULL; 386 387 ptep = pte_offset_map(pmd, addr); 388 pte = *ptep; 389 if (pte_present(pte)) 390 page = pte_page(pte); 391 pte_unmap(ptep); 392 return page; 393 } 394 EXPORT_SYMBOL(vmalloc_to_page); 395 396 /* 397 * Map a vmalloc()-space virtual address to the physical page frame number. 398 */ 399 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 400 { 401 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 402 } 403 EXPORT_SYMBOL(vmalloc_to_pfn); 404 405 406 /*** Global kva allocator ***/ 407 408 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 409 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 410 411 412 static DEFINE_SPINLOCK(vmap_area_lock); 413 static DEFINE_SPINLOCK(free_vmap_area_lock); 414 /* Export for kexec only */ 415 LIST_HEAD(vmap_area_list); 416 static struct rb_root vmap_area_root = RB_ROOT; 417 static bool vmap_initialized __read_mostly; 418 419 static struct rb_root purge_vmap_area_root = RB_ROOT; 420 static LIST_HEAD(purge_vmap_area_list); 421 static DEFINE_SPINLOCK(purge_vmap_area_lock); 422 423 /* 424 * This kmem_cache is used for vmap_area objects. Instead of 425 * allocating from slab we reuse an object from this cache to 426 * make things faster. Especially in "no edge" splitting of 427 * free block. 428 */ 429 static struct kmem_cache *vmap_area_cachep; 430 431 /* 432 * This linked list is used in pair with free_vmap_area_root. 433 * It gives O(1) access to prev/next to perform fast coalescing. 434 */ 435 static LIST_HEAD(free_vmap_area_list); 436 437 /* 438 * This augment red-black tree represents the free vmap space. 439 * All vmap_area objects in this tree are sorted by va->va_start 440 * address. It is used for allocation and merging when a vmap 441 * object is released. 442 * 443 * Each vmap_area node contains a maximum available free block 444 * of its sub-tree, right or left. Therefore it is possible to 445 * find a lowest match of free area. 446 */ 447 static struct rb_root free_vmap_area_root = RB_ROOT; 448 449 /* 450 * Preload a CPU with one object for "no edge" split case. The 451 * aim is to get rid of allocations from the atomic context, thus 452 * to use more permissive allocation masks. 453 */ 454 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); 455 456 static __always_inline unsigned long 457 va_size(struct vmap_area *va) 458 { 459 return (va->va_end - va->va_start); 460 } 461 462 static __always_inline unsigned long 463 get_subtree_max_size(struct rb_node *node) 464 { 465 struct vmap_area *va; 466 467 va = rb_entry_safe(node, struct vmap_area, rb_node); 468 return va ? va->subtree_max_size : 0; 469 } 470 471 /* 472 * Gets called when remove the node and rotate. 473 */ 474 static __always_inline unsigned long 475 compute_subtree_max_size(struct vmap_area *va) 476 { 477 return max3(va_size(va), 478 get_subtree_max_size(va->rb_node.rb_left), 479 get_subtree_max_size(va->rb_node.rb_right)); 480 } 481 482 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, 483 struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) 484 485 static void purge_vmap_area_lazy(void); 486 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 487 static unsigned long lazy_max_pages(void); 488 489 static atomic_long_t nr_vmalloc_pages; 490 491 unsigned long vmalloc_nr_pages(void) 492 { 493 return atomic_long_read(&nr_vmalloc_pages); 494 } 495 496 static struct vmap_area *__find_vmap_area(unsigned long addr) 497 { 498 struct rb_node *n = vmap_area_root.rb_node; 499 500 while (n) { 501 struct vmap_area *va; 502 503 va = rb_entry(n, struct vmap_area, rb_node); 504 if (addr < va->va_start) 505 n = n->rb_left; 506 else if (addr >= va->va_end) 507 n = n->rb_right; 508 else 509 return va; 510 } 511 512 return NULL; 513 } 514 515 /* 516 * This function returns back addresses of parent node 517 * and its left or right link for further processing. 518 * 519 * Otherwise NULL is returned. In that case all further 520 * steps regarding inserting of conflicting overlap range 521 * have to be declined and actually considered as a bug. 522 */ 523 static __always_inline struct rb_node ** 524 find_va_links(struct vmap_area *va, 525 struct rb_root *root, struct rb_node *from, 526 struct rb_node **parent) 527 { 528 struct vmap_area *tmp_va; 529 struct rb_node **link; 530 531 if (root) { 532 link = &root->rb_node; 533 if (unlikely(!*link)) { 534 *parent = NULL; 535 return link; 536 } 537 } else { 538 link = &from; 539 } 540 541 /* 542 * Go to the bottom of the tree. When we hit the last point 543 * we end up with parent rb_node and correct direction, i name 544 * it link, where the new va->rb_node will be attached to. 545 */ 546 do { 547 tmp_va = rb_entry(*link, struct vmap_area, rb_node); 548 549 /* 550 * During the traversal we also do some sanity check. 551 * Trigger the BUG() if there are sides(left/right) 552 * or full overlaps. 553 */ 554 if (va->va_start < tmp_va->va_end && 555 va->va_end <= tmp_va->va_start) 556 link = &(*link)->rb_left; 557 else if (va->va_end > tmp_va->va_start && 558 va->va_start >= tmp_va->va_end) 559 link = &(*link)->rb_right; 560 else { 561 WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", 562 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); 563 564 return NULL; 565 } 566 } while (*link); 567 568 *parent = &tmp_va->rb_node; 569 return link; 570 } 571 572 static __always_inline struct list_head * 573 get_va_next_sibling(struct rb_node *parent, struct rb_node **link) 574 { 575 struct list_head *list; 576 577 if (unlikely(!parent)) 578 /* 579 * The red-black tree where we try to find VA neighbors 580 * before merging or inserting is empty, i.e. it means 581 * there is no free vmap space. Normally it does not 582 * happen but we handle this case anyway. 583 */ 584 return NULL; 585 586 list = &rb_entry(parent, struct vmap_area, rb_node)->list; 587 return (&parent->rb_right == link ? list->next : list); 588 } 589 590 static __always_inline void 591 link_va(struct vmap_area *va, struct rb_root *root, 592 struct rb_node *parent, struct rb_node **link, struct list_head *head) 593 { 594 /* 595 * VA is still not in the list, but we can 596 * identify its future previous list_head node. 597 */ 598 if (likely(parent)) { 599 head = &rb_entry(parent, struct vmap_area, rb_node)->list; 600 if (&parent->rb_right != link) 601 head = head->prev; 602 } 603 604 /* Insert to the rb-tree */ 605 rb_link_node(&va->rb_node, parent, link); 606 if (root == &free_vmap_area_root) { 607 /* 608 * Some explanation here. Just perform simple insertion 609 * to the tree. We do not set va->subtree_max_size to 610 * its current size before calling rb_insert_augmented(). 611 * It is because of we populate the tree from the bottom 612 * to parent levels when the node _is_ in the tree. 613 * 614 * Therefore we set subtree_max_size to zero after insertion, 615 * to let __augment_tree_propagate_from() puts everything to 616 * the correct order later on. 617 */ 618 rb_insert_augmented(&va->rb_node, 619 root, &free_vmap_area_rb_augment_cb); 620 va->subtree_max_size = 0; 621 } else { 622 rb_insert_color(&va->rb_node, root); 623 } 624 625 /* Address-sort this list */ 626 list_add(&va->list, head); 627 } 628 629 static __always_inline void 630 unlink_va(struct vmap_area *va, struct rb_root *root) 631 { 632 if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) 633 return; 634 635 if (root == &free_vmap_area_root) 636 rb_erase_augmented(&va->rb_node, 637 root, &free_vmap_area_rb_augment_cb); 638 else 639 rb_erase(&va->rb_node, root); 640 641 list_del(&va->list); 642 RB_CLEAR_NODE(&va->rb_node); 643 } 644 645 #if DEBUG_AUGMENT_PROPAGATE_CHECK 646 static void 647 augment_tree_propagate_check(void) 648 { 649 struct vmap_area *va; 650 unsigned long computed_size; 651 652 list_for_each_entry(va, &free_vmap_area_list, list) { 653 computed_size = compute_subtree_max_size(va); 654 if (computed_size != va->subtree_max_size) 655 pr_emerg("tree is corrupted: %lu, %lu\n", 656 va_size(va), va->subtree_max_size); 657 } 658 } 659 #endif 660 661 /* 662 * This function populates subtree_max_size from bottom to upper 663 * levels starting from VA point. The propagation must be done 664 * when VA size is modified by changing its va_start/va_end. Or 665 * in case of newly inserting of VA to the tree. 666 * 667 * It means that __augment_tree_propagate_from() must be called: 668 * - After VA has been inserted to the tree(free path); 669 * - After VA has been shrunk(allocation path); 670 * - After VA has been increased(merging path). 671 * 672 * Please note that, it does not mean that upper parent nodes 673 * and their subtree_max_size are recalculated all the time up 674 * to the root node. 675 * 676 * 4--8 677 * /\ 678 * / \ 679 * / \ 680 * 2--2 8--8 681 * 682 * For example if we modify the node 4, shrinking it to 2, then 683 * no any modification is required. If we shrink the node 2 to 1 684 * its subtree_max_size is updated only, and set to 1. If we shrink 685 * the node 8 to 6, then its subtree_max_size is set to 6 and parent 686 * node becomes 4--6. 687 */ 688 static __always_inline void 689 augment_tree_propagate_from(struct vmap_area *va) 690 { 691 /* 692 * Populate the tree from bottom towards the root until 693 * the calculated maximum available size of checked node 694 * is equal to its current one. 695 */ 696 free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); 697 698 #if DEBUG_AUGMENT_PROPAGATE_CHECK 699 augment_tree_propagate_check(); 700 #endif 701 } 702 703 static void 704 insert_vmap_area(struct vmap_area *va, 705 struct rb_root *root, struct list_head *head) 706 { 707 struct rb_node **link; 708 struct rb_node *parent; 709 710 link = find_va_links(va, root, NULL, &parent); 711 if (link) 712 link_va(va, root, parent, link, head); 713 } 714 715 static void 716 insert_vmap_area_augment(struct vmap_area *va, 717 struct rb_node *from, struct rb_root *root, 718 struct list_head *head) 719 { 720 struct rb_node **link; 721 struct rb_node *parent; 722 723 if (from) 724 link = find_va_links(va, NULL, from, &parent); 725 else 726 link = find_va_links(va, root, NULL, &parent); 727 728 if (link) { 729 link_va(va, root, parent, link, head); 730 augment_tree_propagate_from(va); 731 } 732 } 733 734 /* 735 * Merge de-allocated chunk of VA memory with previous 736 * and next free blocks. If coalesce is not done a new 737 * free area is inserted. If VA has been merged, it is 738 * freed. 739 * 740 * Please note, it can return NULL in case of overlap 741 * ranges, followed by WARN() report. Despite it is a 742 * buggy behaviour, a system can be alive and keep 743 * ongoing. 744 */ 745 static __always_inline struct vmap_area * 746 merge_or_add_vmap_area(struct vmap_area *va, 747 struct rb_root *root, struct list_head *head) 748 { 749 struct vmap_area *sibling; 750 struct list_head *next; 751 struct rb_node **link; 752 struct rb_node *parent; 753 bool merged = false; 754 755 /* 756 * Find a place in the tree where VA potentially will be 757 * inserted, unless it is merged with its sibling/siblings. 758 */ 759 link = find_va_links(va, root, NULL, &parent); 760 if (!link) 761 return NULL; 762 763 /* 764 * Get next node of VA to check if merging can be done. 765 */ 766 next = get_va_next_sibling(parent, link); 767 if (unlikely(next == NULL)) 768 goto insert; 769 770 /* 771 * start end 772 * | | 773 * |<------VA------>|<-----Next----->| 774 * | | 775 * start end 776 */ 777 if (next != head) { 778 sibling = list_entry(next, struct vmap_area, list); 779 if (sibling->va_start == va->va_end) { 780 sibling->va_start = va->va_start; 781 782 /* Free vmap_area object. */ 783 kmem_cache_free(vmap_area_cachep, va); 784 785 /* Point to the new merged area. */ 786 va = sibling; 787 merged = true; 788 } 789 } 790 791 /* 792 * start end 793 * | | 794 * |<-----Prev----->|<------VA------>| 795 * | | 796 * start end 797 */ 798 if (next->prev != head) { 799 sibling = list_entry(next->prev, struct vmap_area, list); 800 if (sibling->va_end == va->va_start) { 801 /* 802 * If both neighbors are coalesced, it is important 803 * to unlink the "next" node first, followed by merging 804 * with "previous" one. Otherwise the tree might not be 805 * fully populated if a sibling's augmented value is 806 * "normalized" because of rotation operations. 807 */ 808 if (merged) 809 unlink_va(va, root); 810 811 sibling->va_end = va->va_end; 812 813 /* Free vmap_area object. */ 814 kmem_cache_free(vmap_area_cachep, va); 815 816 /* Point to the new merged area. */ 817 va = sibling; 818 merged = true; 819 } 820 } 821 822 insert: 823 if (!merged) 824 link_va(va, root, parent, link, head); 825 826 return va; 827 } 828 829 static __always_inline struct vmap_area * 830 merge_or_add_vmap_area_augment(struct vmap_area *va, 831 struct rb_root *root, struct list_head *head) 832 { 833 va = merge_or_add_vmap_area(va, root, head); 834 if (va) 835 augment_tree_propagate_from(va); 836 837 return va; 838 } 839 840 static __always_inline bool 841 is_within_this_va(struct vmap_area *va, unsigned long size, 842 unsigned long align, unsigned long vstart) 843 { 844 unsigned long nva_start_addr; 845 846 if (va->va_start > vstart) 847 nva_start_addr = ALIGN(va->va_start, align); 848 else 849 nva_start_addr = ALIGN(vstart, align); 850 851 /* Can be overflowed due to big size or alignment. */ 852 if (nva_start_addr + size < nva_start_addr || 853 nva_start_addr < vstart) 854 return false; 855 856 return (nva_start_addr + size <= va->va_end); 857 } 858 859 /* 860 * Find the first free block(lowest start address) in the tree, 861 * that will accomplish the request corresponding to passing 862 * parameters. 863 */ 864 static __always_inline struct vmap_area * 865 find_vmap_lowest_match(unsigned long size, 866 unsigned long align, unsigned long vstart) 867 { 868 struct vmap_area *va; 869 struct rb_node *node; 870 unsigned long length; 871 872 /* Start from the root. */ 873 node = free_vmap_area_root.rb_node; 874 875 /* Adjust the search size for alignment overhead. */ 876 length = size + align - 1; 877 878 while (node) { 879 va = rb_entry(node, struct vmap_area, rb_node); 880 881 if (get_subtree_max_size(node->rb_left) >= length && 882 vstart < va->va_start) { 883 node = node->rb_left; 884 } else { 885 if (is_within_this_va(va, size, align, vstart)) 886 return va; 887 888 /* 889 * Does not make sense to go deeper towards the right 890 * sub-tree if it does not have a free block that is 891 * equal or bigger to the requested search length. 892 */ 893 if (get_subtree_max_size(node->rb_right) >= length) { 894 node = node->rb_right; 895 continue; 896 } 897 898 /* 899 * OK. We roll back and find the first right sub-tree, 900 * that will satisfy the search criteria. It can happen 901 * only once due to "vstart" restriction. 902 */ 903 while ((node = rb_parent(node))) { 904 va = rb_entry(node, struct vmap_area, rb_node); 905 if (is_within_this_va(va, size, align, vstart)) 906 return va; 907 908 if (get_subtree_max_size(node->rb_right) >= length && 909 vstart <= va->va_start) { 910 node = node->rb_right; 911 break; 912 } 913 } 914 } 915 } 916 917 return NULL; 918 } 919 920 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 921 #include <linux/random.h> 922 923 static struct vmap_area * 924 find_vmap_lowest_linear_match(unsigned long size, 925 unsigned long align, unsigned long vstart) 926 { 927 struct vmap_area *va; 928 929 list_for_each_entry(va, &free_vmap_area_list, list) { 930 if (!is_within_this_va(va, size, align, vstart)) 931 continue; 932 933 return va; 934 } 935 936 return NULL; 937 } 938 939 static void 940 find_vmap_lowest_match_check(unsigned long size) 941 { 942 struct vmap_area *va_1, *va_2; 943 unsigned long vstart; 944 unsigned int rnd; 945 946 get_random_bytes(&rnd, sizeof(rnd)); 947 vstart = VMALLOC_START + rnd; 948 949 va_1 = find_vmap_lowest_match(size, 1, vstart); 950 va_2 = find_vmap_lowest_linear_match(size, 1, vstart); 951 952 if (va_1 != va_2) 953 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", 954 va_1, va_2, vstart); 955 } 956 #endif 957 958 enum fit_type { 959 NOTHING_FIT = 0, 960 FL_FIT_TYPE = 1, /* full fit */ 961 LE_FIT_TYPE = 2, /* left edge fit */ 962 RE_FIT_TYPE = 3, /* right edge fit */ 963 NE_FIT_TYPE = 4 /* no edge fit */ 964 }; 965 966 static __always_inline enum fit_type 967 classify_va_fit_type(struct vmap_area *va, 968 unsigned long nva_start_addr, unsigned long size) 969 { 970 enum fit_type type; 971 972 /* Check if it is within VA. */ 973 if (nva_start_addr < va->va_start || 974 nva_start_addr + size > va->va_end) 975 return NOTHING_FIT; 976 977 /* Now classify. */ 978 if (va->va_start == nva_start_addr) { 979 if (va->va_end == nva_start_addr + size) 980 type = FL_FIT_TYPE; 981 else 982 type = LE_FIT_TYPE; 983 } else if (va->va_end == nva_start_addr + size) { 984 type = RE_FIT_TYPE; 985 } else { 986 type = NE_FIT_TYPE; 987 } 988 989 return type; 990 } 991 992 static __always_inline int 993 adjust_va_to_fit_type(struct vmap_area *va, 994 unsigned long nva_start_addr, unsigned long size, 995 enum fit_type type) 996 { 997 struct vmap_area *lva = NULL; 998 999 if (type == FL_FIT_TYPE) { 1000 /* 1001 * No need to split VA, it fully fits. 1002 * 1003 * | | 1004 * V NVA V 1005 * |---------------| 1006 */ 1007 unlink_va(va, &free_vmap_area_root); 1008 kmem_cache_free(vmap_area_cachep, va); 1009 } else if (type == LE_FIT_TYPE) { 1010 /* 1011 * Split left edge of fit VA. 1012 * 1013 * | | 1014 * V NVA V R 1015 * |-------|-------| 1016 */ 1017 va->va_start += size; 1018 } else if (type == RE_FIT_TYPE) { 1019 /* 1020 * Split right edge of fit VA. 1021 * 1022 * | | 1023 * L V NVA V 1024 * |-------|-------| 1025 */ 1026 va->va_end = nva_start_addr; 1027 } else if (type == NE_FIT_TYPE) { 1028 /* 1029 * Split no edge of fit VA. 1030 * 1031 * | | 1032 * L V NVA V R 1033 * |---|-------|---| 1034 */ 1035 lva = __this_cpu_xchg(ne_fit_preload_node, NULL); 1036 if (unlikely(!lva)) { 1037 /* 1038 * For percpu allocator we do not do any pre-allocation 1039 * and leave it as it is. The reason is it most likely 1040 * never ends up with NE_FIT_TYPE splitting. In case of 1041 * percpu allocations offsets and sizes are aligned to 1042 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE 1043 * are its main fitting cases. 1044 * 1045 * There are a few exceptions though, as an example it is 1046 * a first allocation (early boot up) when we have "one" 1047 * big free space that has to be split. 1048 * 1049 * Also we can hit this path in case of regular "vmap" 1050 * allocations, if "this" current CPU was not preloaded. 1051 * See the comment in alloc_vmap_area() why. If so, then 1052 * GFP_NOWAIT is used instead to get an extra object for 1053 * split purpose. That is rare and most time does not 1054 * occur. 1055 * 1056 * What happens if an allocation gets failed. Basically, 1057 * an "overflow" path is triggered to purge lazily freed 1058 * areas to free some memory, then, the "retry" path is 1059 * triggered to repeat one more time. See more details 1060 * in alloc_vmap_area() function. 1061 */ 1062 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 1063 if (!lva) 1064 return -1; 1065 } 1066 1067 /* 1068 * Build the remainder. 1069 */ 1070 lva->va_start = va->va_start; 1071 lva->va_end = nva_start_addr; 1072 1073 /* 1074 * Shrink this VA to remaining size. 1075 */ 1076 va->va_start = nva_start_addr + size; 1077 } else { 1078 return -1; 1079 } 1080 1081 if (type != FL_FIT_TYPE) { 1082 augment_tree_propagate_from(va); 1083 1084 if (lva) /* type == NE_FIT_TYPE */ 1085 insert_vmap_area_augment(lva, &va->rb_node, 1086 &free_vmap_area_root, &free_vmap_area_list); 1087 } 1088 1089 return 0; 1090 } 1091 1092 /* 1093 * Returns a start address of the newly allocated area, if success. 1094 * Otherwise a vend is returned that indicates failure. 1095 */ 1096 static __always_inline unsigned long 1097 __alloc_vmap_area(unsigned long size, unsigned long align, 1098 unsigned long vstart, unsigned long vend) 1099 { 1100 unsigned long nva_start_addr; 1101 struct vmap_area *va; 1102 enum fit_type type; 1103 int ret; 1104 1105 va = find_vmap_lowest_match(size, align, vstart); 1106 if (unlikely(!va)) 1107 return vend; 1108 1109 if (va->va_start > vstart) 1110 nva_start_addr = ALIGN(va->va_start, align); 1111 else 1112 nva_start_addr = ALIGN(vstart, align); 1113 1114 /* Check the "vend" restriction. */ 1115 if (nva_start_addr + size > vend) 1116 return vend; 1117 1118 /* Classify what we have found. */ 1119 type = classify_va_fit_type(va, nva_start_addr, size); 1120 if (WARN_ON_ONCE(type == NOTHING_FIT)) 1121 return vend; 1122 1123 /* Update the free vmap_area. */ 1124 ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); 1125 if (ret) 1126 return vend; 1127 1128 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1129 find_vmap_lowest_match_check(size); 1130 #endif 1131 1132 return nva_start_addr; 1133 } 1134 1135 /* 1136 * Free a region of KVA allocated by alloc_vmap_area 1137 */ 1138 static void free_vmap_area(struct vmap_area *va) 1139 { 1140 /* 1141 * Remove from the busy tree/list. 1142 */ 1143 spin_lock(&vmap_area_lock); 1144 unlink_va(va, &vmap_area_root); 1145 spin_unlock(&vmap_area_lock); 1146 1147 /* 1148 * Insert/Merge it back to the free tree/list. 1149 */ 1150 spin_lock(&free_vmap_area_lock); 1151 merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); 1152 spin_unlock(&free_vmap_area_lock); 1153 } 1154 1155 /* 1156 * Allocate a region of KVA of the specified size and alignment, within the 1157 * vstart and vend. 1158 */ 1159 static struct vmap_area *alloc_vmap_area(unsigned long size, 1160 unsigned long align, 1161 unsigned long vstart, unsigned long vend, 1162 int node, gfp_t gfp_mask) 1163 { 1164 struct vmap_area *va, *pva; 1165 unsigned long addr; 1166 int purged = 0; 1167 int ret; 1168 1169 BUG_ON(!size); 1170 BUG_ON(offset_in_page(size)); 1171 BUG_ON(!is_power_of_2(align)); 1172 1173 if (unlikely(!vmap_initialized)) 1174 return ERR_PTR(-EBUSY); 1175 1176 might_sleep(); 1177 gfp_mask = gfp_mask & GFP_RECLAIM_MASK; 1178 1179 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1180 if (unlikely(!va)) 1181 return ERR_PTR(-ENOMEM); 1182 1183 /* 1184 * Only scan the relevant parts containing pointers to other objects 1185 * to avoid false negatives. 1186 */ 1187 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); 1188 1189 retry: 1190 /* 1191 * Preload this CPU with one extra vmap_area object. It is used 1192 * when fit type of free area is NE_FIT_TYPE. Please note, it 1193 * does not guarantee that an allocation occurs on a CPU that 1194 * is preloaded, instead we minimize the case when it is not. 1195 * It can happen because of cpu migration, because there is a 1196 * race until the below spinlock is taken. 1197 * 1198 * The preload is done in non-atomic context, thus it allows us 1199 * to use more permissive allocation masks to be more stable under 1200 * low memory condition and high memory pressure. In rare case, 1201 * if not preloaded, GFP_NOWAIT is used. 1202 * 1203 * Set "pva" to NULL here, because of "retry" path. 1204 */ 1205 pva = NULL; 1206 1207 if (!this_cpu_read(ne_fit_preload_node)) 1208 /* 1209 * Even if it fails we do not really care about that. 1210 * Just proceed as it is. If needed "overflow" path 1211 * will refill the cache we allocate from. 1212 */ 1213 pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1214 1215 spin_lock(&free_vmap_area_lock); 1216 1217 if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) 1218 kmem_cache_free(vmap_area_cachep, pva); 1219 1220 /* 1221 * If an allocation fails, the "vend" address is 1222 * returned. Therefore trigger the overflow path. 1223 */ 1224 addr = __alloc_vmap_area(size, align, vstart, vend); 1225 spin_unlock(&free_vmap_area_lock); 1226 1227 if (unlikely(addr == vend)) 1228 goto overflow; 1229 1230 va->va_start = addr; 1231 va->va_end = addr + size; 1232 va->vm = NULL; 1233 1234 1235 spin_lock(&vmap_area_lock); 1236 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1237 spin_unlock(&vmap_area_lock); 1238 1239 BUG_ON(!IS_ALIGNED(va->va_start, align)); 1240 BUG_ON(va->va_start < vstart); 1241 BUG_ON(va->va_end > vend); 1242 1243 ret = kasan_populate_vmalloc(addr, size); 1244 if (ret) { 1245 free_vmap_area(va); 1246 return ERR_PTR(ret); 1247 } 1248 1249 return va; 1250 1251 overflow: 1252 if (!purged) { 1253 purge_vmap_area_lazy(); 1254 purged = 1; 1255 goto retry; 1256 } 1257 1258 if (gfpflags_allow_blocking(gfp_mask)) { 1259 unsigned long freed = 0; 1260 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); 1261 if (freed > 0) { 1262 purged = 0; 1263 goto retry; 1264 } 1265 } 1266 1267 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 1268 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", 1269 size); 1270 1271 kmem_cache_free(vmap_area_cachep, va); 1272 return ERR_PTR(-EBUSY); 1273 } 1274 1275 int register_vmap_purge_notifier(struct notifier_block *nb) 1276 { 1277 return blocking_notifier_chain_register(&vmap_notify_list, nb); 1278 } 1279 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); 1280 1281 int unregister_vmap_purge_notifier(struct notifier_block *nb) 1282 { 1283 return blocking_notifier_chain_unregister(&vmap_notify_list, nb); 1284 } 1285 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); 1286 1287 /* 1288 * lazy_max_pages is the maximum amount of virtual address space we gather up 1289 * before attempting to purge with a TLB flush. 1290 * 1291 * There is a tradeoff here: a larger number will cover more kernel page tables 1292 * and take slightly longer to purge, but it will linearly reduce the number of 1293 * global TLB flushes that must be performed. It would seem natural to scale 1294 * this number up linearly with the number of CPUs (because vmapping activity 1295 * could also scale linearly with the number of CPUs), however it is likely 1296 * that in practice, workloads might be constrained in other ways that mean 1297 * vmap activity will not scale linearly with CPUs. Also, I want to be 1298 * conservative and not introduce a big latency on huge systems, so go with 1299 * a less aggressive log scale. It will still be an improvement over the old 1300 * code, and it will be simple to change the scale factor if we find that it 1301 * becomes a problem on bigger systems. 1302 */ 1303 static unsigned long lazy_max_pages(void) 1304 { 1305 unsigned int log; 1306 1307 log = fls(num_online_cpus()); 1308 1309 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 1310 } 1311 1312 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); 1313 1314 /* 1315 * Serialize vmap purging. There is no actual criticial section protected 1316 * by this look, but we want to avoid concurrent calls for performance 1317 * reasons and to make the pcpu_get_vm_areas more deterministic. 1318 */ 1319 static DEFINE_MUTEX(vmap_purge_lock); 1320 1321 /* for per-CPU blocks */ 1322 static void purge_fragmented_blocks_allcpus(void); 1323 1324 /* 1325 * called before a call to iounmap() if the caller wants vm_area_struct's 1326 * immediately freed. 1327 */ 1328 void set_iounmap_nonlazy(void) 1329 { 1330 atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); 1331 } 1332 1333 /* 1334 * Purges all lazily-freed vmap areas. 1335 */ 1336 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) 1337 { 1338 unsigned long resched_threshold; 1339 struct list_head local_pure_list; 1340 struct vmap_area *va, *n_va; 1341 1342 lockdep_assert_held(&vmap_purge_lock); 1343 1344 spin_lock(&purge_vmap_area_lock); 1345 purge_vmap_area_root = RB_ROOT; 1346 list_replace_init(&purge_vmap_area_list, &local_pure_list); 1347 spin_unlock(&purge_vmap_area_lock); 1348 1349 if (unlikely(list_empty(&local_pure_list))) 1350 return false; 1351 1352 start = min(start, 1353 list_first_entry(&local_pure_list, 1354 struct vmap_area, list)->va_start); 1355 1356 end = max(end, 1357 list_last_entry(&local_pure_list, 1358 struct vmap_area, list)->va_end); 1359 1360 flush_tlb_kernel_range(start, end); 1361 resched_threshold = lazy_max_pages() << 1; 1362 1363 spin_lock(&free_vmap_area_lock); 1364 list_for_each_entry_safe(va, n_va, &local_pure_list, list) { 1365 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1366 unsigned long orig_start = va->va_start; 1367 unsigned long orig_end = va->va_end; 1368 1369 /* 1370 * Finally insert or merge lazily-freed area. It is 1371 * detached and there is no need to "unlink" it from 1372 * anything. 1373 */ 1374 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, 1375 &free_vmap_area_list); 1376 1377 if (!va) 1378 continue; 1379 1380 if (is_vmalloc_or_module_addr((void *)orig_start)) 1381 kasan_release_vmalloc(orig_start, orig_end, 1382 va->va_start, va->va_end); 1383 1384 atomic_long_sub(nr, &vmap_lazy_nr); 1385 1386 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) 1387 cond_resched_lock(&free_vmap_area_lock); 1388 } 1389 spin_unlock(&free_vmap_area_lock); 1390 return true; 1391 } 1392 1393 /* 1394 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody 1395 * is already purging. 1396 */ 1397 static void try_purge_vmap_area_lazy(void) 1398 { 1399 if (mutex_trylock(&vmap_purge_lock)) { 1400 __purge_vmap_area_lazy(ULONG_MAX, 0); 1401 mutex_unlock(&vmap_purge_lock); 1402 } 1403 } 1404 1405 /* 1406 * Kick off a purge of the outstanding lazy areas. 1407 */ 1408 static void purge_vmap_area_lazy(void) 1409 { 1410 mutex_lock(&vmap_purge_lock); 1411 purge_fragmented_blocks_allcpus(); 1412 __purge_vmap_area_lazy(ULONG_MAX, 0); 1413 mutex_unlock(&vmap_purge_lock); 1414 } 1415 1416 /* 1417 * Free a vmap area, caller ensuring that the area has been unmapped 1418 * and flush_cache_vunmap had been called for the correct range 1419 * previously. 1420 */ 1421 static void free_vmap_area_noflush(struct vmap_area *va) 1422 { 1423 unsigned long nr_lazy; 1424 1425 spin_lock(&vmap_area_lock); 1426 unlink_va(va, &vmap_area_root); 1427 spin_unlock(&vmap_area_lock); 1428 1429 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1430 PAGE_SHIFT, &vmap_lazy_nr); 1431 1432 /* 1433 * Merge or place it to the purge tree/list. 1434 */ 1435 spin_lock(&purge_vmap_area_lock); 1436 merge_or_add_vmap_area(va, 1437 &purge_vmap_area_root, &purge_vmap_area_list); 1438 spin_unlock(&purge_vmap_area_lock); 1439 1440 /* After this point, we may free va at any time */ 1441 if (unlikely(nr_lazy > lazy_max_pages())) 1442 try_purge_vmap_area_lazy(); 1443 } 1444 1445 /* 1446 * Free and unmap a vmap area 1447 */ 1448 static void free_unmap_vmap_area(struct vmap_area *va) 1449 { 1450 flush_cache_vunmap(va->va_start, va->va_end); 1451 unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); 1452 if (debug_pagealloc_enabled_static()) 1453 flush_tlb_kernel_range(va->va_start, va->va_end); 1454 1455 free_vmap_area_noflush(va); 1456 } 1457 1458 static struct vmap_area *find_vmap_area(unsigned long addr) 1459 { 1460 struct vmap_area *va; 1461 1462 spin_lock(&vmap_area_lock); 1463 va = __find_vmap_area(addr); 1464 spin_unlock(&vmap_area_lock); 1465 1466 return va; 1467 } 1468 1469 /*** Per cpu kva allocator ***/ 1470 1471 /* 1472 * vmap space is limited especially on 32 bit architectures. Ensure there is 1473 * room for at least 16 percpu vmap blocks per CPU. 1474 */ 1475 /* 1476 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 1477 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 1478 * instead (we just need a rough idea) 1479 */ 1480 #if BITS_PER_LONG == 32 1481 #define VMALLOC_SPACE (128UL*1024*1024) 1482 #else 1483 #define VMALLOC_SPACE (128UL*1024*1024*1024) 1484 #endif 1485 1486 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 1487 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 1488 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 1489 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 1490 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 1491 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 1492 #define VMAP_BBMAP_BITS \ 1493 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 1494 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 1495 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 1496 1497 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 1498 1499 struct vmap_block_queue { 1500 spinlock_t lock; 1501 struct list_head free; 1502 }; 1503 1504 struct vmap_block { 1505 spinlock_t lock; 1506 struct vmap_area *va; 1507 unsigned long free, dirty; 1508 unsigned long dirty_min, dirty_max; /*< dirty range */ 1509 struct list_head free_list; 1510 struct rcu_head rcu_head; 1511 struct list_head purge; 1512 }; 1513 1514 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 1515 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 1516 1517 /* 1518 * XArray of vmap blocks, indexed by address, to quickly find a vmap block 1519 * in the free path. Could get rid of this if we change the API to return a 1520 * "cookie" from alloc, to be passed to free. But no big deal yet. 1521 */ 1522 static DEFINE_XARRAY(vmap_blocks); 1523 1524 /* 1525 * We should probably have a fallback mechanism to allocate virtual memory 1526 * out of partially filled vmap blocks. However vmap block sizing should be 1527 * fairly reasonable according to the vmalloc size, so it shouldn't be a 1528 * big problem. 1529 */ 1530 1531 static unsigned long addr_to_vb_idx(unsigned long addr) 1532 { 1533 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 1534 addr /= VMAP_BLOCK_SIZE; 1535 return addr; 1536 } 1537 1538 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) 1539 { 1540 unsigned long addr; 1541 1542 addr = va_start + (pages_off << PAGE_SHIFT); 1543 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); 1544 return (void *)addr; 1545 } 1546 1547 /** 1548 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this 1549 * block. Of course pages number can't exceed VMAP_BBMAP_BITS 1550 * @order: how many 2^order pages should be occupied in newly allocated block 1551 * @gfp_mask: flags for the page level allocator 1552 * 1553 * Return: virtual address in a newly allocated block or ERR_PTR(-errno) 1554 */ 1555 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 1556 { 1557 struct vmap_block_queue *vbq; 1558 struct vmap_block *vb; 1559 struct vmap_area *va; 1560 unsigned long vb_idx; 1561 int node, err; 1562 void *vaddr; 1563 1564 node = numa_node_id(); 1565 1566 vb = kmalloc_node(sizeof(struct vmap_block), 1567 gfp_mask & GFP_RECLAIM_MASK, node); 1568 if (unlikely(!vb)) 1569 return ERR_PTR(-ENOMEM); 1570 1571 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 1572 VMALLOC_START, VMALLOC_END, 1573 node, gfp_mask); 1574 if (IS_ERR(va)) { 1575 kfree(vb); 1576 return ERR_CAST(va); 1577 } 1578 1579 vaddr = vmap_block_vaddr(va->va_start, 0); 1580 spin_lock_init(&vb->lock); 1581 vb->va = va; 1582 /* At least something should be left free */ 1583 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); 1584 vb->free = VMAP_BBMAP_BITS - (1UL << order); 1585 vb->dirty = 0; 1586 vb->dirty_min = VMAP_BBMAP_BITS; 1587 vb->dirty_max = 0; 1588 INIT_LIST_HEAD(&vb->free_list); 1589 1590 vb_idx = addr_to_vb_idx(va->va_start); 1591 err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); 1592 if (err) { 1593 kfree(vb); 1594 free_vmap_area(va); 1595 return ERR_PTR(err); 1596 } 1597 1598 vbq = &get_cpu_var(vmap_block_queue); 1599 spin_lock(&vbq->lock); 1600 list_add_tail_rcu(&vb->free_list, &vbq->free); 1601 spin_unlock(&vbq->lock); 1602 put_cpu_var(vmap_block_queue); 1603 1604 return vaddr; 1605 } 1606 1607 static void free_vmap_block(struct vmap_block *vb) 1608 { 1609 struct vmap_block *tmp; 1610 1611 tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); 1612 BUG_ON(tmp != vb); 1613 1614 free_vmap_area_noflush(vb->va); 1615 kfree_rcu(vb, rcu_head); 1616 } 1617 1618 static void purge_fragmented_blocks(int cpu) 1619 { 1620 LIST_HEAD(purge); 1621 struct vmap_block *vb; 1622 struct vmap_block *n_vb; 1623 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1624 1625 rcu_read_lock(); 1626 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1627 1628 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 1629 continue; 1630 1631 spin_lock(&vb->lock); 1632 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 1633 vb->free = 0; /* prevent further allocs after releasing lock */ 1634 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 1635 vb->dirty_min = 0; 1636 vb->dirty_max = VMAP_BBMAP_BITS; 1637 spin_lock(&vbq->lock); 1638 list_del_rcu(&vb->free_list); 1639 spin_unlock(&vbq->lock); 1640 spin_unlock(&vb->lock); 1641 list_add_tail(&vb->purge, &purge); 1642 } else 1643 spin_unlock(&vb->lock); 1644 } 1645 rcu_read_unlock(); 1646 1647 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 1648 list_del(&vb->purge); 1649 free_vmap_block(vb); 1650 } 1651 } 1652 1653 static void purge_fragmented_blocks_allcpus(void) 1654 { 1655 int cpu; 1656 1657 for_each_possible_cpu(cpu) 1658 purge_fragmented_blocks(cpu); 1659 } 1660 1661 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 1662 { 1663 struct vmap_block_queue *vbq; 1664 struct vmap_block *vb; 1665 void *vaddr = NULL; 1666 unsigned int order; 1667 1668 BUG_ON(offset_in_page(size)); 1669 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1670 if (WARN_ON(size == 0)) { 1671 /* 1672 * Allocating 0 bytes isn't what caller wants since 1673 * get_order(0) returns funny result. Just warn and terminate 1674 * early. 1675 */ 1676 return NULL; 1677 } 1678 order = get_order(size); 1679 1680 rcu_read_lock(); 1681 vbq = &get_cpu_var(vmap_block_queue); 1682 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1683 unsigned long pages_off; 1684 1685 spin_lock(&vb->lock); 1686 if (vb->free < (1UL << order)) { 1687 spin_unlock(&vb->lock); 1688 continue; 1689 } 1690 1691 pages_off = VMAP_BBMAP_BITS - vb->free; 1692 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); 1693 vb->free -= 1UL << order; 1694 if (vb->free == 0) { 1695 spin_lock(&vbq->lock); 1696 list_del_rcu(&vb->free_list); 1697 spin_unlock(&vbq->lock); 1698 } 1699 1700 spin_unlock(&vb->lock); 1701 break; 1702 } 1703 1704 put_cpu_var(vmap_block_queue); 1705 rcu_read_unlock(); 1706 1707 /* Allocate new block if nothing was found */ 1708 if (!vaddr) 1709 vaddr = new_vmap_block(order, gfp_mask); 1710 1711 return vaddr; 1712 } 1713 1714 static void vb_free(unsigned long addr, unsigned long size) 1715 { 1716 unsigned long offset; 1717 unsigned int order; 1718 struct vmap_block *vb; 1719 1720 BUG_ON(offset_in_page(size)); 1721 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1722 1723 flush_cache_vunmap(addr, addr + size); 1724 1725 order = get_order(size); 1726 offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; 1727 vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); 1728 1729 unmap_kernel_range_noflush(addr, size); 1730 1731 if (debug_pagealloc_enabled_static()) 1732 flush_tlb_kernel_range(addr, addr + size); 1733 1734 spin_lock(&vb->lock); 1735 1736 /* Expand dirty range */ 1737 vb->dirty_min = min(vb->dirty_min, offset); 1738 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); 1739 1740 vb->dirty += 1UL << order; 1741 if (vb->dirty == VMAP_BBMAP_BITS) { 1742 BUG_ON(vb->free); 1743 spin_unlock(&vb->lock); 1744 free_vmap_block(vb); 1745 } else 1746 spin_unlock(&vb->lock); 1747 } 1748 1749 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) 1750 { 1751 int cpu; 1752 1753 if (unlikely(!vmap_initialized)) 1754 return; 1755 1756 might_sleep(); 1757 1758 for_each_possible_cpu(cpu) { 1759 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1760 struct vmap_block *vb; 1761 1762 rcu_read_lock(); 1763 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1764 spin_lock(&vb->lock); 1765 if (vb->dirty) { 1766 unsigned long va_start = vb->va->va_start; 1767 unsigned long s, e; 1768 1769 s = va_start + (vb->dirty_min << PAGE_SHIFT); 1770 e = va_start + (vb->dirty_max << PAGE_SHIFT); 1771 1772 start = min(s, start); 1773 end = max(e, end); 1774 1775 flush = 1; 1776 } 1777 spin_unlock(&vb->lock); 1778 } 1779 rcu_read_unlock(); 1780 } 1781 1782 mutex_lock(&vmap_purge_lock); 1783 purge_fragmented_blocks_allcpus(); 1784 if (!__purge_vmap_area_lazy(start, end) && flush) 1785 flush_tlb_kernel_range(start, end); 1786 mutex_unlock(&vmap_purge_lock); 1787 } 1788 1789 /** 1790 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 1791 * 1792 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 1793 * to amortize TLB flushing overheads. What this means is that any page you 1794 * have now, may, in a former life, have been mapped into kernel virtual 1795 * address by the vmap layer and so there might be some CPUs with TLB entries 1796 * still referencing that page (additional to the regular 1:1 kernel mapping). 1797 * 1798 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 1799 * be sure that none of the pages we have control over will have any aliases 1800 * from the vmap layer. 1801 */ 1802 void vm_unmap_aliases(void) 1803 { 1804 unsigned long start = ULONG_MAX, end = 0; 1805 int flush = 0; 1806 1807 _vm_unmap_aliases(start, end, flush); 1808 } 1809 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 1810 1811 /** 1812 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 1813 * @mem: the pointer returned by vm_map_ram 1814 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 1815 */ 1816 void vm_unmap_ram(const void *mem, unsigned int count) 1817 { 1818 unsigned long size = (unsigned long)count << PAGE_SHIFT; 1819 unsigned long addr = (unsigned long)mem; 1820 struct vmap_area *va; 1821 1822 might_sleep(); 1823 BUG_ON(!addr); 1824 BUG_ON(addr < VMALLOC_START); 1825 BUG_ON(addr > VMALLOC_END); 1826 BUG_ON(!PAGE_ALIGNED(addr)); 1827 1828 kasan_poison_vmalloc(mem, size); 1829 1830 if (likely(count <= VMAP_MAX_ALLOC)) { 1831 debug_check_no_locks_freed(mem, size); 1832 vb_free(addr, size); 1833 return; 1834 } 1835 1836 va = find_vmap_area(addr); 1837 BUG_ON(!va); 1838 debug_check_no_locks_freed((void *)va->va_start, 1839 (va->va_end - va->va_start)); 1840 free_unmap_vmap_area(va); 1841 } 1842 EXPORT_SYMBOL(vm_unmap_ram); 1843 1844 /** 1845 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 1846 * @pages: an array of pointers to the pages to be mapped 1847 * @count: number of pages 1848 * @node: prefer to allocate data structures on this node 1849 * 1850 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be 1851 * faster than vmap so it's good. But if you mix long-life and short-life 1852 * objects with vm_map_ram(), it could consume lots of address space through 1853 * fragmentation (especially on a 32bit machine). You could see failures in 1854 * the end. Please use this function for short-lived objects. 1855 * 1856 * Returns: a pointer to the address that has been mapped, or %NULL on failure 1857 */ 1858 void *vm_map_ram(struct page **pages, unsigned int count, int node) 1859 { 1860 unsigned long size = (unsigned long)count << PAGE_SHIFT; 1861 unsigned long addr; 1862 void *mem; 1863 1864 if (likely(count <= VMAP_MAX_ALLOC)) { 1865 mem = vb_alloc(size, GFP_KERNEL); 1866 if (IS_ERR(mem)) 1867 return NULL; 1868 addr = (unsigned long)mem; 1869 } else { 1870 struct vmap_area *va; 1871 va = alloc_vmap_area(size, PAGE_SIZE, 1872 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 1873 if (IS_ERR(va)) 1874 return NULL; 1875 1876 addr = va->va_start; 1877 mem = (void *)addr; 1878 } 1879 1880 kasan_unpoison_vmalloc(mem, size); 1881 1882 if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { 1883 vm_unmap_ram(mem, count); 1884 return NULL; 1885 } 1886 return mem; 1887 } 1888 EXPORT_SYMBOL(vm_map_ram); 1889 1890 static struct vm_struct *vmlist __initdata; 1891 1892 /** 1893 * vm_area_add_early - add vmap area early during boot 1894 * @vm: vm_struct to add 1895 * 1896 * This function is used to add fixed kernel vm area to vmlist before 1897 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags 1898 * should contain proper values and the other fields should be zero. 1899 * 1900 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1901 */ 1902 void __init vm_area_add_early(struct vm_struct *vm) 1903 { 1904 struct vm_struct *tmp, **p; 1905 1906 BUG_ON(vmap_initialized); 1907 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1908 if (tmp->addr >= vm->addr) { 1909 BUG_ON(tmp->addr < vm->addr + vm->size); 1910 break; 1911 } else 1912 BUG_ON(tmp->addr + tmp->size > vm->addr); 1913 } 1914 vm->next = *p; 1915 *p = vm; 1916 } 1917 1918 /** 1919 * vm_area_register_early - register vmap area early during boot 1920 * @vm: vm_struct to register 1921 * @align: requested alignment 1922 * 1923 * This function is used to register kernel vm area before 1924 * vmalloc_init() is called. @vm->size and @vm->flags should contain 1925 * proper values on entry and other fields should be zero. On return, 1926 * vm->addr contains the allocated address. 1927 * 1928 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1929 */ 1930 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 1931 { 1932 static size_t vm_init_off __initdata; 1933 unsigned long addr; 1934 1935 addr = ALIGN(VMALLOC_START + vm_init_off, align); 1936 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; 1937 1938 vm->addr = (void *)addr; 1939 1940 vm_area_add_early(vm); 1941 } 1942 1943 static void vmap_init_free_space(void) 1944 { 1945 unsigned long vmap_start = 1; 1946 const unsigned long vmap_end = ULONG_MAX; 1947 struct vmap_area *busy, *free; 1948 1949 /* 1950 * B F B B B F 1951 * -|-----|.....|-----|-----|-----|.....|- 1952 * | The KVA space | 1953 * |<--------------------------------->| 1954 */ 1955 list_for_each_entry(busy, &vmap_area_list, list) { 1956 if (busy->va_start - vmap_start > 0) { 1957 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1958 if (!WARN_ON_ONCE(!free)) { 1959 free->va_start = vmap_start; 1960 free->va_end = busy->va_start; 1961 1962 insert_vmap_area_augment(free, NULL, 1963 &free_vmap_area_root, 1964 &free_vmap_area_list); 1965 } 1966 } 1967 1968 vmap_start = busy->va_end; 1969 } 1970 1971 if (vmap_end - vmap_start > 0) { 1972 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1973 if (!WARN_ON_ONCE(!free)) { 1974 free->va_start = vmap_start; 1975 free->va_end = vmap_end; 1976 1977 insert_vmap_area_augment(free, NULL, 1978 &free_vmap_area_root, 1979 &free_vmap_area_list); 1980 } 1981 } 1982 } 1983 1984 void __init vmalloc_init(void) 1985 { 1986 struct vmap_area *va; 1987 struct vm_struct *tmp; 1988 int i; 1989 1990 /* 1991 * Create the cache for vmap_area objects. 1992 */ 1993 vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); 1994 1995 for_each_possible_cpu(i) { 1996 struct vmap_block_queue *vbq; 1997 struct vfree_deferred *p; 1998 1999 vbq = &per_cpu(vmap_block_queue, i); 2000 spin_lock_init(&vbq->lock); 2001 INIT_LIST_HEAD(&vbq->free); 2002 p = &per_cpu(vfree_deferred, i); 2003 init_llist_head(&p->list); 2004 INIT_WORK(&p->wq, free_work); 2005 } 2006 2007 /* Import existing vmlist entries. */ 2008 for (tmp = vmlist; tmp; tmp = tmp->next) { 2009 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2010 if (WARN_ON_ONCE(!va)) 2011 continue; 2012 2013 va->va_start = (unsigned long)tmp->addr; 2014 va->va_end = va->va_start + tmp->size; 2015 va->vm = tmp; 2016 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 2017 } 2018 2019 /* 2020 * Now we can initialize a free vmap space. 2021 */ 2022 vmap_init_free_space(); 2023 vmap_initialized = true; 2024 } 2025 2026 /** 2027 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB 2028 * @addr: start of the VM area to unmap 2029 * @size: size of the VM area to unmap 2030 * 2031 * Similar to unmap_kernel_range_noflush() but flushes vcache before 2032 * the unmapping and tlb after. 2033 */ 2034 void unmap_kernel_range(unsigned long addr, unsigned long size) 2035 { 2036 unsigned long end = addr + size; 2037 2038 flush_cache_vunmap(addr, end); 2039 unmap_kernel_range_noflush(addr, size); 2040 flush_tlb_kernel_range(addr, end); 2041 } 2042 2043 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, 2044 struct vmap_area *va, unsigned long flags, const void *caller) 2045 { 2046 vm->flags = flags; 2047 vm->addr = (void *)va->va_start; 2048 vm->size = va->va_end - va->va_start; 2049 vm->caller = caller; 2050 va->vm = vm; 2051 } 2052 2053 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 2054 unsigned long flags, const void *caller) 2055 { 2056 spin_lock(&vmap_area_lock); 2057 setup_vmalloc_vm_locked(vm, va, flags, caller); 2058 spin_unlock(&vmap_area_lock); 2059 } 2060 2061 static void clear_vm_uninitialized_flag(struct vm_struct *vm) 2062 { 2063 /* 2064 * Before removing VM_UNINITIALIZED, 2065 * we should make sure that vm has proper values. 2066 * Pair with smp_rmb() in show_numa_info(). 2067 */ 2068 smp_wmb(); 2069 vm->flags &= ~VM_UNINITIALIZED; 2070 } 2071 2072 static struct vm_struct *__get_vm_area_node(unsigned long size, 2073 unsigned long align, unsigned long flags, unsigned long start, 2074 unsigned long end, int node, gfp_t gfp_mask, const void *caller) 2075 { 2076 struct vmap_area *va; 2077 struct vm_struct *area; 2078 unsigned long requested_size = size; 2079 2080 BUG_ON(in_interrupt()); 2081 size = PAGE_ALIGN(size); 2082 if (unlikely(!size)) 2083 return NULL; 2084 2085 if (flags & VM_IOREMAP) 2086 align = 1ul << clamp_t(int, get_count_order_long(size), 2087 PAGE_SHIFT, IOREMAP_MAX_ORDER); 2088 2089 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 2090 if (unlikely(!area)) 2091 return NULL; 2092 2093 if (!(flags & VM_NO_GUARD)) 2094 size += PAGE_SIZE; 2095 2096 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 2097 if (IS_ERR(va)) { 2098 kfree(area); 2099 return NULL; 2100 } 2101 2102 kasan_unpoison_vmalloc((void *)va->va_start, requested_size); 2103 2104 setup_vmalloc_vm(area, va, flags, caller); 2105 2106 return area; 2107 } 2108 2109 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 2110 unsigned long start, unsigned long end, 2111 const void *caller) 2112 { 2113 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, 2114 GFP_KERNEL, caller); 2115 } 2116 2117 /** 2118 * get_vm_area - reserve a contiguous kernel virtual area 2119 * @size: size of the area 2120 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 2121 * 2122 * Search an area of @size in the kernel virtual mapping area, 2123 * and reserved it for out purposes. Returns the area descriptor 2124 * on success or %NULL on failure. 2125 * 2126 * Return: the area descriptor on success or %NULL on failure. 2127 */ 2128 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 2129 { 2130 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2131 NUMA_NO_NODE, GFP_KERNEL, 2132 __builtin_return_address(0)); 2133 } 2134 2135 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 2136 const void *caller) 2137 { 2138 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2139 NUMA_NO_NODE, GFP_KERNEL, caller); 2140 } 2141 2142 /** 2143 * find_vm_area - find a continuous kernel virtual area 2144 * @addr: base address 2145 * 2146 * Search for the kernel VM area starting at @addr, and return it. 2147 * It is up to the caller to do all required locking to keep the returned 2148 * pointer valid. 2149 * 2150 * Return: the area descriptor on success or %NULL on failure. 2151 */ 2152 struct vm_struct *find_vm_area(const void *addr) 2153 { 2154 struct vmap_area *va; 2155 2156 va = find_vmap_area((unsigned long)addr); 2157 if (!va) 2158 return NULL; 2159 2160 return va->vm; 2161 } 2162 2163 /** 2164 * remove_vm_area - find and remove a continuous kernel virtual area 2165 * @addr: base address 2166 * 2167 * Search for the kernel VM area starting at @addr, and remove it. 2168 * This function returns the found VM area, but using it is NOT safe 2169 * on SMP machines, except for its size or flags. 2170 * 2171 * Return: the area descriptor on success or %NULL on failure. 2172 */ 2173 struct vm_struct *remove_vm_area(const void *addr) 2174 { 2175 struct vmap_area *va; 2176 2177 might_sleep(); 2178 2179 spin_lock(&vmap_area_lock); 2180 va = __find_vmap_area((unsigned long)addr); 2181 if (va && va->vm) { 2182 struct vm_struct *vm = va->vm; 2183 2184 va->vm = NULL; 2185 spin_unlock(&vmap_area_lock); 2186 2187 kasan_free_shadow(vm); 2188 free_unmap_vmap_area(va); 2189 2190 return vm; 2191 } 2192 2193 spin_unlock(&vmap_area_lock); 2194 return NULL; 2195 } 2196 2197 static inline void set_area_direct_map(const struct vm_struct *area, 2198 int (*set_direct_map)(struct page *page)) 2199 { 2200 int i; 2201 2202 for (i = 0; i < area->nr_pages; i++) 2203 if (page_address(area->pages[i])) 2204 set_direct_map(area->pages[i]); 2205 } 2206 2207 /* Handle removing and resetting vm mappings related to the vm_struct. */ 2208 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) 2209 { 2210 unsigned long start = ULONG_MAX, end = 0; 2211 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; 2212 int flush_dmap = 0; 2213 int i; 2214 2215 remove_vm_area(area->addr); 2216 2217 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ 2218 if (!flush_reset) 2219 return; 2220 2221 /* 2222 * If not deallocating pages, just do the flush of the VM area and 2223 * return. 2224 */ 2225 if (!deallocate_pages) { 2226 vm_unmap_aliases(); 2227 return; 2228 } 2229 2230 /* 2231 * If execution gets here, flush the vm mapping and reset the direct 2232 * map. Find the start and end range of the direct mappings to make sure 2233 * the vm_unmap_aliases() flush includes the direct map. 2234 */ 2235 for (i = 0; i < area->nr_pages; i++) { 2236 unsigned long addr = (unsigned long)page_address(area->pages[i]); 2237 if (addr) { 2238 start = min(addr, start); 2239 end = max(addr + PAGE_SIZE, end); 2240 flush_dmap = 1; 2241 } 2242 } 2243 2244 /* 2245 * Set direct map to something invalid so that it won't be cached if 2246 * there are any accesses after the TLB flush, then flush the TLB and 2247 * reset the direct map permissions to the default. 2248 */ 2249 set_area_direct_map(area, set_direct_map_invalid_noflush); 2250 _vm_unmap_aliases(start, end, flush_dmap); 2251 set_area_direct_map(area, set_direct_map_default_noflush); 2252 } 2253 2254 static void __vunmap(const void *addr, int deallocate_pages) 2255 { 2256 struct vm_struct *area; 2257 2258 if (!addr) 2259 return; 2260 2261 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", 2262 addr)) 2263 return; 2264 2265 area = find_vm_area(addr); 2266 if (unlikely(!area)) { 2267 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 2268 addr); 2269 return; 2270 } 2271 2272 debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); 2273 debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); 2274 2275 kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); 2276 2277 vm_remove_mappings(area, deallocate_pages); 2278 2279 if (deallocate_pages) { 2280 int i; 2281 2282 for (i = 0; i < area->nr_pages; i++) { 2283 struct page *page = area->pages[i]; 2284 2285 BUG_ON(!page); 2286 __free_pages(page, 0); 2287 } 2288 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); 2289 2290 kvfree(area->pages); 2291 } 2292 2293 kfree(area); 2294 } 2295 2296 static inline void __vfree_deferred(const void *addr) 2297 { 2298 /* 2299 * Use raw_cpu_ptr() because this can be called from preemptible 2300 * context. Preemption is absolutely fine here, because the llist_add() 2301 * implementation is lockless, so it works even if we are adding to 2302 * another cpu's list. schedule_work() should be fine with this too. 2303 */ 2304 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); 2305 2306 if (llist_add((struct llist_node *)addr, &p->list)) 2307 schedule_work(&p->wq); 2308 } 2309 2310 /** 2311 * vfree_atomic - release memory allocated by vmalloc() 2312 * @addr: memory base address 2313 * 2314 * This one is just like vfree() but can be called in any atomic context 2315 * except NMIs. 2316 */ 2317 void vfree_atomic(const void *addr) 2318 { 2319 BUG_ON(in_nmi()); 2320 2321 kmemleak_free(addr); 2322 2323 if (!addr) 2324 return; 2325 __vfree_deferred(addr); 2326 } 2327 2328 static void __vfree(const void *addr) 2329 { 2330 if (unlikely(in_interrupt())) 2331 __vfree_deferred(addr); 2332 else 2333 __vunmap(addr, 1); 2334 } 2335 2336 /** 2337 * vfree - Release memory allocated by vmalloc() 2338 * @addr: Memory base address 2339 * 2340 * Free the virtually continuous memory area starting at @addr, as obtained 2341 * from one of the vmalloc() family of APIs. This will usually also free the 2342 * physical memory underlying the virtual allocation, but that memory is 2343 * reference counted, so it will not be freed until the last user goes away. 2344 * 2345 * If @addr is NULL, no operation is performed. 2346 * 2347 * Context: 2348 * May sleep if called *not* from interrupt context. 2349 * Must not be called in NMI context (strictly speaking, it could be 2350 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 2351 * conventions for vfree() arch-depenedent would be a really bad idea). 2352 */ 2353 void vfree(const void *addr) 2354 { 2355 BUG_ON(in_nmi()); 2356 2357 kmemleak_free(addr); 2358 2359 might_sleep_if(!in_interrupt()); 2360 2361 if (!addr) 2362 return; 2363 2364 __vfree(addr); 2365 } 2366 EXPORT_SYMBOL(vfree); 2367 2368 /** 2369 * vunmap - release virtual mapping obtained by vmap() 2370 * @addr: memory base address 2371 * 2372 * Free the virtually contiguous memory area starting at @addr, 2373 * which was created from the page array passed to vmap(). 2374 * 2375 * Must not be called in interrupt context. 2376 */ 2377 void vunmap(const void *addr) 2378 { 2379 BUG_ON(in_interrupt()); 2380 might_sleep(); 2381 if (addr) 2382 __vunmap(addr, 0); 2383 } 2384 EXPORT_SYMBOL(vunmap); 2385 2386 /** 2387 * vmap - map an array of pages into virtually contiguous space 2388 * @pages: array of page pointers 2389 * @count: number of pages to map 2390 * @flags: vm_area->flags 2391 * @prot: page protection for the mapping 2392 * 2393 * Maps @count pages from @pages into contiguous kernel virtual space. 2394 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself 2395 * (which must be kmalloc or vmalloc memory) and one reference per pages in it 2396 * are transferred from the caller to vmap(), and will be freed / dropped when 2397 * vfree() is called on the return value. 2398 * 2399 * Return: the address of the area or %NULL on failure 2400 */ 2401 void *vmap(struct page **pages, unsigned int count, 2402 unsigned long flags, pgprot_t prot) 2403 { 2404 struct vm_struct *area; 2405 unsigned long size; /* In bytes */ 2406 2407 might_sleep(); 2408 2409 if (count > totalram_pages()) 2410 return NULL; 2411 2412 size = (unsigned long)count << PAGE_SHIFT; 2413 area = get_vm_area_caller(size, flags, __builtin_return_address(0)); 2414 if (!area) 2415 return NULL; 2416 2417 if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), 2418 pages) < 0) { 2419 vunmap(area->addr); 2420 return NULL; 2421 } 2422 2423 if (flags & VM_MAP_PUT_PAGES) { 2424 area->pages = pages; 2425 area->nr_pages = count; 2426 } 2427 return area->addr; 2428 } 2429 EXPORT_SYMBOL(vmap); 2430 2431 #ifdef CONFIG_VMAP_PFN 2432 struct vmap_pfn_data { 2433 unsigned long *pfns; 2434 pgprot_t prot; 2435 unsigned int idx; 2436 }; 2437 2438 static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) 2439 { 2440 struct vmap_pfn_data *data = private; 2441 2442 if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) 2443 return -EINVAL; 2444 *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); 2445 return 0; 2446 } 2447 2448 /** 2449 * vmap_pfn - map an array of PFNs into virtually contiguous space 2450 * @pfns: array of PFNs 2451 * @count: number of pages to map 2452 * @prot: page protection for the mapping 2453 * 2454 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns 2455 * the start address of the mapping. 2456 */ 2457 void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) 2458 { 2459 struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; 2460 struct vm_struct *area; 2461 2462 area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, 2463 __builtin_return_address(0)); 2464 if (!area) 2465 return NULL; 2466 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2467 count * PAGE_SIZE, vmap_pfn_apply, &data)) { 2468 free_vm_area(area); 2469 return NULL; 2470 } 2471 return area->addr; 2472 } 2473 EXPORT_SYMBOL_GPL(vmap_pfn); 2474 #endif /* CONFIG_VMAP_PFN */ 2475 2476 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 2477 pgprot_t prot, int node) 2478 { 2479 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 2480 unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 2481 unsigned long array_size; 2482 unsigned int i; 2483 struct page **pages; 2484 2485 array_size = (unsigned long)nr_pages * sizeof(struct page *); 2486 gfp_mask |= __GFP_NOWARN; 2487 if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) 2488 gfp_mask |= __GFP_HIGHMEM; 2489 2490 /* Please note that the recursion is strictly bounded. */ 2491 if (array_size > PAGE_SIZE) { 2492 pages = __vmalloc_node(array_size, 1, nested_gfp, node, 2493 area->caller); 2494 } else { 2495 pages = kmalloc_node(array_size, nested_gfp, node); 2496 } 2497 2498 if (!pages) { 2499 free_vm_area(area); 2500 return NULL; 2501 } 2502 2503 area->pages = pages; 2504 area->nr_pages = nr_pages; 2505 2506 for (i = 0; i < area->nr_pages; i++) { 2507 struct page *page; 2508 2509 if (node == NUMA_NO_NODE) 2510 page = alloc_page(gfp_mask); 2511 else 2512 page = alloc_pages_node(node, gfp_mask, 0); 2513 2514 if (unlikely(!page)) { 2515 /* Successfully allocated i pages, free them in __vfree() */ 2516 area->nr_pages = i; 2517 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2518 goto fail; 2519 } 2520 area->pages[i] = page; 2521 if (gfpflags_allow_blocking(gfp_mask)) 2522 cond_resched(); 2523 } 2524 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2525 2526 if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), 2527 prot, pages) < 0) 2528 goto fail; 2529 2530 return area->addr; 2531 2532 fail: 2533 warn_alloc(gfp_mask, NULL, 2534 "vmalloc: allocation failure, allocated %ld of %ld bytes", 2535 (area->nr_pages*PAGE_SIZE), area->size); 2536 __vfree(area->addr); 2537 return NULL; 2538 } 2539 2540 /** 2541 * __vmalloc_node_range - allocate virtually contiguous memory 2542 * @size: allocation size 2543 * @align: desired alignment 2544 * @start: vm area range start 2545 * @end: vm area range end 2546 * @gfp_mask: flags for the page level allocator 2547 * @prot: protection mask for the allocated pages 2548 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 2549 * @node: node to use for allocation or NUMA_NO_NODE 2550 * @caller: caller's return address 2551 * 2552 * Allocate enough pages to cover @size from the page level 2553 * allocator with @gfp_mask flags. Map them into contiguous 2554 * kernel virtual space, using a pagetable protection of @prot. 2555 * 2556 * Return: the address of the area or %NULL on failure 2557 */ 2558 void *__vmalloc_node_range(unsigned long size, unsigned long align, 2559 unsigned long start, unsigned long end, gfp_t gfp_mask, 2560 pgprot_t prot, unsigned long vm_flags, int node, 2561 const void *caller) 2562 { 2563 struct vm_struct *area; 2564 void *addr; 2565 unsigned long real_size = size; 2566 2567 size = PAGE_ALIGN(size); 2568 if (!size || (size >> PAGE_SHIFT) > totalram_pages()) 2569 goto fail; 2570 2571 area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | 2572 vm_flags, start, end, node, gfp_mask, caller); 2573 if (!area) 2574 goto fail; 2575 2576 addr = __vmalloc_area_node(area, gfp_mask, prot, node); 2577 if (!addr) 2578 return NULL; 2579 2580 /* 2581 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 2582 * flag. It means that vm_struct is not fully initialized. 2583 * Now, it is fully initialized, so remove this flag here. 2584 */ 2585 clear_vm_uninitialized_flag(area); 2586 2587 kmemleak_vmalloc(area, size, gfp_mask); 2588 2589 return addr; 2590 2591 fail: 2592 warn_alloc(gfp_mask, NULL, 2593 "vmalloc: allocation failure: %lu bytes", real_size); 2594 return NULL; 2595 } 2596 2597 /** 2598 * __vmalloc_node - allocate virtually contiguous memory 2599 * @size: allocation size 2600 * @align: desired alignment 2601 * @gfp_mask: flags for the page level allocator 2602 * @node: node to use for allocation or NUMA_NO_NODE 2603 * @caller: caller's return address 2604 * 2605 * Allocate enough pages to cover @size from the page level allocator with 2606 * @gfp_mask flags. Map them into contiguous kernel virtual space. 2607 * 2608 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 2609 * and __GFP_NOFAIL are not supported 2610 * 2611 * Any use of gfp flags outside of GFP_KERNEL should be consulted 2612 * with mm people. 2613 * 2614 * Return: pointer to the allocated memory or %NULL on error 2615 */ 2616 void *__vmalloc_node(unsigned long size, unsigned long align, 2617 gfp_t gfp_mask, int node, const void *caller) 2618 { 2619 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 2620 gfp_mask, PAGE_KERNEL, 0, node, caller); 2621 } 2622 /* 2623 * This is only for performance analysis of vmalloc and stress purpose. 2624 * It is required by vmalloc test module, therefore do not use it other 2625 * than that. 2626 */ 2627 #ifdef CONFIG_TEST_VMALLOC_MODULE 2628 EXPORT_SYMBOL_GPL(__vmalloc_node); 2629 #endif 2630 2631 void *__vmalloc(unsigned long size, gfp_t gfp_mask) 2632 { 2633 return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, 2634 __builtin_return_address(0)); 2635 } 2636 EXPORT_SYMBOL(__vmalloc); 2637 2638 /** 2639 * vmalloc - allocate virtually contiguous memory 2640 * @size: allocation size 2641 * 2642 * Allocate enough pages to cover @size from the page level 2643 * allocator and map them into contiguous kernel virtual space. 2644 * 2645 * For tight control over page level allocator and protection flags 2646 * use __vmalloc() instead. 2647 * 2648 * Return: pointer to the allocated memory or %NULL on error 2649 */ 2650 void *vmalloc(unsigned long size) 2651 { 2652 return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, 2653 __builtin_return_address(0)); 2654 } 2655 EXPORT_SYMBOL(vmalloc); 2656 2657 /** 2658 * vzalloc - allocate virtually contiguous memory with zero fill 2659 * @size: allocation size 2660 * 2661 * Allocate enough pages to cover @size from the page level 2662 * allocator and map them into contiguous kernel virtual space. 2663 * The memory allocated is set to zero. 2664 * 2665 * For tight control over page level allocator and protection flags 2666 * use __vmalloc() instead. 2667 * 2668 * Return: pointer to the allocated memory or %NULL on error 2669 */ 2670 void *vzalloc(unsigned long size) 2671 { 2672 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 2673 __builtin_return_address(0)); 2674 } 2675 EXPORT_SYMBOL(vzalloc); 2676 2677 /** 2678 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 2679 * @size: allocation size 2680 * 2681 * The resulting memory area is zeroed so it can be mapped to userspace 2682 * without leaking data. 2683 * 2684 * Return: pointer to the allocated memory or %NULL on error 2685 */ 2686 void *vmalloc_user(unsigned long size) 2687 { 2688 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 2689 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, 2690 VM_USERMAP, NUMA_NO_NODE, 2691 __builtin_return_address(0)); 2692 } 2693 EXPORT_SYMBOL(vmalloc_user); 2694 2695 /** 2696 * vmalloc_node - allocate memory on a specific node 2697 * @size: allocation size 2698 * @node: numa node 2699 * 2700 * Allocate enough pages to cover @size from the page level 2701 * allocator and map them into contiguous kernel virtual space. 2702 * 2703 * For tight control over page level allocator and protection flags 2704 * use __vmalloc() instead. 2705 * 2706 * Return: pointer to the allocated memory or %NULL on error 2707 */ 2708 void *vmalloc_node(unsigned long size, int node) 2709 { 2710 return __vmalloc_node(size, 1, GFP_KERNEL, node, 2711 __builtin_return_address(0)); 2712 } 2713 EXPORT_SYMBOL(vmalloc_node); 2714 2715 /** 2716 * vzalloc_node - allocate memory on a specific node with zero fill 2717 * @size: allocation size 2718 * @node: numa node 2719 * 2720 * Allocate enough pages to cover @size from the page level 2721 * allocator and map them into contiguous kernel virtual space. 2722 * The memory allocated is set to zero. 2723 * 2724 * Return: pointer to the allocated memory or %NULL on error 2725 */ 2726 void *vzalloc_node(unsigned long size, int node) 2727 { 2728 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, 2729 __builtin_return_address(0)); 2730 } 2731 EXPORT_SYMBOL(vzalloc_node); 2732 2733 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 2734 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 2735 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 2736 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) 2737 #else 2738 /* 2739 * 64b systems should always have either DMA or DMA32 zones. For others 2740 * GFP_DMA32 should do the right thing and use the normal zone. 2741 */ 2742 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL 2743 #endif 2744 2745 /** 2746 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 2747 * @size: allocation size 2748 * 2749 * Allocate enough 32bit PA addressable pages to cover @size from the 2750 * page level allocator and map them into contiguous kernel virtual space. 2751 * 2752 * Return: pointer to the allocated memory or %NULL on error 2753 */ 2754 void *vmalloc_32(unsigned long size) 2755 { 2756 return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, 2757 __builtin_return_address(0)); 2758 } 2759 EXPORT_SYMBOL(vmalloc_32); 2760 2761 /** 2762 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 2763 * @size: allocation size 2764 * 2765 * The resulting memory area is 32bit addressable and zeroed so it can be 2766 * mapped to userspace without leaking data. 2767 * 2768 * Return: pointer to the allocated memory or %NULL on error 2769 */ 2770 void *vmalloc_32_user(unsigned long size) 2771 { 2772 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 2773 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 2774 VM_USERMAP, NUMA_NO_NODE, 2775 __builtin_return_address(0)); 2776 } 2777 EXPORT_SYMBOL(vmalloc_32_user); 2778 2779 /* 2780 * small helper routine , copy contents to buf from addr. 2781 * If the page is not present, fill zero. 2782 */ 2783 2784 static int aligned_vread(char *buf, char *addr, unsigned long count) 2785 { 2786 struct page *p; 2787 int copied = 0; 2788 2789 while (count) { 2790 unsigned long offset, length; 2791 2792 offset = offset_in_page(addr); 2793 length = PAGE_SIZE - offset; 2794 if (length > count) 2795 length = count; 2796 p = vmalloc_to_page(addr); 2797 /* 2798 * To do safe access to this _mapped_ area, we need 2799 * lock. But adding lock here means that we need to add 2800 * overhead of vmalloc()/vfree() calles for this _debug_ 2801 * interface, rarely used. Instead of that, we'll use 2802 * kmap() and get small overhead in this access function. 2803 */ 2804 if (p) { 2805 /* 2806 * we can expect USER0 is not used (see vread/vwrite's 2807 * function description) 2808 */ 2809 void *map = kmap_atomic(p); 2810 memcpy(buf, map + offset, length); 2811 kunmap_atomic(map); 2812 } else 2813 memset(buf, 0, length); 2814 2815 addr += length; 2816 buf += length; 2817 copied += length; 2818 count -= length; 2819 } 2820 return copied; 2821 } 2822 2823 static int aligned_vwrite(char *buf, char *addr, unsigned long count) 2824 { 2825 struct page *p; 2826 int copied = 0; 2827 2828 while (count) { 2829 unsigned long offset, length; 2830 2831 offset = offset_in_page(addr); 2832 length = PAGE_SIZE - offset; 2833 if (length > count) 2834 length = count; 2835 p = vmalloc_to_page(addr); 2836 /* 2837 * To do safe access to this _mapped_ area, we need 2838 * lock. But adding lock here means that we need to add 2839 * overhead of vmalloc()/vfree() calles for this _debug_ 2840 * interface, rarely used. Instead of that, we'll use 2841 * kmap() and get small overhead in this access function. 2842 */ 2843 if (p) { 2844 /* 2845 * we can expect USER0 is not used (see vread/vwrite's 2846 * function description) 2847 */ 2848 void *map = kmap_atomic(p); 2849 memcpy(map + offset, buf, length); 2850 kunmap_atomic(map); 2851 } 2852 addr += length; 2853 buf += length; 2854 copied += length; 2855 count -= length; 2856 } 2857 return copied; 2858 } 2859 2860 /** 2861 * vread() - read vmalloc area in a safe way. 2862 * @buf: buffer for reading data 2863 * @addr: vm address. 2864 * @count: number of bytes to be read. 2865 * 2866 * This function checks that addr is a valid vmalloc'ed area, and 2867 * copy data from that area to a given buffer. If the given memory range 2868 * of [addr...addr+count) includes some valid address, data is copied to 2869 * proper area of @buf. If there are memory holes, they'll be zero-filled. 2870 * IOREMAP area is treated as memory hole and no copy is done. 2871 * 2872 * If [addr...addr+count) doesn't includes any intersects with alive 2873 * vm_struct area, returns 0. @buf should be kernel's buffer. 2874 * 2875 * Note: In usual ops, vread() is never necessary because the caller 2876 * should know vmalloc() area is valid and can use memcpy(). 2877 * This is for routines which have to access vmalloc area without 2878 * any information, as /dev/kmem. 2879 * 2880 * Return: number of bytes for which addr and buf should be increased 2881 * (same number as @count) or %0 if [addr...addr+count) doesn't 2882 * include any intersection with valid vmalloc area 2883 */ 2884 long vread(char *buf, char *addr, unsigned long count) 2885 { 2886 struct vmap_area *va; 2887 struct vm_struct *vm; 2888 char *vaddr, *buf_start = buf; 2889 unsigned long buflen = count; 2890 unsigned long n; 2891 2892 /* Don't allow overflow */ 2893 if ((unsigned long) addr + count < count) 2894 count = -(unsigned long) addr; 2895 2896 spin_lock(&vmap_area_lock); 2897 list_for_each_entry(va, &vmap_area_list, list) { 2898 if (!count) 2899 break; 2900 2901 if (!va->vm) 2902 continue; 2903 2904 vm = va->vm; 2905 vaddr = (char *) vm->addr; 2906 if (addr >= vaddr + get_vm_area_size(vm)) 2907 continue; 2908 while (addr < vaddr) { 2909 if (count == 0) 2910 goto finished; 2911 *buf = '\0'; 2912 buf++; 2913 addr++; 2914 count--; 2915 } 2916 n = vaddr + get_vm_area_size(vm) - addr; 2917 if (n > count) 2918 n = count; 2919 if (!(vm->flags & VM_IOREMAP)) 2920 aligned_vread(buf, addr, n); 2921 else /* IOREMAP area is treated as memory hole */ 2922 memset(buf, 0, n); 2923 buf += n; 2924 addr += n; 2925 count -= n; 2926 } 2927 finished: 2928 spin_unlock(&vmap_area_lock); 2929 2930 if (buf == buf_start) 2931 return 0; 2932 /* zero-fill memory holes */ 2933 if (buf != buf_start + buflen) 2934 memset(buf, 0, buflen - (buf - buf_start)); 2935 2936 return buflen; 2937 } 2938 2939 /** 2940 * vwrite() - write vmalloc area in a safe way. 2941 * @buf: buffer for source data 2942 * @addr: vm address. 2943 * @count: number of bytes to be read. 2944 * 2945 * This function checks that addr is a valid vmalloc'ed area, and 2946 * copy data from a buffer to the given addr. If specified range of 2947 * [addr...addr+count) includes some valid address, data is copied from 2948 * proper area of @buf. If there are memory holes, no copy to hole. 2949 * IOREMAP area is treated as memory hole and no copy is done. 2950 * 2951 * If [addr...addr+count) doesn't includes any intersects with alive 2952 * vm_struct area, returns 0. @buf should be kernel's buffer. 2953 * 2954 * Note: In usual ops, vwrite() is never necessary because the caller 2955 * should know vmalloc() area is valid and can use memcpy(). 2956 * This is for routines which have to access vmalloc area without 2957 * any information, as /dev/kmem. 2958 * 2959 * Return: number of bytes for which addr and buf should be 2960 * increased (same number as @count) or %0 if [addr...addr+count) 2961 * doesn't include any intersection with valid vmalloc area 2962 */ 2963 long vwrite(char *buf, char *addr, unsigned long count) 2964 { 2965 struct vmap_area *va; 2966 struct vm_struct *vm; 2967 char *vaddr; 2968 unsigned long n, buflen; 2969 int copied = 0; 2970 2971 /* Don't allow overflow */ 2972 if ((unsigned long) addr + count < count) 2973 count = -(unsigned long) addr; 2974 buflen = count; 2975 2976 spin_lock(&vmap_area_lock); 2977 list_for_each_entry(va, &vmap_area_list, list) { 2978 if (!count) 2979 break; 2980 2981 if (!va->vm) 2982 continue; 2983 2984 vm = va->vm; 2985 vaddr = (char *) vm->addr; 2986 if (addr >= vaddr + get_vm_area_size(vm)) 2987 continue; 2988 while (addr < vaddr) { 2989 if (count == 0) 2990 goto finished; 2991 buf++; 2992 addr++; 2993 count--; 2994 } 2995 n = vaddr + get_vm_area_size(vm) - addr; 2996 if (n > count) 2997 n = count; 2998 if (!(vm->flags & VM_IOREMAP)) { 2999 aligned_vwrite(buf, addr, n); 3000 copied++; 3001 } 3002 buf += n; 3003 addr += n; 3004 count -= n; 3005 } 3006 finished: 3007 spin_unlock(&vmap_area_lock); 3008 if (!copied) 3009 return 0; 3010 return buflen; 3011 } 3012 3013 /** 3014 * remap_vmalloc_range_partial - map vmalloc pages to userspace 3015 * @vma: vma to cover 3016 * @uaddr: target user address to start at 3017 * @kaddr: virtual address of vmalloc kernel memory 3018 * @pgoff: offset from @kaddr to start at 3019 * @size: size of map area 3020 * 3021 * Returns: 0 for success, -Exxx on failure 3022 * 3023 * This function checks that @kaddr is a valid vmalloc'ed area, 3024 * and that it is big enough to cover the range starting at 3025 * @uaddr in @vma. Will return failure if that criteria isn't 3026 * met. 3027 * 3028 * Similar to remap_pfn_range() (see mm/memory.c) 3029 */ 3030 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 3031 void *kaddr, unsigned long pgoff, 3032 unsigned long size) 3033 { 3034 struct vm_struct *area; 3035 unsigned long off; 3036 unsigned long end_index; 3037 3038 if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) 3039 return -EINVAL; 3040 3041 size = PAGE_ALIGN(size); 3042 3043 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) 3044 return -EINVAL; 3045 3046 area = find_vm_area(kaddr); 3047 if (!area) 3048 return -EINVAL; 3049 3050 if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) 3051 return -EINVAL; 3052 3053 if (check_add_overflow(size, off, &end_index) || 3054 end_index > get_vm_area_size(area)) 3055 return -EINVAL; 3056 kaddr += off; 3057 3058 do { 3059 struct page *page = vmalloc_to_page(kaddr); 3060 int ret; 3061 3062 ret = vm_insert_page(vma, uaddr, page); 3063 if (ret) 3064 return ret; 3065 3066 uaddr += PAGE_SIZE; 3067 kaddr += PAGE_SIZE; 3068 size -= PAGE_SIZE; 3069 } while (size > 0); 3070 3071 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3072 3073 return 0; 3074 } 3075 EXPORT_SYMBOL(remap_vmalloc_range_partial); 3076 3077 /** 3078 * remap_vmalloc_range - map vmalloc pages to userspace 3079 * @vma: vma to cover (map full range of vma) 3080 * @addr: vmalloc memory 3081 * @pgoff: number of pages into addr before first page to map 3082 * 3083 * Returns: 0 for success, -Exxx on failure 3084 * 3085 * This function checks that addr is a valid vmalloc'ed area, and 3086 * that it is big enough to cover the vma. Will return failure if 3087 * that criteria isn't met. 3088 * 3089 * Similar to remap_pfn_range() (see mm/memory.c) 3090 */ 3091 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 3092 unsigned long pgoff) 3093 { 3094 return remap_vmalloc_range_partial(vma, vma->vm_start, 3095 addr, pgoff, 3096 vma->vm_end - vma->vm_start); 3097 } 3098 EXPORT_SYMBOL(remap_vmalloc_range); 3099 3100 void free_vm_area(struct vm_struct *area) 3101 { 3102 struct vm_struct *ret; 3103 ret = remove_vm_area(area->addr); 3104 BUG_ON(ret != area); 3105 kfree(area); 3106 } 3107 EXPORT_SYMBOL_GPL(free_vm_area); 3108 3109 #ifdef CONFIG_SMP 3110 static struct vmap_area *node_to_va(struct rb_node *n) 3111 { 3112 return rb_entry_safe(n, struct vmap_area, rb_node); 3113 } 3114 3115 /** 3116 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to 3117 * @addr: target address 3118 * 3119 * Returns: vmap_area if it is found. If there is no such area 3120 * the first highest(reverse order) vmap_area is returned 3121 * i.e. va->va_start < addr && va->va_end < addr or NULL 3122 * if there are no any areas before @addr. 3123 */ 3124 static struct vmap_area * 3125 pvm_find_va_enclose_addr(unsigned long addr) 3126 { 3127 struct vmap_area *va, *tmp; 3128 struct rb_node *n; 3129 3130 n = free_vmap_area_root.rb_node; 3131 va = NULL; 3132 3133 while (n) { 3134 tmp = rb_entry(n, struct vmap_area, rb_node); 3135 if (tmp->va_start <= addr) { 3136 va = tmp; 3137 if (tmp->va_end >= addr) 3138 break; 3139 3140 n = n->rb_right; 3141 } else { 3142 n = n->rb_left; 3143 } 3144 } 3145 3146 return va; 3147 } 3148 3149 /** 3150 * pvm_determine_end_from_reverse - find the highest aligned address 3151 * of free block below VMALLOC_END 3152 * @va: 3153 * in - the VA we start the search(reverse order); 3154 * out - the VA with the highest aligned end address. 3155 * @align: alignment for required highest address 3156 * 3157 * Returns: determined end address within vmap_area 3158 */ 3159 static unsigned long 3160 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) 3161 { 3162 unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3163 unsigned long addr; 3164 3165 if (likely(*va)) { 3166 list_for_each_entry_from_reverse((*va), 3167 &free_vmap_area_list, list) { 3168 addr = min((*va)->va_end & ~(align - 1), vmalloc_end); 3169 if ((*va)->va_start < addr) 3170 return addr; 3171 } 3172 } 3173 3174 return 0; 3175 } 3176 3177 /** 3178 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 3179 * @offsets: array containing offset of each area 3180 * @sizes: array containing size of each area 3181 * @nr_vms: the number of areas to allocate 3182 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 3183 * 3184 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 3185 * vm_structs on success, %NULL on failure 3186 * 3187 * Percpu allocator wants to use congruent vm areas so that it can 3188 * maintain the offsets among percpu areas. This function allocates 3189 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to 3190 * be scattered pretty far, distance between two areas easily going up 3191 * to gigabytes. To avoid interacting with regular vmallocs, these 3192 * areas are allocated from top. 3193 * 3194 * Despite its complicated look, this allocator is rather simple. It 3195 * does everything top-down and scans free blocks from the end looking 3196 * for matching base. While scanning, if any of the areas do not fit the 3197 * base address is pulled down to fit the area. Scanning is repeated till 3198 * all the areas fit and then all necessary data structures are inserted 3199 * and the result is returned. 3200 */ 3201 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 3202 const size_t *sizes, int nr_vms, 3203 size_t align) 3204 { 3205 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 3206 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3207 struct vmap_area **vas, *va; 3208 struct vm_struct **vms; 3209 int area, area2, last_area, term_area; 3210 unsigned long base, start, size, end, last_end, orig_start, orig_end; 3211 bool purged = false; 3212 enum fit_type type; 3213 3214 /* verify parameters and allocate data structures */ 3215 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); 3216 for (last_area = 0, area = 0; area < nr_vms; area++) { 3217 start = offsets[area]; 3218 end = start + sizes[area]; 3219 3220 /* is everything aligned properly? */ 3221 BUG_ON(!IS_ALIGNED(offsets[area], align)); 3222 BUG_ON(!IS_ALIGNED(sizes[area], align)); 3223 3224 /* detect the area with the highest address */ 3225 if (start > offsets[last_area]) 3226 last_area = area; 3227 3228 for (area2 = area + 1; area2 < nr_vms; area2++) { 3229 unsigned long start2 = offsets[area2]; 3230 unsigned long end2 = start2 + sizes[area2]; 3231 3232 BUG_ON(start2 < end && start < end2); 3233 } 3234 } 3235 last_end = offsets[last_area] + sizes[last_area]; 3236 3237 if (vmalloc_end - vmalloc_start < last_end) { 3238 WARN_ON(true); 3239 return NULL; 3240 } 3241 3242 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); 3243 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); 3244 if (!vas || !vms) 3245 goto err_free2; 3246 3247 for (area = 0; area < nr_vms; area++) { 3248 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); 3249 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 3250 if (!vas[area] || !vms[area]) 3251 goto err_free; 3252 } 3253 retry: 3254 spin_lock(&free_vmap_area_lock); 3255 3256 /* start scanning - we scan from the top, begin with the last area */ 3257 area = term_area = last_area; 3258 start = offsets[area]; 3259 end = start + sizes[area]; 3260 3261 va = pvm_find_va_enclose_addr(vmalloc_end); 3262 base = pvm_determine_end_from_reverse(&va, align) - end; 3263 3264 while (true) { 3265 /* 3266 * base might have underflowed, add last_end before 3267 * comparing. 3268 */ 3269 if (base + last_end < vmalloc_start + last_end) 3270 goto overflow; 3271 3272 /* 3273 * Fitting base has not been found. 3274 */ 3275 if (va == NULL) 3276 goto overflow; 3277 3278 /* 3279 * If required width exceeds current VA block, move 3280 * base downwards and then recheck. 3281 */ 3282 if (base + end > va->va_end) { 3283 base = pvm_determine_end_from_reverse(&va, align) - end; 3284 term_area = area; 3285 continue; 3286 } 3287 3288 /* 3289 * If this VA does not fit, move base downwards and recheck. 3290 */ 3291 if (base + start < va->va_start) { 3292 va = node_to_va(rb_prev(&va->rb_node)); 3293 base = pvm_determine_end_from_reverse(&va, align) - end; 3294 term_area = area; 3295 continue; 3296 } 3297 3298 /* 3299 * This area fits, move on to the previous one. If 3300 * the previous one is the terminal one, we're done. 3301 */ 3302 area = (area + nr_vms - 1) % nr_vms; 3303 if (area == term_area) 3304 break; 3305 3306 start = offsets[area]; 3307 end = start + sizes[area]; 3308 va = pvm_find_va_enclose_addr(base + end); 3309 } 3310 3311 /* we've found a fitting base, insert all va's */ 3312 for (area = 0; area < nr_vms; area++) { 3313 int ret; 3314 3315 start = base + offsets[area]; 3316 size = sizes[area]; 3317 3318 va = pvm_find_va_enclose_addr(start); 3319 if (WARN_ON_ONCE(va == NULL)) 3320 /* It is a BUG(), but trigger recovery instead. */ 3321 goto recovery; 3322 3323 type = classify_va_fit_type(va, start, size); 3324 if (WARN_ON_ONCE(type == NOTHING_FIT)) 3325 /* It is a BUG(), but trigger recovery instead. */ 3326 goto recovery; 3327 3328 ret = adjust_va_to_fit_type(va, start, size, type); 3329 if (unlikely(ret)) 3330 goto recovery; 3331 3332 /* Allocated area. */ 3333 va = vas[area]; 3334 va->va_start = start; 3335 va->va_end = start + size; 3336 } 3337 3338 spin_unlock(&free_vmap_area_lock); 3339 3340 /* populate the kasan shadow space */ 3341 for (area = 0; area < nr_vms; area++) { 3342 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) 3343 goto err_free_shadow; 3344 3345 kasan_unpoison_vmalloc((void *)vas[area]->va_start, 3346 sizes[area]); 3347 } 3348 3349 /* insert all vm's */ 3350 spin_lock(&vmap_area_lock); 3351 for (area = 0; area < nr_vms; area++) { 3352 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); 3353 3354 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, 3355 pcpu_get_vm_areas); 3356 } 3357 spin_unlock(&vmap_area_lock); 3358 3359 kfree(vas); 3360 return vms; 3361 3362 recovery: 3363 /* 3364 * Remove previously allocated areas. There is no 3365 * need in removing these areas from the busy tree, 3366 * because they are inserted only on the final step 3367 * and when pcpu_get_vm_areas() is success. 3368 */ 3369 while (area--) { 3370 orig_start = vas[area]->va_start; 3371 orig_end = vas[area]->va_end; 3372 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3373 &free_vmap_area_list); 3374 if (va) 3375 kasan_release_vmalloc(orig_start, orig_end, 3376 va->va_start, va->va_end); 3377 vas[area] = NULL; 3378 } 3379 3380 overflow: 3381 spin_unlock(&free_vmap_area_lock); 3382 if (!purged) { 3383 purge_vmap_area_lazy(); 3384 purged = true; 3385 3386 /* Before "retry", check if we recover. */ 3387 for (area = 0; area < nr_vms; area++) { 3388 if (vas[area]) 3389 continue; 3390 3391 vas[area] = kmem_cache_zalloc( 3392 vmap_area_cachep, GFP_KERNEL); 3393 if (!vas[area]) 3394 goto err_free; 3395 } 3396 3397 goto retry; 3398 } 3399 3400 err_free: 3401 for (area = 0; area < nr_vms; area++) { 3402 if (vas[area]) 3403 kmem_cache_free(vmap_area_cachep, vas[area]); 3404 3405 kfree(vms[area]); 3406 } 3407 err_free2: 3408 kfree(vas); 3409 kfree(vms); 3410 return NULL; 3411 3412 err_free_shadow: 3413 spin_lock(&free_vmap_area_lock); 3414 /* 3415 * We release all the vmalloc shadows, even the ones for regions that 3416 * hadn't been successfully added. This relies on kasan_release_vmalloc 3417 * being able to tolerate this case. 3418 */ 3419 for (area = 0; area < nr_vms; area++) { 3420 orig_start = vas[area]->va_start; 3421 orig_end = vas[area]->va_end; 3422 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3423 &free_vmap_area_list); 3424 if (va) 3425 kasan_release_vmalloc(orig_start, orig_end, 3426 va->va_start, va->va_end); 3427 vas[area] = NULL; 3428 kfree(vms[area]); 3429 } 3430 spin_unlock(&free_vmap_area_lock); 3431 kfree(vas); 3432 kfree(vms); 3433 return NULL; 3434 } 3435 3436 /** 3437 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 3438 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 3439 * @nr_vms: the number of allocated areas 3440 * 3441 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 3442 */ 3443 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 3444 { 3445 int i; 3446 3447 for (i = 0; i < nr_vms; i++) 3448 free_vm_area(vms[i]); 3449 kfree(vms); 3450 } 3451 #endif /* CONFIG_SMP */ 3452 3453 #ifdef CONFIG_PROC_FS 3454 static void *s_start(struct seq_file *m, loff_t *pos) 3455 __acquires(&vmap_purge_lock) 3456 __acquires(&vmap_area_lock) 3457 { 3458 mutex_lock(&vmap_purge_lock); 3459 spin_lock(&vmap_area_lock); 3460 3461 return seq_list_start(&vmap_area_list, *pos); 3462 } 3463 3464 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3465 { 3466 return seq_list_next(p, &vmap_area_list, pos); 3467 } 3468 3469 static void s_stop(struct seq_file *m, void *p) 3470 __releases(&vmap_area_lock) 3471 __releases(&vmap_purge_lock) 3472 { 3473 spin_unlock(&vmap_area_lock); 3474 mutex_unlock(&vmap_purge_lock); 3475 } 3476 3477 static void show_numa_info(struct seq_file *m, struct vm_struct *v) 3478 { 3479 if (IS_ENABLED(CONFIG_NUMA)) { 3480 unsigned int nr, *counters = m->private; 3481 3482 if (!counters) 3483 return; 3484 3485 if (v->flags & VM_UNINITIALIZED) 3486 return; 3487 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 3488 smp_rmb(); 3489 3490 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 3491 3492 for (nr = 0; nr < v->nr_pages; nr++) 3493 counters[page_to_nid(v->pages[nr])]++; 3494 3495 for_each_node_state(nr, N_HIGH_MEMORY) 3496 if (counters[nr]) 3497 seq_printf(m, " N%u=%u", nr, counters[nr]); 3498 } 3499 } 3500 3501 static void show_purge_info(struct seq_file *m) 3502 { 3503 struct vmap_area *va; 3504 3505 spin_lock(&purge_vmap_area_lock); 3506 list_for_each_entry(va, &purge_vmap_area_list, list) { 3507 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 3508 (void *)va->va_start, (void *)va->va_end, 3509 va->va_end - va->va_start); 3510 } 3511 spin_unlock(&purge_vmap_area_lock); 3512 } 3513 3514 static int s_show(struct seq_file *m, void *p) 3515 { 3516 struct vmap_area *va; 3517 struct vm_struct *v; 3518 3519 va = list_entry(p, struct vmap_area, list); 3520 3521 /* 3522 * s_show can encounter race with remove_vm_area, !vm on behalf 3523 * of vmap area is being tear down or vm_map_ram allocation. 3524 */ 3525 if (!va->vm) { 3526 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", 3527 (void *)va->va_start, (void *)va->va_end, 3528 va->va_end - va->va_start); 3529 3530 return 0; 3531 } 3532 3533 v = va->vm; 3534 3535 seq_printf(m, "0x%pK-0x%pK %7ld", 3536 v->addr, v->addr + v->size, v->size); 3537 3538 if (v->caller) 3539 seq_printf(m, " %pS", v->caller); 3540 3541 if (v->nr_pages) 3542 seq_printf(m, " pages=%d", v->nr_pages); 3543 3544 if (v->phys_addr) 3545 seq_printf(m, " phys=%pa", &v->phys_addr); 3546 3547 if (v->flags & VM_IOREMAP) 3548 seq_puts(m, " ioremap"); 3549 3550 if (v->flags & VM_ALLOC) 3551 seq_puts(m, " vmalloc"); 3552 3553 if (v->flags & VM_MAP) 3554 seq_puts(m, " vmap"); 3555 3556 if (v->flags & VM_USERMAP) 3557 seq_puts(m, " user"); 3558 3559 if (v->flags & VM_DMA_COHERENT) 3560 seq_puts(m, " dma-coherent"); 3561 3562 if (is_vmalloc_addr(v->pages)) 3563 seq_puts(m, " vpages"); 3564 3565 show_numa_info(m, v); 3566 seq_putc(m, '\n'); 3567 3568 /* 3569 * As a final step, dump "unpurged" areas. 3570 */ 3571 if (list_is_last(&va->list, &vmap_area_list)) 3572 show_purge_info(m); 3573 3574 return 0; 3575 } 3576 3577 static const struct seq_operations vmalloc_op = { 3578 .start = s_start, 3579 .next = s_next, 3580 .stop = s_stop, 3581 .show = s_show, 3582 }; 3583 3584 static int __init proc_vmalloc_init(void) 3585 { 3586 if (IS_ENABLED(CONFIG_NUMA)) 3587 proc_create_seq_private("vmallocinfo", 0400, NULL, 3588 &vmalloc_op, 3589 nr_node_ids * sizeof(unsigned int), NULL); 3590 else 3591 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); 3592 return 0; 3593 } 3594 module_init(proc_vmalloc_init); 3595 3596 #endif 3597