1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/vmalloc.c 4 * 5 * Copyright (C) 1993 Linus Torvalds 6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 7 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 8 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 9 * Numa awareness, Christoph Lameter, SGI, June 2005 10 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 11 */ 12 13 #include <linux/vmalloc.h> 14 #include <linux/mm.h> 15 #include <linux/module.h> 16 #include <linux/highmem.h> 17 #include <linux/sched/signal.h> 18 #include <linux/slab.h> 19 #include <linux/spinlock.h> 20 #include <linux/interrupt.h> 21 #include <linux/proc_fs.h> 22 #include <linux/seq_file.h> 23 #include <linux/set_memory.h> 24 #include <linux/debugobjects.h> 25 #include <linux/kallsyms.h> 26 #include <linux/list.h> 27 #include <linux/notifier.h> 28 #include <linux/rbtree.h> 29 #include <linux/xarray.h> 30 #include <linux/rcupdate.h> 31 #include <linux/pfn.h> 32 #include <linux/kmemleak.h> 33 #include <linux/atomic.h> 34 #include <linux/compiler.h> 35 #include <linux/llist.h> 36 #include <linux/bitops.h> 37 #include <linux/rbtree_augmented.h> 38 #include <linux/overflow.h> 39 40 #include <linux/uaccess.h> 41 #include <asm/tlbflush.h> 42 #include <asm/shmparam.h> 43 44 #include "internal.h" 45 #include "pgalloc-track.h" 46 47 bool is_vmalloc_addr(const void *x) 48 { 49 unsigned long addr = (unsigned long)x; 50 51 return addr >= VMALLOC_START && addr < VMALLOC_END; 52 } 53 EXPORT_SYMBOL(is_vmalloc_addr); 54 55 struct vfree_deferred { 56 struct llist_head list; 57 struct work_struct wq; 58 }; 59 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); 60 61 static void __vunmap(const void *, int); 62 63 static void free_work(struct work_struct *w) 64 { 65 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); 66 struct llist_node *t, *llnode; 67 68 llist_for_each_safe(llnode, t, llist_del_all(&p->list)) 69 __vunmap((void *)llnode, 1); 70 } 71 72 /*** Page table manipulation functions ***/ 73 74 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 75 pgtbl_mod_mask *mask) 76 { 77 pte_t *pte; 78 79 pte = pte_offset_kernel(pmd, addr); 80 do { 81 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 82 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 83 } while (pte++, addr += PAGE_SIZE, addr != end); 84 *mask |= PGTBL_PTE_MODIFIED; 85 } 86 87 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 88 pgtbl_mod_mask *mask) 89 { 90 pmd_t *pmd; 91 unsigned long next; 92 int cleared; 93 94 pmd = pmd_offset(pud, addr); 95 do { 96 next = pmd_addr_end(addr, end); 97 98 cleared = pmd_clear_huge(pmd); 99 if (cleared || pmd_bad(*pmd)) 100 *mask |= PGTBL_PMD_MODIFIED; 101 102 if (cleared) 103 continue; 104 if (pmd_none_or_clear_bad(pmd)) 105 continue; 106 vunmap_pte_range(pmd, addr, next, mask); 107 } while (pmd++, addr = next, addr != end); 108 } 109 110 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 111 pgtbl_mod_mask *mask) 112 { 113 pud_t *pud; 114 unsigned long next; 115 int cleared; 116 117 pud = pud_offset(p4d, addr); 118 do { 119 next = pud_addr_end(addr, end); 120 121 cleared = pud_clear_huge(pud); 122 if (cleared || pud_bad(*pud)) 123 *mask |= PGTBL_PUD_MODIFIED; 124 125 if (cleared) 126 continue; 127 if (pud_none_or_clear_bad(pud)) 128 continue; 129 vunmap_pmd_range(pud, addr, next, mask); 130 } while (pud++, addr = next, addr != end); 131 } 132 133 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 134 pgtbl_mod_mask *mask) 135 { 136 p4d_t *p4d; 137 unsigned long next; 138 int cleared; 139 140 p4d = p4d_offset(pgd, addr); 141 do { 142 next = p4d_addr_end(addr, end); 143 144 cleared = p4d_clear_huge(p4d); 145 if (cleared || p4d_bad(*p4d)) 146 *mask |= PGTBL_P4D_MODIFIED; 147 148 if (cleared) 149 continue; 150 if (p4d_none_or_clear_bad(p4d)) 151 continue; 152 vunmap_pud_range(p4d, addr, next, mask); 153 } while (p4d++, addr = next, addr != end); 154 } 155 156 /** 157 * unmap_kernel_range_noflush - unmap kernel VM area 158 * @start: start of the VM area to unmap 159 * @size: size of the VM area to unmap 160 * 161 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify 162 * should have been allocated using get_vm_area() and its friends. 163 * 164 * NOTE: 165 * This function does NOT do any cache flushing. The caller is responsible 166 * for calling flush_cache_vunmap() on to-be-mapped areas before calling this 167 * function and flush_tlb_kernel_range() after. 168 */ 169 void unmap_kernel_range_noflush(unsigned long start, unsigned long size) 170 { 171 unsigned long end = start + size; 172 unsigned long next; 173 pgd_t *pgd; 174 unsigned long addr = start; 175 pgtbl_mod_mask mask = 0; 176 177 BUG_ON(addr >= end); 178 pgd = pgd_offset_k(addr); 179 do { 180 next = pgd_addr_end(addr, end); 181 if (pgd_bad(*pgd)) 182 mask |= PGTBL_PGD_MODIFIED; 183 if (pgd_none_or_clear_bad(pgd)) 184 continue; 185 vunmap_p4d_range(pgd, addr, next, &mask); 186 } while (pgd++, addr = next, addr != end); 187 188 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 189 arch_sync_kernel_mappings(start, end); 190 } 191 192 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 193 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 194 pgtbl_mod_mask *mask) 195 { 196 pte_t *pte; 197 198 /* 199 * nr is a running index into the array which helps higher level 200 * callers keep track of where we're up to. 201 */ 202 203 pte = pte_alloc_kernel_track(pmd, addr, mask); 204 if (!pte) 205 return -ENOMEM; 206 do { 207 struct page *page = pages[*nr]; 208 209 if (WARN_ON(!pte_none(*pte))) 210 return -EBUSY; 211 if (WARN_ON(!page)) 212 return -ENOMEM; 213 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 214 (*nr)++; 215 } while (pte++, addr += PAGE_SIZE, addr != end); 216 *mask |= PGTBL_PTE_MODIFIED; 217 return 0; 218 } 219 220 static int vmap_pmd_range(pud_t *pud, unsigned long addr, 221 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 222 pgtbl_mod_mask *mask) 223 { 224 pmd_t *pmd; 225 unsigned long next; 226 227 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 228 if (!pmd) 229 return -ENOMEM; 230 do { 231 next = pmd_addr_end(addr, end); 232 if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) 233 return -ENOMEM; 234 } while (pmd++, addr = next, addr != end); 235 return 0; 236 } 237 238 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, 239 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 240 pgtbl_mod_mask *mask) 241 { 242 pud_t *pud; 243 unsigned long next; 244 245 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 246 if (!pud) 247 return -ENOMEM; 248 do { 249 next = pud_addr_end(addr, end); 250 if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) 251 return -ENOMEM; 252 } while (pud++, addr = next, addr != end); 253 return 0; 254 } 255 256 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, 257 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 258 pgtbl_mod_mask *mask) 259 { 260 p4d_t *p4d; 261 unsigned long next; 262 263 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 264 if (!p4d) 265 return -ENOMEM; 266 do { 267 next = p4d_addr_end(addr, end); 268 if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) 269 return -ENOMEM; 270 } while (p4d++, addr = next, addr != end); 271 return 0; 272 } 273 274 /** 275 * map_kernel_range_noflush - map kernel VM area with the specified pages 276 * @addr: start of the VM area to map 277 * @size: size of the VM area to map 278 * @prot: page protection flags to use 279 * @pages: pages to map 280 * 281 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should 282 * have been allocated using get_vm_area() and its friends. 283 * 284 * NOTE: 285 * This function does NOT do any cache flushing. The caller is responsible for 286 * calling flush_cache_vmap() on to-be-mapped areas before calling this 287 * function. 288 * 289 * RETURNS: 290 * 0 on success, -errno on failure. 291 */ 292 int map_kernel_range_noflush(unsigned long addr, unsigned long size, 293 pgprot_t prot, struct page **pages) 294 { 295 unsigned long start = addr; 296 unsigned long end = addr + size; 297 unsigned long next; 298 pgd_t *pgd; 299 int err = 0; 300 int nr = 0; 301 pgtbl_mod_mask mask = 0; 302 303 BUG_ON(addr >= end); 304 pgd = pgd_offset_k(addr); 305 do { 306 next = pgd_addr_end(addr, end); 307 if (pgd_bad(*pgd)) 308 mask |= PGTBL_PGD_MODIFIED; 309 err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); 310 if (err) 311 return err; 312 } while (pgd++, addr = next, addr != end); 313 314 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 315 arch_sync_kernel_mappings(start, end); 316 317 return 0; 318 } 319 320 int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, 321 struct page **pages) 322 { 323 int ret; 324 325 ret = map_kernel_range_noflush(start, size, prot, pages); 326 flush_cache_vmap(start, start + size); 327 return ret; 328 } 329 330 int is_vmalloc_or_module_addr(const void *x) 331 { 332 /* 333 * ARM, x86-64 and sparc64 put modules in a special place, 334 * and fall back on vmalloc() if that fails. Others 335 * just put it in the vmalloc space. 336 */ 337 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 338 unsigned long addr = (unsigned long)x; 339 if (addr >= MODULES_VADDR && addr < MODULES_END) 340 return 1; 341 #endif 342 return is_vmalloc_addr(x); 343 } 344 345 /* 346 * Walk a vmap address to the struct page it maps. 347 */ 348 struct page *vmalloc_to_page(const void *vmalloc_addr) 349 { 350 unsigned long addr = (unsigned long) vmalloc_addr; 351 struct page *page = NULL; 352 pgd_t *pgd = pgd_offset_k(addr); 353 p4d_t *p4d; 354 pud_t *pud; 355 pmd_t *pmd; 356 pte_t *ptep, pte; 357 358 /* 359 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 360 * architectures that do not vmalloc module space 361 */ 362 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 363 364 if (pgd_none(*pgd)) 365 return NULL; 366 p4d = p4d_offset(pgd, addr); 367 if (p4d_none(*p4d)) 368 return NULL; 369 pud = pud_offset(p4d, addr); 370 371 /* 372 * Don't dereference bad PUD or PMD (below) entries. This will also 373 * identify huge mappings, which we may encounter on architectures 374 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be 375 * identified as vmalloc addresses by is_vmalloc_addr(), but are 376 * not [unambiguously] associated with a struct page, so there is 377 * no correct value to return for them. 378 */ 379 WARN_ON_ONCE(pud_bad(*pud)); 380 if (pud_none(*pud) || pud_bad(*pud)) 381 return NULL; 382 pmd = pmd_offset(pud, addr); 383 WARN_ON_ONCE(pmd_bad(*pmd)); 384 if (pmd_none(*pmd) || pmd_bad(*pmd)) 385 return NULL; 386 387 ptep = pte_offset_map(pmd, addr); 388 pte = *ptep; 389 if (pte_present(pte)) 390 page = pte_page(pte); 391 pte_unmap(ptep); 392 return page; 393 } 394 EXPORT_SYMBOL(vmalloc_to_page); 395 396 /* 397 * Map a vmalloc()-space virtual address to the physical page frame number. 398 */ 399 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 400 { 401 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 402 } 403 EXPORT_SYMBOL(vmalloc_to_pfn); 404 405 406 /*** Global kva allocator ***/ 407 408 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 409 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 410 411 412 static DEFINE_SPINLOCK(vmap_area_lock); 413 static DEFINE_SPINLOCK(free_vmap_area_lock); 414 /* Export for kexec only */ 415 LIST_HEAD(vmap_area_list); 416 static LLIST_HEAD(vmap_purge_list); 417 static struct rb_root vmap_area_root = RB_ROOT; 418 static bool vmap_initialized __read_mostly; 419 420 /* 421 * This kmem_cache is used for vmap_area objects. Instead of 422 * allocating from slab we reuse an object from this cache to 423 * make things faster. Especially in "no edge" splitting of 424 * free block. 425 */ 426 static struct kmem_cache *vmap_area_cachep; 427 428 /* 429 * This linked list is used in pair with free_vmap_area_root. 430 * It gives O(1) access to prev/next to perform fast coalescing. 431 */ 432 static LIST_HEAD(free_vmap_area_list); 433 434 /* 435 * This augment red-black tree represents the free vmap space. 436 * All vmap_area objects in this tree are sorted by va->va_start 437 * address. It is used for allocation and merging when a vmap 438 * object is released. 439 * 440 * Each vmap_area node contains a maximum available free block 441 * of its sub-tree, right or left. Therefore it is possible to 442 * find a lowest match of free area. 443 */ 444 static struct rb_root free_vmap_area_root = RB_ROOT; 445 446 /* 447 * Preload a CPU with one object for "no edge" split case. The 448 * aim is to get rid of allocations from the atomic context, thus 449 * to use more permissive allocation masks. 450 */ 451 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); 452 453 static __always_inline unsigned long 454 va_size(struct vmap_area *va) 455 { 456 return (va->va_end - va->va_start); 457 } 458 459 static __always_inline unsigned long 460 get_subtree_max_size(struct rb_node *node) 461 { 462 struct vmap_area *va; 463 464 va = rb_entry_safe(node, struct vmap_area, rb_node); 465 return va ? va->subtree_max_size : 0; 466 } 467 468 /* 469 * Gets called when remove the node and rotate. 470 */ 471 static __always_inline unsigned long 472 compute_subtree_max_size(struct vmap_area *va) 473 { 474 return max3(va_size(va), 475 get_subtree_max_size(va->rb_node.rb_left), 476 get_subtree_max_size(va->rb_node.rb_right)); 477 } 478 479 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, 480 struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) 481 482 static void purge_vmap_area_lazy(void); 483 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 484 static unsigned long lazy_max_pages(void); 485 486 static atomic_long_t nr_vmalloc_pages; 487 488 unsigned long vmalloc_nr_pages(void) 489 { 490 return atomic_long_read(&nr_vmalloc_pages); 491 } 492 493 static struct vmap_area *__find_vmap_area(unsigned long addr) 494 { 495 struct rb_node *n = vmap_area_root.rb_node; 496 497 while (n) { 498 struct vmap_area *va; 499 500 va = rb_entry(n, struct vmap_area, rb_node); 501 if (addr < va->va_start) 502 n = n->rb_left; 503 else if (addr >= va->va_end) 504 n = n->rb_right; 505 else 506 return va; 507 } 508 509 return NULL; 510 } 511 512 /* 513 * This function returns back addresses of parent node 514 * and its left or right link for further processing. 515 * 516 * Otherwise NULL is returned. In that case all further 517 * steps regarding inserting of conflicting overlap range 518 * have to be declined and actually considered as a bug. 519 */ 520 static __always_inline struct rb_node ** 521 find_va_links(struct vmap_area *va, 522 struct rb_root *root, struct rb_node *from, 523 struct rb_node **parent) 524 { 525 struct vmap_area *tmp_va; 526 struct rb_node **link; 527 528 if (root) { 529 link = &root->rb_node; 530 if (unlikely(!*link)) { 531 *parent = NULL; 532 return link; 533 } 534 } else { 535 link = &from; 536 } 537 538 /* 539 * Go to the bottom of the tree. When we hit the last point 540 * we end up with parent rb_node and correct direction, i name 541 * it link, where the new va->rb_node will be attached to. 542 */ 543 do { 544 tmp_va = rb_entry(*link, struct vmap_area, rb_node); 545 546 /* 547 * During the traversal we also do some sanity check. 548 * Trigger the BUG() if there are sides(left/right) 549 * or full overlaps. 550 */ 551 if (va->va_start < tmp_va->va_end && 552 va->va_end <= tmp_va->va_start) 553 link = &(*link)->rb_left; 554 else if (va->va_end > tmp_va->va_start && 555 va->va_start >= tmp_va->va_end) 556 link = &(*link)->rb_right; 557 else { 558 WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", 559 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); 560 561 return NULL; 562 } 563 } while (*link); 564 565 *parent = &tmp_va->rb_node; 566 return link; 567 } 568 569 static __always_inline struct list_head * 570 get_va_next_sibling(struct rb_node *parent, struct rb_node **link) 571 { 572 struct list_head *list; 573 574 if (unlikely(!parent)) 575 /* 576 * The red-black tree where we try to find VA neighbors 577 * before merging or inserting is empty, i.e. it means 578 * there is no free vmap space. Normally it does not 579 * happen but we handle this case anyway. 580 */ 581 return NULL; 582 583 list = &rb_entry(parent, struct vmap_area, rb_node)->list; 584 return (&parent->rb_right == link ? list->next : list); 585 } 586 587 static __always_inline void 588 link_va(struct vmap_area *va, struct rb_root *root, 589 struct rb_node *parent, struct rb_node **link, struct list_head *head) 590 { 591 /* 592 * VA is still not in the list, but we can 593 * identify its future previous list_head node. 594 */ 595 if (likely(parent)) { 596 head = &rb_entry(parent, struct vmap_area, rb_node)->list; 597 if (&parent->rb_right != link) 598 head = head->prev; 599 } 600 601 /* Insert to the rb-tree */ 602 rb_link_node(&va->rb_node, parent, link); 603 if (root == &free_vmap_area_root) { 604 /* 605 * Some explanation here. Just perform simple insertion 606 * to the tree. We do not set va->subtree_max_size to 607 * its current size before calling rb_insert_augmented(). 608 * It is because of we populate the tree from the bottom 609 * to parent levels when the node _is_ in the tree. 610 * 611 * Therefore we set subtree_max_size to zero after insertion, 612 * to let __augment_tree_propagate_from() puts everything to 613 * the correct order later on. 614 */ 615 rb_insert_augmented(&va->rb_node, 616 root, &free_vmap_area_rb_augment_cb); 617 va->subtree_max_size = 0; 618 } else { 619 rb_insert_color(&va->rb_node, root); 620 } 621 622 /* Address-sort this list */ 623 list_add(&va->list, head); 624 } 625 626 static __always_inline void 627 unlink_va(struct vmap_area *va, struct rb_root *root) 628 { 629 if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) 630 return; 631 632 if (root == &free_vmap_area_root) 633 rb_erase_augmented(&va->rb_node, 634 root, &free_vmap_area_rb_augment_cb); 635 else 636 rb_erase(&va->rb_node, root); 637 638 list_del(&va->list); 639 RB_CLEAR_NODE(&va->rb_node); 640 } 641 642 #if DEBUG_AUGMENT_PROPAGATE_CHECK 643 static void 644 augment_tree_propagate_check(void) 645 { 646 struct vmap_area *va; 647 unsigned long computed_size; 648 649 list_for_each_entry(va, &free_vmap_area_list, list) { 650 computed_size = compute_subtree_max_size(va); 651 if (computed_size != va->subtree_max_size) 652 pr_emerg("tree is corrupted: %lu, %lu\n", 653 va_size(va), va->subtree_max_size); 654 } 655 } 656 #endif 657 658 /* 659 * This function populates subtree_max_size from bottom to upper 660 * levels starting from VA point. The propagation must be done 661 * when VA size is modified by changing its va_start/va_end. Or 662 * in case of newly inserting of VA to the tree. 663 * 664 * It means that __augment_tree_propagate_from() must be called: 665 * - After VA has been inserted to the tree(free path); 666 * - After VA has been shrunk(allocation path); 667 * - After VA has been increased(merging path). 668 * 669 * Please note that, it does not mean that upper parent nodes 670 * and their subtree_max_size are recalculated all the time up 671 * to the root node. 672 * 673 * 4--8 674 * /\ 675 * / \ 676 * / \ 677 * 2--2 8--8 678 * 679 * For example if we modify the node 4, shrinking it to 2, then 680 * no any modification is required. If we shrink the node 2 to 1 681 * its subtree_max_size is updated only, and set to 1. If we shrink 682 * the node 8 to 6, then its subtree_max_size is set to 6 and parent 683 * node becomes 4--6. 684 */ 685 static __always_inline void 686 augment_tree_propagate_from(struct vmap_area *va) 687 { 688 /* 689 * Populate the tree from bottom towards the root until 690 * the calculated maximum available size of checked node 691 * is equal to its current one. 692 */ 693 free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); 694 695 #if DEBUG_AUGMENT_PROPAGATE_CHECK 696 augment_tree_propagate_check(); 697 #endif 698 } 699 700 static void 701 insert_vmap_area(struct vmap_area *va, 702 struct rb_root *root, struct list_head *head) 703 { 704 struct rb_node **link; 705 struct rb_node *parent; 706 707 link = find_va_links(va, root, NULL, &parent); 708 if (link) 709 link_va(va, root, parent, link, head); 710 } 711 712 static void 713 insert_vmap_area_augment(struct vmap_area *va, 714 struct rb_node *from, struct rb_root *root, 715 struct list_head *head) 716 { 717 struct rb_node **link; 718 struct rb_node *parent; 719 720 if (from) 721 link = find_va_links(va, NULL, from, &parent); 722 else 723 link = find_va_links(va, root, NULL, &parent); 724 725 if (link) { 726 link_va(va, root, parent, link, head); 727 augment_tree_propagate_from(va); 728 } 729 } 730 731 /* 732 * Merge de-allocated chunk of VA memory with previous 733 * and next free blocks. If coalesce is not done a new 734 * free area is inserted. If VA has been merged, it is 735 * freed. 736 * 737 * Please note, it can return NULL in case of overlap 738 * ranges, followed by WARN() report. Despite it is a 739 * buggy behaviour, a system can be alive and keep 740 * ongoing. 741 */ 742 static __always_inline struct vmap_area * 743 merge_or_add_vmap_area(struct vmap_area *va, 744 struct rb_root *root, struct list_head *head) 745 { 746 struct vmap_area *sibling; 747 struct list_head *next; 748 struct rb_node **link; 749 struct rb_node *parent; 750 bool merged = false; 751 752 /* 753 * Find a place in the tree where VA potentially will be 754 * inserted, unless it is merged with its sibling/siblings. 755 */ 756 link = find_va_links(va, root, NULL, &parent); 757 if (!link) 758 return NULL; 759 760 /* 761 * Get next node of VA to check if merging can be done. 762 */ 763 next = get_va_next_sibling(parent, link); 764 if (unlikely(next == NULL)) 765 goto insert; 766 767 /* 768 * start end 769 * | | 770 * |<------VA------>|<-----Next----->| 771 * | | 772 * start end 773 */ 774 if (next != head) { 775 sibling = list_entry(next, struct vmap_area, list); 776 if (sibling->va_start == va->va_end) { 777 sibling->va_start = va->va_start; 778 779 /* Free vmap_area object. */ 780 kmem_cache_free(vmap_area_cachep, va); 781 782 /* Point to the new merged area. */ 783 va = sibling; 784 merged = true; 785 } 786 } 787 788 /* 789 * start end 790 * | | 791 * |<-----Prev----->|<------VA------>| 792 * | | 793 * start end 794 */ 795 if (next->prev != head) { 796 sibling = list_entry(next->prev, struct vmap_area, list); 797 if (sibling->va_end == va->va_start) { 798 /* 799 * If both neighbors are coalesced, it is important 800 * to unlink the "next" node first, followed by merging 801 * with "previous" one. Otherwise the tree might not be 802 * fully populated if a sibling's augmented value is 803 * "normalized" because of rotation operations. 804 */ 805 if (merged) 806 unlink_va(va, root); 807 808 sibling->va_end = va->va_end; 809 810 /* Free vmap_area object. */ 811 kmem_cache_free(vmap_area_cachep, va); 812 813 /* Point to the new merged area. */ 814 va = sibling; 815 merged = true; 816 } 817 } 818 819 insert: 820 if (!merged) 821 link_va(va, root, parent, link, head); 822 823 /* 824 * Last step is to check and update the tree. 825 */ 826 augment_tree_propagate_from(va); 827 return va; 828 } 829 830 static __always_inline bool 831 is_within_this_va(struct vmap_area *va, unsigned long size, 832 unsigned long align, unsigned long vstart) 833 { 834 unsigned long nva_start_addr; 835 836 if (va->va_start > vstart) 837 nva_start_addr = ALIGN(va->va_start, align); 838 else 839 nva_start_addr = ALIGN(vstart, align); 840 841 /* Can be overflowed due to big size or alignment. */ 842 if (nva_start_addr + size < nva_start_addr || 843 nva_start_addr < vstart) 844 return false; 845 846 return (nva_start_addr + size <= va->va_end); 847 } 848 849 /* 850 * Find the first free block(lowest start address) in the tree, 851 * that will accomplish the request corresponding to passing 852 * parameters. 853 */ 854 static __always_inline struct vmap_area * 855 find_vmap_lowest_match(unsigned long size, 856 unsigned long align, unsigned long vstart) 857 { 858 struct vmap_area *va; 859 struct rb_node *node; 860 unsigned long length; 861 862 /* Start from the root. */ 863 node = free_vmap_area_root.rb_node; 864 865 /* Adjust the search size for alignment overhead. */ 866 length = size + align - 1; 867 868 while (node) { 869 va = rb_entry(node, struct vmap_area, rb_node); 870 871 if (get_subtree_max_size(node->rb_left) >= length && 872 vstart < va->va_start) { 873 node = node->rb_left; 874 } else { 875 if (is_within_this_va(va, size, align, vstart)) 876 return va; 877 878 /* 879 * Does not make sense to go deeper towards the right 880 * sub-tree if it does not have a free block that is 881 * equal or bigger to the requested search length. 882 */ 883 if (get_subtree_max_size(node->rb_right) >= length) { 884 node = node->rb_right; 885 continue; 886 } 887 888 /* 889 * OK. We roll back and find the first right sub-tree, 890 * that will satisfy the search criteria. It can happen 891 * only once due to "vstart" restriction. 892 */ 893 while ((node = rb_parent(node))) { 894 va = rb_entry(node, struct vmap_area, rb_node); 895 if (is_within_this_va(va, size, align, vstart)) 896 return va; 897 898 if (get_subtree_max_size(node->rb_right) >= length && 899 vstart <= va->va_start) { 900 node = node->rb_right; 901 break; 902 } 903 } 904 } 905 } 906 907 return NULL; 908 } 909 910 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 911 #include <linux/random.h> 912 913 static struct vmap_area * 914 find_vmap_lowest_linear_match(unsigned long size, 915 unsigned long align, unsigned long vstart) 916 { 917 struct vmap_area *va; 918 919 list_for_each_entry(va, &free_vmap_area_list, list) { 920 if (!is_within_this_va(va, size, align, vstart)) 921 continue; 922 923 return va; 924 } 925 926 return NULL; 927 } 928 929 static void 930 find_vmap_lowest_match_check(unsigned long size) 931 { 932 struct vmap_area *va_1, *va_2; 933 unsigned long vstart; 934 unsigned int rnd; 935 936 get_random_bytes(&rnd, sizeof(rnd)); 937 vstart = VMALLOC_START + rnd; 938 939 va_1 = find_vmap_lowest_match(size, 1, vstart); 940 va_2 = find_vmap_lowest_linear_match(size, 1, vstart); 941 942 if (va_1 != va_2) 943 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", 944 va_1, va_2, vstart); 945 } 946 #endif 947 948 enum fit_type { 949 NOTHING_FIT = 0, 950 FL_FIT_TYPE = 1, /* full fit */ 951 LE_FIT_TYPE = 2, /* left edge fit */ 952 RE_FIT_TYPE = 3, /* right edge fit */ 953 NE_FIT_TYPE = 4 /* no edge fit */ 954 }; 955 956 static __always_inline enum fit_type 957 classify_va_fit_type(struct vmap_area *va, 958 unsigned long nva_start_addr, unsigned long size) 959 { 960 enum fit_type type; 961 962 /* Check if it is within VA. */ 963 if (nva_start_addr < va->va_start || 964 nva_start_addr + size > va->va_end) 965 return NOTHING_FIT; 966 967 /* Now classify. */ 968 if (va->va_start == nva_start_addr) { 969 if (va->va_end == nva_start_addr + size) 970 type = FL_FIT_TYPE; 971 else 972 type = LE_FIT_TYPE; 973 } else if (va->va_end == nva_start_addr + size) { 974 type = RE_FIT_TYPE; 975 } else { 976 type = NE_FIT_TYPE; 977 } 978 979 return type; 980 } 981 982 static __always_inline int 983 adjust_va_to_fit_type(struct vmap_area *va, 984 unsigned long nva_start_addr, unsigned long size, 985 enum fit_type type) 986 { 987 struct vmap_area *lva = NULL; 988 989 if (type == FL_FIT_TYPE) { 990 /* 991 * No need to split VA, it fully fits. 992 * 993 * | | 994 * V NVA V 995 * |---------------| 996 */ 997 unlink_va(va, &free_vmap_area_root); 998 kmem_cache_free(vmap_area_cachep, va); 999 } else if (type == LE_FIT_TYPE) { 1000 /* 1001 * Split left edge of fit VA. 1002 * 1003 * | | 1004 * V NVA V R 1005 * |-------|-------| 1006 */ 1007 va->va_start += size; 1008 } else if (type == RE_FIT_TYPE) { 1009 /* 1010 * Split right edge of fit VA. 1011 * 1012 * | | 1013 * L V NVA V 1014 * |-------|-------| 1015 */ 1016 va->va_end = nva_start_addr; 1017 } else if (type == NE_FIT_TYPE) { 1018 /* 1019 * Split no edge of fit VA. 1020 * 1021 * | | 1022 * L V NVA V R 1023 * |---|-------|---| 1024 */ 1025 lva = __this_cpu_xchg(ne_fit_preload_node, NULL); 1026 if (unlikely(!lva)) { 1027 /* 1028 * For percpu allocator we do not do any pre-allocation 1029 * and leave it as it is. The reason is it most likely 1030 * never ends up with NE_FIT_TYPE splitting. In case of 1031 * percpu allocations offsets and sizes are aligned to 1032 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE 1033 * are its main fitting cases. 1034 * 1035 * There are a few exceptions though, as an example it is 1036 * a first allocation (early boot up) when we have "one" 1037 * big free space that has to be split. 1038 * 1039 * Also we can hit this path in case of regular "vmap" 1040 * allocations, if "this" current CPU was not preloaded. 1041 * See the comment in alloc_vmap_area() why. If so, then 1042 * GFP_NOWAIT is used instead to get an extra object for 1043 * split purpose. That is rare and most time does not 1044 * occur. 1045 * 1046 * What happens if an allocation gets failed. Basically, 1047 * an "overflow" path is triggered to purge lazily freed 1048 * areas to free some memory, then, the "retry" path is 1049 * triggered to repeat one more time. See more details 1050 * in alloc_vmap_area() function. 1051 */ 1052 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 1053 if (!lva) 1054 return -1; 1055 } 1056 1057 /* 1058 * Build the remainder. 1059 */ 1060 lva->va_start = va->va_start; 1061 lva->va_end = nva_start_addr; 1062 1063 /* 1064 * Shrink this VA to remaining size. 1065 */ 1066 va->va_start = nva_start_addr + size; 1067 } else { 1068 return -1; 1069 } 1070 1071 if (type != FL_FIT_TYPE) { 1072 augment_tree_propagate_from(va); 1073 1074 if (lva) /* type == NE_FIT_TYPE */ 1075 insert_vmap_area_augment(lva, &va->rb_node, 1076 &free_vmap_area_root, &free_vmap_area_list); 1077 } 1078 1079 return 0; 1080 } 1081 1082 /* 1083 * Returns a start address of the newly allocated area, if success. 1084 * Otherwise a vend is returned that indicates failure. 1085 */ 1086 static __always_inline unsigned long 1087 __alloc_vmap_area(unsigned long size, unsigned long align, 1088 unsigned long vstart, unsigned long vend) 1089 { 1090 unsigned long nva_start_addr; 1091 struct vmap_area *va; 1092 enum fit_type type; 1093 int ret; 1094 1095 va = find_vmap_lowest_match(size, align, vstart); 1096 if (unlikely(!va)) 1097 return vend; 1098 1099 if (va->va_start > vstart) 1100 nva_start_addr = ALIGN(va->va_start, align); 1101 else 1102 nva_start_addr = ALIGN(vstart, align); 1103 1104 /* Check the "vend" restriction. */ 1105 if (nva_start_addr + size > vend) 1106 return vend; 1107 1108 /* Classify what we have found. */ 1109 type = classify_va_fit_type(va, nva_start_addr, size); 1110 if (WARN_ON_ONCE(type == NOTHING_FIT)) 1111 return vend; 1112 1113 /* Update the free vmap_area. */ 1114 ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); 1115 if (ret) 1116 return vend; 1117 1118 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1119 find_vmap_lowest_match_check(size); 1120 #endif 1121 1122 return nva_start_addr; 1123 } 1124 1125 /* 1126 * Free a region of KVA allocated by alloc_vmap_area 1127 */ 1128 static void free_vmap_area(struct vmap_area *va) 1129 { 1130 /* 1131 * Remove from the busy tree/list. 1132 */ 1133 spin_lock(&vmap_area_lock); 1134 unlink_va(va, &vmap_area_root); 1135 spin_unlock(&vmap_area_lock); 1136 1137 /* 1138 * Insert/Merge it back to the free tree/list. 1139 */ 1140 spin_lock(&free_vmap_area_lock); 1141 merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); 1142 spin_unlock(&free_vmap_area_lock); 1143 } 1144 1145 /* 1146 * Allocate a region of KVA of the specified size and alignment, within the 1147 * vstart and vend. 1148 */ 1149 static struct vmap_area *alloc_vmap_area(unsigned long size, 1150 unsigned long align, 1151 unsigned long vstart, unsigned long vend, 1152 int node, gfp_t gfp_mask) 1153 { 1154 struct vmap_area *va, *pva; 1155 unsigned long addr; 1156 int purged = 0; 1157 int ret; 1158 1159 BUG_ON(!size); 1160 BUG_ON(offset_in_page(size)); 1161 BUG_ON(!is_power_of_2(align)); 1162 1163 if (unlikely(!vmap_initialized)) 1164 return ERR_PTR(-EBUSY); 1165 1166 might_sleep(); 1167 gfp_mask = gfp_mask & GFP_RECLAIM_MASK; 1168 1169 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1170 if (unlikely(!va)) 1171 return ERR_PTR(-ENOMEM); 1172 1173 /* 1174 * Only scan the relevant parts containing pointers to other objects 1175 * to avoid false negatives. 1176 */ 1177 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); 1178 1179 retry: 1180 /* 1181 * Preload this CPU with one extra vmap_area object. It is used 1182 * when fit type of free area is NE_FIT_TYPE. Please note, it 1183 * does not guarantee that an allocation occurs on a CPU that 1184 * is preloaded, instead we minimize the case when it is not. 1185 * It can happen because of cpu migration, because there is a 1186 * race until the below spinlock is taken. 1187 * 1188 * The preload is done in non-atomic context, thus it allows us 1189 * to use more permissive allocation masks to be more stable under 1190 * low memory condition and high memory pressure. In rare case, 1191 * if not preloaded, GFP_NOWAIT is used. 1192 * 1193 * Set "pva" to NULL here, because of "retry" path. 1194 */ 1195 pva = NULL; 1196 1197 if (!this_cpu_read(ne_fit_preload_node)) 1198 /* 1199 * Even if it fails we do not really care about that. 1200 * Just proceed as it is. If needed "overflow" path 1201 * will refill the cache we allocate from. 1202 */ 1203 pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1204 1205 spin_lock(&free_vmap_area_lock); 1206 1207 if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) 1208 kmem_cache_free(vmap_area_cachep, pva); 1209 1210 /* 1211 * If an allocation fails, the "vend" address is 1212 * returned. Therefore trigger the overflow path. 1213 */ 1214 addr = __alloc_vmap_area(size, align, vstart, vend); 1215 spin_unlock(&free_vmap_area_lock); 1216 1217 if (unlikely(addr == vend)) 1218 goto overflow; 1219 1220 va->va_start = addr; 1221 va->va_end = addr + size; 1222 va->vm = NULL; 1223 1224 1225 spin_lock(&vmap_area_lock); 1226 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1227 spin_unlock(&vmap_area_lock); 1228 1229 BUG_ON(!IS_ALIGNED(va->va_start, align)); 1230 BUG_ON(va->va_start < vstart); 1231 BUG_ON(va->va_end > vend); 1232 1233 ret = kasan_populate_vmalloc(addr, size); 1234 if (ret) { 1235 free_vmap_area(va); 1236 return ERR_PTR(ret); 1237 } 1238 1239 return va; 1240 1241 overflow: 1242 if (!purged) { 1243 purge_vmap_area_lazy(); 1244 purged = 1; 1245 goto retry; 1246 } 1247 1248 if (gfpflags_allow_blocking(gfp_mask)) { 1249 unsigned long freed = 0; 1250 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); 1251 if (freed > 0) { 1252 purged = 0; 1253 goto retry; 1254 } 1255 } 1256 1257 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 1258 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", 1259 size); 1260 1261 kmem_cache_free(vmap_area_cachep, va); 1262 return ERR_PTR(-EBUSY); 1263 } 1264 1265 int register_vmap_purge_notifier(struct notifier_block *nb) 1266 { 1267 return blocking_notifier_chain_register(&vmap_notify_list, nb); 1268 } 1269 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); 1270 1271 int unregister_vmap_purge_notifier(struct notifier_block *nb) 1272 { 1273 return blocking_notifier_chain_unregister(&vmap_notify_list, nb); 1274 } 1275 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); 1276 1277 /* 1278 * lazy_max_pages is the maximum amount of virtual address space we gather up 1279 * before attempting to purge with a TLB flush. 1280 * 1281 * There is a tradeoff here: a larger number will cover more kernel page tables 1282 * and take slightly longer to purge, but it will linearly reduce the number of 1283 * global TLB flushes that must be performed. It would seem natural to scale 1284 * this number up linearly with the number of CPUs (because vmapping activity 1285 * could also scale linearly with the number of CPUs), however it is likely 1286 * that in practice, workloads might be constrained in other ways that mean 1287 * vmap activity will not scale linearly with CPUs. Also, I want to be 1288 * conservative and not introduce a big latency on huge systems, so go with 1289 * a less aggressive log scale. It will still be an improvement over the old 1290 * code, and it will be simple to change the scale factor if we find that it 1291 * becomes a problem on bigger systems. 1292 */ 1293 static unsigned long lazy_max_pages(void) 1294 { 1295 unsigned int log; 1296 1297 log = fls(num_online_cpus()); 1298 1299 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 1300 } 1301 1302 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); 1303 1304 /* 1305 * Serialize vmap purging. There is no actual criticial section protected 1306 * by this look, but we want to avoid concurrent calls for performance 1307 * reasons and to make the pcpu_get_vm_areas more deterministic. 1308 */ 1309 static DEFINE_MUTEX(vmap_purge_lock); 1310 1311 /* for per-CPU blocks */ 1312 static void purge_fragmented_blocks_allcpus(void); 1313 1314 /* 1315 * called before a call to iounmap() if the caller wants vm_area_struct's 1316 * immediately freed. 1317 */ 1318 void set_iounmap_nonlazy(void) 1319 { 1320 atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); 1321 } 1322 1323 /* 1324 * Purges all lazily-freed vmap areas. 1325 */ 1326 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) 1327 { 1328 unsigned long resched_threshold; 1329 struct llist_node *valist; 1330 struct vmap_area *va; 1331 struct vmap_area *n_va; 1332 1333 lockdep_assert_held(&vmap_purge_lock); 1334 1335 valist = llist_del_all(&vmap_purge_list); 1336 if (unlikely(valist == NULL)) 1337 return false; 1338 1339 /* 1340 * TODO: to calculate a flush range without looping. 1341 * The list can be up to lazy_max_pages() elements. 1342 */ 1343 llist_for_each_entry(va, valist, purge_list) { 1344 if (va->va_start < start) 1345 start = va->va_start; 1346 if (va->va_end > end) 1347 end = va->va_end; 1348 } 1349 1350 flush_tlb_kernel_range(start, end); 1351 resched_threshold = lazy_max_pages() << 1; 1352 1353 spin_lock(&free_vmap_area_lock); 1354 llist_for_each_entry_safe(va, n_va, valist, purge_list) { 1355 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1356 unsigned long orig_start = va->va_start; 1357 unsigned long orig_end = va->va_end; 1358 1359 /* 1360 * Finally insert or merge lazily-freed area. It is 1361 * detached and there is no need to "unlink" it from 1362 * anything. 1363 */ 1364 va = merge_or_add_vmap_area(va, &free_vmap_area_root, 1365 &free_vmap_area_list); 1366 1367 if (!va) 1368 continue; 1369 1370 if (is_vmalloc_or_module_addr((void *)orig_start)) 1371 kasan_release_vmalloc(orig_start, orig_end, 1372 va->va_start, va->va_end); 1373 1374 atomic_long_sub(nr, &vmap_lazy_nr); 1375 1376 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) 1377 cond_resched_lock(&free_vmap_area_lock); 1378 } 1379 spin_unlock(&free_vmap_area_lock); 1380 return true; 1381 } 1382 1383 /* 1384 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody 1385 * is already purging. 1386 */ 1387 static void try_purge_vmap_area_lazy(void) 1388 { 1389 if (mutex_trylock(&vmap_purge_lock)) { 1390 __purge_vmap_area_lazy(ULONG_MAX, 0); 1391 mutex_unlock(&vmap_purge_lock); 1392 } 1393 } 1394 1395 /* 1396 * Kick off a purge of the outstanding lazy areas. 1397 */ 1398 static void purge_vmap_area_lazy(void) 1399 { 1400 mutex_lock(&vmap_purge_lock); 1401 purge_fragmented_blocks_allcpus(); 1402 __purge_vmap_area_lazy(ULONG_MAX, 0); 1403 mutex_unlock(&vmap_purge_lock); 1404 } 1405 1406 /* 1407 * Free a vmap area, caller ensuring that the area has been unmapped 1408 * and flush_cache_vunmap had been called for the correct range 1409 * previously. 1410 */ 1411 static void free_vmap_area_noflush(struct vmap_area *va) 1412 { 1413 unsigned long nr_lazy; 1414 1415 spin_lock(&vmap_area_lock); 1416 unlink_va(va, &vmap_area_root); 1417 spin_unlock(&vmap_area_lock); 1418 1419 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1420 PAGE_SHIFT, &vmap_lazy_nr); 1421 1422 /* After this point, we may free va at any time */ 1423 llist_add(&va->purge_list, &vmap_purge_list); 1424 1425 if (unlikely(nr_lazy > lazy_max_pages())) 1426 try_purge_vmap_area_lazy(); 1427 } 1428 1429 /* 1430 * Free and unmap a vmap area 1431 */ 1432 static void free_unmap_vmap_area(struct vmap_area *va) 1433 { 1434 flush_cache_vunmap(va->va_start, va->va_end); 1435 unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); 1436 if (debug_pagealloc_enabled_static()) 1437 flush_tlb_kernel_range(va->va_start, va->va_end); 1438 1439 free_vmap_area_noflush(va); 1440 } 1441 1442 static struct vmap_area *find_vmap_area(unsigned long addr) 1443 { 1444 struct vmap_area *va; 1445 1446 spin_lock(&vmap_area_lock); 1447 va = __find_vmap_area(addr); 1448 spin_unlock(&vmap_area_lock); 1449 1450 return va; 1451 } 1452 1453 /*** Per cpu kva allocator ***/ 1454 1455 /* 1456 * vmap space is limited especially on 32 bit architectures. Ensure there is 1457 * room for at least 16 percpu vmap blocks per CPU. 1458 */ 1459 /* 1460 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 1461 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 1462 * instead (we just need a rough idea) 1463 */ 1464 #if BITS_PER_LONG == 32 1465 #define VMALLOC_SPACE (128UL*1024*1024) 1466 #else 1467 #define VMALLOC_SPACE (128UL*1024*1024*1024) 1468 #endif 1469 1470 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 1471 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 1472 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 1473 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 1474 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 1475 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 1476 #define VMAP_BBMAP_BITS \ 1477 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 1478 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 1479 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 1480 1481 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 1482 1483 struct vmap_block_queue { 1484 spinlock_t lock; 1485 struct list_head free; 1486 }; 1487 1488 struct vmap_block { 1489 spinlock_t lock; 1490 struct vmap_area *va; 1491 unsigned long free, dirty; 1492 unsigned long dirty_min, dirty_max; /*< dirty range */ 1493 struct list_head free_list; 1494 struct rcu_head rcu_head; 1495 struct list_head purge; 1496 }; 1497 1498 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 1499 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 1500 1501 /* 1502 * XArray of vmap blocks, indexed by address, to quickly find a vmap block 1503 * in the free path. Could get rid of this if we change the API to return a 1504 * "cookie" from alloc, to be passed to free. But no big deal yet. 1505 */ 1506 static DEFINE_XARRAY(vmap_blocks); 1507 1508 /* 1509 * We should probably have a fallback mechanism to allocate virtual memory 1510 * out of partially filled vmap blocks. However vmap block sizing should be 1511 * fairly reasonable according to the vmalloc size, so it shouldn't be a 1512 * big problem. 1513 */ 1514 1515 static unsigned long addr_to_vb_idx(unsigned long addr) 1516 { 1517 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 1518 addr /= VMAP_BLOCK_SIZE; 1519 return addr; 1520 } 1521 1522 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) 1523 { 1524 unsigned long addr; 1525 1526 addr = va_start + (pages_off << PAGE_SHIFT); 1527 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); 1528 return (void *)addr; 1529 } 1530 1531 /** 1532 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this 1533 * block. Of course pages number can't exceed VMAP_BBMAP_BITS 1534 * @order: how many 2^order pages should be occupied in newly allocated block 1535 * @gfp_mask: flags for the page level allocator 1536 * 1537 * Return: virtual address in a newly allocated block or ERR_PTR(-errno) 1538 */ 1539 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 1540 { 1541 struct vmap_block_queue *vbq; 1542 struct vmap_block *vb; 1543 struct vmap_area *va; 1544 unsigned long vb_idx; 1545 int node, err; 1546 void *vaddr; 1547 1548 node = numa_node_id(); 1549 1550 vb = kmalloc_node(sizeof(struct vmap_block), 1551 gfp_mask & GFP_RECLAIM_MASK, node); 1552 if (unlikely(!vb)) 1553 return ERR_PTR(-ENOMEM); 1554 1555 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 1556 VMALLOC_START, VMALLOC_END, 1557 node, gfp_mask); 1558 if (IS_ERR(va)) { 1559 kfree(vb); 1560 return ERR_CAST(va); 1561 } 1562 1563 vaddr = vmap_block_vaddr(va->va_start, 0); 1564 spin_lock_init(&vb->lock); 1565 vb->va = va; 1566 /* At least something should be left free */ 1567 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); 1568 vb->free = VMAP_BBMAP_BITS - (1UL << order); 1569 vb->dirty = 0; 1570 vb->dirty_min = VMAP_BBMAP_BITS; 1571 vb->dirty_max = 0; 1572 INIT_LIST_HEAD(&vb->free_list); 1573 1574 vb_idx = addr_to_vb_idx(va->va_start); 1575 err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); 1576 if (err) { 1577 kfree(vb); 1578 free_vmap_area(va); 1579 return ERR_PTR(err); 1580 } 1581 1582 vbq = &get_cpu_var(vmap_block_queue); 1583 spin_lock(&vbq->lock); 1584 list_add_tail_rcu(&vb->free_list, &vbq->free); 1585 spin_unlock(&vbq->lock); 1586 put_cpu_var(vmap_block_queue); 1587 1588 return vaddr; 1589 } 1590 1591 static void free_vmap_block(struct vmap_block *vb) 1592 { 1593 struct vmap_block *tmp; 1594 1595 tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); 1596 BUG_ON(tmp != vb); 1597 1598 free_vmap_area_noflush(vb->va); 1599 kfree_rcu(vb, rcu_head); 1600 } 1601 1602 static void purge_fragmented_blocks(int cpu) 1603 { 1604 LIST_HEAD(purge); 1605 struct vmap_block *vb; 1606 struct vmap_block *n_vb; 1607 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1608 1609 rcu_read_lock(); 1610 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1611 1612 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 1613 continue; 1614 1615 spin_lock(&vb->lock); 1616 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 1617 vb->free = 0; /* prevent further allocs after releasing lock */ 1618 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 1619 vb->dirty_min = 0; 1620 vb->dirty_max = VMAP_BBMAP_BITS; 1621 spin_lock(&vbq->lock); 1622 list_del_rcu(&vb->free_list); 1623 spin_unlock(&vbq->lock); 1624 spin_unlock(&vb->lock); 1625 list_add_tail(&vb->purge, &purge); 1626 } else 1627 spin_unlock(&vb->lock); 1628 } 1629 rcu_read_unlock(); 1630 1631 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 1632 list_del(&vb->purge); 1633 free_vmap_block(vb); 1634 } 1635 } 1636 1637 static void purge_fragmented_blocks_allcpus(void) 1638 { 1639 int cpu; 1640 1641 for_each_possible_cpu(cpu) 1642 purge_fragmented_blocks(cpu); 1643 } 1644 1645 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 1646 { 1647 struct vmap_block_queue *vbq; 1648 struct vmap_block *vb; 1649 void *vaddr = NULL; 1650 unsigned int order; 1651 1652 BUG_ON(offset_in_page(size)); 1653 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1654 if (WARN_ON(size == 0)) { 1655 /* 1656 * Allocating 0 bytes isn't what caller wants since 1657 * get_order(0) returns funny result. Just warn and terminate 1658 * early. 1659 */ 1660 return NULL; 1661 } 1662 order = get_order(size); 1663 1664 rcu_read_lock(); 1665 vbq = &get_cpu_var(vmap_block_queue); 1666 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1667 unsigned long pages_off; 1668 1669 spin_lock(&vb->lock); 1670 if (vb->free < (1UL << order)) { 1671 spin_unlock(&vb->lock); 1672 continue; 1673 } 1674 1675 pages_off = VMAP_BBMAP_BITS - vb->free; 1676 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); 1677 vb->free -= 1UL << order; 1678 if (vb->free == 0) { 1679 spin_lock(&vbq->lock); 1680 list_del_rcu(&vb->free_list); 1681 spin_unlock(&vbq->lock); 1682 } 1683 1684 spin_unlock(&vb->lock); 1685 break; 1686 } 1687 1688 put_cpu_var(vmap_block_queue); 1689 rcu_read_unlock(); 1690 1691 /* Allocate new block if nothing was found */ 1692 if (!vaddr) 1693 vaddr = new_vmap_block(order, gfp_mask); 1694 1695 return vaddr; 1696 } 1697 1698 static void vb_free(unsigned long addr, unsigned long size) 1699 { 1700 unsigned long offset; 1701 unsigned int order; 1702 struct vmap_block *vb; 1703 1704 BUG_ON(offset_in_page(size)); 1705 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1706 1707 flush_cache_vunmap(addr, addr + size); 1708 1709 order = get_order(size); 1710 offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; 1711 vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); 1712 1713 unmap_kernel_range_noflush(addr, size); 1714 1715 if (debug_pagealloc_enabled_static()) 1716 flush_tlb_kernel_range(addr, addr + size); 1717 1718 spin_lock(&vb->lock); 1719 1720 /* Expand dirty range */ 1721 vb->dirty_min = min(vb->dirty_min, offset); 1722 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); 1723 1724 vb->dirty += 1UL << order; 1725 if (vb->dirty == VMAP_BBMAP_BITS) { 1726 BUG_ON(vb->free); 1727 spin_unlock(&vb->lock); 1728 free_vmap_block(vb); 1729 } else 1730 spin_unlock(&vb->lock); 1731 } 1732 1733 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) 1734 { 1735 int cpu; 1736 1737 if (unlikely(!vmap_initialized)) 1738 return; 1739 1740 might_sleep(); 1741 1742 for_each_possible_cpu(cpu) { 1743 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1744 struct vmap_block *vb; 1745 1746 rcu_read_lock(); 1747 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1748 spin_lock(&vb->lock); 1749 if (vb->dirty) { 1750 unsigned long va_start = vb->va->va_start; 1751 unsigned long s, e; 1752 1753 s = va_start + (vb->dirty_min << PAGE_SHIFT); 1754 e = va_start + (vb->dirty_max << PAGE_SHIFT); 1755 1756 start = min(s, start); 1757 end = max(e, end); 1758 1759 flush = 1; 1760 } 1761 spin_unlock(&vb->lock); 1762 } 1763 rcu_read_unlock(); 1764 } 1765 1766 mutex_lock(&vmap_purge_lock); 1767 purge_fragmented_blocks_allcpus(); 1768 if (!__purge_vmap_area_lazy(start, end) && flush) 1769 flush_tlb_kernel_range(start, end); 1770 mutex_unlock(&vmap_purge_lock); 1771 } 1772 1773 /** 1774 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 1775 * 1776 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 1777 * to amortize TLB flushing overheads. What this means is that any page you 1778 * have now, may, in a former life, have been mapped into kernel virtual 1779 * address by the vmap layer and so there might be some CPUs with TLB entries 1780 * still referencing that page (additional to the regular 1:1 kernel mapping). 1781 * 1782 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 1783 * be sure that none of the pages we have control over will have any aliases 1784 * from the vmap layer. 1785 */ 1786 void vm_unmap_aliases(void) 1787 { 1788 unsigned long start = ULONG_MAX, end = 0; 1789 int flush = 0; 1790 1791 _vm_unmap_aliases(start, end, flush); 1792 } 1793 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 1794 1795 /** 1796 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 1797 * @mem: the pointer returned by vm_map_ram 1798 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 1799 */ 1800 void vm_unmap_ram(const void *mem, unsigned int count) 1801 { 1802 unsigned long size = (unsigned long)count << PAGE_SHIFT; 1803 unsigned long addr = (unsigned long)mem; 1804 struct vmap_area *va; 1805 1806 might_sleep(); 1807 BUG_ON(!addr); 1808 BUG_ON(addr < VMALLOC_START); 1809 BUG_ON(addr > VMALLOC_END); 1810 BUG_ON(!PAGE_ALIGNED(addr)); 1811 1812 kasan_poison_vmalloc(mem, size); 1813 1814 if (likely(count <= VMAP_MAX_ALLOC)) { 1815 debug_check_no_locks_freed(mem, size); 1816 vb_free(addr, size); 1817 return; 1818 } 1819 1820 va = find_vmap_area(addr); 1821 BUG_ON(!va); 1822 debug_check_no_locks_freed((void *)va->va_start, 1823 (va->va_end - va->va_start)); 1824 free_unmap_vmap_area(va); 1825 } 1826 EXPORT_SYMBOL(vm_unmap_ram); 1827 1828 /** 1829 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 1830 * @pages: an array of pointers to the pages to be mapped 1831 * @count: number of pages 1832 * @node: prefer to allocate data structures on this node 1833 * 1834 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be 1835 * faster than vmap so it's good. But if you mix long-life and short-life 1836 * objects with vm_map_ram(), it could consume lots of address space through 1837 * fragmentation (especially on a 32bit machine). You could see failures in 1838 * the end. Please use this function for short-lived objects. 1839 * 1840 * Returns: a pointer to the address that has been mapped, or %NULL on failure 1841 */ 1842 void *vm_map_ram(struct page **pages, unsigned int count, int node) 1843 { 1844 unsigned long size = (unsigned long)count << PAGE_SHIFT; 1845 unsigned long addr; 1846 void *mem; 1847 1848 if (likely(count <= VMAP_MAX_ALLOC)) { 1849 mem = vb_alloc(size, GFP_KERNEL); 1850 if (IS_ERR(mem)) 1851 return NULL; 1852 addr = (unsigned long)mem; 1853 } else { 1854 struct vmap_area *va; 1855 va = alloc_vmap_area(size, PAGE_SIZE, 1856 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 1857 if (IS_ERR(va)) 1858 return NULL; 1859 1860 addr = va->va_start; 1861 mem = (void *)addr; 1862 } 1863 1864 kasan_unpoison_vmalloc(mem, size); 1865 1866 if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { 1867 vm_unmap_ram(mem, count); 1868 return NULL; 1869 } 1870 return mem; 1871 } 1872 EXPORT_SYMBOL(vm_map_ram); 1873 1874 static struct vm_struct *vmlist __initdata; 1875 1876 /** 1877 * vm_area_add_early - add vmap area early during boot 1878 * @vm: vm_struct to add 1879 * 1880 * This function is used to add fixed kernel vm area to vmlist before 1881 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags 1882 * should contain proper values and the other fields should be zero. 1883 * 1884 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1885 */ 1886 void __init vm_area_add_early(struct vm_struct *vm) 1887 { 1888 struct vm_struct *tmp, **p; 1889 1890 BUG_ON(vmap_initialized); 1891 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1892 if (tmp->addr >= vm->addr) { 1893 BUG_ON(tmp->addr < vm->addr + vm->size); 1894 break; 1895 } else 1896 BUG_ON(tmp->addr + tmp->size > vm->addr); 1897 } 1898 vm->next = *p; 1899 *p = vm; 1900 } 1901 1902 /** 1903 * vm_area_register_early - register vmap area early during boot 1904 * @vm: vm_struct to register 1905 * @align: requested alignment 1906 * 1907 * This function is used to register kernel vm area before 1908 * vmalloc_init() is called. @vm->size and @vm->flags should contain 1909 * proper values on entry and other fields should be zero. On return, 1910 * vm->addr contains the allocated address. 1911 * 1912 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1913 */ 1914 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 1915 { 1916 static size_t vm_init_off __initdata; 1917 unsigned long addr; 1918 1919 addr = ALIGN(VMALLOC_START + vm_init_off, align); 1920 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; 1921 1922 vm->addr = (void *)addr; 1923 1924 vm_area_add_early(vm); 1925 } 1926 1927 static void vmap_init_free_space(void) 1928 { 1929 unsigned long vmap_start = 1; 1930 const unsigned long vmap_end = ULONG_MAX; 1931 struct vmap_area *busy, *free; 1932 1933 /* 1934 * B F B B B F 1935 * -|-----|.....|-----|-----|-----|.....|- 1936 * | The KVA space | 1937 * |<--------------------------------->| 1938 */ 1939 list_for_each_entry(busy, &vmap_area_list, list) { 1940 if (busy->va_start - vmap_start > 0) { 1941 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1942 if (!WARN_ON_ONCE(!free)) { 1943 free->va_start = vmap_start; 1944 free->va_end = busy->va_start; 1945 1946 insert_vmap_area_augment(free, NULL, 1947 &free_vmap_area_root, 1948 &free_vmap_area_list); 1949 } 1950 } 1951 1952 vmap_start = busy->va_end; 1953 } 1954 1955 if (vmap_end - vmap_start > 0) { 1956 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1957 if (!WARN_ON_ONCE(!free)) { 1958 free->va_start = vmap_start; 1959 free->va_end = vmap_end; 1960 1961 insert_vmap_area_augment(free, NULL, 1962 &free_vmap_area_root, 1963 &free_vmap_area_list); 1964 } 1965 } 1966 } 1967 1968 void __init vmalloc_init(void) 1969 { 1970 struct vmap_area *va; 1971 struct vm_struct *tmp; 1972 int i; 1973 1974 /* 1975 * Create the cache for vmap_area objects. 1976 */ 1977 vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); 1978 1979 for_each_possible_cpu(i) { 1980 struct vmap_block_queue *vbq; 1981 struct vfree_deferred *p; 1982 1983 vbq = &per_cpu(vmap_block_queue, i); 1984 spin_lock_init(&vbq->lock); 1985 INIT_LIST_HEAD(&vbq->free); 1986 p = &per_cpu(vfree_deferred, i); 1987 init_llist_head(&p->list); 1988 INIT_WORK(&p->wq, free_work); 1989 } 1990 1991 /* Import existing vmlist entries. */ 1992 for (tmp = vmlist; tmp; tmp = tmp->next) { 1993 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1994 if (WARN_ON_ONCE(!va)) 1995 continue; 1996 1997 va->va_start = (unsigned long)tmp->addr; 1998 va->va_end = va->va_start + tmp->size; 1999 va->vm = tmp; 2000 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 2001 } 2002 2003 /* 2004 * Now we can initialize a free vmap space. 2005 */ 2006 vmap_init_free_space(); 2007 vmap_initialized = true; 2008 } 2009 2010 /** 2011 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB 2012 * @addr: start of the VM area to unmap 2013 * @size: size of the VM area to unmap 2014 * 2015 * Similar to unmap_kernel_range_noflush() but flushes vcache before 2016 * the unmapping and tlb after. 2017 */ 2018 void unmap_kernel_range(unsigned long addr, unsigned long size) 2019 { 2020 unsigned long end = addr + size; 2021 2022 flush_cache_vunmap(addr, end); 2023 unmap_kernel_range_noflush(addr, size); 2024 flush_tlb_kernel_range(addr, end); 2025 } 2026 2027 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, 2028 struct vmap_area *va, unsigned long flags, const void *caller) 2029 { 2030 vm->flags = flags; 2031 vm->addr = (void *)va->va_start; 2032 vm->size = va->va_end - va->va_start; 2033 vm->caller = caller; 2034 va->vm = vm; 2035 } 2036 2037 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 2038 unsigned long flags, const void *caller) 2039 { 2040 spin_lock(&vmap_area_lock); 2041 setup_vmalloc_vm_locked(vm, va, flags, caller); 2042 spin_unlock(&vmap_area_lock); 2043 } 2044 2045 static void clear_vm_uninitialized_flag(struct vm_struct *vm) 2046 { 2047 /* 2048 * Before removing VM_UNINITIALIZED, 2049 * we should make sure that vm has proper values. 2050 * Pair with smp_rmb() in show_numa_info(). 2051 */ 2052 smp_wmb(); 2053 vm->flags &= ~VM_UNINITIALIZED; 2054 } 2055 2056 static struct vm_struct *__get_vm_area_node(unsigned long size, 2057 unsigned long align, unsigned long flags, unsigned long start, 2058 unsigned long end, int node, gfp_t gfp_mask, const void *caller) 2059 { 2060 struct vmap_area *va; 2061 struct vm_struct *area; 2062 unsigned long requested_size = size; 2063 2064 BUG_ON(in_interrupt()); 2065 size = PAGE_ALIGN(size); 2066 if (unlikely(!size)) 2067 return NULL; 2068 2069 if (flags & VM_IOREMAP) 2070 align = 1ul << clamp_t(int, get_count_order_long(size), 2071 PAGE_SHIFT, IOREMAP_MAX_ORDER); 2072 2073 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 2074 if (unlikely(!area)) 2075 return NULL; 2076 2077 if (!(flags & VM_NO_GUARD)) 2078 size += PAGE_SIZE; 2079 2080 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 2081 if (IS_ERR(va)) { 2082 kfree(area); 2083 return NULL; 2084 } 2085 2086 kasan_unpoison_vmalloc((void *)va->va_start, requested_size); 2087 2088 setup_vmalloc_vm(area, va, flags, caller); 2089 2090 return area; 2091 } 2092 2093 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 2094 unsigned long start, unsigned long end, 2095 const void *caller) 2096 { 2097 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, 2098 GFP_KERNEL, caller); 2099 } 2100 2101 /** 2102 * get_vm_area - reserve a contiguous kernel virtual area 2103 * @size: size of the area 2104 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 2105 * 2106 * Search an area of @size in the kernel virtual mapping area, 2107 * and reserved it for out purposes. Returns the area descriptor 2108 * on success or %NULL on failure. 2109 * 2110 * Return: the area descriptor on success or %NULL on failure. 2111 */ 2112 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 2113 { 2114 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2115 NUMA_NO_NODE, GFP_KERNEL, 2116 __builtin_return_address(0)); 2117 } 2118 2119 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 2120 const void *caller) 2121 { 2122 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2123 NUMA_NO_NODE, GFP_KERNEL, caller); 2124 } 2125 2126 /** 2127 * find_vm_area - find a continuous kernel virtual area 2128 * @addr: base address 2129 * 2130 * Search for the kernel VM area starting at @addr, and return it. 2131 * It is up to the caller to do all required locking to keep the returned 2132 * pointer valid. 2133 * 2134 * Return: pointer to the found area or %NULL on faulure 2135 */ 2136 struct vm_struct *find_vm_area(const void *addr) 2137 { 2138 struct vmap_area *va; 2139 2140 va = find_vmap_area((unsigned long)addr); 2141 if (!va) 2142 return NULL; 2143 2144 return va->vm; 2145 } 2146 2147 /** 2148 * remove_vm_area - find and remove a continuous kernel virtual area 2149 * @addr: base address 2150 * 2151 * Search for the kernel VM area starting at @addr, and remove it. 2152 * This function returns the found VM area, but using it is NOT safe 2153 * on SMP machines, except for its size or flags. 2154 * 2155 * Return: pointer to the found area or %NULL on faulure 2156 */ 2157 struct vm_struct *remove_vm_area(const void *addr) 2158 { 2159 struct vmap_area *va; 2160 2161 might_sleep(); 2162 2163 spin_lock(&vmap_area_lock); 2164 va = __find_vmap_area((unsigned long)addr); 2165 if (va && va->vm) { 2166 struct vm_struct *vm = va->vm; 2167 2168 va->vm = NULL; 2169 spin_unlock(&vmap_area_lock); 2170 2171 kasan_free_shadow(vm); 2172 free_unmap_vmap_area(va); 2173 2174 return vm; 2175 } 2176 2177 spin_unlock(&vmap_area_lock); 2178 return NULL; 2179 } 2180 2181 static inline void set_area_direct_map(const struct vm_struct *area, 2182 int (*set_direct_map)(struct page *page)) 2183 { 2184 int i; 2185 2186 for (i = 0; i < area->nr_pages; i++) 2187 if (page_address(area->pages[i])) 2188 set_direct_map(area->pages[i]); 2189 } 2190 2191 /* Handle removing and resetting vm mappings related to the vm_struct. */ 2192 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) 2193 { 2194 unsigned long start = ULONG_MAX, end = 0; 2195 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; 2196 int flush_dmap = 0; 2197 int i; 2198 2199 remove_vm_area(area->addr); 2200 2201 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ 2202 if (!flush_reset) 2203 return; 2204 2205 /* 2206 * If not deallocating pages, just do the flush of the VM area and 2207 * return. 2208 */ 2209 if (!deallocate_pages) { 2210 vm_unmap_aliases(); 2211 return; 2212 } 2213 2214 /* 2215 * If execution gets here, flush the vm mapping and reset the direct 2216 * map. Find the start and end range of the direct mappings to make sure 2217 * the vm_unmap_aliases() flush includes the direct map. 2218 */ 2219 for (i = 0; i < area->nr_pages; i++) { 2220 unsigned long addr = (unsigned long)page_address(area->pages[i]); 2221 if (addr) { 2222 start = min(addr, start); 2223 end = max(addr + PAGE_SIZE, end); 2224 flush_dmap = 1; 2225 } 2226 } 2227 2228 /* 2229 * Set direct map to something invalid so that it won't be cached if 2230 * there are any accesses after the TLB flush, then flush the TLB and 2231 * reset the direct map permissions to the default. 2232 */ 2233 set_area_direct_map(area, set_direct_map_invalid_noflush); 2234 _vm_unmap_aliases(start, end, flush_dmap); 2235 set_area_direct_map(area, set_direct_map_default_noflush); 2236 } 2237 2238 static void __vunmap(const void *addr, int deallocate_pages) 2239 { 2240 struct vm_struct *area; 2241 2242 if (!addr) 2243 return; 2244 2245 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", 2246 addr)) 2247 return; 2248 2249 area = find_vm_area(addr); 2250 if (unlikely(!area)) { 2251 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 2252 addr); 2253 return; 2254 } 2255 2256 debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); 2257 debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); 2258 2259 kasan_poison_vmalloc(area->addr, area->size); 2260 2261 vm_remove_mappings(area, deallocate_pages); 2262 2263 if (deallocate_pages) { 2264 int i; 2265 2266 for (i = 0; i < area->nr_pages; i++) { 2267 struct page *page = area->pages[i]; 2268 2269 BUG_ON(!page); 2270 __free_pages(page, 0); 2271 } 2272 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); 2273 2274 kvfree(area->pages); 2275 } 2276 2277 kfree(area); 2278 return; 2279 } 2280 2281 static inline void __vfree_deferred(const void *addr) 2282 { 2283 /* 2284 * Use raw_cpu_ptr() because this can be called from preemptible 2285 * context. Preemption is absolutely fine here, because the llist_add() 2286 * implementation is lockless, so it works even if we are adding to 2287 * another cpu's list. schedule_work() should be fine with this too. 2288 */ 2289 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); 2290 2291 if (llist_add((struct llist_node *)addr, &p->list)) 2292 schedule_work(&p->wq); 2293 } 2294 2295 /** 2296 * vfree_atomic - release memory allocated by vmalloc() 2297 * @addr: memory base address 2298 * 2299 * This one is just like vfree() but can be called in any atomic context 2300 * except NMIs. 2301 */ 2302 void vfree_atomic(const void *addr) 2303 { 2304 BUG_ON(in_nmi()); 2305 2306 kmemleak_free(addr); 2307 2308 if (!addr) 2309 return; 2310 __vfree_deferred(addr); 2311 } 2312 2313 static void __vfree(const void *addr) 2314 { 2315 if (unlikely(in_interrupt())) 2316 __vfree_deferred(addr); 2317 else 2318 __vunmap(addr, 1); 2319 } 2320 2321 /** 2322 * vfree - release memory allocated by vmalloc() 2323 * @addr: memory base address 2324 * 2325 * Free the virtually continuous memory area starting at @addr, as 2326 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is 2327 * NULL, no operation is performed. 2328 * 2329 * Must not be called in NMI context (strictly speaking, only if we don't 2330 * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 2331 * conventions for vfree() arch-depenedent would be a really bad idea) 2332 * 2333 * May sleep if called *not* from interrupt context. 2334 * 2335 * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) 2336 */ 2337 void vfree(const void *addr) 2338 { 2339 BUG_ON(in_nmi()); 2340 2341 kmemleak_free(addr); 2342 2343 might_sleep_if(!in_interrupt()); 2344 2345 if (!addr) 2346 return; 2347 2348 __vfree(addr); 2349 } 2350 EXPORT_SYMBOL(vfree); 2351 2352 /** 2353 * vunmap - release virtual mapping obtained by vmap() 2354 * @addr: memory base address 2355 * 2356 * Free the virtually contiguous memory area starting at @addr, 2357 * which was created from the page array passed to vmap(). 2358 * 2359 * Must not be called in interrupt context. 2360 */ 2361 void vunmap(const void *addr) 2362 { 2363 BUG_ON(in_interrupt()); 2364 might_sleep(); 2365 if (addr) 2366 __vunmap(addr, 0); 2367 } 2368 EXPORT_SYMBOL(vunmap); 2369 2370 /** 2371 * vmap - map an array of pages into virtually contiguous space 2372 * @pages: array of page pointers 2373 * @count: number of pages to map 2374 * @flags: vm_area->flags 2375 * @prot: page protection for the mapping 2376 * 2377 * Maps @count pages from @pages into contiguous kernel virtual 2378 * space. 2379 * 2380 * Return: the address of the area or %NULL on failure 2381 */ 2382 void *vmap(struct page **pages, unsigned int count, 2383 unsigned long flags, pgprot_t prot) 2384 { 2385 struct vm_struct *area; 2386 unsigned long size; /* In bytes */ 2387 2388 might_sleep(); 2389 2390 if (count > totalram_pages()) 2391 return NULL; 2392 2393 size = (unsigned long)count << PAGE_SHIFT; 2394 area = get_vm_area_caller(size, flags, __builtin_return_address(0)); 2395 if (!area) 2396 return NULL; 2397 2398 if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), 2399 pages) < 0) { 2400 vunmap(area->addr); 2401 return NULL; 2402 } 2403 2404 return area->addr; 2405 } 2406 EXPORT_SYMBOL(vmap); 2407 2408 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 2409 pgprot_t prot, int node) 2410 { 2411 struct page **pages; 2412 unsigned int nr_pages, array_size, i; 2413 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 2414 const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; 2415 const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? 2416 0 : 2417 __GFP_HIGHMEM; 2418 2419 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 2420 array_size = (nr_pages * sizeof(struct page *)); 2421 2422 /* Please note that the recursion is strictly bounded. */ 2423 if (array_size > PAGE_SIZE) { 2424 pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, 2425 node, area->caller); 2426 } else { 2427 pages = kmalloc_node(array_size, nested_gfp, node); 2428 } 2429 2430 if (!pages) { 2431 remove_vm_area(area->addr); 2432 kfree(area); 2433 return NULL; 2434 } 2435 2436 area->pages = pages; 2437 area->nr_pages = nr_pages; 2438 2439 for (i = 0; i < area->nr_pages; i++) { 2440 struct page *page; 2441 2442 if (node == NUMA_NO_NODE) 2443 page = alloc_page(alloc_mask|highmem_mask); 2444 else 2445 page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); 2446 2447 if (unlikely(!page)) { 2448 /* Successfully allocated i pages, free them in __vunmap() */ 2449 area->nr_pages = i; 2450 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2451 goto fail; 2452 } 2453 area->pages[i] = page; 2454 if (gfpflags_allow_blocking(gfp_mask)) 2455 cond_resched(); 2456 } 2457 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2458 2459 if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), 2460 prot, pages) < 0) 2461 goto fail; 2462 2463 return area->addr; 2464 2465 fail: 2466 warn_alloc(gfp_mask, NULL, 2467 "vmalloc: allocation failure, allocated %ld of %ld bytes", 2468 (area->nr_pages*PAGE_SIZE), area->size); 2469 __vfree(area->addr); 2470 return NULL; 2471 } 2472 2473 /** 2474 * __vmalloc_node_range - allocate virtually contiguous memory 2475 * @size: allocation size 2476 * @align: desired alignment 2477 * @start: vm area range start 2478 * @end: vm area range end 2479 * @gfp_mask: flags for the page level allocator 2480 * @prot: protection mask for the allocated pages 2481 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 2482 * @node: node to use for allocation or NUMA_NO_NODE 2483 * @caller: caller's return address 2484 * 2485 * Allocate enough pages to cover @size from the page level 2486 * allocator with @gfp_mask flags. Map them into contiguous 2487 * kernel virtual space, using a pagetable protection of @prot. 2488 * 2489 * Return: the address of the area or %NULL on failure 2490 */ 2491 void *__vmalloc_node_range(unsigned long size, unsigned long align, 2492 unsigned long start, unsigned long end, gfp_t gfp_mask, 2493 pgprot_t prot, unsigned long vm_flags, int node, 2494 const void *caller) 2495 { 2496 struct vm_struct *area; 2497 void *addr; 2498 unsigned long real_size = size; 2499 2500 size = PAGE_ALIGN(size); 2501 if (!size || (size >> PAGE_SHIFT) > totalram_pages()) 2502 goto fail; 2503 2504 area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | 2505 vm_flags, start, end, node, gfp_mask, caller); 2506 if (!area) 2507 goto fail; 2508 2509 addr = __vmalloc_area_node(area, gfp_mask, prot, node); 2510 if (!addr) 2511 return NULL; 2512 2513 /* 2514 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 2515 * flag. It means that vm_struct is not fully initialized. 2516 * Now, it is fully initialized, so remove this flag here. 2517 */ 2518 clear_vm_uninitialized_flag(area); 2519 2520 kmemleak_vmalloc(area, size, gfp_mask); 2521 2522 return addr; 2523 2524 fail: 2525 warn_alloc(gfp_mask, NULL, 2526 "vmalloc: allocation failure: %lu bytes", real_size); 2527 return NULL; 2528 } 2529 2530 /** 2531 * __vmalloc_node - allocate virtually contiguous memory 2532 * @size: allocation size 2533 * @align: desired alignment 2534 * @gfp_mask: flags for the page level allocator 2535 * @node: node to use for allocation or NUMA_NO_NODE 2536 * @caller: caller's return address 2537 * 2538 * Allocate enough pages to cover @size from the page level allocator with 2539 * @gfp_mask flags. Map them into contiguous kernel virtual space. 2540 * 2541 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 2542 * and __GFP_NOFAIL are not supported 2543 * 2544 * Any use of gfp flags outside of GFP_KERNEL should be consulted 2545 * with mm people. 2546 * 2547 * Return: pointer to the allocated memory or %NULL on error 2548 */ 2549 void *__vmalloc_node(unsigned long size, unsigned long align, 2550 gfp_t gfp_mask, int node, const void *caller) 2551 { 2552 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 2553 gfp_mask, PAGE_KERNEL, 0, node, caller); 2554 } 2555 /* 2556 * This is only for performance analysis of vmalloc and stress purpose. 2557 * It is required by vmalloc test module, therefore do not use it other 2558 * than that. 2559 */ 2560 #ifdef CONFIG_TEST_VMALLOC_MODULE 2561 EXPORT_SYMBOL_GPL(__vmalloc_node); 2562 #endif 2563 2564 void *__vmalloc(unsigned long size, gfp_t gfp_mask) 2565 { 2566 return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, 2567 __builtin_return_address(0)); 2568 } 2569 EXPORT_SYMBOL(__vmalloc); 2570 2571 /** 2572 * vmalloc - allocate virtually contiguous memory 2573 * @size: allocation size 2574 * 2575 * Allocate enough pages to cover @size from the page level 2576 * allocator and map them into contiguous kernel virtual space. 2577 * 2578 * For tight control over page level allocator and protection flags 2579 * use __vmalloc() instead. 2580 * 2581 * Return: pointer to the allocated memory or %NULL on error 2582 */ 2583 void *vmalloc(unsigned long size) 2584 { 2585 return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, 2586 __builtin_return_address(0)); 2587 } 2588 EXPORT_SYMBOL(vmalloc); 2589 2590 /** 2591 * vzalloc - allocate virtually contiguous memory with zero fill 2592 * @size: allocation size 2593 * 2594 * Allocate enough pages to cover @size from the page level 2595 * allocator and map them into contiguous kernel virtual space. 2596 * The memory allocated is set to zero. 2597 * 2598 * For tight control over page level allocator and protection flags 2599 * use __vmalloc() instead. 2600 * 2601 * Return: pointer to the allocated memory or %NULL on error 2602 */ 2603 void *vzalloc(unsigned long size) 2604 { 2605 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 2606 __builtin_return_address(0)); 2607 } 2608 EXPORT_SYMBOL(vzalloc); 2609 2610 /** 2611 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 2612 * @size: allocation size 2613 * 2614 * The resulting memory area is zeroed so it can be mapped to userspace 2615 * without leaking data. 2616 * 2617 * Return: pointer to the allocated memory or %NULL on error 2618 */ 2619 void *vmalloc_user(unsigned long size) 2620 { 2621 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 2622 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, 2623 VM_USERMAP, NUMA_NO_NODE, 2624 __builtin_return_address(0)); 2625 } 2626 EXPORT_SYMBOL(vmalloc_user); 2627 2628 /** 2629 * vmalloc_node - allocate memory on a specific node 2630 * @size: allocation size 2631 * @node: numa node 2632 * 2633 * Allocate enough pages to cover @size from the page level 2634 * allocator and map them into contiguous kernel virtual space. 2635 * 2636 * For tight control over page level allocator and protection flags 2637 * use __vmalloc() instead. 2638 * 2639 * Return: pointer to the allocated memory or %NULL on error 2640 */ 2641 void *vmalloc_node(unsigned long size, int node) 2642 { 2643 return __vmalloc_node(size, 1, GFP_KERNEL, node, 2644 __builtin_return_address(0)); 2645 } 2646 EXPORT_SYMBOL(vmalloc_node); 2647 2648 /** 2649 * vzalloc_node - allocate memory on a specific node with zero fill 2650 * @size: allocation size 2651 * @node: numa node 2652 * 2653 * Allocate enough pages to cover @size from the page level 2654 * allocator and map them into contiguous kernel virtual space. 2655 * The memory allocated is set to zero. 2656 * 2657 * Return: pointer to the allocated memory or %NULL on error 2658 */ 2659 void *vzalloc_node(unsigned long size, int node) 2660 { 2661 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, 2662 __builtin_return_address(0)); 2663 } 2664 EXPORT_SYMBOL(vzalloc_node); 2665 2666 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 2667 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 2668 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 2669 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) 2670 #else 2671 /* 2672 * 64b systems should always have either DMA or DMA32 zones. For others 2673 * GFP_DMA32 should do the right thing and use the normal zone. 2674 */ 2675 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL 2676 #endif 2677 2678 /** 2679 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 2680 * @size: allocation size 2681 * 2682 * Allocate enough 32bit PA addressable pages to cover @size from the 2683 * page level allocator and map them into contiguous kernel virtual space. 2684 * 2685 * Return: pointer to the allocated memory or %NULL on error 2686 */ 2687 void *vmalloc_32(unsigned long size) 2688 { 2689 return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, 2690 __builtin_return_address(0)); 2691 } 2692 EXPORT_SYMBOL(vmalloc_32); 2693 2694 /** 2695 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 2696 * @size: allocation size 2697 * 2698 * The resulting memory area is 32bit addressable and zeroed so it can be 2699 * mapped to userspace without leaking data. 2700 * 2701 * Return: pointer to the allocated memory or %NULL on error 2702 */ 2703 void *vmalloc_32_user(unsigned long size) 2704 { 2705 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 2706 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 2707 VM_USERMAP, NUMA_NO_NODE, 2708 __builtin_return_address(0)); 2709 } 2710 EXPORT_SYMBOL(vmalloc_32_user); 2711 2712 /* 2713 * small helper routine , copy contents to buf from addr. 2714 * If the page is not present, fill zero. 2715 */ 2716 2717 static int aligned_vread(char *buf, char *addr, unsigned long count) 2718 { 2719 struct page *p; 2720 int copied = 0; 2721 2722 while (count) { 2723 unsigned long offset, length; 2724 2725 offset = offset_in_page(addr); 2726 length = PAGE_SIZE - offset; 2727 if (length > count) 2728 length = count; 2729 p = vmalloc_to_page(addr); 2730 /* 2731 * To do safe access to this _mapped_ area, we need 2732 * lock. But adding lock here means that we need to add 2733 * overhead of vmalloc()/vfree() calles for this _debug_ 2734 * interface, rarely used. Instead of that, we'll use 2735 * kmap() and get small overhead in this access function. 2736 */ 2737 if (p) { 2738 /* 2739 * we can expect USER0 is not used (see vread/vwrite's 2740 * function description) 2741 */ 2742 void *map = kmap_atomic(p); 2743 memcpy(buf, map + offset, length); 2744 kunmap_atomic(map); 2745 } else 2746 memset(buf, 0, length); 2747 2748 addr += length; 2749 buf += length; 2750 copied += length; 2751 count -= length; 2752 } 2753 return copied; 2754 } 2755 2756 static int aligned_vwrite(char *buf, char *addr, unsigned long count) 2757 { 2758 struct page *p; 2759 int copied = 0; 2760 2761 while (count) { 2762 unsigned long offset, length; 2763 2764 offset = offset_in_page(addr); 2765 length = PAGE_SIZE - offset; 2766 if (length > count) 2767 length = count; 2768 p = vmalloc_to_page(addr); 2769 /* 2770 * To do safe access to this _mapped_ area, we need 2771 * lock. But adding lock here means that we need to add 2772 * overhead of vmalloc()/vfree() calles for this _debug_ 2773 * interface, rarely used. Instead of that, we'll use 2774 * kmap() and get small overhead in this access function. 2775 */ 2776 if (p) { 2777 /* 2778 * we can expect USER0 is not used (see vread/vwrite's 2779 * function description) 2780 */ 2781 void *map = kmap_atomic(p); 2782 memcpy(map + offset, buf, length); 2783 kunmap_atomic(map); 2784 } 2785 addr += length; 2786 buf += length; 2787 copied += length; 2788 count -= length; 2789 } 2790 return copied; 2791 } 2792 2793 /** 2794 * vread() - read vmalloc area in a safe way. 2795 * @buf: buffer for reading data 2796 * @addr: vm address. 2797 * @count: number of bytes to be read. 2798 * 2799 * This function checks that addr is a valid vmalloc'ed area, and 2800 * copy data from that area to a given buffer. If the given memory range 2801 * of [addr...addr+count) includes some valid address, data is copied to 2802 * proper area of @buf. If there are memory holes, they'll be zero-filled. 2803 * IOREMAP area is treated as memory hole and no copy is done. 2804 * 2805 * If [addr...addr+count) doesn't includes any intersects with alive 2806 * vm_struct area, returns 0. @buf should be kernel's buffer. 2807 * 2808 * Note: In usual ops, vread() is never necessary because the caller 2809 * should know vmalloc() area is valid and can use memcpy(). 2810 * This is for routines which have to access vmalloc area without 2811 * any information, as /dev/kmem. 2812 * 2813 * Return: number of bytes for which addr and buf should be increased 2814 * (same number as @count) or %0 if [addr...addr+count) doesn't 2815 * include any intersection with valid vmalloc area 2816 */ 2817 long vread(char *buf, char *addr, unsigned long count) 2818 { 2819 struct vmap_area *va; 2820 struct vm_struct *vm; 2821 char *vaddr, *buf_start = buf; 2822 unsigned long buflen = count; 2823 unsigned long n; 2824 2825 /* Don't allow overflow */ 2826 if ((unsigned long) addr + count < count) 2827 count = -(unsigned long) addr; 2828 2829 spin_lock(&vmap_area_lock); 2830 list_for_each_entry(va, &vmap_area_list, list) { 2831 if (!count) 2832 break; 2833 2834 if (!va->vm) 2835 continue; 2836 2837 vm = va->vm; 2838 vaddr = (char *) vm->addr; 2839 if (addr >= vaddr + get_vm_area_size(vm)) 2840 continue; 2841 while (addr < vaddr) { 2842 if (count == 0) 2843 goto finished; 2844 *buf = '\0'; 2845 buf++; 2846 addr++; 2847 count--; 2848 } 2849 n = vaddr + get_vm_area_size(vm) - addr; 2850 if (n > count) 2851 n = count; 2852 if (!(vm->flags & VM_IOREMAP)) 2853 aligned_vread(buf, addr, n); 2854 else /* IOREMAP area is treated as memory hole */ 2855 memset(buf, 0, n); 2856 buf += n; 2857 addr += n; 2858 count -= n; 2859 } 2860 finished: 2861 spin_unlock(&vmap_area_lock); 2862 2863 if (buf == buf_start) 2864 return 0; 2865 /* zero-fill memory holes */ 2866 if (buf != buf_start + buflen) 2867 memset(buf, 0, buflen - (buf - buf_start)); 2868 2869 return buflen; 2870 } 2871 2872 /** 2873 * vwrite() - write vmalloc area in a safe way. 2874 * @buf: buffer for source data 2875 * @addr: vm address. 2876 * @count: number of bytes to be read. 2877 * 2878 * This function checks that addr is a valid vmalloc'ed area, and 2879 * copy data from a buffer to the given addr. If specified range of 2880 * [addr...addr+count) includes some valid address, data is copied from 2881 * proper area of @buf. If there are memory holes, no copy to hole. 2882 * IOREMAP area is treated as memory hole and no copy is done. 2883 * 2884 * If [addr...addr+count) doesn't includes any intersects with alive 2885 * vm_struct area, returns 0. @buf should be kernel's buffer. 2886 * 2887 * Note: In usual ops, vwrite() is never necessary because the caller 2888 * should know vmalloc() area is valid and can use memcpy(). 2889 * This is for routines which have to access vmalloc area without 2890 * any information, as /dev/kmem. 2891 * 2892 * Return: number of bytes for which addr and buf should be 2893 * increased (same number as @count) or %0 if [addr...addr+count) 2894 * doesn't include any intersection with valid vmalloc area 2895 */ 2896 long vwrite(char *buf, char *addr, unsigned long count) 2897 { 2898 struct vmap_area *va; 2899 struct vm_struct *vm; 2900 char *vaddr; 2901 unsigned long n, buflen; 2902 int copied = 0; 2903 2904 /* Don't allow overflow */ 2905 if ((unsigned long) addr + count < count) 2906 count = -(unsigned long) addr; 2907 buflen = count; 2908 2909 spin_lock(&vmap_area_lock); 2910 list_for_each_entry(va, &vmap_area_list, list) { 2911 if (!count) 2912 break; 2913 2914 if (!va->vm) 2915 continue; 2916 2917 vm = va->vm; 2918 vaddr = (char *) vm->addr; 2919 if (addr >= vaddr + get_vm_area_size(vm)) 2920 continue; 2921 while (addr < vaddr) { 2922 if (count == 0) 2923 goto finished; 2924 buf++; 2925 addr++; 2926 count--; 2927 } 2928 n = vaddr + get_vm_area_size(vm) - addr; 2929 if (n > count) 2930 n = count; 2931 if (!(vm->flags & VM_IOREMAP)) { 2932 aligned_vwrite(buf, addr, n); 2933 copied++; 2934 } 2935 buf += n; 2936 addr += n; 2937 count -= n; 2938 } 2939 finished: 2940 spin_unlock(&vmap_area_lock); 2941 if (!copied) 2942 return 0; 2943 return buflen; 2944 } 2945 2946 /** 2947 * remap_vmalloc_range_partial - map vmalloc pages to userspace 2948 * @vma: vma to cover 2949 * @uaddr: target user address to start at 2950 * @kaddr: virtual address of vmalloc kernel memory 2951 * @pgoff: offset from @kaddr to start at 2952 * @size: size of map area 2953 * 2954 * Returns: 0 for success, -Exxx on failure 2955 * 2956 * This function checks that @kaddr is a valid vmalloc'ed area, 2957 * and that it is big enough to cover the range starting at 2958 * @uaddr in @vma. Will return failure if that criteria isn't 2959 * met. 2960 * 2961 * Similar to remap_pfn_range() (see mm/memory.c) 2962 */ 2963 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 2964 void *kaddr, unsigned long pgoff, 2965 unsigned long size) 2966 { 2967 struct vm_struct *area; 2968 unsigned long off; 2969 unsigned long end_index; 2970 2971 if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) 2972 return -EINVAL; 2973 2974 size = PAGE_ALIGN(size); 2975 2976 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) 2977 return -EINVAL; 2978 2979 area = find_vm_area(kaddr); 2980 if (!area) 2981 return -EINVAL; 2982 2983 if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) 2984 return -EINVAL; 2985 2986 if (check_add_overflow(size, off, &end_index) || 2987 end_index > get_vm_area_size(area)) 2988 return -EINVAL; 2989 kaddr += off; 2990 2991 do { 2992 struct page *page = vmalloc_to_page(kaddr); 2993 int ret; 2994 2995 ret = vm_insert_page(vma, uaddr, page); 2996 if (ret) 2997 return ret; 2998 2999 uaddr += PAGE_SIZE; 3000 kaddr += PAGE_SIZE; 3001 size -= PAGE_SIZE; 3002 } while (size > 0); 3003 3004 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3005 3006 return 0; 3007 } 3008 EXPORT_SYMBOL(remap_vmalloc_range_partial); 3009 3010 /** 3011 * remap_vmalloc_range - map vmalloc pages to userspace 3012 * @vma: vma to cover (map full range of vma) 3013 * @addr: vmalloc memory 3014 * @pgoff: number of pages into addr before first page to map 3015 * 3016 * Returns: 0 for success, -Exxx on failure 3017 * 3018 * This function checks that addr is a valid vmalloc'ed area, and 3019 * that it is big enough to cover the vma. Will return failure if 3020 * that criteria isn't met. 3021 * 3022 * Similar to remap_pfn_range() (see mm/memory.c) 3023 */ 3024 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 3025 unsigned long pgoff) 3026 { 3027 return remap_vmalloc_range_partial(vma, vma->vm_start, 3028 addr, pgoff, 3029 vma->vm_end - vma->vm_start); 3030 } 3031 EXPORT_SYMBOL(remap_vmalloc_range); 3032 3033 static int f(pte_t *pte, unsigned long addr, void *data) 3034 { 3035 pte_t ***p = data; 3036 3037 if (p) { 3038 *(*p) = pte; 3039 (*p)++; 3040 } 3041 return 0; 3042 } 3043 3044 /** 3045 * alloc_vm_area - allocate a range of kernel address space 3046 * @size: size of the area 3047 * @ptes: returns the PTEs for the address space 3048 * 3049 * Returns: NULL on failure, vm_struct on success 3050 * 3051 * This function reserves a range of kernel address space, and 3052 * allocates pagetables to map that range. No actual mappings 3053 * are created. 3054 * 3055 * If @ptes is non-NULL, pointers to the PTEs (in init_mm) 3056 * allocated for the VM area are returned. 3057 */ 3058 struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) 3059 { 3060 struct vm_struct *area; 3061 3062 area = get_vm_area_caller(size, VM_IOREMAP, 3063 __builtin_return_address(0)); 3064 if (area == NULL) 3065 return NULL; 3066 3067 /* 3068 * This ensures that page tables are constructed for this region 3069 * of kernel virtual address space and mapped into init_mm. 3070 */ 3071 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 3072 size, f, ptes ? &ptes : NULL)) { 3073 free_vm_area(area); 3074 return NULL; 3075 } 3076 3077 return area; 3078 } 3079 EXPORT_SYMBOL_GPL(alloc_vm_area); 3080 3081 void free_vm_area(struct vm_struct *area) 3082 { 3083 struct vm_struct *ret; 3084 ret = remove_vm_area(area->addr); 3085 BUG_ON(ret != area); 3086 kfree(area); 3087 } 3088 EXPORT_SYMBOL_GPL(free_vm_area); 3089 3090 #ifdef CONFIG_SMP 3091 static struct vmap_area *node_to_va(struct rb_node *n) 3092 { 3093 return rb_entry_safe(n, struct vmap_area, rb_node); 3094 } 3095 3096 /** 3097 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to 3098 * @addr: target address 3099 * 3100 * Returns: vmap_area if it is found. If there is no such area 3101 * the first highest(reverse order) vmap_area is returned 3102 * i.e. va->va_start < addr && va->va_end < addr or NULL 3103 * if there are no any areas before @addr. 3104 */ 3105 static struct vmap_area * 3106 pvm_find_va_enclose_addr(unsigned long addr) 3107 { 3108 struct vmap_area *va, *tmp; 3109 struct rb_node *n; 3110 3111 n = free_vmap_area_root.rb_node; 3112 va = NULL; 3113 3114 while (n) { 3115 tmp = rb_entry(n, struct vmap_area, rb_node); 3116 if (tmp->va_start <= addr) { 3117 va = tmp; 3118 if (tmp->va_end >= addr) 3119 break; 3120 3121 n = n->rb_right; 3122 } else { 3123 n = n->rb_left; 3124 } 3125 } 3126 3127 return va; 3128 } 3129 3130 /** 3131 * pvm_determine_end_from_reverse - find the highest aligned address 3132 * of free block below VMALLOC_END 3133 * @va: 3134 * in - the VA we start the search(reverse order); 3135 * out - the VA with the highest aligned end address. 3136 * 3137 * Returns: determined end address within vmap_area 3138 */ 3139 static unsigned long 3140 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) 3141 { 3142 unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3143 unsigned long addr; 3144 3145 if (likely(*va)) { 3146 list_for_each_entry_from_reverse((*va), 3147 &free_vmap_area_list, list) { 3148 addr = min((*va)->va_end & ~(align - 1), vmalloc_end); 3149 if ((*va)->va_start < addr) 3150 return addr; 3151 } 3152 } 3153 3154 return 0; 3155 } 3156 3157 /** 3158 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 3159 * @offsets: array containing offset of each area 3160 * @sizes: array containing size of each area 3161 * @nr_vms: the number of areas to allocate 3162 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 3163 * 3164 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 3165 * vm_structs on success, %NULL on failure 3166 * 3167 * Percpu allocator wants to use congruent vm areas so that it can 3168 * maintain the offsets among percpu areas. This function allocates 3169 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to 3170 * be scattered pretty far, distance between two areas easily going up 3171 * to gigabytes. To avoid interacting with regular vmallocs, these 3172 * areas are allocated from top. 3173 * 3174 * Despite its complicated look, this allocator is rather simple. It 3175 * does everything top-down and scans free blocks from the end looking 3176 * for matching base. While scanning, if any of the areas do not fit the 3177 * base address is pulled down to fit the area. Scanning is repeated till 3178 * all the areas fit and then all necessary data structures are inserted 3179 * and the result is returned. 3180 */ 3181 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 3182 const size_t *sizes, int nr_vms, 3183 size_t align) 3184 { 3185 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 3186 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3187 struct vmap_area **vas, *va; 3188 struct vm_struct **vms; 3189 int area, area2, last_area, term_area; 3190 unsigned long base, start, size, end, last_end, orig_start, orig_end; 3191 bool purged = false; 3192 enum fit_type type; 3193 3194 /* verify parameters and allocate data structures */ 3195 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); 3196 for (last_area = 0, area = 0; area < nr_vms; area++) { 3197 start = offsets[area]; 3198 end = start + sizes[area]; 3199 3200 /* is everything aligned properly? */ 3201 BUG_ON(!IS_ALIGNED(offsets[area], align)); 3202 BUG_ON(!IS_ALIGNED(sizes[area], align)); 3203 3204 /* detect the area with the highest address */ 3205 if (start > offsets[last_area]) 3206 last_area = area; 3207 3208 for (area2 = area + 1; area2 < nr_vms; area2++) { 3209 unsigned long start2 = offsets[area2]; 3210 unsigned long end2 = start2 + sizes[area2]; 3211 3212 BUG_ON(start2 < end && start < end2); 3213 } 3214 } 3215 last_end = offsets[last_area] + sizes[last_area]; 3216 3217 if (vmalloc_end - vmalloc_start < last_end) { 3218 WARN_ON(true); 3219 return NULL; 3220 } 3221 3222 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); 3223 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); 3224 if (!vas || !vms) 3225 goto err_free2; 3226 3227 for (area = 0; area < nr_vms; area++) { 3228 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); 3229 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 3230 if (!vas[area] || !vms[area]) 3231 goto err_free; 3232 } 3233 retry: 3234 spin_lock(&free_vmap_area_lock); 3235 3236 /* start scanning - we scan from the top, begin with the last area */ 3237 area = term_area = last_area; 3238 start = offsets[area]; 3239 end = start + sizes[area]; 3240 3241 va = pvm_find_va_enclose_addr(vmalloc_end); 3242 base = pvm_determine_end_from_reverse(&va, align) - end; 3243 3244 while (true) { 3245 /* 3246 * base might have underflowed, add last_end before 3247 * comparing. 3248 */ 3249 if (base + last_end < vmalloc_start + last_end) 3250 goto overflow; 3251 3252 /* 3253 * Fitting base has not been found. 3254 */ 3255 if (va == NULL) 3256 goto overflow; 3257 3258 /* 3259 * If required width exceeds current VA block, move 3260 * base downwards and then recheck. 3261 */ 3262 if (base + end > va->va_end) { 3263 base = pvm_determine_end_from_reverse(&va, align) - end; 3264 term_area = area; 3265 continue; 3266 } 3267 3268 /* 3269 * If this VA does not fit, move base downwards and recheck. 3270 */ 3271 if (base + start < va->va_start) { 3272 va = node_to_va(rb_prev(&va->rb_node)); 3273 base = pvm_determine_end_from_reverse(&va, align) - end; 3274 term_area = area; 3275 continue; 3276 } 3277 3278 /* 3279 * This area fits, move on to the previous one. If 3280 * the previous one is the terminal one, we're done. 3281 */ 3282 area = (area + nr_vms - 1) % nr_vms; 3283 if (area == term_area) 3284 break; 3285 3286 start = offsets[area]; 3287 end = start + sizes[area]; 3288 va = pvm_find_va_enclose_addr(base + end); 3289 } 3290 3291 /* we've found a fitting base, insert all va's */ 3292 for (area = 0; area < nr_vms; area++) { 3293 int ret; 3294 3295 start = base + offsets[area]; 3296 size = sizes[area]; 3297 3298 va = pvm_find_va_enclose_addr(start); 3299 if (WARN_ON_ONCE(va == NULL)) 3300 /* It is a BUG(), but trigger recovery instead. */ 3301 goto recovery; 3302 3303 type = classify_va_fit_type(va, start, size); 3304 if (WARN_ON_ONCE(type == NOTHING_FIT)) 3305 /* It is a BUG(), but trigger recovery instead. */ 3306 goto recovery; 3307 3308 ret = adjust_va_to_fit_type(va, start, size, type); 3309 if (unlikely(ret)) 3310 goto recovery; 3311 3312 /* Allocated area. */ 3313 va = vas[area]; 3314 va->va_start = start; 3315 va->va_end = start + size; 3316 } 3317 3318 spin_unlock(&free_vmap_area_lock); 3319 3320 /* populate the kasan shadow space */ 3321 for (area = 0; area < nr_vms; area++) { 3322 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) 3323 goto err_free_shadow; 3324 3325 kasan_unpoison_vmalloc((void *)vas[area]->va_start, 3326 sizes[area]); 3327 } 3328 3329 /* insert all vm's */ 3330 spin_lock(&vmap_area_lock); 3331 for (area = 0; area < nr_vms; area++) { 3332 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); 3333 3334 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, 3335 pcpu_get_vm_areas); 3336 } 3337 spin_unlock(&vmap_area_lock); 3338 3339 kfree(vas); 3340 return vms; 3341 3342 recovery: 3343 /* 3344 * Remove previously allocated areas. There is no 3345 * need in removing these areas from the busy tree, 3346 * because they are inserted only on the final step 3347 * and when pcpu_get_vm_areas() is success. 3348 */ 3349 while (area--) { 3350 orig_start = vas[area]->va_start; 3351 orig_end = vas[area]->va_end; 3352 va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, 3353 &free_vmap_area_list); 3354 if (va) 3355 kasan_release_vmalloc(orig_start, orig_end, 3356 va->va_start, va->va_end); 3357 vas[area] = NULL; 3358 } 3359 3360 overflow: 3361 spin_unlock(&free_vmap_area_lock); 3362 if (!purged) { 3363 purge_vmap_area_lazy(); 3364 purged = true; 3365 3366 /* Before "retry", check if we recover. */ 3367 for (area = 0; area < nr_vms; area++) { 3368 if (vas[area]) 3369 continue; 3370 3371 vas[area] = kmem_cache_zalloc( 3372 vmap_area_cachep, GFP_KERNEL); 3373 if (!vas[area]) 3374 goto err_free; 3375 } 3376 3377 goto retry; 3378 } 3379 3380 err_free: 3381 for (area = 0; area < nr_vms; area++) { 3382 if (vas[area]) 3383 kmem_cache_free(vmap_area_cachep, vas[area]); 3384 3385 kfree(vms[area]); 3386 } 3387 err_free2: 3388 kfree(vas); 3389 kfree(vms); 3390 return NULL; 3391 3392 err_free_shadow: 3393 spin_lock(&free_vmap_area_lock); 3394 /* 3395 * We release all the vmalloc shadows, even the ones for regions that 3396 * hadn't been successfully added. This relies on kasan_release_vmalloc 3397 * being able to tolerate this case. 3398 */ 3399 for (area = 0; area < nr_vms; area++) { 3400 orig_start = vas[area]->va_start; 3401 orig_end = vas[area]->va_end; 3402 va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, 3403 &free_vmap_area_list); 3404 if (va) 3405 kasan_release_vmalloc(orig_start, orig_end, 3406 va->va_start, va->va_end); 3407 vas[area] = NULL; 3408 kfree(vms[area]); 3409 } 3410 spin_unlock(&free_vmap_area_lock); 3411 kfree(vas); 3412 kfree(vms); 3413 return NULL; 3414 } 3415 3416 /** 3417 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 3418 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 3419 * @nr_vms: the number of allocated areas 3420 * 3421 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 3422 */ 3423 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 3424 { 3425 int i; 3426 3427 for (i = 0; i < nr_vms; i++) 3428 free_vm_area(vms[i]); 3429 kfree(vms); 3430 } 3431 #endif /* CONFIG_SMP */ 3432 3433 #ifdef CONFIG_PROC_FS 3434 static void *s_start(struct seq_file *m, loff_t *pos) 3435 __acquires(&vmap_purge_lock) 3436 __acquires(&vmap_area_lock) 3437 { 3438 mutex_lock(&vmap_purge_lock); 3439 spin_lock(&vmap_area_lock); 3440 3441 return seq_list_start(&vmap_area_list, *pos); 3442 } 3443 3444 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3445 { 3446 return seq_list_next(p, &vmap_area_list, pos); 3447 } 3448 3449 static void s_stop(struct seq_file *m, void *p) 3450 __releases(&vmap_purge_lock) 3451 __releases(&vmap_area_lock) 3452 { 3453 mutex_unlock(&vmap_purge_lock); 3454 spin_unlock(&vmap_area_lock); 3455 } 3456 3457 static void show_numa_info(struct seq_file *m, struct vm_struct *v) 3458 { 3459 if (IS_ENABLED(CONFIG_NUMA)) { 3460 unsigned int nr, *counters = m->private; 3461 3462 if (!counters) 3463 return; 3464 3465 if (v->flags & VM_UNINITIALIZED) 3466 return; 3467 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 3468 smp_rmb(); 3469 3470 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 3471 3472 for (nr = 0; nr < v->nr_pages; nr++) 3473 counters[page_to_nid(v->pages[nr])]++; 3474 3475 for_each_node_state(nr, N_HIGH_MEMORY) 3476 if (counters[nr]) 3477 seq_printf(m, " N%u=%u", nr, counters[nr]); 3478 } 3479 } 3480 3481 static void show_purge_info(struct seq_file *m) 3482 { 3483 struct llist_node *head; 3484 struct vmap_area *va; 3485 3486 head = READ_ONCE(vmap_purge_list.first); 3487 if (head == NULL) 3488 return; 3489 3490 llist_for_each_entry(va, head, purge_list) { 3491 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 3492 (void *)va->va_start, (void *)va->va_end, 3493 va->va_end - va->va_start); 3494 } 3495 } 3496 3497 static int s_show(struct seq_file *m, void *p) 3498 { 3499 struct vmap_area *va; 3500 struct vm_struct *v; 3501 3502 va = list_entry(p, struct vmap_area, list); 3503 3504 /* 3505 * s_show can encounter race with remove_vm_area, !vm on behalf 3506 * of vmap area is being tear down or vm_map_ram allocation. 3507 */ 3508 if (!va->vm) { 3509 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", 3510 (void *)va->va_start, (void *)va->va_end, 3511 va->va_end - va->va_start); 3512 3513 return 0; 3514 } 3515 3516 v = va->vm; 3517 3518 seq_printf(m, "0x%pK-0x%pK %7ld", 3519 v->addr, v->addr + v->size, v->size); 3520 3521 if (v->caller) 3522 seq_printf(m, " %pS", v->caller); 3523 3524 if (v->nr_pages) 3525 seq_printf(m, " pages=%d", v->nr_pages); 3526 3527 if (v->phys_addr) 3528 seq_printf(m, " phys=%pa", &v->phys_addr); 3529 3530 if (v->flags & VM_IOREMAP) 3531 seq_puts(m, " ioremap"); 3532 3533 if (v->flags & VM_ALLOC) 3534 seq_puts(m, " vmalloc"); 3535 3536 if (v->flags & VM_MAP) 3537 seq_puts(m, " vmap"); 3538 3539 if (v->flags & VM_USERMAP) 3540 seq_puts(m, " user"); 3541 3542 if (v->flags & VM_DMA_COHERENT) 3543 seq_puts(m, " dma-coherent"); 3544 3545 if (is_vmalloc_addr(v->pages)) 3546 seq_puts(m, " vpages"); 3547 3548 show_numa_info(m, v); 3549 seq_putc(m, '\n'); 3550 3551 /* 3552 * As a final step, dump "unpurged" areas. Note, 3553 * that entire "/proc/vmallocinfo" output will not 3554 * be address sorted, because the purge list is not 3555 * sorted. 3556 */ 3557 if (list_is_last(&va->list, &vmap_area_list)) 3558 show_purge_info(m); 3559 3560 return 0; 3561 } 3562 3563 static const struct seq_operations vmalloc_op = { 3564 .start = s_start, 3565 .next = s_next, 3566 .stop = s_stop, 3567 .show = s_show, 3568 }; 3569 3570 static int __init proc_vmalloc_init(void) 3571 { 3572 if (IS_ENABLED(CONFIG_NUMA)) 3573 proc_create_seq_private("vmallocinfo", 0400, NULL, 3574 &vmalloc_op, 3575 nr_node_ids * sizeof(unsigned int), NULL); 3576 else 3577 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); 3578 return 0; 3579 } 3580 module_init(proc_vmalloc_init); 3581 3582 #endif 3583