1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1993 Linus Torvalds 4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 6 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 7 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 9 */ 10 11 #include <linux/vmalloc.h> 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/highmem.h> 15 #include <linux/sched/signal.h> 16 #include <linux/slab.h> 17 #include <linux/spinlock.h> 18 #include <linux/interrupt.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/set_memory.h> 22 #include <linux/debugobjects.h> 23 #include <linux/kallsyms.h> 24 #include <linux/list.h> 25 #include <linux/notifier.h> 26 #include <linux/rbtree.h> 27 #include <linux/xarray.h> 28 #include <linux/rcupdate.h> 29 #include <linux/pfn.h> 30 #include <linux/kmemleak.h> 31 #include <linux/atomic.h> 32 #include <linux/compiler.h> 33 #include <linux/llist.h> 34 #include <linux/bitops.h> 35 #include <linux/rbtree_augmented.h> 36 #include <linux/overflow.h> 37 38 #include <linux/uaccess.h> 39 #include <asm/tlbflush.h> 40 #include <asm/shmparam.h> 41 42 #include "internal.h" 43 #include "pgalloc-track.h" 44 45 bool is_vmalloc_addr(const void *x) 46 { 47 unsigned long addr = (unsigned long)x; 48 49 return addr >= VMALLOC_START && addr < VMALLOC_END; 50 } 51 EXPORT_SYMBOL(is_vmalloc_addr); 52 53 struct vfree_deferred { 54 struct llist_head list; 55 struct work_struct wq; 56 }; 57 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); 58 59 static void __vunmap(const void *, int); 60 61 static void free_work(struct work_struct *w) 62 { 63 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); 64 struct llist_node *t, *llnode; 65 66 llist_for_each_safe(llnode, t, llist_del_all(&p->list)) 67 __vunmap((void *)llnode, 1); 68 } 69 70 /*** Page table manipulation functions ***/ 71 72 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 73 pgtbl_mod_mask *mask) 74 { 75 pte_t *pte; 76 77 pte = pte_offset_kernel(pmd, addr); 78 do { 79 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 80 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 81 } while (pte++, addr += PAGE_SIZE, addr != end); 82 *mask |= PGTBL_PTE_MODIFIED; 83 } 84 85 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 86 pgtbl_mod_mask *mask) 87 { 88 pmd_t *pmd; 89 unsigned long next; 90 int cleared; 91 92 pmd = pmd_offset(pud, addr); 93 do { 94 next = pmd_addr_end(addr, end); 95 96 cleared = pmd_clear_huge(pmd); 97 if (cleared || pmd_bad(*pmd)) 98 *mask |= PGTBL_PMD_MODIFIED; 99 100 if (cleared) 101 continue; 102 if (pmd_none_or_clear_bad(pmd)) 103 continue; 104 vunmap_pte_range(pmd, addr, next, mask); 105 106 cond_resched(); 107 } while (pmd++, addr = next, addr != end); 108 } 109 110 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 111 pgtbl_mod_mask *mask) 112 { 113 pud_t *pud; 114 unsigned long next; 115 int cleared; 116 117 pud = pud_offset(p4d, addr); 118 do { 119 next = pud_addr_end(addr, end); 120 121 cleared = pud_clear_huge(pud); 122 if (cleared || pud_bad(*pud)) 123 *mask |= PGTBL_PUD_MODIFIED; 124 125 if (cleared) 126 continue; 127 if (pud_none_or_clear_bad(pud)) 128 continue; 129 vunmap_pmd_range(pud, addr, next, mask); 130 } while (pud++, addr = next, addr != end); 131 } 132 133 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 134 pgtbl_mod_mask *mask) 135 { 136 p4d_t *p4d; 137 unsigned long next; 138 int cleared; 139 140 p4d = p4d_offset(pgd, addr); 141 do { 142 next = p4d_addr_end(addr, end); 143 144 cleared = p4d_clear_huge(p4d); 145 if (cleared || p4d_bad(*p4d)) 146 *mask |= PGTBL_P4D_MODIFIED; 147 148 if (cleared) 149 continue; 150 if (p4d_none_or_clear_bad(p4d)) 151 continue; 152 vunmap_pud_range(p4d, addr, next, mask); 153 } while (p4d++, addr = next, addr != end); 154 } 155 156 /** 157 * unmap_kernel_range_noflush - unmap kernel VM area 158 * @start: start of the VM area to unmap 159 * @size: size of the VM area to unmap 160 * 161 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify 162 * should have been allocated using get_vm_area() and its friends. 163 * 164 * NOTE: 165 * This function does NOT do any cache flushing. The caller is responsible 166 * for calling flush_cache_vunmap() on to-be-mapped areas before calling this 167 * function and flush_tlb_kernel_range() after. 168 */ 169 void unmap_kernel_range_noflush(unsigned long start, unsigned long size) 170 { 171 unsigned long end = start + size; 172 unsigned long next; 173 pgd_t *pgd; 174 unsigned long addr = start; 175 pgtbl_mod_mask mask = 0; 176 177 BUG_ON(addr >= end); 178 pgd = pgd_offset_k(addr); 179 do { 180 next = pgd_addr_end(addr, end); 181 if (pgd_bad(*pgd)) 182 mask |= PGTBL_PGD_MODIFIED; 183 if (pgd_none_or_clear_bad(pgd)) 184 continue; 185 vunmap_p4d_range(pgd, addr, next, &mask); 186 } while (pgd++, addr = next, addr != end); 187 188 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 189 arch_sync_kernel_mappings(start, end); 190 } 191 192 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 193 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 194 pgtbl_mod_mask *mask) 195 { 196 pte_t *pte; 197 198 /* 199 * nr is a running index into the array which helps higher level 200 * callers keep track of where we're up to. 201 */ 202 203 pte = pte_alloc_kernel_track(pmd, addr, mask); 204 if (!pte) 205 return -ENOMEM; 206 do { 207 struct page *page = pages[*nr]; 208 209 if (WARN_ON(!pte_none(*pte))) 210 return -EBUSY; 211 if (WARN_ON(!page)) 212 return -ENOMEM; 213 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 214 (*nr)++; 215 } while (pte++, addr += PAGE_SIZE, addr != end); 216 *mask |= PGTBL_PTE_MODIFIED; 217 return 0; 218 } 219 220 static int vmap_pmd_range(pud_t *pud, unsigned long addr, 221 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 222 pgtbl_mod_mask *mask) 223 { 224 pmd_t *pmd; 225 unsigned long next; 226 227 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 228 if (!pmd) 229 return -ENOMEM; 230 do { 231 next = pmd_addr_end(addr, end); 232 if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) 233 return -ENOMEM; 234 } while (pmd++, addr = next, addr != end); 235 return 0; 236 } 237 238 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, 239 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 240 pgtbl_mod_mask *mask) 241 { 242 pud_t *pud; 243 unsigned long next; 244 245 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 246 if (!pud) 247 return -ENOMEM; 248 do { 249 next = pud_addr_end(addr, end); 250 if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) 251 return -ENOMEM; 252 } while (pud++, addr = next, addr != end); 253 return 0; 254 } 255 256 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, 257 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 258 pgtbl_mod_mask *mask) 259 { 260 p4d_t *p4d; 261 unsigned long next; 262 263 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 264 if (!p4d) 265 return -ENOMEM; 266 do { 267 next = p4d_addr_end(addr, end); 268 if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) 269 return -ENOMEM; 270 } while (p4d++, addr = next, addr != end); 271 return 0; 272 } 273 274 /** 275 * map_kernel_range_noflush - map kernel VM area with the specified pages 276 * @addr: start of the VM area to map 277 * @size: size of the VM area to map 278 * @prot: page protection flags to use 279 * @pages: pages to map 280 * 281 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should 282 * have been allocated using get_vm_area() and its friends. 283 * 284 * NOTE: 285 * This function does NOT do any cache flushing. The caller is responsible for 286 * calling flush_cache_vmap() on to-be-mapped areas before calling this 287 * function. 288 * 289 * RETURNS: 290 * 0 on success, -errno on failure. 291 */ 292 int map_kernel_range_noflush(unsigned long addr, unsigned long size, 293 pgprot_t prot, struct page **pages) 294 { 295 unsigned long start = addr; 296 unsigned long end = addr + size; 297 unsigned long next; 298 pgd_t *pgd; 299 int err = 0; 300 int nr = 0; 301 pgtbl_mod_mask mask = 0; 302 303 BUG_ON(addr >= end); 304 pgd = pgd_offset_k(addr); 305 do { 306 next = pgd_addr_end(addr, end); 307 if (pgd_bad(*pgd)) 308 mask |= PGTBL_PGD_MODIFIED; 309 err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); 310 if (err) 311 return err; 312 } while (pgd++, addr = next, addr != end); 313 314 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 315 arch_sync_kernel_mappings(start, end); 316 317 return 0; 318 } 319 320 int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, 321 struct page **pages) 322 { 323 int ret; 324 325 ret = map_kernel_range_noflush(start, size, prot, pages); 326 flush_cache_vmap(start, start + size); 327 return ret; 328 } 329 330 int is_vmalloc_or_module_addr(const void *x) 331 { 332 /* 333 * ARM, x86-64 and sparc64 put modules in a special place, 334 * and fall back on vmalloc() if that fails. Others 335 * just put it in the vmalloc space. 336 */ 337 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 338 unsigned long addr = (unsigned long)x; 339 if (addr >= MODULES_VADDR && addr < MODULES_END) 340 return 1; 341 #endif 342 return is_vmalloc_addr(x); 343 } 344 345 /* 346 * Walk a vmap address to the struct page it maps. 347 */ 348 struct page *vmalloc_to_page(const void *vmalloc_addr) 349 { 350 unsigned long addr = (unsigned long) vmalloc_addr; 351 struct page *page = NULL; 352 pgd_t *pgd = pgd_offset_k(addr); 353 p4d_t *p4d; 354 pud_t *pud; 355 pmd_t *pmd; 356 pte_t *ptep, pte; 357 358 /* 359 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 360 * architectures that do not vmalloc module space 361 */ 362 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 363 364 if (pgd_none(*pgd)) 365 return NULL; 366 p4d = p4d_offset(pgd, addr); 367 if (p4d_none(*p4d)) 368 return NULL; 369 pud = pud_offset(p4d, addr); 370 371 /* 372 * Don't dereference bad PUD or PMD (below) entries. This will also 373 * identify huge mappings, which we may encounter on architectures 374 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be 375 * identified as vmalloc addresses by is_vmalloc_addr(), but are 376 * not [unambiguously] associated with a struct page, so there is 377 * no correct value to return for them. 378 */ 379 WARN_ON_ONCE(pud_bad(*pud)); 380 if (pud_none(*pud) || pud_bad(*pud)) 381 return NULL; 382 pmd = pmd_offset(pud, addr); 383 WARN_ON_ONCE(pmd_bad(*pmd)); 384 if (pmd_none(*pmd) || pmd_bad(*pmd)) 385 return NULL; 386 387 ptep = pte_offset_map(pmd, addr); 388 pte = *ptep; 389 if (pte_present(pte)) 390 page = pte_page(pte); 391 pte_unmap(ptep); 392 return page; 393 } 394 EXPORT_SYMBOL(vmalloc_to_page); 395 396 /* 397 * Map a vmalloc()-space virtual address to the physical page frame number. 398 */ 399 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 400 { 401 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 402 } 403 EXPORT_SYMBOL(vmalloc_to_pfn); 404 405 406 /*** Global kva allocator ***/ 407 408 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 409 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 410 411 412 static DEFINE_SPINLOCK(vmap_area_lock); 413 static DEFINE_SPINLOCK(free_vmap_area_lock); 414 /* Export for kexec only */ 415 LIST_HEAD(vmap_area_list); 416 static LLIST_HEAD(vmap_purge_list); 417 static struct rb_root vmap_area_root = RB_ROOT; 418 static bool vmap_initialized __read_mostly; 419 420 /* 421 * This kmem_cache is used for vmap_area objects. Instead of 422 * allocating from slab we reuse an object from this cache to 423 * make things faster. Especially in "no edge" splitting of 424 * free block. 425 */ 426 static struct kmem_cache *vmap_area_cachep; 427 428 /* 429 * This linked list is used in pair with free_vmap_area_root. 430 * It gives O(1) access to prev/next to perform fast coalescing. 431 */ 432 static LIST_HEAD(free_vmap_area_list); 433 434 /* 435 * This augment red-black tree represents the free vmap space. 436 * All vmap_area objects in this tree are sorted by va->va_start 437 * address. It is used for allocation and merging when a vmap 438 * object is released. 439 * 440 * Each vmap_area node contains a maximum available free block 441 * of its sub-tree, right or left. Therefore it is possible to 442 * find a lowest match of free area. 443 */ 444 static struct rb_root free_vmap_area_root = RB_ROOT; 445 446 /* 447 * Preload a CPU with one object for "no edge" split case. The 448 * aim is to get rid of allocations from the atomic context, thus 449 * to use more permissive allocation masks. 450 */ 451 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); 452 453 static __always_inline unsigned long 454 va_size(struct vmap_area *va) 455 { 456 return (va->va_end - va->va_start); 457 } 458 459 static __always_inline unsigned long 460 get_subtree_max_size(struct rb_node *node) 461 { 462 struct vmap_area *va; 463 464 va = rb_entry_safe(node, struct vmap_area, rb_node); 465 return va ? va->subtree_max_size : 0; 466 } 467 468 /* 469 * Gets called when remove the node and rotate. 470 */ 471 static __always_inline unsigned long 472 compute_subtree_max_size(struct vmap_area *va) 473 { 474 return max3(va_size(va), 475 get_subtree_max_size(va->rb_node.rb_left), 476 get_subtree_max_size(va->rb_node.rb_right)); 477 } 478 479 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, 480 struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) 481 482 static void purge_vmap_area_lazy(void); 483 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 484 static unsigned long lazy_max_pages(void); 485 486 static atomic_long_t nr_vmalloc_pages; 487 488 unsigned long vmalloc_nr_pages(void) 489 { 490 return atomic_long_read(&nr_vmalloc_pages); 491 } 492 493 static struct vmap_area *__find_vmap_area(unsigned long addr) 494 { 495 struct rb_node *n = vmap_area_root.rb_node; 496 497 while (n) { 498 struct vmap_area *va; 499 500 va = rb_entry(n, struct vmap_area, rb_node); 501 if (addr < va->va_start) 502 n = n->rb_left; 503 else if (addr >= va->va_end) 504 n = n->rb_right; 505 else 506 return va; 507 } 508 509 return NULL; 510 } 511 512 /* 513 * This function returns back addresses of parent node 514 * and its left or right link for further processing. 515 * 516 * Otherwise NULL is returned. In that case all further 517 * steps regarding inserting of conflicting overlap range 518 * have to be declined and actually considered as a bug. 519 */ 520 static __always_inline struct rb_node ** 521 find_va_links(struct vmap_area *va, 522 struct rb_root *root, struct rb_node *from, 523 struct rb_node **parent) 524 { 525 struct vmap_area *tmp_va; 526 struct rb_node **link; 527 528 if (root) { 529 link = &root->rb_node; 530 if (unlikely(!*link)) { 531 *parent = NULL; 532 return link; 533 } 534 } else { 535 link = &from; 536 } 537 538 /* 539 * Go to the bottom of the tree. When we hit the last point 540 * we end up with parent rb_node and correct direction, i name 541 * it link, where the new va->rb_node will be attached to. 542 */ 543 do { 544 tmp_va = rb_entry(*link, struct vmap_area, rb_node); 545 546 /* 547 * During the traversal we also do some sanity check. 548 * Trigger the BUG() if there are sides(left/right) 549 * or full overlaps. 550 */ 551 if (va->va_start < tmp_va->va_end && 552 va->va_end <= tmp_va->va_start) 553 link = &(*link)->rb_left; 554 else if (va->va_end > tmp_va->va_start && 555 va->va_start >= tmp_va->va_end) 556 link = &(*link)->rb_right; 557 else { 558 WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", 559 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); 560 561 return NULL; 562 } 563 } while (*link); 564 565 *parent = &tmp_va->rb_node; 566 return link; 567 } 568 569 static __always_inline struct list_head * 570 get_va_next_sibling(struct rb_node *parent, struct rb_node **link) 571 { 572 struct list_head *list; 573 574 if (unlikely(!parent)) 575 /* 576 * The red-black tree where we try to find VA neighbors 577 * before merging or inserting is empty, i.e. it means 578 * there is no free vmap space. Normally it does not 579 * happen but we handle this case anyway. 580 */ 581 return NULL; 582 583 list = &rb_entry(parent, struct vmap_area, rb_node)->list; 584 return (&parent->rb_right == link ? list->next : list); 585 } 586 587 static __always_inline void 588 link_va(struct vmap_area *va, struct rb_root *root, 589 struct rb_node *parent, struct rb_node **link, struct list_head *head) 590 { 591 /* 592 * VA is still not in the list, but we can 593 * identify its future previous list_head node. 594 */ 595 if (likely(parent)) { 596 head = &rb_entry(parent, struct vmap_area, rb_node)->list; 597 if (&parent->rb_right != link) 598 head = head->prev; 599 } 600 601 /* Insert to the rb-tree */ 602 rb_link_node(&va->rb_node, parent, link); 603 if (root == &free_vmap_area_root) { 604 /* 605 * Some explanation here. Just perform simple insertion 606 * to the tree. We do not set va->subtree_max_size to 607 * its current size before calling rb_insert_augmented(). 608 * It is because of we populate the tree from the bottom 609 * to parent levels when the node _is_ in the tree. 610 * 611 * Therefore we set subtree_max_size to zero after insertion, 612 * to let __augment_tree_propagate_from() puts everything to 613 * the correct order later on. 614 */ 615 rb_insert_augmented(&va->rb_node, 616 root, &free_vmap_area_rb_augment_cb); 617 va->subtree_max_size = 0; 618 } else { 619 rb_insert_color(&va->rb_node, root); 620 } 621 622 /* Address-sort this list */ 623 list_add(&va->list, head); 624 } 625 626 static __always_inline void 627 unlink_va(struct vmap_area *va, struct rb_root *root) 628 { 629 if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) 630 return; 631 632 if (root == &free_vmap_area_root) 633 rb_erase_augmented(&va->rb_node, 634 root, &free_vmap_area_rb_augment_cb); 635 else 636 rb_erase(&va->rb_node, root); 637 638 list_del(&va->list); 639 RB_CLEAR_NODE(&va->rb_node); 640 } 641 642 #if DEBUG_AUGMENT_PROPAGATE_CHECK 643 static void 644 augment_tree_propagate_check(void) 645 { 646 struct vmap_area *va; 647 unsigned long computed_size; 648 649 list_for_each_entry(va, &free_vmap_area_list, list) { 650 computed_size = compute_subtree_max_size(va); 651 if (computed_size != va->subtree_max_size) 652 pr_emerg("tree is corrupted: %lu, %lu\n", 653 va_size(va), va->subtree_max_size); 654 } 655 } 656 #endif 657 658 /* 659 * This function populates subtree_max_size from bottom to upper 660 * levels starting from VA point. The propagation must be done 661 * when VA size is modified by changing its va_start/va_end. Or 662 * in case of newly inserting of VA to the tree. 663 * 664 * It means that __augment_tree_propagate_from() must be called: 665 * - After VA has been inserted to the tree(free path); 666 * - After VA has been shrunk(allocation path); 667 * - After VA has been increased(merging path). 668 * 669 * Please note that, it does not mean that upper parent nodes 670 * and their subtree_max_size are recalculated all the time up 671 * to the root node. 672 * 673 * 4--8 674 * /\ 675 * / \ 676 * / \ 677 * 2--2 8--8 678 * 679 * For example if we modify the node 4, shrinking it to 2, then 680 * no any modification is required. If we shrink the node 2 to 1 681 * its subtree_max_size is updated only, and set to 1. If we shrink 682 * the node 8 to 6, then its subtree_max_size is set to 6 and parent 683 * node becomes 4--6. 684 */ 685 static __always_inline void 686 augment_tree_propagate_from(struct vmap_area *va) 687 { 688 /* 689 * Populate the tree from bottom towards the root until 690 * the calculated maximum available size of checked node 691 * is equal to its current one. 692 */ 693 free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); 694 695 #if DEBUG_AUGMENT_PROPAGATE_CHECK 696 augment_tree_propagate_check(); 697 #endif 698 } 699 700 static void 701 insert_vmap_area(struct vmap_area *va, 702 struct rb_root *root, struct list_head *head) 703 { 704 struct rb_node **link; 705 struct rb_node *parent; 706 707 link = find_va_links(va, root, NULL, &parent); 708 if (link) 709 link_va(va, root, parent, link, head); 710 } 711 712 static void 713 insert_vmap_area_augment(struct vmap_area *va, 714 struct rb_node *from, struct rb_root *root, 715 struct list_head *head) 716 { 717 struct rb_node **link; 718 struct rb_node *parent; 719 720 if (from) 721 link = find_va_links(va, NULL, from, &parent); 722 else 723 link = find_va_links(va, root, NULL, &parent); 724 725 if (link) { 726 link_va(va, root, parent, link, head); 727 augment_tree_propagate_from(va); 728 } 729 } 730 731 /* 732 * Merge de-allocated chunk of VA memory with previous 733 * and next free blocks. If coalesce is not done a new 734 * free area is inserted. If VA has been merged, it is 735 * freed. 736 * 737 * Please note, it can return NULL in case of overlap 738 * ranges, followed by WARN() report. Despite it is a 739 * buggy behaviour, a system can be alive and keep 740 * ongoing. 741 */ 742 static __always_inline struct vmap_area * 743 merge_or_add_vmap_area(struct vmap_area *va, 744 struct rb_root *root, struct list_head *head) 745 { 746 struct vmap_area *sibling; 747 struct list_head *next; 748 struct rb_node **link; 749 struct rb_node *parent; 750 bool merged = false; 751 752 /* 753 * Find a place in the tree where VA potentially will be 754 * inserted, unless it is merged with its sibling/siblings. 755 */ 756 link = find_va_links(va, root, NULL, &parent); 757 if (!link) 758 return NULL; 759 760 /* 761 * Get next node of VA to check if merging can be done. 762 */ 763 next = get_va_next_sibling(parent, link); 764 if (unlikely(next == NULL)) 765 goto insert; 766 767 /* 768 * start end 769 * | | 770 * |<------VA------>|<-----Next----->| 771 * | | 772 * start end 773 */ 774 if (next != head) { 775 sibling = list_entry(next, struct vmap_area, list); 776 if (sibling->va_start == va->va_end) { 777 sibling->va_start = va->va_start; 778 779 /* Free vmap_area object. */ 780 kmem_cache_free(vmap_area_cachep, va); 781 782 /* Point to the new merged area. */ 783 va = sibling; 784 merged = true; 785 } 786 } 787 788 /* 789 * start end 790 * | | 791 * |<-----Prev----->|<------VA------>| 792 * | | 793 * start end 794 */ 795 if (next->prev != head) { 796 sibling = list_entry(next->prev, struct vmap_area, list); 797 if (sibling->va_end == va->va_start) { 798 /* 799 * If both neighbors are coalesced, it is important 800 * to unlink the "next" node first, followed by merging 801 * with "previous" one. Otherwise the tree might not be 802 * fully populated if a sibling's augmented value is 803 * "normalized" because of rotation operations. 804 */ 805 if (merged) 806 unlink_va(va, root); 807 808 sibling->va_end = va->va_end; 809 810 /* Free vmap_area object. */ 811 kmem_cache_free(vmap_area_cachep, va); 812 813 /* Point to the new merged area. */ 814 va = sibling; 815 merged = true; 816 } 817 } 818 819 insert: 820 if (!merged) 821 link_va(va, root, parent, link, head); 822 823 /* 824 * Last step is to check and update the tree. 825 */ 826 augment_tree_propagate_from(va); 827 return va; 828 } 829 830 static __always_inline bool 831 is_within_this_va(struct vmap_area *va, unsigned long size, 832 unsigned long align, unsigned long vstart) 833 { 834 unsigned long nva_start_addr; 835 836 if (va->va_start > vstart) 837 nva_start_addr = ALIGN(va->va_start, align); 838 else 839 nva_start_addr = ALIGN(vstart, align); 840 841 /* Can be overflowed due to big size or alignment. */ 842 if (nva_start_addr + size < nva_start_addr || 843 nva_start_addr < vstart) 844 return false; 845 846 return (nva_start_addr + size <= va->va_end); 847 } 848 849 /* 850 * Find the first free block(lowest start address) in the tree, 851 * that will accomplish the request corresponding to passing 852 * parameters. 853 */ 854 static __always_inline struct vmap_area * 855 find_vmap_lowest_match(unsigned long size, 856 unsigned long align, unsigned long vstart) 857 { 858 struct vmap_area *va; 859 struct rb_node *node; 860 unsigned long length; 861 862 /* Start from the root. */ 863 node = free_vmap_area_root.rb_node; 864 865 /* Adjust the search size for alignment overhead. */ 866 length = size + align - 1; 867 868 while (node) { 869 va = rb_entry(node, struct vmap_area, rb_node); 870 871 if (get_subtree_max_size(node->rb_left) >= length && 872 vstart < va->va_start) { 873 node = node->rb_left; 874 } else { 875 if (is_within_this_va(va, size, align, vstart)) 876 return va; 877 878 /* 879 * Does not make sense to go deeper towards the right 880 * sub-tree if it does not have a free block that is 881 * equal or bigger to the requested search length. 882 */ 883 if (get_subtree_max_size(node->rb_right) >= length) { 884 node = node->rb_right; 885 continue; 886 } 887 888 /* 889 * OK. We roll back and find the first right sub-tree, 890 * that will satisfy the search criteria. It can happen 891 * only once due to "vstart" restriction. 892 */ 893 while ((node = rb_parent(node))) { 894 va = rb_entry(node, struct vmap_area, rb_node); 895 if (is_within_this_va(va, size, align, vstart)) 896 return va; 897 898 if (get_subtree_max_size(node->rb_right) >= length && 899 vstart <= va->va_start) { 900 node = node->rb_right; 901 break; 902 } 903 } 904 } 905 } 906 907 return NULL; 908 } 909 910 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 911 #include <linux/random.h> 912 913 static struct vmap_area * 914 find_vmap_lowest_linear_match(unsigned long size, 915 unsigned long align, unsigned long vstart) 916 { 917 struct vmap_area *va; 918 919 list_for_each_entry(va, &free_vmap_area_list, list) { 920 if (!is_within_this_va(va, size, align, vstart)) 921 continue; 922 923 return va; 924 } 925 926 return NULL; 927 } 928 929 static void 930 find_vmap_lowest_match_check(unsigned long size) 931 { 932 struct vmap_area *va_1, *va_2; 933 unsigned long vstart; 934 unsigned int rnd; 935 936 get_random_bytes(&rnd, sizeof(rnd)); 937 vstart = VMALLOC_START + rnd; 938 939 va_1 = find_vmap_lowest_match(size, 1, vstart); 940 va_2 = find_vmap_lowest_linear_match(size, 1, vstart); 941 942 if (va_1 != va_2) 943 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", 944 va_1, va_2, vstart); 945 } 946 #endif 947 948 enum fit_type { 949 NOTHING_FIT = 0, 950 FL_FIT_TYPE = 1, /* full fit */ 951 LE_FIT_TYPE = 2, /* left edge fit */ 952 RE_FIT_TYPE = 3, /* right edge fit */ 953 NE_FIT_TYPE = 4 /* no edge fit */ 954 }; 955 956 static __always_inline enum fit_type 957 classify_va_fit_type(struct vmap_area *va, 958 unsigned long nva_start_addr, unsigned long size) 959 { 960 enum fit_type type; 961 962 /* Check if it is within VA. */ 963 if (nva_start_addr < va->va_start || 964 nva_start_addr + size > va->va_end) 965 return NOTHING_FIT; 966 967 /* Now classify. */ 968 if (va->va_start == nva_start_addr) { 969 if (va->va_end == nva_start_addr + size) 970 type = FL_FIT_TYPE; 971 else 972 type = LE_FIT_TYPE; 973 } else if (va->va_end == nva_start_addr + size) { 974 type = RE_FIT_TYPE; 975 } else { 976 type = NE_FIT_TYPE; 977 } 978 979 return type; 980 } 981 982 static __always_inline int 983 adjust_va_to_fit_type(struct vmap_area *va, 984 unsigned long nva_start_addr, unsigned long size, 985 enum fit_type type) 986 { 987 struct vmap_area *lva = NULL; 988 989 if (type == FL_FIT_TYPE) { 990 /* 991 * No need to split VA, it fully fits. 992 * 993 * | | 994 * V NVA V 995 * |---------------| 996 */ 997 unlink_va(va, &free_vmap_area_root); 998 kmem_cache_free(vmap_area_cachep, va); 999 } else if (type == LE_FIT_TYPE) { 1000 /* 1001 * Split left edge of fit VA. 1002 * 1003 * | | 1004 * V NVA V R 1005 * |-------|-------| 1006 */ 1007 va->va_start += size; 1008 } else if (type == RE_FIT_TYPE) { 1009 /* 1010 * Split right edge of fit VA. 1011 * 1012 * | | 1013 * L V NVA V 1014 * |-------|-------| 1015 */ 1016 va->va_end = nva_start_addr; 1017 } else if (type == NE_FIT_TYPE) { 1018 /* 1019 * Split no edge of fit VA. 1020 * 1021 * | | 1022 * L V NVA V R 1023 * |---|-------|---| 1024 */ 1025 lva = __this_cpu_xchg(ne_fit_preload_node, NULL); 1026 if (unlikely(!lva)) { 1027 /* 1028 * For percpu allocator we do not do any pre-allocation 1029 * and leave it as it is. The reason is it most likely 1030 * never ends up with NE_FIT_TYPE splitting. In case of 1031 * percpu allocations offsets and sizes are aligned to 1032 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE 1033 * are its main fitting cases. 1034 * 1035 * There are a few exceptions though, as an example it is 1036 * a first allocation (early boot up) when we have "one" 1037 * big free space that has to be split. 1038 * 1039 * Also we can hit this path in case of regular "vmap" 1040 * allocations, if "this" current CPU was not preloaded. 1041 * See the comment in alloc_vmap_area() why. If so, then 1042 * GFP_NOWAIT is used instead to get an extra object for 1043 * split purpose. That is rare and most time does not 1044 * occur. 1045 * 1046 * What happens if an allocation gets failed. Basically, 1047 * an "overflow" path is triggered to purge lazily freed 1048 * areas to free some memory, then, the "retry" path is 1049 * triggered to repeat one more time. See more details 1050 * in alloc_vmap_area() function. 1051 */ 1052 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 1053 if (!lva) 1054 return -1; 1055 } 1056 1057 /* 1058 * Build the remainder. 1059 */ 1060 lva->va_start = va->va_start; 1061 lva->va_end = nva_start_addr; 1062 1063 /* 1064 * Shrink this VA to remaining size. 1065 */ 1066 va->va_start = nva_start_addr + size; 1067 } else { 1068 return -1; 1069 } 1070 1071 if (type != FL_FIT_TYPE) { 1072 augment_tree_propagate_from(va); 1073 1074 if (lva) /* type == NE_FIT_TYPE */ 1075 insert_vmap_area_augment(lva, &va->rb_node, 1076 &free_vmap_area_root, &free_vmap_area_list); 1077 } 1078 1079 return 0; 1080 } 1081 1082 /* 1083 * Returns a start address of the newly allocated area, if success. 1084 * Otherwise a vend is returned that indicates failure. 1085 */ 1086 static __always_inline unsigned long 1087 __alloc_vmap_area(unsigned long size, unsigned long align, 1088 unsigned long vstart, unsigned long vend) 1089 { 1090 unsigned long nva_start_addr; 1091 struct vmap_area *va; 1092 enum fit_type type; 1093 int ret; 1094 1095 va = find_vmap_lowest_match(size, align, vstart); 1096 if (unlikely(!va)) 1097 return vend; 1098 1099 if (va->va_start > vstart) 1100 nva_start_addr = ALIGN(va->va_start, align); 1101 else 1102 nva_start_addr = ALIGN(vstart, align); 1103 1104 /* Check the "vend" restriction. */ 1105 if (nva_start_addr + size > vend) 1106 return vend; 1107 1108 /* Classify what we have found. */ 1109 type = classify_va_fit_type(va, nva_start_addr, size); 1110 if (WARN_ON_ONCE(type == NOTHING_FIT)) 1111 return vend; 1112 1113 /* Update the free vmap_area. */ 1114 ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); 1115 if (ret) 1116 return vend; 1117 1118 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1119 find_vmap_lowest_match_check(size); 1120 #endif 1121 1122 return nva_start_addr; 1123 } 1124 1125 /* 1126 * Free a region of KVA allocated by alloc_vmap_area 1127 */ 1128 static void free_vmap_area(struct vmap_area *va) 1129 { 1130 /* 1131 * Remove from the busy tree/list. 1132 */ 1133 spin_lock(&vmap_area_lock); 1134 unlink_va(va, &vmap_area_root); 1135 spin_unlock(&vmap_area_lock); 1136 1137 /* 1138 * Insert/Merge it back to the free tree/list. 1139 */ 1140 spin_lock(&free_vmap_area_lock); 1141 merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); 1142 spin_unlock(&free_vmap_area_lock); 1143 } 1144 1145 /* 1146 * Allocate a region of KVA of the specified size and alignment, within the 1147 * vstart and vend. 1148 */ 1149 static struct vmap_area *alloc_vmap_area(unsigned long size, 1150 unsigned long align, 1151 unsigned long vstart, unsigned long vend, 1152 int node, gfp_t gfp_mask) 1153 { 1154 struct vmap_area *va, *pva; 1155 unsigned long addr; 1156 int purged = 0; 1157 int ret; 1158 1159 BUG_ON(!size); 1160 BUG_ON(offset_in_page(size)); 1161 BUG_ON(!is_power_of_2(align)); 1162 1163 if (unlikely(!vmap_initialized)) 1164 return ERR_PTR(-EBUSY); 1165 1166 might_sleep(); 1167 gfp_mask = gfp_mask & GFP_RECLAIM_MASK; 1168 1169 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1170 if (unlikely(!va)) 1171 return ERR_PTR(-ENOMEM); 1172 1173 /* 1174 * Only scan the relevant parts containing pointers to other objects 1175 * to avoid false negatives. 1176 */ 1177 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); 1178 1179 retry: 1180 /* 1181 * Preload this CPU with one extra vmap_area object. It is used 1182 * when fit type of free area is NE_FIT_TYPE. Please note, it 1183 * does not guarantee that an allocation occurs on a CPU that 1184 * is preloaded, instead we minimize the case when it is not. 1185 * It can happen because of cpu migration, because there is a 1186 * race until the below spinlock is taken. 1187 * 1188 * The preload is done in non-atomic context, thus it allows us 1189 * to use more permissive allocation masks to be more stable under 1190 * low memory condition and high memory pressure. In rare case, 1191 * if not preloaded, GFP_NOWAIT is used. 1192 * 1193 * Set "pva" to NULL here, because of "retry" path. 1194 */ 1195 pva = NULL; 1196 1197 if (!this_cpu_read(ne_fit_preload_node)) 1198 /* 1199 * Even if it fails we do not really care about that. 1200 * Just proceed as it is. If needed "overflow" path 1201 * will refill the cache we allocate from. 1202 */ 1203 pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1204 1205 spin_lock(&free_vmap_area_lock); 1206 1207 if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) 1208 kmem_cache_free(vmap_area_cachep, pva); 1209 1210 /* 1211 * If an allocation fails, the "vend" address is 1212 * returned. Therefore trigger the overflow path. 1213 */ 1214 addr = __alloc_vmap_area(size, align, vstart, vend); 1215 spin_unlock(&free_vmap_area_lock); 1216 1217 if (unlikely(addr == vend)) 1218 goto overflow; 1219 1220 va->va_start = addr; 1221 va->va_end = addr + size; 1222 va->vm = NULL; 1223 1224 1225 spin_lock(&vmap_area_lock); 1226 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1227 spin_unlock(&vmap_area_lock); 1228 1229 BUG_ON(!IS_ALIGNED(va->va_start, align)); 1230 BUG_ON(va->va_start < vstart); 1231 BUG_ON(va->va_end > vend); 1232 1233 ret = kasan_populate_vmalloc(addr, size); 1234 if (ret) { 1235 free_vmap_area(va); 1236 return ERR_PTR(ret); 1237 } 1238 1239 return va; 1240 1241 overflow: 1242 if (!purged) { 1243 purge_vmap_area_lazy(); 1244 purged = 1; 1245 goto retry; 1246 } 1247 1248 if (gfpflags_allow_blocking(gfp_mask)) { 1249 unsigned long freed = 0; 1250 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); 1251 if (freed > 0) { 1252 purged = 0; 1253 goto retry; 1254 } 1255 } 1256 1257 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 1258 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", 1259 size); 1260 1261 kmem_cache_free(vmap_area_cachep, va); 1262 return ERR_PTR(-EBUSY); 1263 } 1264 1265 int register_vmap_purge_notifier(struct notifier_block *nb) 1266 { 1267 return blocking_notifier_chain_register(&vmap_notify_list, nb); 1268 } 1269 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); 1270 1271 int unregister_vmap_purge_notifier(struct notifier_block *nb) 1272 { 1273 return blocking_notifier_chain_unregister(&vmap_notify_list, nb); 1274 } 1275 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); 1276 1277 /* 1278 * lazy_max_pages is the maximum amount of virtual address space we gather up 1279 * before attempting to purge with a TLB flush. 1280 * 1281 * There is a tradeoff here: a larger number will cover more kernel page tables 1282 * and take slightly longer to purge, but it will linearly reduce the number of 1283 * global TLB flushes that must be performed. It would seem natural to scale 1284 * this number up linearly with the number of CPUs (because vmapping activity 1285 * could also scale linearly with the number of CPUs), however it is likely 1286 * that in practice, workloads might be constrained in other ways that mean 1287 * vmap activity will not scale linearly with CPUs. Also, I want to be 1288 * conservative and not introduce a big latency on huge systems, so go with 1289 * a less aggressive log scale. It will still be an improvement over the old 1290 * code, and it will be simple to change the scale factor if we find that it 1291 * becomes a problem on bigger systems. 1292 */ 1293 static unsigned long lazy_max_pages(void) 1294 { 1295 unsigned int log; 1296 1297 log = fls(num_online_cpus()); 1298 1299 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 1300 } 1301 1302 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); 1303 1304 /* 1305 * Serialize vmap purging. There is no actual criticial section protected 1306 * by this look, but we want to avoid concurrent calls for performance 1307 * reasons and to make the pcpu_get_vm_areas more deterministic. 1308 */ 1309 static DEFINE_MUTEX(vmap_purge_lock); 1310 1311 /* for per-CPU blocks */ 1312 static void purge_fragmented_blocks_allcpus(void); 1313 1314 /* 1315 * called before a call to iounmap() if the caller wants vm_area_struct's 1316 * immediately freed. 1317 */ 1318 void set_iounmap_nonlazy(void) 1319 { 1320 atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); 1321 } 1322 1323 /* 1324 * Purges all lazily-freed vmap areas. 1325 */ 1326 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) 1327 { 1328 unsigned long resched_threshold; 1329 struct llist_node *valist; 1330 struct vmap_area *va; 1331 struct vmap_area *n_va; 1332 1333 lockdep_assert_held(&vmap_purge_lock); 1334 1335 valist = llist_del_all(&vmap_purge_list); 1336 if (unlikely(valist == NULL)) 1337 return false; 1338 1339 /* 1340 * TODO: to calculate a flush range without looping. 1341 * The list can be up to lazy_max_pages() elements. 1342 */ 1343 llist_for_each_entry(va, valist, purge_list) { 1344 if (va->va_start < start) 1345 start = va->va_start; 1346 if (va->va_end > end) 1347 end = va->va_end; 1348 } 1349 1350 flush_tlb_kernel_range(start, end); 1351 resched_threshold = lazy_max_pages() << 1; 1352 1353 spin_lock(&free_vmap_area_lock); 1354 llist_for_each_entry_safe(va, n_va, valist, purge_list) { 1355 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1356 unsigned long orig_start = va->va_start; 1357 unsigned long orig_end = va->va_end; 1358 1359 /* 1360 * Finally insert or merge lazily-freed area. It is 1361 * detached and there is no need to "unlink" it from 1362 * anything. 1363 */ 1364 va = merge_or_add_vmap_area(va, &free_vmap_area_root, 1365 &free_vmap_area_list); 1366 1367 if (!va) 1368 continue; 1369 1370 if (is_vmalloc_or_module_addr((void *)orig_start)) 1371 kasan_release_vmalloc(orig_start, orig_end, 1372 va->va_start, va->va_end); 1373 1374 atomic_long_sub(nr, &vmap_lazy_nr); 1375 1376 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) 1377 cond_resched_lock(&free_vmap_area_lock); 1378 } 1379 spin_unlock(&free_vmap_area_lock); 1380 return true; 1381 } 1382 1383 /* 1384 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody 1385 * is already purging. 1386 */ 1387 static void try_purge_vmap_area_lazy(void) 1388 { 1389 if (mutex_trylock(&vmap_purge_lock)) { 1390 __purge_vmap_area_lazy(ULONG_MAX, 0); 1391 mutex_unlock(&vmap_purge_lock); 1392 } 1393 } 1394 1395 /* 1396 * Kick off a purge of the outstanding lazy areas. 1397 */ 1398 static void purge_vmap_area_lazy(void) 1399 { 1400 mutex_lock(&vmap_purge_lock); 1401 purge_fragmented_blocks_allcpus(); 1402 __purge_vmap_area_lazy(ULONG_MAX, 0); 1403 mutex_unlock(&vmap_purge_lock); 1404 } 1405 1406 /* 1407 * Free a vmap area, caller ensuring that the area has been unmapped 1408 * and flush_cache_vunmap had been called for the correct range 1409 * previously. 1410 */ 1411 static void free_vmap_area_noflush(struct vmap_area *va) 1412 { 1413 unsigned long nr_lazy; 1414 1415 spin_lock(&vmap_area_lock); 1416 unlink_va(va, &vmap_area_root); 1417 spin_unlock(&vmap_area_lock); 1418 1419 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1420 PAGE_SHIFT, &vmap_lazy_nr); 1421 1422 /* After this point, we may free va at any time */ 1423 llist_add(&va->purge_list, &vmap_purge_list); 1424 1425 if (unlikely(nr_lazy > lazy_max_pages())) 1426 try_purge_vmap_area_lazy(); 1427 } 1428 1429 /* 1430 * Free and unmap a vmap area 1431 */ 1432 static void free_unmap_vmap_area(struct vmap_area *va) 1433 { 1434 flush_cache_vunmap(va->va_start, va->va_end); 1435 unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); 1436 if (debug_pagealloc_enabled_static()) 1437 flush_tlb_kernel_range(va->va_start, va->va_end); 1438 1439 free_vmap_area_noflush(va); 1440 } 1441 1442 static struct vmap_area *find_vmap_area(unsigned long addr) 1443 { 1444 struct vmap_area *va; 1445 1446 spin_lock(&vmap_area_lock); 1447 va = __find_vmap_area(addr); 1448 spin_unlock(&vmap_area_lock); 1449 1450 return va; 1451 } 1452 1453 /*** Per cpu kva allocator ***/ 1454 1455 /* 1456 * vmap space is limited especially on 32 bit architectures. Ensure there is 1457 * room for at least 16 percpu vmap blocks per CPU. 1458 */ 1459 /* 1460 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 1461 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 1462 * instead (we just need a rough idea) 1463 */ 1464 #if BITS_PER_LONG == 32 1465 #define VMALLOC_SPACE (128UL*1024*1024) 1466 #else 1467 #define VMALLOC_SPACE (128UL*1024*1024*1024) 1468 #endif 1469 1470 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 1471 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 1472 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 1473 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 1474 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 1475 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 1476 #define VMAP_BBMAP_BITS \ 1477 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 1478 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 1479 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 1480 1481 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 1482 1483 struct vmap_block_queue { 1484 spinlock_t lock; 1485 struct list_head free; 1486 }; 1487 1488 struct vmap_block { 1489 spinlock_t lock; 1490 struct vmap_area *va; 1491 unsigned long free, dirty; 1492 unsigned long dirty_min, dirty_max; /*< dirty range */ 1493 struct list_head free_list; 1494 struct rcu_head rcu_head; 1495 struct list_head purge; 1496 }; 1497 1498 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 1499 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 1500 1501 /* 1502 * XArray of vmap blocks, indexed by address, to quickly find a vmap block 1503 * in the free path. Could get rid of this if we change the API to return a 1504 * "cookie" from alloc, to be passed to free. But no big deal yet. 1505 */ 1506 static DEFINE_XARRAY(vmap_blocks); 1507 1508 /* 1509 * We should probably have a fallback mechanism to allocate virtual memory 1510 * out of partially filled vmap blocks. However vmap block sizing should be 1511 * fairly reasonable according to the vmalloc size, so it shouldn't be a 1512 * big problem. 1513 */ 1514 1515 static unsigned long addr_to_vb_idx(unsigned long addr) 1516 { 1517 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 1518 addr /= VMAP_BLOCK_SIZE; 1519 return addr; 1520 } 1521 1522 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) 1523 { 1524 unsigned long addr; 1525 1526 addr = va_start + (pages_off << PAGE_SHIFT); 1527 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); 1528 return (void *)addr; 1529 } 1530 1531 /** 1532 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this 1533 * block. Of course pages number can't exceed VMAP_BBMAP_BITS 1534 * @order: how many 2^order pages should be occupied in newly allocated block 1535 * @gfp_mask: flags for the page level allocator 1536 * 1537 * Return: virtual address in a newly allocated block or ERR_PTR(-errno) 1538 */ 1539 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 1540 { 1541 struct vmap_block_queue *vbq; 1542 struct vmap_block *vb; 1543 struct vmap_area *va; 1544 unsigned long vb_idx; 1545 int node, err; 1546 void *vaddr; 1547 1548 node = numa_node_id(); 1549 1550 vb = kmalloc_node(sizeof(struct vmap_block), 1551 gfp_mask & GFP_RECLAIM_MASK, node); 1552 if (unlikely(!vb)) 1553 return ERR_PTR(-ENOMEM); 1554 1555 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 1556 VMALLOC_START, VMALLOC_END, 1557 node, gfp_mask); 1558 if (IS_ERR(va)) { 1559 kfree(vb); 1560 return ERR_CAST(va); 1561 } 1562 1563 vaddr = vmap_block_vaddr(va->va_start, 0); 1564 spin_lock_init(&vb->lock); 1565 vb->va = va; 1566 /* At least something should be left free */ 1567 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); 1568 vb->free = VMAP_BBMAP_BITS - (1UL << order); 1569 vb->dirty = 0; 1570 vb->dirty_min = VMAP_BBMAP_BITS; 1571 vb->dirty_max = 0; 1572 INIT_LIST_HEAD(&vb->free_list); 1573 1574 vb_idx = addr_to_vb_idx(va->va_start); 1575 err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); 1576 if (err) { 1577 kfree(vb); 1578 free_vmap_area(va); 1579 return ERR_PTR(err); 1580 } 1581 1582 vbq = &get_cpu_var(vmap_block_queue); 1583 spin_lock(&vbq->lock); 1584 list_add_tail_rcu(&vb->free_list, &vbq->free); 1585 spin_unlock(&vbq->lock); 1586 put_cpu_var(vmap_block_queue); 1587 1588 return vaddr; 1589 } 1590 1591 static void free_vmap_block(struct vmap_block *vb) 1592 { 1593 struct vmap_block *tmp; 1594 1595 tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); 1596 BUG_ON(tmp != vb); 1597 1598 free_vmap_area_noflush(vb->va); 1599 kfree_rcu(vb, rcu_head); 1600 } 1601 1602 static void purge_fragmented_blocks(int cpu) 1603 { 1604 LIST_HEAD(purge); 1605 struct vmap_block *vb; 1606 struct vmap_block *n_vb; 1607 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1608 1609 rcu_read_lock(); 1610 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1611 1612 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 1613 continue; 1614 1615 spin_lock(&vb->lock); 1616 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 1617 vb->free = 0; /* prevent further allocs after releasing lock */ 1618 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 1619 vb->dirty_min = 0; 1620 vb->dirty_max = VMAP_BBMAP_BITS; 1621 spin_lock(&vbq->lock); 1622 list_del_rcu(&vb->free_list); 1623 spin_unlock(&vbq->lock); 1624 spin_unlock(&vb->lock); 1625 list_add_tail(&vb->purge, &purge); 1626 } else 1627 spin_unlock(&vb->lock); 1628 } 1629 rcu_read_unlock(); 1630 1631 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 1632 list_del(&vb->purge); 1633 free_vmap_block(vb); 1634 } 1635 } 1636 1637 static void purge_fragmented_blocks_allcpus(void) 1638 { 1639 int cpu; 1640 1641 for_each_possible_cpu(cpu) 1642 purge_fragmented_blocks(cpu); 1643 } 1644 1645 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 1646 { 1647 struct vmap_block_queue *vbq; 1648 struct vmap_block *vb; 1649 void *vaddr = NULL; 1650 unsigned int order; 1651 1652 BUG_ON(offset_in_page(size)); 1653 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1654 if (WARN_ON(size == 0)) { 1655 /* 1656 * Allocating 0 bytes isn't what caller wants since 1657 * get_order(0) returns funny result. Just warn and terminate 1658 * early. 1659 */ 1660 return NULL; 1661 } 1662 order = get_order(size); 1663 1664 rcu_read_lock(); 1665 vbq = &get_cpu_var(vmap_block_queue); 1666 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1667 unsigned long pages_off; 1668 1669 spin_lock(&vb->lock); 1670 if (vb->free < (1UL << order)) { 1671 spin_unlock(&vb->lock); 1672 continue; 1673 } 1674 1675 pages_off = VMAP_BBMAP_BITS - vb->free; 1676 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); 1677 vb->free -= 1UL << order; 1678 if (vb->free == 0) { 1679 spin_lock(&vbq->lock); 1680 list_del_rcu(&vb->free_list); 1681 spin_unlock(&vbq->lock); 1682 } 1683 1684 spin_unlock(&vb->lock); 1685 break; 1686 } 1687 1688 put_cpu_var(vmap_block_queue); 1689 rcu_read_unlock(); 1690 1691 /* Allocate new block if nothing was found */ 1692 if (!vaddr) 1693 vaddr = new_vmap_block(order, gfp_mask); 1694 1695 return vaddr; 1696 } 1697 1698 static void vb_free(unsigned long addr, unsigned long size) 1699 { 1700 unsigned long offset; 1701 unsigned int order; 1702 struct vmap_block *vb; 1703 1704 BUG_ON(offset_in_page(size)); 1705 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1706 1707 flush_cache_vunmap(addr, addr + size); 1708 1709 order = get_order(size); 1710 offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; 1711 vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); 1712 1713 unmap_kernel_range_noflush(addr, size); 1714 1715 if (debug_pagealloc_enabled_static()) 1716 flush_tlb_kernel_range(addr, addr + size); 1717 1718 spin_lock(&vb->lock); 1719 1720 /* Expand dirty range */ 1721 vb->dirty_min = min(vb->dirty_min, offset); 1722 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); 1723 1724 vb->dirty += 1UL << order; 1725 if (vb->dirty == VMAP_BBMAP_BITS) { 1726 BUG_ON(vb->free); 1727 spin_unlock(&vb->lock); 1728 free_vmap_block(vb); 1729 } else 1730 spin_unlock(&vb->lock); 1731 } 1732 1733 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) 1734 { 1735 int cpu; 1736 1737 if (unlikely(!vmap_initialized)) 1738 return; 1739 1740 might_sleep(); 1741 1742 for_each_possible_cpu(cpu) { 1743 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1744 struct vmap_block *vb; 1745 1746 rcu_read_lock(); 1747 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1748 spin_lock(&vb->lock); 1749 if (vb->dirty) { 1750 unsigned long va_start = vb->va->va_start; 1751 unsigned long s, e; 1752 1753 s = va_start + (vb->dirty_min << PAGE_SHIFT); 1754 e = va_start + (vb->dirty_max << PAGE_SHIFT); 1755 1756 start = min(s, start); 1757 end = max(e, end); 1758 1759 flush = 1; 1760 } 1761 spin_unlock(&vb->lock); 1762 } 1763 rcu_read_unlock(); 1764 } 1765 1766 mutex_lock(&vmap_purge_lock); 1767 purge_fragmented_blocks_allcpus(); 1768 if (!__purge_vmap_area_lazy(start, end) && flush) 1769 flush_tlb_kernel_range(start, end); 1770 mutex_unlock(&vmap_purge_lock); 1771 } 1772 1773 /** 1774 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 1775 * 1776 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 1777 * to amortize TLB flushing overheads. What this means is that any page you 1778 * have now, may, in a former life, have been mapped into kernel virtual 1779 * address by the vmap layer and so there might be some CPUs with TLB entries 1780 * still referencing that page (additional to the regular 1:1 kernel mapping). 1781 * 1782 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 1783 * be sure that none of the pages we have control over will have any aliases 1784 * from the vmap layer. 1785 */ 1786 void vm_unmap_aliases(void) 1787 { 1788 unsigned long start = ULONG_MAX, end = 0; 1789 int flush = 0; 1790 1791 _vm_unmap_aliases(start, end, flush); 1792 } 1793 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 1794 1795 /** 1796 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 1797 * @mem: the pointer returned by vm_map_ram 1798 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 1799 */ 1800 void vm_unmap_ram(const void *mem, unsigned int count) 1801 { 1802 unsigned long size = (unsigned long)count << PAGE_SHIFT; 1803 unsigned long addr = (unsigned long)mem; 1804 struct vmap_area *va; 1805 1806 might_sleep(); 1807 BUG_ON(!addr); 1808 BUG_ON(addr < VMALLOC_START); 1809 BUG_ON(addr > VMALLOC_END); 1810 BUG_ON(!PAGE_ALIGNED(addr)); 1811 1812 kasan_poison_vmalloc(mem, size); 1813 1814 if (likely(count <= VMAP_MAX_ALLOC)) { 1815 debug_check_no_locks_freed(mem, size); 1816 vb_free(addr, size); 1817 return; 1818 } 1819 1820 va = find_vmap_area(addr); 1821 BUG_ON(!va); 1822 debug_check_no_locks_freed((void *)va->va_start, 1823 (va->va_end - va->va_start)); 1824 free_unmap_vmap_area(va); 1825 } 1826 EXPORT_SYMBOL(vm_unmap_ram); 1827 1828 /** 1829 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 1830 * @pages: an array of pointers to the pages to be mapped 1831 * @count: number of pages 1832 * @node: prefer to allocate data structures on this node 1833 * 1834 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be 1835 * faster than vmap so it's good. But if you mix long-life and short-life 1836 * objects with vm_map_ram(), it could consume lots of address space through 1837 * fragmentation (especially on a 32bit machine). You could see failures in 1838 * the end. Please use this function for short-lived objects. 1839 * 1840 * Returns: a pointer to the address that has been mapped, or %NULL on failure 1841 */ 1842 void *vm_map_ram(struct page **pages, unsigned int count, int node) 1843 { 1844 unsigned long size = (unsigned long)count << PAGE_SHIFT; 1845 unsigned long addr; 1846 void *mem; 1847 1848 if (likely(count <= VMAP_MAX_ALLOC)) { 1849 mem = vb_alloc(size, GFP_KERNEL); 1850 if (IS_ERR(mem)) 1851 return NULL; 1852 addr = (unsigned long)mem; 1853 } else { 1854 struct vmap_area *va; 1855 va = alloc_vmap_area(size, PAGE_SIZE, 1856 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 1857 if (IS_ERR(va)) 1858 return NULL; 1859 1860 addr = va->va_start; 1861 mem = (void *)addr; 1862 } 1863 1864 kasan_unpoison_vmalloc(mem, size); 1865 1866 if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { 1867 vm_unmap_ram(mem, count); 1868 return NULL; 1869 } 1870 return mem; 1871 } 1872 EXPORT_SYMBOL(vm_map_ram); 1873 1874 static struct vm_struct *vmlist __initdata; 1875 1876 /** 1877 * vm_area_add_early - add vmap area early during boot 1878 * @vm: vm_struct to add 1879 * 1880 * This function is used to add fixed kernel vm area to vmlist before 1881 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags 1882 * should contain proper values and the other fields should be zero. 1883 * 1884 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1885 */ 1886 void __init vm_area_add_early(struct vm_struct *vm) 1887 { 1888 struct vm_struct *tmp, **p; 1889 1890 BUG_ON(vmap_initialized); 1891 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1892 if (tmp->addr >= vm->addr) { 1893 BUG_ON(tmp->addr < vm->addr + vm->size); 1894 break; 1895 } else 1896 BUG_ON(tmp->addr + tmp->size > vm->addr); 1897 } 1898 vm->next = *p; 1899 *p = vm; 1900 } 1901 1902 /** 1903 * vm_area_register_early - register vmap area early during boot 1904 * @vm: vm_struct to register 1905 * @align: requested alignment 1906 * 1907 * This function is used to register kernel vm area before 1908 * vmalloc_init() is called. @vm->size and @vm->flags should contain 1909 * proper values on entry and other fields should be zero. On return, 1910 * vm->addr contains the allocated address. 1911 * 1912 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1913 */ 1914 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 1915 { 1916 static size_t vm_init_off __initdata; 1917 unsigned long addr; 1918 1919 addr = ALIGN(VMALLOC_START + vm_init_off, align); 1920 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; 1921 1922 vm->addr = (void *)addr; 1923 1924 vm_area_add_early(vm); 1925 } 1926 1927 static void vmap_init_free_space(void) 1928 { 1929 unsigned long vmap_start = 1; 1930 const unsigned long vmap_end = ULONG_MAX; 1931 struct vmap_area *busy, *free; 1932 1933 /* 1934 * B F B B B F 1935 * -|-----|.....|-----|-----|-----|.....|- 1936 * | The KVA space | 1937 * |<--------------------------------->| 1938 */ 1939 list_for_each_entry(busy, &vmap_area_list, list) { 1940 if (busy->va_start - vmap_start > 0) { 1941 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1942 if (!WARN_ON_ONCE(!free)) { 1943 free->va_start = vmap_start; 1944 free->va_end = busy->va_start; 1945 1946 insert_vmap_area_augment(free, NULL, 1947 &free_vmap_area_root, 1948 &free_vmap_area_list); 1949 } 1950 } 1951 1952 vmap_start = busy->va_end; 1953 } 1954 1955 if (vmap_end - vmap_start > 0) { 1956 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1957 if (!WARN_ON_ONCE(!free)) { 1958 free->va_start = vmap_start; 1959 free->va_end = vmap_end; 1960 1961 insert_vmap_area_augment(free, NULL, 1962 &free_vmap_area_root, 1963 &free_vmap_area_list); 1964 } 1965 } 1966 } 1967 1968 void __init vmalloc_init(void) 1969 { 1970 struct vmap_area *va; 1971 struct vm_struct *tmp; 1972 int i; 1973 1974 /* 1975 * Create the cache for vmap_area objects. 1976 */ 1977 vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); 1978 1979 for_each_possible_cpu(i) { 1980 struct vmap_block_queue *vbq; 1981 struct vfree_deferred *p; 1982 1983 vbq = &per_cpu(vmap_block_queue, i); 1984 spin_lock_init(&vbq->lock); 1985 INIT_LIST_HEAD(&vbq->free); 1986 p = &per_cpu(vfree_deferred, i); 1987 init_llist_head(&p->list); 1988 INIT_WORK(&p->wq, free_work); 1989 } 1990 1991 /* Import existing vmlist entries. */ 1992 for (tmp = vmlist; tmp; tmp = tmp->next) { 1993 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1994 if (WARN_ON_ONCE(!va)) 1995 continue; 1996 1997 va->va_start = (unsigned long)tmp->addr; 1998 va->va_end = va->va_start + tmp->size; 1999 va->vm = tmp; 2000 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 2001 } 2002 2003 /* 2004 * Now we can initialize a free vmap space. 2005 */ 2006 vmap_init_free_space(); 2007 vmap_initialized = true; 2008 } 2009 2010 /** 2011 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB 2012 * @addr: start of the VM area to unmap 2013 * @size: size of the VM area to unmap 2014 * 2015 * Similar to unmap_kernel_range_noflush() but flushes vcache before 2016 * the unmapping and tlb after. 2017 */ 2018 void unmap_kernel_range(unsigned long addr, unsigned long size) 2019 { 2020 unsigned long end = addr + size; 2021 2022 flush_cache_vunmap(addr, end); 2023 unmap_kernel_range_noflush(addr, size); 2024 flush_tlb_kernel_range(addr, end); 2025 } 2026 2027 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, 2028 struct vmap_area *va, unsigned long flags, const void *caller) 2029 { 2030 vm->flags = flags; 2031 vm->addr = (void *)va->va_start; 2032 vm->size = va->va_end - va->va_start; 2033 vm->caller = caller; 2034 va->vm = vm; 2035 } 2036 2037 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 2038 unsigned long flags, const void *caller) 2039 { 2040 spin_lock(&vmap_area_lock); 2041 setup_vmalloc_vm_locked(vm, va, flags, caller); 2042 spin_unlock(&vmap_area_lock); 2043 } 2044 2045 static void clear_vm_uninitialized_flag(struct vm_struct *vm) 2046 { 2047 /* 2048 * Before removing VM_UNINITIALIZED, 2049 * we should make sure that vm has proper values. 2050 * Pair with smp_rmb() in show_numa_info(). 2051 */ 2052 smp_wmb(); 2053 vm->flags &= ~VM_UNINITIALIZED; 2054 } 2055 2056 static struct vm_struct *__get_vm_area_node(unsigned long size, 2057 unsigned long align, unsigned long flags, unsigned long start, 2058 unsigned long end, int node, gfp_t gfp_mask, const void *caller) 2059 { 2060 struct vmap_area *va; 2061 struct vm_struct *area; 2062 unsigned long requested_size = size; 2063 2064 BUG_ON(in_interrupt()); 2065 size = PAGE_ALIGN(size); 2066 if (unlikely(!size)) 2067 return NULL; 2068 2069 if (flags & VM_IOREMAP) 2070 align = 1ul << clamp_t(int, get_count_order_long(size), 2071 PAGE_SHIFT, IOREMAP_MAX_ORDER); 2072 2073 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 2074 if (unlikely(!area)) 2075 return NULL; 2076 2077 if (!(flags & VM_NO_GUARD)) 2078 size += PAGE_SIZE; 2079 2080 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 2081 if (IS_ERR(va)) { 2082 kfree(area); 2083 return NULL; 2084 } 2085 2086 kasan_unpoison_vmalloc((void *)va->va_start, requested_size); 2087 2088 setup_vmalloc_vm(area, va, flags, caller); 2089 2090 return area; 2091 } 2092 2093 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 2094 unsigned long start, unsigned long end, 2095 const void *caller) 2096 { 2097 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, 2098 GFP_KERNEL, caller); 2099 } 2100 2101 /** 2102 * get_vm_area - reserve a contiguous kernel virtual area 2103 * @size: size of the area 2104 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 2105 * 2106 * Search an area of @size in the kernel virtual mapping area, 2107 * and reserved it for out purposes. Returns the area descriptor 2108 * on success or %NULL on failure. 2109 * 2110 * Return: the area descriptor on success or %NULL on failure. 2111 */ 2112 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 2113 { 2114 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2115 NUMA_NO_NODE, GFP_KERNEL, 2116 __builtin_return_address(0)); 2117 } 2118 2119 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 2120 const void *caller) 2121 { 2122 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2123 NUMA_NO_NODE, GFP_KERNEL, caller); 2124 } 2125 2126 /** 2127 * find_vm_area - find a continuous kernel virtual area 2128 * @addr: base address 2129 * 2130 * Search for the kernel VM area starting at @addr, and return it. 2131 * It is up to the caller to do all required locking to keep the returned 2132 * pointer valid. 2133 * 2134 * Return: the area descriptor on success or %NULL on failure. 2135 */ 2136 struct vm_struct *find_vm_area(const void *addr) 2137 { 2138 struct vmap_area *va; 2139 2140 va = find_vmap_area((unsigned long)addr); 2141 if (!va) 2142 return NULL; 2143 2144 return va->vm; 2145 } 2146 2147 /** 2148 * remove_vm_area - find and remove a continuous kernel virtual area 2149 * @addr: base address 2150 * 2151 * Search for the kernel VM area starting at @addr, and remove it. 2152 * This function returns the found VM area, but using it is NOT safe 2153 * on SMP machines, except for its size or flags. 2154 * 2155 * Return: the area descriptor on success or %NULL on failure. 2156 */ 2157 struct vm_struct *remove_vm_area(const void *addr) 2158 { 2159 struct vmap_area *va; 2160 2161 might_sleep(); 2162 2163 spin_lock(&vmap_area_lock); 2164 va = __find_vmap_area((unsigned long)addr); 2165 if (va && va->vm) { 2166 struct vm_struct *vm = va->vm; 2167 2168 va->vm = NULL; 2169 spin_unlock(&vmap_area_lock); 2170 2171 kasan_free_shadow(vm); 2172 free_unmap_vmap_area(va); 2173 2174 return vm; 2175 } 2176 2177 spin_unlock(&vmap_area_lock); 2178 return NULL; 2179 } 2180 2181 static inline void set_area_direct_map(const struct vm_struct *area, 2182 int (*set_direct_map)(struct page *page)) 2183 { 2184 int i; 2185 2186 for (i = 0; i < area->nr_pages; i++) 2187 if (page_address(area->pages[i])) 2188 set_direct_map(area->pages[i]); 2189 } 2190 2191 /* Handle removing and resetting vm mappings related to the vm_struct. */ 2192 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) 2193 { 2194 unsigned long start = ULONG_MAX, end = 0; 2195 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; 2196 int flush_dmap = 0; 2197 int i; 2198 2199 remove_vm_area(area->addr); 2200 2201 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ 2202 if (!flush_reset) 2203 return; 2204 2205 /* 2206 * If not deallocating pages, just do the flush of the VM area and 2207 * return. 2208 */ 2209 if (!deallocate_pages) { 2210 vm_unmap_aliases(); 2211 return; 2212 } 2213 2214 /* 2215 * If execution gets here, flush the vm mapping and reset the direct 2216 * map. Find the start and end range of the direct mappings to make sure 2217 * the vm_unmap_aliases() flush includes the direct map. 2218 */ 2219 for (i = 0; i < area->nr_pages; i++) { 2220 unsigned long addr = (unsigned long)page_address(area->pages[i]); 2221 if (addr) { 2222 start = min(addr, start); 2223 end = max(addr + PAGE_SIZE, end); 2224 flush_dmap = 1; 2225 } 2226 } 2227 2228 /* 2229 * Set direct map to something invalid so that it won't be cached if 2230 * there are any accesses after the TLB flush, then flush the TLB and 2231 * reset the direct map permissions to the default. 2232 */ 2233 set_area_direct_map(area, set_direct_map_invalid_noflush); 2234 _vm_unmap_aliases(start, end, flush_dmap); 2235 set_area_direct_map(area, set_direct_map_default_noflush); 2236 } 2237 2238 static void __vunmap(const void *addr, int deallocate_pages) 2239 { 2240 struct vm_struct *area; 2241 2242 if (!addr) 2243 return; 2244 2245 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", 2246 addr)) 2247 return; 2248 2249 area = find_vm_area(addr); 2250 if (unlikely(!area)) { 2251 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 2252 addr); 2253 return; 2254 } 2255 2256 debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); 2257 debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); 2258 2259 kasan_poison_vmalloc(area->addr, area->size); 2260 2261 vm_remove_mappings(area, deallocate_pages); 2262 2263 if (deallocate_pages) { 2264 int i; 2265 2266 for (i = 0; i < area->nr_pages; i++) { 2267 struct page *page = area->pages[i]; 2268 2269 BUG_ON(!page); 2270 __free_pages(page, 0); 2271 } 2272 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); 2273 2274 kvfree(area->pages); 2275 } 2276 2277 kfree(area); 2278 return; 2279 } 2280 2281 static inline void __vfree_deferred(const void *addr) 2282 { 2283 /* 2284 * Use raw_cpu_ptr() because this can be called from preemptible 2285 * context. Preemption is absolutely fine here, because the llist_add() 2286 * implementation is lockless, so it works even if we are adding to 2287 * another cpu's list. schedule_work() should be fine with this too. 2288 */ 2289 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); 2290 2291 if (llist_add((struct llist_node *)addr, &p->list)) 2292 schedule_work(&p->wq); 2293 } 2294 2295 /** 2296 * vfree_atomic - release memory allocated by vmalloc() 2297 * @addr: memory base address 2298 * 2299 * This one is just like vfree() but can be called in any atomic context 2300 * except NMIs. 2301 */ 2302 void vfree_atomic(const void *addr) 2303 { 2304 BUG_ON(in_nmi()); 2305 2306 kmemleak_free(addr); 2307 2308 if (!addr) 2309 return; 2310 __vfree_deferred(addr); 2311 } 2312 2313 static void __vfree(const void *addr) 2314 { 2315 if (unlikely(in_interrupt())) 2316 __vfree_deferred(addr); 2317 else 2318 __vunmap(addr, 1); 2319 } 2320 2321 /** 2322 * vfree - Release memory allocated by vmalloc() 2323 * @addr: Memory base address 2324 * 2325 * Free the virtually continuous memory area starting at @addr, as obtained 2326 * from one of the vmalloc() family of APIs. This will usually also free the 2327 * physical memory underlying the virtual allocation, but that memory is 2328 * reference counted, so it will not be freed until the last user goes away. 2329 * 2330 * If @addr is NULL, no operation is performed. 2331 * 2332 * Context: 2333 * May sleep if called *not* from interrupt context. 2334 * Must not be called in NMI context (strictly speaking, it could be 2335 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 2336 * conventions for vfree() arch-depenedent would be a really bad idea). 2337 */ 2338 void vfree(const void *addr) 2339 { 2340 BUG_ON(in_nmi()); 2341 2342 kmemleak_free(addr); 2343 2344 might_sleep_if(!in_interrupt()); 2345 2346 if (!addr) 2347 return; 2348 2349 __vfree(addr); 2350 } 2351 EXPORT_SYMBOL(vfree); 2352 2353 /** 2354 * vunmap - release virtual mapping obtained by vmap() 2355 * @addr: memory base address 2356 * 2357 * Free the virtually contiguous memory area starting at @addr, 2358 * which was created from the page array passed to vmap(). 2359 * 2360 * Must not be called in interrupt context. 2361 */ 2362 void vunmap(const void *addr) 2363 { 2364 BUG_ON(in_interrupt()); 2365 might_sleep(); 2366 if (addr) 2367 __vunmap(addr, 0); 2368 } 2369 EXPORT_SYMBOL(vunmap); 2370 2371 /** 2372 * vmap - map an array of pages into virtually contiguous space 2373 * @pages: array of page pointers 2374 * @count: number of pages to map 2375 * @flags: vm_area->flags 2376 * @prot: page protection for the mapping 2377 * 2378 * Maps @count pages from @pages into contiguous kernel virtual space. 2379 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself 2380 * (which must be kmalloc or vmalloc memory) and one reference per pages in it 2381 * are transferred from the caller to vmap(), and will be freed / dropped when 2382 * vfree() is called on the return value. 2383 * 2384 * Return: the address of the area or %NULL on failure 2385 */ 2386 void *vmap(struct page **pages, unsigned int count, 2387 unsigned long flags, pgprot_t prot) 2388 { 2389 struct vm_struct *area; 2390 unsigned long size; /* In bytes */ 2391 2392 might_sleep(); 2393 2394 if (count > totalram_pages()) 2395 return NULL; 2396 2397 size = (unsigned long)count << PAGE_SHIFT; 2398 area = get_vm_area_caller(size, flags, __builtin_return_address(0)); 2399 if (!area) 2400 return NULL; 2401 2402 if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), 2403 pages) < 0) { 2404 vunmap(area->addr); 2405 return NULL; 2406 } 2407 2408 if (flags & VM_MAP_PUT_PAGES) 2409 area->pages = pages; 2410 return area->addr; 2411 } 2412 EXPORT_SYMBOL(vmap); 2413 2414 #ifdef CONFIG_VMAP_PFN 2415 struct vmap_pfn_data { 2416 unsigned long *pfns; 2417 pgprot_t prot; 2418 unsigned int idx; 2419 }; 2420 2421 static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) 2422 { 2423 struct vmap_pfn_data *data = private; 2424 2425 if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) 2426 return -EINVAL; 2427 *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); 2428 return 0; 2429 } 2430 2431 /** 2432 * vmap_pfn - map an array of PFNs into virtually contiguous space 2433 * @pfns: array of PFNs 2434 * @count: number of pages to map 2435 * @prot: page protection for the mapping 2436 * 2437 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns 2438 * the start address of the mapping. 2439 */ 2440 void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) 2441 { 2442 struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; 2443 struct vm_struct *area; 2444 2445 area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, 2446 __builtin_return_address(0)); 2447 if (!area) 2448 return NULL; 2449 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2450 count * PAGE_SIZE, vmap_pfn_apply, &data)) { 2451 free_vm_area(area); 2452 return NULL; 2453 } 2454 return area->addr; 2455 } 2456 EXPORT_SYMBOL_GPL(vmap_pfn); 2457 #endif /* CONFIG_VMAP_PFN */ 2458 2459 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 2460 pgprot_t prot, int node) 2461 { 2462 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 2463 unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 2464 unsigned int array_size = nr_pages * sizeof(struct page *), i; 2465 struct page **pages; 2466 2467 gfp_mask |= __GFP_NOWARN; 2468 if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) 2469 gfp_mask |= __GFP_HIGHMEM; 2470 2471 /* Please note that the recursion is strictly bounded. */ 2472 if (array_size > PAGE_SIZE) { 2473 pages = __vmalloc_node(array_size, 1, nested_gfp, node, 2474 area->caller); 2475 } else { 2476 pages = kmalloc_node(array_size, nested_gfp, node); 2477 } 2478 2479 if (!pages) { 2480 remove_vm_area(area->addr); 2481 kfree(area); 2482 return NULL; 2483 } 2484 2485 area->pages = pages; 2486 area->nr_pages = nr_pages; 2487 2488 for (i = 0; i < area->nr_pages; i++) { 2489 struct page *page; 2490 2491 if (node == NUMA_NO_NODE) 2492 page = alloc_page(gfp_mask); 2493 else 2494 page = alloc_pages_node(node, gfp_mask, 0); 2495 2496 if (unlikely(!page)) { 2497 /* Successfully allocated i pages, free them in __vfree() */ 2498 area->nr_pages = i; 2499 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2500 goto fail; 2501 } 2502 area->pages[i] = page; 2503 if (gfpflags_allow_blocking(gfp_mask)) 2504 cond_resched(); 2505 } 2506 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2507 2508 if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), 2509 prot, pages) < 0) 2510 goto fail; 2511 2512 return area->addr; 2513 2514 fail: 2515 warn_alloc(gfp_mask, NULL, 2516 "vmalloc: allocation failure, allocated %ld of %ld bytes", 2517 (area->nr_pages*PAGE_SIZE), area->size); 2518 __vfree(area->addr); 2519 return NULL; 2520 } 2521 2522 /** 2523 * __vmalloc_node_range - allocate virtually contiguous memory 2524 * @size: allocation size 2525 * @align: desired alignment 2526 * @start: vm area range start 2527 * @end: vm area range end 2528 * @gfp_mask: flags for the page level allocator 2529 * @prot: protection mask for the allocated pages 2530 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 2531 * @node: node to use for allocation or NUMA_NO_NODE 2532 * @caller: caller's return address 2533 * 2534 * Allocate enough pages to cover @size from the page level 2535 * allocator with @gfp_mask flags. Map them into contiguous 2536 * kernel virtual space, using a pagetable protection of @prot. 2537 * 2538 * Return: the address of the area or %NULL on failure 2539 */ 2540 void *__vmalloc_node_range(unsigned long size, unsigned long align, 2541 unsigned long start, unsigned long end, gfp_t gfp_mask, 2542 pgprot_t prot, unsigned long vm_flags, int node, 2543 const void *caller) 2544 { 2545 struct vm_struct *area; 2546 void *addr; 2547 unsigned long real_size = size; 2548 2549 size = PAGE_ALIGN(size); 2550 if (!size || (size >> PAGE_SHIFT) > totalram_pages()) 2551 goto fail; 2552 2553 area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | 2554 vm_flags, start, end, node, gfp_mask, caller); 2555 if (!area) 2556 goto fail; 2557 2558 addr = __vmalloc_area_node(area, gfp_mask, prot, node); 2559 if (!addr) 2560 return NULL; 2561 2562 /* 2563 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 2564 * flag. It means that vm_struct is not fully initialized. 2565 * Now, it is fully initialized, so remove this flag here. 2566 */ 2567 clear_vm_uninitialized_flag(area); 2568 2569 kmemleak_vmalloc(area, size, gfp_mask); 2570 2571 return addr; 2572 2573 fail: 2574 warn_alloc(gfp_mask, NULL, 2575 "vmalloc: allocation failure: %lu bytes", real_size); 2576 return NULL; 2577 } 2578 2579 /** 2580 * __vmalloc_node - allocate virtually contiguous memory 2581 * @size: allocation size 2582 * @align: desired alignment 2583 * @gfp_mask: flags for the page level allocator 2584 * @node: node to use for allocation or NUMA_NO_NODE 2585 * @caller: caller's return address 2586 * 2587 * Allocate enough pages to cover @size from the page level allocator with 2588 * @gfp_mask flags. Map them into contiguous kernel virtual space. 2589 * 2590 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 2591 * and __GFP_NOFAIL are not supported 2592 * 2593 * Any use of gfp flags outside of GFP_KERNEL should be consulted 2594 * with mm people. 2595 * 2596 * Return: pointer to the allocated memory or %NULL on error 2597 */ 2598 void *__vmalloc_node(unsigned long size, unsigned long align, 2599 gfp_t gfp_mask, int node, const void *caller) 2600 { 2601 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 2602 gfp_mask, PAGE_KERNEL, 0, node, caller); 2603 } 2604 /* 2605 * This is only for performance analysis of vmalloc and stress purpose. 2606 * It is required by vmalloc test module, therefore do not use it other 2607 * than that. 2608 */ 2609 #ifdef CONFIG_TEST_VMALLOC_MODULE 2610 EXPORT_SYMBOL_GPL(__vmalloc_node); 2611 #endif 2612 2613 void *__vmalloc(unsigned long size, gfp_t gfp_mask) 2614 { 2615 return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, 2616 __builtin_return_address(0)); 2617 } 2618 EXPORT_SYMBOL(__vmalloc); 2619 2620 /** 2621 * vmalloc - allocate virtually contiguous memory 2622 * @size: allocation size 2623 * 2624 * Allocate enough pages to cover @size from the page level 2625 * allocator and map them into contiguous kernel virtual space. 2626 * 2627 * For tight control over page level allocator and protection flags 2628 * use __vmalloc() instead. 2629 * 2630 * Return: pointer to the allocated memory or %NULL on error 2631 */ 2632 void *vmalloc(unsigned long size) 2633 { 2634 return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, 2635 __builtin_return_address(0)); 2636 } 2637 EXPORT_SYMBOL(vmalloc); 2638 2639 /** 2640 * vzalloc - allocate virtually contiguous memory with zero fill 2641 * @size: allocation size 2642 * 2643 * Allocate enough pages to cover @size from the page level 2644 * allocator and map them into contiguous kernel virtual space. 2645 * The memory allocated is set to zero. 2646 * 2647 * For tight control over page level allocator and protection flags 2648 * use __vmalloc() instead. 2649 * 2650 * Return: pointer to the allocated memory or %NULL on error 2651 */ 2652 void *vzalloc(unsigned long size) 2653 { 2654 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 2655 __builtin_return_address(0)); 2656 } 2657 EXPORT_SYMBOL(vzalloc); 2658 2659 /** 2660 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 2661 * @size: allocation size 2662 * 2663 * The resulting memory area is zeroed so it can be mapped to userspace 2664 * without leaking data. 2665 * 2666 * Return: pointer to the allocated memory or %NULL on error 2667 */ 2668 void *vmalloc_user(unsigned long size) 2669 { 2670 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 2671 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, 2672 VM_USERMAP, NUMA_NO_NODE, 2673 __builtin_return_address(0)); 2674 } 2675 EXPORT_SYMBOL(vmalloc_user); 2676 2677 /** 2678 * vmalloc_node - allocate memory on a specific node 2679 * @size: allocation size 2680 * @node: numa node 2681 * 2682 * Allocate enough pages to cover @size from the page level 2683 * allocator and map them into contiguous kernel virtual space. 2684 * 2685 * For tight control over page level allocator and protection flags 2686 * use __vmalloc() instead. 2687 * 2688 * Return: pointer to the allocated memory or %NULL on error 2689 */ 2690 void *vmalloc_node(unsigned long size, int node) 2691 { 2692 return __vmalloc_node(size, 1, GFP_KERNEL, node, 2693 __builtin_return_address(0)); 2694 } 2695 EXPORT_SYMBOL(vmalloc_node); 2696 2697 /** 2698 * vzalloc_node - allocate memory on a specific node with zero fill 2699 * @size: allocation size 2700 * @node: numa node 2701 * 2702 * Allocate enough pages to cover @size from the page level 2703 * allocator and map them into contiguous kernel virtual space. 2704 * The memory allocated is set to zero. 2705 * 2706 * Return: pointer to the allocated memory or %NULL on error 2707 */ 2708 void *vzalloc_node(unsigned long size, int node) 2709 { 2710 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, 2711 __builtin_return_address(0)); 2712 } 2713 EXPORT_SYMBOL(vzalloc_node); 2714 2715 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 2716 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 2717 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 2718 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) 2719 #else 2720 /* 2721 * 64b systems should always have either DMA or DMA32 zones. For others 2722 * GFP_DMA32 should do the right thing and use the normal zone. 2723 */ 2724 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL 2725 #endif 2726 2727 /** 2728 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 2729 * @size: allocation size 2730 * 2731 * Allocate enough 32bit PA addressable pages to cover @size from the 2732 * page level allocator and map them into contiguous kernel virtual space. 2733 * 2734 * Return: pointer to the allocated memory or %NULL on error 2735 */ 2736 void *vmalloc_32(unsigned long size) 2737 { 2738 return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, 2739 __builtin_return_address(0)); 2740 } 2741 EXPORT_SYMBOL(vmalloc_32); 2742 2743 /** 2744 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 2745 * @size: allocation size 2746 * 2747 * The resulting memory area is 32bit addressable and zeroed so it can be 2748 * mapped to userspace without leaking data. 2749 * 2750 * Return: pointer to the allocated memory or %NULL on error 2751 */ 2752 void *vmalloc_32_user(unsigned long size) 2753 { 2754 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 2755 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 2756 VM_USERMAP, NUMA_NO_NODE, 2757 __builtin_return_address(0)); 2758 } 2759 EXPORT_SYMBOL(vmalloc_32_user); 2760 2761 /* 2762 * small helper routine , copy contents to buf from addr. 2763 * If the page is not present, fill zero. 2764 */ 2765 2766 static int aligned_vread(char *buf, char *addr, unsigned long count) 2767 { 2768 struct page *p; 2769 int copied = 0; 2770 2771 while (count) { 2772 unsigned long offset, length; 2773 2774 offset = offset_in_page(addr); 2775 length = PAGE_SIZE - offset; 2776 if (length > count) 2777 length = count; 2778 p = vmalloc_to_page(addr); 2779 /* 2780 * To do safe access to this _mapped_ area, we need 2781 * lock. But adding lock here means that we need to add 2782 * overhead of vmalloc()/vfree() calles for this _debug_ 2783 * interface, rarely used. Instead of that, we'll use 2784 * kmap() and get small overhead in this access function. 2785 */ 2786 if (p) { 2787 /* 2788 * we can expect USER0 is not used (see vread/vwrite's 2789 * function description) 2790 */ 2791 void *map = kmap_atomic(p); 2792 memcpy(buf, map + offset, length); 2793 kunmap_atomic(map); 2794 } else 2795 memset(buf, 0, length); 2796 2797 addr += length; 2798 buf += length; 2799 copied += length; 2800 count -= length; 2801 } 2802 return copied; 2803 } 2804 2805 static int aligned_vwrite(char *buf, char *addr, unsigned long count) 2806 { 2807 struct page *p; 2808 int copied = 0; 2809 2810 while (count) { 2811 unsigned long offset, length; 2812 2813 offset = offset_in_page(addr); 2814 length = PAGE_SIZE - offset; 2815 if (length > count) 2816 length = count; 2817 p = vmalloc_to_page(addr); 2818 /* 2819 * To do safe access to this _mapped_ area, we need 2820 * lock. But adding lock here means that we need to add 2821 * overhead of vmalloc()/vfree() calles for this _debug_ 2822 * interface, rarely used. Instead of that, we'll use 2823 * kmap() and get small overhead in this access function. 2824 */ 2825 if (p) { 2826 /* 2827 * we can expect USER0 is not used (see vread/vwrite's 2828 * function description) 2829 */ 2830 void *map = kmap_atomic(p); 2831 memcpy(map + offset, buf, length); 2832 kunmap_atomic(map); 2833 } 2834 addr += length; 2835 buf += length; 2836 copied += length; 2837 count -= length; 2838 } 2839 return copied; 2840 } 2841 2842 /** 2843 * vread() - read vmalloc area in a safe way. 2844 * @buf: buffer for reading data 2845 * @addr: vm address. 2846 * @count: number of bytes to be read. 2847 * 2848 * This function checks that addr is a valid vmalloc'ed area, and 2849 * copy data from that area to a given buffer. If the given memory range 2850 * of [addr...addr+count) includes some valid address, data is copied to 2851 * proper area of @buf. If there are memory holes, they'll be zero-filled. 2852 * IOREMAP area is treated as memory hole and no copy is done. 2853 * 2854 * If [addr...addr+count) doesn't includes any intersects with alive 2855 * vm_struct area, returns 0. @buf should be kernel's buffer. 2856 * 2857 * Note: In usual ops, vread() is never necessary because the caller 2858 * should know vmalloc() area is valid and can use memcpy(). 2859 * This is for routines which have to access vmalloc area without 2860 * any information, as /dev/kmem. 2861 * 2862 * Return: number of bytes for which addr and buf should be increased 2863 * (same number as @count) or %0 if [addr...addr+count) doesn't 2864 * include any intersection with valid vmalloc area 2865 */ 2866 long vread(char *buf, char *addr, unsigned long count) 2867 { 2868 struct vmap_area *va; 2869 struct vm_struct *vm; 2870 char *vaddr, *buf_start = buf; 2871 unsigned long buflen = count; 2872 unsigned long n; 2873 2874 /* Don't allow overflow */ 2875 if ((unsigned long) addr + count < count) 2876 count = -(unsigned long) addr; 2877 2878 spin_lock(&vmap_area_lock); 2879 list_for_each_entry(va, &vmap_area_list, list) { 2880 if (!count) 2881 break; 2882 2883 if (!va->vm) 2884 continue; 2885 2886 vm = va->vm; 2887 vaddr = (char *) vm->addr; 2888 if (addr >= vaddr + get_vm_area_size(vm)) 2889 continue; 2890 while (addr < vaddr) { 2891 if (count == 0) 2892 goto finished; 2893 *buf = '\0'; 2894 buf++; 2895 addr++; 2896 count--; 2897 } 2898 n = vaddr + get_vm_area_size(vm) - addr; 2899 if (n > count) 2900 n = count; 2901 if (!(vm->flags & VM_IOREMAP)) 2902 aligned_vread(buf, addr, n); 2903 else /* IOREMAP area is treated as memory hole */ 2904 memset(buf, 0, n); 2905 buf += n; 2906 addr += n; 2907 count -= n; 2908 } 2909 finished: 2910 spin_unlock(&vmap_area_lock); 2911 2912 if (buf == buf_start) 2913 return 0; 2914 /* zero-fill memory holes */ 2915 if (buf != buf_start + buflen) 2916 memset(buf, 0, buflen - (buf - buf_start)); 2917 2918 return buflen; 2919 } 2920 2921 /** 2922 * vwrite() - write vmalloc area in a safe way. 2923 * @buf: buffer for source data 2924 * @addr: vm address. 2925 * @count: number of bytes to be read. 2926 * 2927 * This function checks that addr is a valid vmalloc'ed area, and 2928 * copy data from a buffer to the given addr. If specified range of 2929 * [addr...addr+count) includes some valid address, data is copied from 2930 * proper area of @buf. If there are memory holes, no copy to hole. 2931 * IOREMAP area is treated as memory hole and no copy is done. 2932 * 2933 * If [addr...addr+count) doesn't includes any intersects with alive 2934 * vm_struct area, returns 0. @buf should be kernel's buffer. 2935 * 2936 * Note: In usual ops, vwrite() is never necessary because the caller 2937 * should know vmalloc() area is valid and can use memcpy(). 2938 * This is for routines which have to access vmalloc area without 2939 * any information, as /dev/kmem. 2940 * 2941 * Return: number of bytes for which addr and buf should be 2942 * increased (same number as @count) or %0 if [addr...addr+count) 2943 * doesn't include any intersection with valid vmalloc area 2944 */ 2945 long vwrite(char *buf, char *addr, unsigned long count) 2946 { 2947 struct vmap_area *va; 2948 struct vm_struct *vm; 2949 char *vaddr; 2950 unsigned long n, buflen; 2951 int copied = 0; 2952 2953 /* Don't allow overflow */ 2954 if ((unsigned long) addr + count < count) 2955 count = -(unsigned long) addr; 2956 buflen = count; 2957 2958 spin_lock(&vmap_area_lock); 2959 list_for_each_entry(va, &vmap_area_list, list) { 2960 if (!count) 2961 break; 2962 2963 if (!va->vm) 2964 continue; 2965 2966 vm = va->vm; 2967 vaddr = (char *) vm->addr; 2968 if (addr >= vaddr + get_vm_area_size(vm)) 2969 continue; 2970 while (addr < vaddr) { 2971 if (count == 0) 2972 goto finished; 2973 buf++; 2974 addr++; 2975 count--; 2976 } 2977 n = vaddr + get_vm_area_size(vm) - addr; 2978 if (n > count) 2979 n = count; 2980 if (!(vm->flags & VM_IOREMAP)) { 2981 aligned_vwrite(buf, addr, n); 2982 copied++; 2983 } 2984 buf += n; 2985 addr += n; 2986 count -= n; 2987 } 2988 finished: 2989 spin_unlock(&vmap_area_lock); 2990 if (!copied) 2991 return 0; 2992 return buflen; 2993 } 2994 2995 /** 2996 * remap_vmalloc_range_partial - map vmalloc pages to userspace 2997 * @vma: vma to cover 2998 * @uaddr: target user address to start at 2999 * @kaddr: virtual address of vmalloc kernel memory 3000 * @pgoff: offset from @kaddr to start at 3001 * @size: size of map area 3002 * 3003 * Returns: 0 for success, -Exxx on failure 3004 * 3005 * This function checks that @kaddr is a valid vmalloc'ed area, 3006 * and that it is big enough to cover the range starting at 3007 * @uaddr in @vma. Will return failure if that criteria isn't 3008 * met. 3009 * 3010 * Similar to remap_pfn_range() (see mm/memory.c) 3011 */ 3012 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 3013 void *kaddr, unsigned long pgoff, 3014 unsigned long size) 3015 { 3016 struct vm_struct *area; 3017 unsigned long off; 3018 unsigned long end_index; 3019 3020 if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) 3021 return -EINVAL; 3022 3023 size = PAGE_ALIGN(size); 3024 3025 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) 3026 return -EINVAL; 3027 3028 area = find_vm_area(kaddr); 3029 if (!area) 3030 return -EINVAL; 3031 3032 if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) 3033 return -EINVAL; 3034 3035 if (check_add_overflow(size, off, &end_index) || 3036 end_index > get_vm_area_size(area)) 3037 return -EINVAL; 3038 kaddr += off; 3039 3040 do { 3041 struct page *page = vmalloc_to_page(kaddr); 3042 int ret; 3043 3044 ret = vm_insert_page(vma, uaddr, page); 3045 if (ret) 3046 return ret; 3047 3048 uaddr += PAGE_SIZE; 3049 kaddr += PAGE_SIZE; 3050 size -= PAGE_SIZE; 3051 } while (size > 0); 3052 3053 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3054 3055 return 0; 3056 } 3057 EXPORT_SYMBOL(remap_vmalloc_range_partial); 3058 3059 /** 3060 * remap_vmalloc_range - map vmalloc pages to userspace 3061 * @vma: vma to cover (map full range of vma) 3062 * @addr: vmalloc memory 3063 * @pgoff: number of pages into addr before first page to map 3064 * 3065 * Returns: 0 for success, -Exxx on failure 3066 * 3067 * This function checks that addr is a valid vmalloc'ed area, and 3068 * that it is big enough to cover the vma. Will return failure if 3069 * that criteria isn't met. 3070 * 3071 * Similar to remap_pfn_range() (see mm/memory.c) 3072 */ 3073 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 3074 unsigned long pgoff) 3075 { 3076 return remap_vmalloc_range_partial(vma, vma->vm_start, 3077 addr, pgoff, 3078 vma->vm_end - vma->vm_start); 3079 } 3080 EXPORT_SYMBOL(remap_vmalloc_range); 3081 3082 void free_vm_area(struct vm_struct *area) 3083 { 3084 struct vm_struct *ret; 3085 ret = remove_vm_area(area->addr); 3086 BUG_ON(ret != area); 3087 kfree(area); 3088 } 3089 EXPORT_SYMBOL_GPL(free_vm_area); 3090 3091 #ifdef CONFIG_SMP 3092 static struct vmap_area *node_to_va(struct rb_node *n) 3093 { 3094 return rb_entry_safe(n, struct vmap_area, rb_node); 3095 } 3096 3097 /** 3098 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to 3099 * @addr: target address 3100 * 3101 * Returns: vmap_area if it is found. If there is no such area 3102 * the first highest(reverse order) vmap_area is returned 3103 * i.e. va->va_start < addr && va->va_end < addr or NULL 3104 * if there are no any areas before @addr. 3105 */ 3106 static struct vmap_area * 3107 pvm_find_va_enclose_addr(unsigned long addr) 3108 { 3109 struct vmap_area *va, *tmp; 3110 struct rb_node *n; 3111 3112 n = free_vmap_area_root.rb_node; 3113 va = NULL; 3114 3115 while (n) { 3116 tmp = rb_entry(n, struct vmap_area, rb_node); 3117 if (tmp->va_start <= addr) { 3118 va = tmp; 3119 if (tmp->va_end >= addr) 3120 break; 3121 3122 n = n->rb_right; 3123 } else { 3124 n = n->rb_left; 3125 } 3126 } 3127 3128 return va; 3129 } 3130 3131 /** 3132 * pvm_determine_end_from_reverse - find the highest aligned address 3133 * of free block below VMALLOC_END 3134 * @va: 3135 * in - the VA we start the search(reverse order); 3136 * out - the VA with the highest aligned end address. 3137 * 3138 * Returns: determined end address within vmap_area 3139 */ 3140 static unsigned long 3141 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) 3142 { 3143 unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3144 unsigned long addr; 3145 3146 if (likely(*va)) { 3147 list_for_each_entry_from_reverse((*va), 3148 &free_vmap_area_list, list) { 3149 addr = min((*va)->va_end & ~(align - 1), vmalloc_end); 3150 if ((*va)->va_start < addr) 3151 return addr; 3152 } 3153 } 3154 3155 return 0; 3156 } 3157 3158 /** 3159 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 3160 * @offsets: array containing offset of each area 3161 * @sizes: array containing size of each area 3162 * @nr_vms: the number of areas to allocate 3163 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 3164 * 3165 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 3166 * vm_structs on success, %NULL on failure 3167 * 3168 * Percpu allocator wants to use congruent vm areas so that it can 3169 * maintain the offsets among percpu areas. This function allocates 3170 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to 3171 * be scattered pretty far, distance between two areas easily going up 3172 * to gigabytes. To avoid interacting with regular vmallocs, these 3173 * areas are allocated from top. 3174 * 3175 * Despite its complicated look, this allocator is rather simple. It 3176 * does everything top-down and scans free blocks from the end looking 3177 * for matching base. While scanning, if any of the areas do not fit the 3178 * base address is pulled down to fit the area. Scanning is repeated till 3179 * all the areas fit and then all necessary data structures are inserted 3180 * and the result is returned. 3181 */ 3182 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 3183 const size_t *sizes, int nr_vms, 3184 size_t align) 3185 { 3186 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 3187 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3188 struct vmap_area **vas, *va; 3189 struct vm_struct **vms; 3190 int area, area2, last_area, term_area; 3191 unsigned long base, start, size, end, last_end, orig_start, orig_end; 3192 bool purged = false; 3193 enum fit_type type; 3194 3195 /* verify parameters and allocate data structures */ 3196 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); 3197 for (last_area = 0, area = 0; area < nr_vms; area++) { 3198 start = offsets[area]; 3199 end = start + sizes[area]; 3200 3201 /* is everything aligned properly? */ 3202 BUG_ON(!IS_ALIGNED(offsets[area], align)); 3203 BUG_ON(!IS_ALIGNED(sizes[area], align)); 3204 3205 /* detect the area with the highest address */ 3206 if (start > offsets[last_area]) 3207 last_area = area; 3208 3209 for (area2 = area + 1; area2 < nr_vms; area2++) { 3210 unsigned long start2 = offsets[area2]; 3211 unsigned long end2 = start2 + sizes[area2]; 3212 3213 BUG_ON(start2 < end && start < end2); 3214 } 3215 } 3216 last_end = offsets[last_area] + sizes[last_area]; 3217 3218 if (vmalloc_end - vmalloc_start < last_end) { 3219 WARN_ON(true); 3220 return NULL; 3221 } 3222 3223 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); 3224 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); 3225 if (!vas || !vms) 3226 goto err_free2; 3227 3228 for (area = 0; area < nr_vms; area++) { 3229 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); 3230 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 3231 if (!vas[area] || !vms[area]) 3232 goto err_free; 3233 } 3234 retry: 3235 spin_lock(&free_vmap_area_lock); 3236 3237 /* start scanning - we scan from the top, begin with the last area */ 3238 area = term_area = last_area; 3239 start = offsets[area]; 3240 end = start + sizes[area]; 3241 3242 va = pvm_find_va_enclose_addr(vmalloc_end); 3243 base = pvm_determine_end_from_reverse(&va, align) - end; 3244 3245 while (true) { 3246 /* 3247 * base might have underflowed, add last_end before 3248 * comparing. 3249 */ 3250 if (base + last_end < vmalloc_start + last_end) 3251 goto overflow; 3252 3253 /* 3254 * Fitting base has not been found. 3255 */ 3256 if (va == NULL) 3257 goto overflow; 3258 3259 /* 3260 * If required width exceeds current VA block, move 3261 * base downwards and then recheck. 3262 */ 3263 if (base + end > va->va_end) { 3264 base = pvm_determine_end_from_reverse(&va, align) - end; 3265 term_area = area; 3266 continue; 3267 } 3268 3269 /* 3270 * If this VA does not fit, move base downwards and recheck. 3271 */ 3272 if (base + start < va->va_start) { 3273 va = node_to_va(rb_prev(&va->rb_node)); 3274 base = pvm_determine_end_from_reverse(&va, align) - end; 3275 term_area = area; 3276 continue; 3277 } 3278 3279 /* 3280 * This area fits, move on to the previous one. If 3281 * the previous one is the terminal one, we're done. 3282 */ 3283 area = (area + nr_vms - 1) % nr_vms; 3284 if (area == term_area) 3285 break; 3286 3287 start = offsets[area]; 3288 end = start + sizes[area]; 3289 va = pvm_find_va_enclose_addr(base + end); 3290 } 3291 3292 /* we've found a fitting base, insert all va's */ 3293 for (area = 0; area < nr_vms; area++) { 3294 int ret; 3295 3296 start = base + offsets[area]; 3297 size = sizes[area]; 3298 3299 va = pvm_find_va_enclose_addr(start); 3300 if (WARN_ON_ONCE(va == NULL)) 3301 /* It is a BUG(), but trigger recovery instead. */ 3302 goto recovery; 3303 3304 type = classify_va_fit_type(va, start, size); 3305 if (WARN_ON_ONCE(type == NOTHING_FIT)) 3306 /* It is a BUG(), but trigger recovery instead. */ 3307 goto recovery; 3308 3309 ret = adjust_va_to_fit_type(va, start, size, type); 3310 if (unlikely(ret)) 3311 goto recovery; 3312 3313 /* Allocated area. */ 3314 va = vas[area]; 3315 va->va_start = start; 3316 va->va_end = start + size; 3317 } 3318 3319 spin_unlock(&free_vmap_area_lock); 3320 3321 /* populate the kasan shadow space */ 3322 for (area = 0; area < nr_vms; area++) { 3323 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) 3324 goto err_free_shadow; 3325 3326 kasan_unpoison_vmalloc((void *)vas[area]->va_start, 3327 sizes[area]); 3328 } 3329 3330 /* insert all vm's */ 3331 spin_lock(&vmap_area_lock); 3332 for (area = 0; area < nr_vms; area++) { 3333 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); 3334 3335 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, 3336 pcpu_get_vm_areas); 3337 } 3338 spin_unlock(&vmap_area_lock); 3339 3340 kfree(vas); 3341 return vms; 3342 3343 recovery: 3344 /* 3345 * Remove previously allocated areas. There is no 3346 * need in removing these areas from the busy tree, 3347 * because they are inserted only on the final step 3348 * and when pcpu_get_vm_areas() is success. 3349 */ 3350 while (area--) { 3351 orig_start = vas[area]->va_start; 3352 orig_end = vas[area]->va_end; 3353 va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, 3354 &free_vmap_area_list); 3355 if (va) 3356 kasan_release_vmalloc(orig_start, orig_end, 3357 va->va_start, va->va_end); 3358 vas[area] = NULL; 3359 } 3360 3361 overflow: 3362 spin_unlock(&free_vmap_area_lock); 3363 if (!purged) { 3364 purge_vmap_area_lazy(); 3365 purged = true; 3366 3367 /* Before "retry", check if we recover. */ 3368 for (area = 0; area < nr_vms; area++) { 3369 if (vas[area]) 3370 continue; 3371 3372 vas[area] = kmem_cache_zalloc( 3373 vmap_area_cachep, GFP_KERNEL); 3374 if (!vas[area]) 3375 goto err_free; 3376 } 3377 3378 goto retry; 3379 } 3380 3381 err_free: 3382 for (area = 0; area < nr_vms; area++) { 3383 if (vas[area]) 3384 kmem_cache_free(vmap_area_cachep, vas[area]); 3385 3386 kfree(vms[area]); 3387 } 3388 err_free2: 3389 kfree(vas); 3390 kfree(vms); 3391 return NULL; 3392 3393 err_free_shadow: 3394 spin_lock(&free_vmap_area_lock); 3395 /* 3396 * We release all the vmalloc shadows, even the ones for regions that 3397 * hadn't been successfully added. This relies on kasan_release_vmalloc 3398 * being able to tolerate this case. 3399 */ 3400 for (area = 0; area < nr_vms; area++) { 3401 orig_start = vas[area]->va_start; 3402 orig_end = vas[area]->va_end; 3403 va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, 3404 &free_vmap_area_list); 3405 if (va) 3406 kasan_release_vmalloc(orig_start, orig_end, 3407 va->va_start, va->va_end); 3408 vas[area] = NULL; 3409 kfree(vms[area]); 3410 } 3411 spin_unlock(&free_vmap_area_lock); 3412 kfree(vas); 3413 kfree(vms); 3414 return NULL; 3415 } 3416 3417 /** 3418 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 3419 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 3420 * @nr_vms: the number of allocated areas 3421 * 3422 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 3423 */ 3424 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 3425 { 3426 int i; 3427 3428 for (i = 0; i < nr_vms; i++) 3429 free_vm_area(vms[i]); 3430 kfree(vms); 3431 } 3432 #endif /* CONFIG_SMP */ 3433 3434 #ifdef CONFIG_PROC_FS 3435 static void *s_start(struct seq_file *m, loff_t *pos) 3436 __acquires(&vmap_purge_lock) 3437 __acquires(&vmap_area_lock) 3438 { 3439 mutex_lock(&vmap_purge_lock); 3440 spin_lock(&vmap_area_lock); 3441 3442 return seq_list_start(&vmap_area_list, *pos); 3443 } 3444 3445 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3446 { 3447 return seq_list_next(p, &vmap_area_list, pos); 3448 } 3449 3450 static void s_stop(struct seq_file *m, void *p) 3451 __releases(&vmap_purge_lock) 3452 __releases(&vmap_area_lock) 3453 { 3454 mutex_unlock(&vmap_purge_lock); 3455 spin_unlock(&vmap_area_lock); 3456 } 3457 3458 static void show_numa_info(struct seq_file *m, struct vm_struct *v) 3459 { 3460 if (IS_ENABLED(CONFIG_NUMA)) { 3461 unsigned int nr, *counters = m->private; 3462 3463 if (!counters) 3464 return; 3465 3466 if (v->flags & VM_UNINITIALIZED) 3467 return; 3468 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 3469 smp_rmb(); 3470 3471 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 3472 3473 for (nr = 0; nr < v->nr_pages; nr++) 3474 counters[page_to_nid(v->pages[nr])]++; 3475 3476 for_each_node_state(nr, N_HIGH_MEMORY) 3477 if (counters[nr]) 3478 seq_printf(m, " N%u=%u", nr, counters[nr]); 3479 } 3480 } 3481 3482 static void show_purge_info(struct seq_file *m) 3483 { 3484 struct llist_node *head; 3485 struct vmap_area *va; 3486 3487 head = READ_ONCE(vmap_purge_list.first); 3488 if (head == NULL) 3489 return; 3490 3491 llist_for_each_entry(va, head, purge_list) { 3492 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 3493 (void *)va->va_start, (void *)va->va_end, 3494 va->va_end - va->va_start); 3495 } 3496 } 3497 3498 static int s_show(struct seq_file *m, void *p) 3499 { 3500 struct vmap_area *va; 3501 struct vm_struct *v; 3502 3503 va = list_entry(p, struct vmap_area, list); 3504 3505 /* 3506 * s_show can encounter race with remove_vm_area, !vm on behalf 3507 * of vmap area is being tear down or vm_map_ram allocation. 3508 */ 3509 if (!va->vm) { 3510 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", 3511 (void *)va->va_start, (void *)va->va_end, 3512 va->va_end - va->va_start); 3513 3514 return 0; 3515 } 3516 3517 v = va->vm; 3518 3519 seq_printf(m, "0x%pK-0x%pK %7ld", 3520 v->addr, v->addr + v->size, v->size); 3521 3522 if (v->caller) 3523 seq_printf(m, " %pS", v->caller); 3524 3525 if (v->nr_pages) 3526 seq_printf(m, " pages=%d", v->nr_pages); 3527 3528 if (v->phys_addr) 3529 seq_printf(m, " phys=%pa", &v->phys_addr); 3530 3531 if (v->flags & VM_IOREMAP) 3532 seq_puts(m, " ioremap"); 3533 3534 if (v->flags & VM_ALLOC) 3535 seq_puts(m, " vmalloc"); 3536 3537 if (v->flags & VM_MAP) 3538 seq_puts(m, " vmap"); 3539 3540 if (v->flags & VM_USERMAP) 3541 seq_puts(m, " user"); 3542 3543 if (v->flags & VM_DMA_COHERENT) 3544 seq_puts(m, " dma-coherent"); 3545 3546 if (is_vmalloc_addr(v->pages)) 3547 seq_puts(m, " vpages"); 3548 3549 show_numa_info(m, v); 3550 seq_putc(m, '\n'); 3551 3552 /* 3553 * As a final step, dump "unpurged" areas. Note, 3554 * that entire "/proc/vmallocinfo" output will not 3555 * be address sorted, because the purge list is not 3556 * sorted. 3557 */ 3558 if (list_is_last(&va->list, &vmap_area_list)) 3559 show_purge_info(m); 3560 3561 return 0; 3562 } 3563 3564 static const struct seq_operations vmalloc_op = { 3565 .start = s_start, 3566 .next = s_next, 3567 .stop = s_stop, 3568 .show = s_show, 3569 }; 3570 3571 static int __init proc_vmalloc_init(void) 3572 { 3573 if (IS_ENABLED(CONFIG_NUMA)) 3574 proc_create_seq_private("vmallocinfo", 0400, NULL, 3575 &vmalloc_op, 3576 nr_node_ids * sizeof(unsigned int), NULL); 3577 else 3578 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); 3579 return 0; 3580 } 3581 module_init(proc_vmalloc_init); 3582 3583 #endif 3584