1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/vmalloc.c 4 * 5 * Copyright (C) 1993 Linus Torvalds 6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 7 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 8 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 9 * Numa awareness, Christoph Lameter, SGI, June 2005 10 */ 11 12 #include <linux/vmalloc.h> 13 #include <linux/mm.h> 14 #include <linux/module.h> 15 #include <linux/highmem.h> 16 #include <linux/sched/signal.h> 17 #include <linux/slab.h> 18 #include <linux/spinlock.h> 19 #include <linux/interrupt.h> 20 #include <linux/proc_fs.h> 21 #include <linux/seq_file.h> 22 #include <linux/set_memory.h> 23 #include <linux/debugobjects.h> 24 #include <linux/kallsyms.h> 25 #include <linux/list.h> 26 #include <linux/notifier.h> 27 #include <linux/rbtree.h> 28 #include <linux/radix-tree.h> 29 #include <linux/rcupdate.h> 30 #include <linux/pfn.h> 31 #include <linux/kmemleak.h> 32 #include <linux/atomic.h> 33 #include <linux/compiler.h> 34 #include <linux/llist.h> 35 #include <linux/bitops.h> 36 #include <linux/rbtree_augmented.h> 37 #include <linux/overflow.h> 38 39 #include <linux/uaccess.h> 40 #include <asm/tlbflush.h> 41 #include <asm/shmparam.h> 42 43 #include "internal.h" 44 45 bool is_vmalloc_addr(const void *x) 46 { 47 unsigned long addr = (unsigned long)x; 48 49 return addr >= VMALLOC_START && addr < VMALLOC_END; 50 } 51 EXPORT_SYMBOL(is_vmalloc_addr); 52 53 struct vfree_deferred { 54 struct llist_head list; 55 struct work_struct wq; 56 }; 57 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); 58 59 static void __vunmap(const void *, int); 60 61 static void free_work(struct work_struct *w) 62 { 63 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); 64 struct llist_node *t, *llnode; 65 66 llist_for_each_safe(llnode, t, llist_del_all(&p->list)) 67 __vunmap((void *)llnode, 1); 68 } 69 70 /*** Page table manipulation functions ***/ 71 72 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 73 { 74 pte_t *pte; 75 76 pte = pte_offset_kernel(pmd, addr); 77 do { 78 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 79 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 80 } while (pte++, addr += PAGE_SIZE, addr != end); 81 } 82 83 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) 84 { 85 pmd_t *pmd; 86 unsigned long next; 87 88 pmd = pmd_offset(pud, addr); 89 do { 90 next = pmd_addr_end(addr, end); 91 if (pmd_clear_huge(pmd)) 92 continue; 93 if (pmd_none_or_clear_bad(pmd)) 94 continue; 95 vunmap_pte_range(pmd, addr, next); 96 } while (pmd++, addr = next, addr != end); 97 } 98 99 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) 100 { 101 pud_t *pud; 102 unsigned long next; 103 104 pud = pud_offset(p4d, addr); 105 do { 106 next = pud_addr_end(addr, end); 107 if (pud_clear_huge(pud)) 108 continue; 109 if (pud_none_or_clear_bad(pud)) 110 continue; 111 vunmap_pmd_range(pud, addr, next); 112 } while (pud++, addr = next, addr != end); 113 } 114 115 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) 116 { 117 p4d_t *p4d; 118 unsigned long next; 119 120 p4d = p4d_offset(pgd, addr); 121 do { 122 next = p4d_addr_end(addr, end); 123 if (p4d_clear_huge(p4d)) 124 continue; 125 if (p4d_none_or_clear_bad(p4d)) 126 continue; 127 vunmap_pud_range(p4d, addr, next); 128 } while (p4d++, addr = next, addr != end); 129 } 130 131 static void vunmap_page_range(unsigned long addr, unsigned long end) 132 { 133 pgd_t *pgd; 134 unsigned long next; 135 136 BUG_ON(addr >= end); 137 pgd = pgd_offset_k(addr); 138 do { 139 next = pgd_addr_end(addr, end); 140 if (pgd_none_or_clear_bad(pgd)) 141 continue; 142 vunmap_p4d_range(pgd, addr, next); 143 } while (pgd++, addr = next, addr != end); 144 } 145 146 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 147 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 148 { 149 pte_t *pte; 150 151 /* 152 * nr is a running index into the array which helps higher level 153 * callers keep track of where we're up to. 154 */ 155 156 pte = pte_alloc_kernel(pmd, addr); 157 if (!pte) 158 return -ENOMEM; 159 do { 160 struct page *page = pages[*nr]; 161 162 if (WARN_ON(!pte_none(*pte))) 163 return -EBUSY; 164 if (WARN_ON(!page)) 165 return -ENOMEM; 166 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 167 (*nr)++; 168 } while (pte++, addr += PAGE_SIZE, addr != end); 169 return 0; 170 } 171 172 static int vmap_pmd_range(pud_t *pud, unsigned long addr, 173 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 174 { 175 pmd_t *pmd; 176 unsigned long next; 177 178 pmd = pmd_alloc(&init_mm, pud, addr); 179 if (!pmd) 180 return -ENOMEM; 181 do { 182 next = pmd_addr_end(addr, end); 183 if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) 184 return -ENOMEM; 185 } while (pmd++, addr = next, addr != end); 186 return 0; 187 } 188 189 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, 190 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 191 { 192 pud_t *pud; 193 unsigned long next; 194 195 pud = pud_alloc(&init_mm, p4d, addr); 196 if (!pud) 197 return -ENOMEM; 198 do { 199 next = pud_addr_end(addr, end); 200 if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) 201 return -ENOMEM; 202 } while (pud++, addr = next, addr != end); 203 return 0; 204 } 205 206 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, 207 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 208 { 209 p4d_t *p4d; 210 unsigned long next; 211 212 p4d = p4d_alloc(&init_mm, pgd, addr); 213 if (!p4d) 214 return -ENOMEM; 215 do { 216 next = p4d_addr_end(addr, end); 217 if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) 218 return -ENOMEM; 219 } while (p4d++, addr = next, addr != end); 220 return 0; 221 } 222 223 /* 224 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and 225 * will have pfns corresponding to the "pages" array. 226 * 227 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] 228 */ 229 static int vmap_page_range_noflush(unsigned long start, unsigned long end, 230 pgprot_t prot, struct page **pages) 231 { 232 pgd_t *pgd; 233 unsigned long next; 234 unsigned long addr = start; 235 int err = 0; 236 int nr = 0; 237 238 BUG_ON(addr >= end); 239 pgd = pgd_offset_k(addr); 240 do { 241 next = pgd_addr_end(addr, end); 242 err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); 243 if (err) 244 return err; 245 } while (pgd++, addr = next, addr != end); 246 247 return nr; 248 } 249 250 static int vmap_page_range(unsigned long start, unsigned long end, 251 pgprot_t prot, struct page **pages) 252 { 253 int ret; 254 255 ret = vmap_page_range_noflush(start, end, prot, pages); 256 flush_cache_vmap(start, end); 257 return ret; 258 } 259 260 int is_vmalloc_or_module_addr(const void *x) 261 { 262 /* 263 * ARM, x86-64 and sparc64 put modules in a special place, 264 * and fall back on vmalloc() if that fails. Others 265 * just put it in the vmalloc space. 266 */ 267 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 268 unsigned long addr = (unsigned long)x; 269 if (addr >= MODULES_VADDR && addr < MODULES_END) 270 return 1; 271 #endif 272 return is_vmalloc_addr(x); 273 } 274 275 /* 276 * Walk a vmap address to the struct page it maps. 277 */ 278 struct page *vmalloc_to_page(const void *vmalloc_addr) 279 { 280 unsigned long addr = (unsigned long) vmalloc_addr; 281 struct page *page = NULL; 282 pgd_t *pgd = pgd_offset_k(addr); 283 p4d_t *p4d; 284 pud_t *pud; 285 pmd_t *pmd; 286 pte_t *ptep, pte; 287 288 /* 289 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 290 * architectures that do not vmalloc module space 291 */ 292 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 293 294 if (pgd_none(*pgd)) 295 return NULL; 296 p4d = p4d_offset(pgd, addr); 297 if (p4d_none(*p4d)) 298 return NULL; 299 pud = pud_offset(p4d, addr); 300 301 /* 302 * Don't dereference bad PUD or PMD (below) entries. This will also 303 * identify huge mappings, which we may encounter on architectures 304 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be 305 * identified as vmalloc addresses by is_vmalloc_addr(), but are 306 * not [unambiguously] associated with a struct page, so there is 307 * no correct value to return for them. 308 */ 309 WARN_ON_ONCE(pud_bad(*pud)); 310 if (pud_none(*pud) || pud_bad(*pud)) 311 return NULL; 312 pmd = pmd_offset(pud, addr); 313 WARN_ON_ONCE(pmd_bad(*pmd)); 314 if (pmd_none(*pmd) || pmd_bad(*pmd)) 315 return NULL; 316 317 ptep = pte_offset_map(pmd, addr); 318 pte = *ptep; 319 if (pte_present(pte)) 320 page = pte_page(pte); 321 pte_unmap(ptep); 322 return page; 323 } 324 EXPORT_SYMBOL(vmalloc_to_page); 325 326 /* 327 * Map a vmalloc()-space virtual address to the physical page frame number. 328 */ 329 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 330 { 331 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 332 } 333 EXPORT_SYMBOL(vmalloc_to_pfn); 334 335 336 /*** Global kva allocator ***/ 337 338 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 339 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 340 341 342 static DEFINE_SPINLOCK(vmap_area_lock); 343 static DEFINE_SPINLOCK(free_vmap_area_lock); 344 /* Export for kexec only */ 345 LIST_HEAD(vmap_area_list); 346 static LLIST_HEAD(vmap_purge_list); 347 static struct rb_root vmap_area_root = RB_ROOT; 348 static bool vmap_initialized __read_mostly; 349 350 /* 351 * This kmem_cache is used for vmap_area objects. Instead of 352 * allocating from slab we reuse an object from this cache to 353 * make things faster. Especially in "no edge" splitting of 354 * free block. 355 */ 356 static struct kmem_cache *vmap_area_cachep; 357 358 /* 359 * This linked list is used in pair with free_vmap_area_root. 360 * It gives O(1) access to prev/next to perform fast coalescing. 361 */ 362 static LIST_HEAD(free_vmap_area_list); 363 364 /* 365 * This augment red-black tree represents the free vmap space. 366 * All vmap_area objects in this tree are sorted by va->va_start 367 * address. It is used for allocation and merging when a vmap 368 * object is released. 369 * 370 * Each vmap_area node contains a maximum available free block 371 * of its sub-tree, right or left. Therefore it is possible to 372 * find a lowest match of free area. 373 */ 374 static struct rb_root free_vmap_area_root = RB_ROOT; 375 376 /* 377 * Preload a CPU with one object for "no edge" split case. The 378 * aim is to get rid of allocations from the atomic context, thus 379 * to use more permissive allocation masks. 380 */ 381 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); 382 383 static __always_inline unsigned long 384 va_size(struct vmap_area *va) 385 { 386 return (va->va_end - va->va_start); 387 } 388 389 static __always_inline unsigned long 390 get_subtree_max_size(struct rb_node *node) 391 { 392 struct vmap_area *va; 393 394 va = rb_entry_safe(node, struct vmap_area, rb_node); 395 return va ? va->subtree_max_size : 0; 396 } 397 398 /* 399 * Gets called when remove the node and rotate. 400 */ 401 static __always_inline unsigned long 402 compute_subtree_max_size(struct vmap_area *va) 403 { 404 return max3(va_size(va), 405 get_subtree_max_size(va->rb_node.rb_left), 406 get_subtree_max_size(va->rb_node.rb_right)); 407 } 408 409 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, 410 struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) 411 412 static void purge_vmap_area_lazy(void); 413 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 414 static unsigned long lazy_max_pages(void); 415 416 static atomic_long_t nr_vmalloc_pages; 417 418 unsigned long vmalloc_nr_pages(void) 419 { 420 return atomic_long_read(&nr_vmalloc_pages); 421 } 422 423 static struct vmap_area *__find_vmap_area(unsigned long addr) 424 { 425 struct rb_node *n = vmap_area_root.rb_node; 426 427 while (n) { 428 struct vmap_area *va; 429 430 va = rb_entry(n, struct vmap_area, rb_node); 431 if (addr < va->va_start) 432 n = n->rb_left; 433 else if (addr >= va->va_end) 434 n = n->rb_right; 435 else 436 return va; 437 } 438 439 return NULL; 440 } 441 442 /* 443 * This function returns back addresses of parent node 444 * and its left or right link for further processing. 445 */ 446 static __always_inline struct rb_node ** 447 find_va_links(struct vmap_area *va, 448 struct rb_root *root, struct rb_node *from, 449 struct rb_node **parent) 450 { 451 struct vmap_area *tmp_va; 452 struct rb_node **link; 453 454 if (root) { 455 link = &root->rb_node; 456 if (unlikely(!*link)) { 457 *parent = NULL; 458 return link; 459 } 460 } else { 461 link = &from; 462 } 463 464 /* 465 * Go to the bottom of the tree. When we hit the last point 466 * we end up with parent rb_node and correct direction, i name 467 * it link, where the new va->rb_node will be attached to. 468 */ 469 do { 470 tmp_va = rb_entry(*link, struct vmap_area, rb_node); 471 472 /* 473 * During the traversal we also do some sanity check. 474 * Trigger the BUG() if there are sides(left/right) 475 * or full overlaps. 476 */ 477 if (va->va_start < tmp_va->va_end && 478 va->va_end <= tmp_va->va_start) 479 link = &(*link)->rb_left; 480 else if (va->va_end > tmp_va->va_start && 481 va->va_start >= tmp_va->va_end) 482 link = &(*link)->rb_right; 483 else 484 BUG(); 485 } while (*link); 486 487 *parent = &tmp_va->rb_node; 488 return link; 489 } 490 491 static __always_inline struct list_head * 492 get_va_next_sibling(struct rb_node *parent, struct rb_node **link) 493 { 494 struct list_head *list; 495 496 if (unlikely(!parent)) 497 /* 498 * The red-black tree where we try to find VA neighbors 499 * before merging or inserting is empty, i.e. it means 500 * there is no free vmap space. Normally it does not 501 * happen but we handle this case anyway. 502 */ 503 return NULL; 504 505 list = &rb_entry(parent, struct vmap_area, rb_node)->list; 506 return (&parent->rb_right == link ? list->next : list); 507 } 508 509 static __always_inline void 510 link_va(struct vmap_area *va, struct rb_root *root, 511 struct rb_node *parent, struct rb_node **link, struct list_head *head) 512 { 513 /* 514 * VA is still not in the list, but we can 515 * identify its future previous list_head node. 516 */ 517 if (likely(parent)) { 518 head = &rb_entry(parent, struct vmap_area, rb_node)->list; 519 if (&parent->rb_right != link) 520 head = head->prev; 521 } 522 523 /* Insert to the rb-tree */ 524 rb_link_node(&va->rb_node, parent, link); 525 if (root == &free_vmap_area_root) { 526 /* 527 * Some explanation here. Just perform simple insertion 528 * to the tree. We do not set va->subtree_max_size to 529 * its current size before calling rb_insert_augmented(). 530 * It is because of we populate the tree from the bottom 531 * to parent levels when the node _is_ in the tree. 532 * 533 * Therefore we set subtree_max_size to zero after insertion, 534 * to let __augment_tree_propagate_from() puts everything to 535 * the correct order later on. 536 */ 537 rb_insert_augmented(&va->rb_node, 538 root, &free_vmap_area_rb_augment_cb); 539 va->subtree_max_size = 0; 540 } else { 541 rb_insert_color(&va->rb_node, root); 542 } 543 544 /* Address-sort this list */ 545 list_add(&va->list, head); 546 } 547 548 static __always_inline void 549 unlink_va(struct vmap_area *va, struct rb_root *root) 550 { 551 if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) 552 return; 553 554 if (root == &free_vmap_area_root) 555 rb_erase_augmented(&va->rb_node, 556 root, &free_vmap_area_rb_augment_cb); 557 else 558 rb_erase(&va->rb_node, root); 559 560 list_del(&va->list); 561 RB_CLEAR_NODE(&va->rb_node); 562 } 563 564 #if DEBUG_AUGMENT_PROPAGATE_CHECK 565 static void 566 augment_tree_propagate_check(struct rb_node *n) 567 { 568 struct vmap_area *va; 569 struct rb_node *node; 570 unsigned long size; 571 bool found = false; 572 573 if (n == NULL) 574 return; 575 576 va = rb_entry(n, struct vmap_area, rb_node); 577 size = va->subtree_max_size; 578 node = n; 579 580 while (node) { 581 va = rb_entry(node, struct vmap_area, rb_node); 582 583 if (get_subtree_max_size(node->rb_left) == size) { 584 node = node->rb_left; 585 } else { 586 if (va_size(va) == size) { 587 found = true; 588 break; 589 } 590 591 node = node->rb_right; 592 } 593 } 594 595 if (!found) { 596 va = rb_entry(n, struct vmap_area, rb_node); 597 pr_emerg("tree is corrupted: %lu, %lu\n", 598 va_size(va), va->subtree_max_size); 599 } 600 601 augment_tree_propagate_check(n->rb_left); 602 augment_tree_propagate_check(n->rb_right); 603 } 604 #endif 605 606 /* 607 * This function populates subtree_max_size from bottom to upper 608 * levels starting from VA point. The propagation must be done 609 * when VA size is modified by changing its va_start/va_end. Or 610 * in case of newly inserting of VA to the tree. 611 * 612 * It means that __augment_tree_propagate_from() must be called: 613 * - After VA has been inserted to the tree(free path); 614 * - After VA has been shrunk(allocation path); 615 * - After VA has been increased(merging path). 616 * 617 * Please note that, it does not mean that upper parent nodes 618 * and their subtree_max_size are recalculated all the time up 619 * to the root node. 620 * 621 * 4--8 622 * /\ 623 * / \ 624 * / \ 625 * 2--2 8--8 626 * 627 * For example if we modify the node 4, shrinking it to 2, then 628 * no any modification is required. If we shrink the node 2 to 1 629 * its subtree_max_size is updated only, and set to 1. If we shrink 630 * the node 8 to 6, then its subtree_max_size is set to 6 and parent 631 * node becomes 4--6. 632 */ 633 static __always_inline void 634 augment_tree_propagate_from(struct vmap_area *va) 635 { 636 struct rb_node *node = &va->rb_node; 637 unsigned long new_va_sub_max_size; 638 639 while (node) { 640 va = rb_entry(node, struct vmap_area, rb_node); 641 new_va_sub_max_size = compute_subtree_max_size(va); 642 643 /* 644 * If the newly calculated maximum available size of the 645 * subtree is equal to the current one, then it means that 646 * the tree is propagated correctly. So we have to stop at 647 * this point to save cycles. 648 */ 649 if (va->subtree_max_size == new_va_sub_max_size) 650 break; 651 652 va->subtree_max_size = new_va_sub_max_size; 653 node = rb_parent(&va->rb_node); 654 } 655 656 #if DEBUG_AUGMENT_PROPAGATE_CHECK 657 augment_tree_propagate_check(free_vmap_area_root.rb_node); 658 #endif 659 } 660 661 static void 662 insert_vmap_area(struct vmap_area *va, 663 struct rb_root *root, struct list_head *head) 664 { 665 struct rb_node **link; 666 struct rb_node *parent; 667 668 link = find_va_links(va, root, NULL, &parent); 669 link_va(va, root, parent, link, head); 670 } 671 672 static void 673 insert_vmap_area_augment(struct vmap_area *va, 674 struct rb_node *from, struct rb_root *root, 675 struct list_head *head) 676 { 677 struct rb_node **link; 678 struct rb_node *parent; 679 680 if (from) 681 link = find_va_links(va, NULL, from, &parent); 682 else 683 link = find_va_links(va, root, NULL, &parent); 684 685 link_va(va, root, parent, link, head); 686 augment_tree_propagate_from(va); 687 } 688 689 /* 690 * Merge de-allocated chunk of VA memory with previous 691 * and next free blocks. If coalesce is not done a new 692 * free area is inserted. If VA has been merged, it is 693 * freed. 694 */ 695 static __always_inline struct vmap_area * 696 merge_or_add_vmap_area(struct vmap_area *va, 697 struct rb_root *root, struct list_head *head) 698 { 699 struct vmap_area *sibling; 700 struct list_head *next; 701 struct rb_node **link; 702 struct rb_node *parent; 703 bool merged = false; 704 705 /* 706 * Find a place in the tree where VA potentially will be 707 * inserted, unless it is merged with its sibling/siblings. 708 */ 709 link = find_va_links(va, root, NULL, &parent); 710 711 /* 712 * Get next node of VA to check if merging can be done. 713 */ 714 next = get_va_next_sibling(parent, link); 715 if (unlikely(next == NULL)) 716 goto insert; 717 718 /* 719 * start end 720 * | | 721 * |<------VA------>|<-----Next----->| 722 * | | 723 * start end 724 */ 725 if (next != head) { 726 sibling = list_entry(next, struct vmap_area, list); 727 if (sibling->va_start == va->va_end) { 728 sibling->va_start = va->va_start; 729 730 /* Check and update the tree if needed. */ 731 augment_tree_propagate_from(sibling); 732 733 /* Free vmap_area object. */ 734 kmem_cache_free(vmap_area_cachep, va); 735 736 /* Point to the new merged area. */ 737 va = sibling; 738 merged = true; 739 } 740 } 741 742 /* 743 * start end 744 * | | 745 * |<-----Prev----->|<------VA------>| 746 * | | 747 * start end 748 */ 749 if (next->prev != head) { 750 sibling = list_entry(next->prev, struct vmap_area, list); 751 if (sibling->va_end == va->va_start) { 752 sibling->va_end = va->va_end; 753 754 /* Check and update the tree if needed. */ 755 augment_tree_propagate_from(sibling); 756 757 if (merged) 758 unlink_va(va, root); 759 760 /* Free vmap_area object. */ 761 kmem_cache_free(vmap_area_cachep, va); 762 763 /* Point to the new merged area. */ 764 va = sibling; 765 merged = true; 766 } 767 } 768 769 insert: 770 if (!merged) { 771 link_va(va, root, parent, link, head); 772 augment_tree_propagate_from(va); 773 } 774 775 return va; 776 } 777 778 static __always_inline bool 779 is_within_this_va(struct vmap_area *va, unsigned long size, 780 unsigned long align, unsigned long vstart) 781 { 782 unsigned long nva_start_addr; 783 784 if (va->va_start > vstart) 785 nva_start_addr = ALIGN(va->va_start, align); 786 else 787 nva_start_addr = ALIGN(vstart, align); 788 789 /* Can be overflowed due to big size or alignment. */ 790 if (nva_start_addr + size < nva_start_addr || 791 nva_start_addr < vstart) 792 return false; 793 794 return (nva_start_addr + size <= va->va_end); 795 } 796 797 /* 798 * Find the first free block(lowest start address) in the tree, 799 * that will accomplish the request corresponding to passing 800 * parameters. 801 */ 802 static __always_inline struct vmap_area * 803 find_vmap_lowest_match(unsigned long size, 804 unsigned long align, unsigned long vstart) 805 { 806 struct vmap_area *va; 807 struct rb_node *node; 808 unsigned long length; 809 810 /* Start from the root. */ 811 node = free_vmap_area_root.rb_node; 812 813 /* Adjust the search size for alignment overhead. */ 814 length = size + align - 1; 815 816 while (node) { 817 va = rb_entry(node, struct vmap_area, rb_node); 818 819 if (get_subtree_max_size(node->rb_left) >= length && 820 vstart < va->va_start) { 821 node = node->rb_left; 822 } else { 823 if (is_within_this_va(va, size, align, vstart)) 824 return va; 825 826 /* 827 * Does not make sense to go deeper towards the right 828 * sub-tree if it does not have a free block that is 829 * equal or bigger to the requested search length. 830 */ 831 if (get_subtree_max_size(node->rb_right) >= length) { 832 node = node->rb_right; 833 continue; 834 } 835 836 /* 837 * OK. We roll back and find the first right sub-tree, 838 * that will satisfy the search criteria. It can happen 839 * only once due to "vstart" restriction. 840 */ 841 while ((node = rb_parent(node))) { 842 va = rb_entry(node, struct vmap_area, rb_node); 843 if (is_within_this_va(va, size, align, vstart)) 844 return va; 845 846 if (get_subtree_max_size(node->rb_right) >= length && 847 vstart <= va->va_start) { 848 node = node->rb_right; 849 break; 850 } 851 } 852 } 853 } 854 855 return NULL; 856 } 857 858 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 859 #include <linux/random.h> 860 861 static struct vmap_area * 862 find_vmap_lowest_linear_match(unsigned long size, 863 unsigned long align, unsigned long vstart) 864 { 865 struct vmap_area *va; 866 867 list_for_each_entry(va, &free_vmap_area_list, list) { 868 if (!is_within_this_va(va, size, align, vstart)) 869 continue; 870 871 return va; 872 } 873 874 return NULL; 875 } 876 877 static void 878 find_vmap_lowest_match_check(unsigned long size) 879 { 880 struct vmap_area *va_1, *va_2; 881 unsigned long vstart; 882 unsigned int rnd; 883 884 get_random_bytes(&rnd, sizeof(rnd)); 885 vstart = VMALLOC_START + rnd; 886 887 va_1 = find_vmap_lowest_match(size, 1, vstart); 888 va_2 = find_vmap_lowest_linear_match(size, 1, vstart); 889 890 if (va_1 != va_2) 891 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", 892 va_1, va_2, vstart); 893 } 894 #endif 895 896 enum fit_type { 897 NOTHING_FIT = 0, 898 FL_FIT_TYPE = 1, /* full fit */ 899 LE_FIT_TYPE = 2, /* left edge fit */ 900 RE_FIT_TYPE = 3, /* right edge fit */ 901 NE_FIT_TYPE = 4 /* no edge fit */ 902 }; 903 904 static __always_inline enum fit_type 905 classify_va_fit_type(struct vmap_area *va, 906 unsigned long nva_start_addr, unsigned long size) 907 { 908 enum fit_type type; 909 910 /* Check if it is within VA. */ 911 if (nva_start_addr < va->va_start || 912 nva_start_addr + size > va->va_end) 913 return NOTHING_FIT; 914 915 /* Now classify. */ 916 if (va->va_start == nva_start_addr) { 917 if (va->va_end == nva_start_addr + size) 918 type = FL_FIT_TYPE; 919 else 920 type = LE_FIT_TYPE; 921 } else if (va->va_end == nva_start_addr + size) { 922 type = RE_FIT_TYPE; 923 } else { 924 type = NE_FIT_TYPE; 925 } 926 927 return type; 928 } 929 930 static __always_inline int 931 adjust_va_to_fit_type(struct vmap_area *va, 932 unsigned long nva_start_addr, unsigned long size, 933 enum fit_type type) 934 { 935 struct vmap_area *lva = NULL; 936 937 if (type == FL_FIT_TYPE) { 938 /* 939 * No need to split VA, it fully fits. 940 * 941 * | | 942 * V NVA V 943 * |---------------| 944 */ 945 unlink_va(va, &free_vmap_area_root); 946 kmem_cache_free(vmap_area_cachep, va); 947 } else if (type == LE_FIT_TYPE) { 948 /* 949 * Split left edge of fit VA. 950 * 951 * | | 952 * V NVA V R 953 * |-------|-------| 954 */ 955 va->va_start += size; 956 } else if (type == RE_FIT_TYPE) { 957 /* 958 * Split right edge of fit VA. 959 * 960 * | | 961 * L V NVA V 962 * |-------|-------| 963 */ 964 va->va_end = nva_start_addr; 965 } else if (type == NE_FIT_TYPE) { 966 /* 967 * Split no edge of fit VA. 968 * 969 * | | 970 * L V NVA V R 971 * |---|-------|---| 972 */ 973 lva = __this_cpu_xchg(ne_fit_preload_node, NULL); 974 if (unlikely(!lva)) { 975 /* 976 * For percpu allocator we do not do any pre-allocation 977 * and leave it as it is. The reason is it most likely 978 * never ends up with NE_FIT_TYPE splitting. In case of 979 * percpu allocations offsets and sizes are aligned to 980 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE 981 * are its main fitting cases. 982 * 983 * There are a few exceptions though, as an example it is 984 * a first allocation (early boot up) when we have "one" 985 * big free space that has to be split. 986 * 987 * Also we can hit this path in case of regular "vmap" 988 * allocations, if "this" current CPU was not preloaded. 989 * See the comment in alloc_vmap_area() why. If so, then 990 * GFP_NOWAIT is used instead to get an extra object for 991 * split purpose. That is rare and most time does not 992 * occur. 993 * 994 * What happens if an allocation gets failed. Basically, 995 * an "overflow" path is triggered to purge lazily freed 996 * areas to free some memory, then, the "retry" path is 997 * triggered to repeat one more time. See more details 998 * in alloc_vmap_area() function. 999 */ 1000 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 1001 if (!lva) 1002 return -1; 1003 } 1004 1005 /* 1006 * Build the remainder. 1007 */ 1008 lva->va_start = va->va_start; 1009 lva->va_end = nva_start_addr; 1010 1011 /* 1012 * Shrink this VA to remaining size. 1013 */ 1014 va->va_start = nva_start_addr + size; 1015 } else { 1016 return -1; 1017 } 1018 1019 if (type != FL_FIT_TYPE) { 1020 augment_tree_propagate_from(va); 1021 1022 if (lva) /* type == NE_FIT_TYPE */ 1023 insert_vmap_area_augment(lva, &va->rb_node, 1024 &free_vmap_area_root, &free_vmap_area_list); 1025 } 1026 1027 return 0; 1028 } 1029 1030 /* 1031 * Returns a start address of the newly allocated area, if success. 1032 * Otherwise a vend is returned that indicates failure. 1033 */ 1034 static __always_inline unsigned long 1035 __alloc_vmap_area(unsigned long size, unsigned long align, 1036 unsigned long vstart, unsigned long vend) 1037 { 1038 unsigned long nva_start_addr; 1039 struct vmap_area *va; 1040 enum fit_type type; 1041 int ret; 1042 1043 va = find_vmap_lowest_match(size, align, vstart); 1044 if (unlikely(!va)) 1045 return vend; 1046 1047 if (va->va_start > vstart) 1048 nva_start_addr = ALIGN(va->va_start, align); 1049 else 1050 nva_start_addr = ALIGN(vstart, align); 1051 1052 /* Check the "vend" restriction. */ 1053 if (nva_start_addr + size > vend) 1054 return vend; 1055 1056 /* Classify what we have found. */ 1057 type = classify_va_fit_type(va, nva_start_addr, size); 1058 if (WARN_ON_ONCE(type == NOTHING_FIT)) 1059 return vend; 1060 1061 /* Update the free vmap_area. */ 1062 ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); 1063 if (ret) 1064 return vend; 1065 1066 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1067 find_vmap_lowest_match_check(size); 1068 #endif 1069 1070 return nva_start_addr; 1071 } 1072 1073 /* 1074 * Free a region of KVA allocated by alloc_vmap_area 1075 */ 1076 static void free_vmap_area(struct vmap_area *va) 1077 { 1078 /* 1079 * Remove from the busy tree/list. 1080 */ 1081 spin_lock(&vmap_area_lock); 1082 unlink_va(va, &vmap_area_root); 1083 spin_unlock(&vmap_area_lock); 1084 1085 /* 1086 * Insert/Merge it back to the free tree/list. 1087 */ 1088 spin_lock(&free_vmap_area_lock); 1089 merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); 1090 spin_unlock(&free_vmap_area_lock); 1091 } 1092 1093 /* 1094 * Allocate a region of KVA of the specified size and alignment, within the 1095 * vstart and vend. 1096 */ 1097 static struct vmap_area *alloc_vmap_area(unsigned long size, 1098 unsigned long align, 1099 unsigned long vstart, unsigned long vend, 1100 int node, gfp_t gfp_mask) 1101 { 1102 struct vmap_area *va, *pva; 1103 unsigned long addr; 1104 int purged = 0; 1105 int ret; 1106 1107 BUG_ON(!size); 1108 BUG_ON(offset_in_page(size)); 1109 BUG_ON(!is_power_of_2(align)); 1110 1111 if (unlikely(!vmap_initialized)) 1112 return ERR_PTR(-EBUSY); 1113 1114 might_sleep(); 1115 gfp_mask = gfp_mask & GFP_RECLAIM_MASK; 1116 1117 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1118 if (unlikely(!va)) 1119 return ERR_PTR(-ENOMEM); 1120 1121 /* 1122 * Only scan the relevant parts containing pointers to other objects 1123 * to avoid false negatives. 1124 */ 1125 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); 1126 1127 retry: 1128 /* 1129 * Preload this CPU with one extra vmap_area object. It is used 1130 * when fit type of free area is NE_FIT_TYPE. Please note, it 1131 * does not guarantee that an allocation occurs on a CPU that 1132 * is preloaded, instead we minimize the case when it is not. 1133 * It can happen because of cpu migration, because there is a 1134 * race until the below spinlock is taken. 1135 * 1136 * The preload is done in non-atomic context, thus it allows us 1137 * to use more permissive allocation masks to be more stable under 1138 * low memory condition and high memory pressure. In rare case, 1139 * if not preloaded, GFP_NOWAIT is used. 1140 * 1141 * Set "pva" to NULL here, because of "retry" path. 1142 */ 1143 pva = NULL; 1144 1145 if (!this_cpu_read(ne_fit_preload_node)) 1146 /* 1147 * Even if it fails we do not really care about that. 1148 * Just proceed as it is. If needed "overflow" path 1149 * will refill the cache we allocate from. 1150 */ 1151 pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1152 1153 spin_lock(&free_vmap_area_lock); 1154 1155 if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) 1156 kmem_cache_free(vmap_area_cachep, pva); 1157 1158 /* 1159 * If an allocation fails, the "vend" address is 1160 * returned. Therefore trigger the overflow path. 1161 */ 1162 addr = __alloc_vmap_area(size, align, vstart, vend); 1163 spin_unlock(&free_vmap_area_lock); 1164 1165 if (unlikely(addr == vend)) 1166 goto overflow; 1167 1168 va->va_start = addr; 1169 va->va_end = addr + size; 1170 va->vm = NULL; 1171 1172 1173 spin_lock(&vmap_area_lock); 1174 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1175 spin_unlock(&vmap_area_lock); 1176 1177 BUG_ON(!IS_ALIGNED(va->va_start, align)); 1178 BUG_ON(va->va_start < vstart); 1179 BUG_ON(va->va_end > vend); 1180 1181 ret = kasan_populate_vmalloc(addr, size); 1182 if (ret) { 1183 free_vmap_area(va); 1184 return ERR_PTR(ret); 1185 } 1186 1187 return va; 1188 1189 overflow: 1190 if (!purged) { 1191 purge_vmap_area_lazy(); 1192 purged = 1; 1193 goto retry; 1194 } 1195 1196 if (gfpflags_allow_blocking(gfp_mask)) { 1197 unsigned long freed = 0; 1198 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); 1199 if (freed > 0) { 1200 purged = 0; 1201 goto retry; 1202 } 1203 } 1204 1205 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 1206 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", 1207 size); 1208 1209 kmem_cache_free(vmap_area_cachep, va); 1210 return ERR_PTR(-EBUSY); 1211 } 1212 1213 int register_vmap_purge_notifier(struct notifier_block *nb) 1214 { 1215 return blocking_notifier_chain_register(&vmap_notify_list, nb); 1216 } 1217 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); 1218 1219 int unregister_vmap_purge_notifier(struct notifier_block *nb) 1220 { 1221 return blocking_notifier_chain_unregister(&vmap_notify_list, nb); 1222 } 1223 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); 1224 1225 /* 1226 * Clear the pagetable entries of a given vmap_area 1227 */ 1228 static void unmap_vmap_area(struct vmap_area *va) 1229 { 1230 vunmap_page_range(va->va_start, va->va_end); 1231 } 1232 1233 /* 1234 * lazy_max_pages is the maximum amount of virtual address space we gather up 1235 * before attempting to purge with a TLB flush. 1236 * 1237 * There is a tradeoff here: a larger number will cover more kernel page tables 1238 * and take slightly longer to purge, but it will linearly reduce the number of 1239 * global TLB flushes that must be performed. It would seem natural to scale 1240 * this number up linearly with the number of CPUs (because vmapping activity 1241 * could also scale linearly with the number of CPUs), however it is likely 1242 * that in practice, workloads might be constrained in other ways that mean 1243 * vmap activity will not scale linearly with CPUs. Also, I want to be 1244 * conservative and not introduce a big latency on huge systems, so go with 1245 * a less aggressive log scale. It will still be an improvement over the old 1246 * code, and it will be simple to change the scale factor if we find that it 1247 * becomes a problem on bigger systems. 1248 */ 1249 static unsigned long lazy_max_pages(void) 1250 { 1251 unsigned int log; 1252 1253 log = fls(num_online_cpus()); 1254 1255 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 1256 } 1257 1258 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); 1259 1260 /* 1261 * Serialize vmap purging. There is no actual criticial section protected 1262 * by this look, but we want to avoid concurrent calls for performance 1263 * reasons and to make the pcpu_get_vm_areas more deterministic. 1264 */ 1265 static DEFINE_MUTEX(vmap_purge_lock); 1266 1267 /* for per-CPU blocks */ 1268 static void purge_fragmented_blocks_allcpus(void); 1269 1270 /* 1271 * called before a call to iounmap() if the caller wants vm_area_struct's 1272 * immediately freed. 1273 */ 1274 void set_iounmap_nonlazy(void) 1275 { 1276 atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); 1277 } 1278 1279 /* 1280 * Purges all lazily-freed vmap areas. 1281 */ 1282 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) 1283 { 1284 unsigned long resched_threshold; 1285 struct llist_node *valist; 1286 struct vmap_area *va; 1287 struct vmap_area *n_va; 1288 1289 lockdep_assert_held(&vmap_purge_lock); 1290 1291 valist = llist_del_all(&vmap_purge_list); 1292 if (unlikely(valist == NULL)) 1293 return false; 1294 1295 /* 1296 * First make sure the mappings are removed from all page-tables 1297 * before they are freed. 1298 */ 1299 vmalloc_sync_unmappings(); 1300 1301 /* 1302 * TODO: to calculate a flush range without looping. 1303 * The list can be up to lazy_max_pages() elements. 1304 */ 1305 llist_for_each_entry(va, valist, purge_list) { 1306 if (va->va_start < start) 1307 start = va->va_start; 1308 if (va->va_end > end) 1309 end = va->va_end; 1310 } 1311 1312 flush_tlb_kernel_range(start, end); 1313 resched_threshold = lazy_max_pages() << 1; 1314 1315 spin_lock(&free_vmap_area_lock); 1316 llist_for_each_entry_safe(va, n_va, valist, purge_list) { 1317 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1318 unsigned long orig_start = va->va_start; 1319 unsigned long orig_end = va->va_end; 1320 1321 /* 1322 * Finally insert or merge lazily-freed area. It is 1323 * detached and there is no need to "unlink" it from 1324 * anything. 1325 */ 1326 va = merge_or_add_vmap_area(va, &free_vmap_area_root, 1327 &free_vmap_area_list); 1328 1329 if (is_vmalloc_or_module_addr((void *)orig_start)) 1330 kasan_release_vmalloc(orig_start, orig_end, 1331 va->va_start, va->va_end); 1332 1333 atomic_long_sub(nr, &vmap_lazy_nr); 1334 1335 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) 1336 cond_resched_lock(&free_vmap_area_lock); 1337 } 1338 spin_unlock(&free_vmap_area_lock); 1339 return true; 1340 } 1341 1342 /* 1343 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody 1344 * is already purging. 1345 */ 1346 static void try_purge_vmap_area_lazy(void) 1347 { 1348 if (mutex_trylock(&vmap_purge_lock)) { 1349 __purge_vmap_area_lazy(ULONG_MAX, 0); 1350 mutex_unlock(&vmap_purge_lock); 1351 } 1352 } 1353 1354 /* 1355 * Kick off a purge of the outstanding lazy areas. 1356 */ 1357 static void purge_vmap_area_lazy(void) 1358 { 1359 mutex_lock(&vmap_purge_lock); 1360 purge_fragmented_blocks_allcpus(); 1361 __purge_vmap_area_lazy(ULONG_MAX, 0); 1362 mutex_unlock(&vmap_purge_lock); 1363 } 1364 1365 /* 1366 * Free a vmap area, caller ensuring that the area has been unmapped 1367 * and flush_cache_vunmap had been called for the correct range 1368 * previously. 1369 */ 1370 static void free_vmap_area_noflush(struct vmap_area *va) 1371 { 1372 unsigned long nr_lazy; 1373 1374 spin_lock(&vmap_area_lock); 1375 unlink_va(va, &vmap_area_root); 1376 spin_unlock(&vmap_area_lock); 1377 1378 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1379 PAGE_SHIFT, &vmap_lazy_nr); 1380 1381 /* After this point, we may free va at any time */ 1382 llist_add(&va->purge_list, &vmap_purge_list); 1383 1384 if (unlikely(nr_lazy > lazy_max_pages())) 1385 try_purge_vmap_area_lazy(); 1386 } 1387 1388 /* 1389 * Free and unmap a vmap area 1390 */ 1391 static void free_unmap_vmap_area(struct vmap_area *va) 1392 { 1393 flush_cache_vunmap(va->va_start, va->va_end); 1394 unmap_vmap_area(va); 1395 if (debug_pagealloc_enabled_static()) 1396 flush_tlb_kernel_range(va->va_start, va->va_end); 1397 1398 free_vmap_area_noflush(va); 1399 } 1400 1401 static struct vmap_area *find_vmap_area(unsigned long addr) 1402 { 1403 struct vmap_area *va; 1404 1405 spin_lock(&vmap_area_lock); 1406 va = __find_vmap_area(addr); 1407 spin_unlock(&vmap_area_lock); 1408 1409 return va; 1410 } 1411 1412 /*** Per cpu kva allocator ***/ 1413 1414 /* 1415 * vmap space is limited especially on 32 bit architectures. Ensure there is 1416 * room for at least 16 percpu vmap blocks per CPU. 1417 */ 1418 /* 1419 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 1420 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 1421 * instead (we just need a rough idea) 1422 */ 1423 #if BITS_PER_LONG == 32 1424 #define VMALLOC_SPACE (128UL*1024*1024) 1425 #else 1426 #define VMALLOC_SPACE (128UL*1024*1024*1024) 1427 #endif 1428 1429 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 1430 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 1431 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 1432 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 1433 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 1434 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 1435 #define VMAP_BBMAP_BITS \ 1436 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 1437 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 1438 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 1439 1440 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 1441 1442 struct vmap_block_queue { 1443 spinlock_t lock; 1444 struct list_head free; 1445 }; 1446 1447 struct vmap_block { 1448 spinlock_t lock; 1449 struct vmap_area *va; 1450 unsigned long free, dirty; 1451 unsigned long dirty_min, dirty_max; /*< dirty range */ 1452 struct list_head free_list; 1453 struct rcu_head rcu_head; 1454 struct list_head purge; 1455 }; 1456 1457 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 1458 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 1459 1460 /* 1461 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block 1462 * in the free path. Could get rid of this if we change the API to return a 1463 * "cookie" from alloc, to be passed to free. But no big deal yet. 1464 */ 1465 static DEFINE_SPINLOCK(vmap_block_tree_lock); 1466 static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); 1467 1468 /* 1469 * We should probably have a fallback mechanism to allocate virtual memory 1470 * out of partially filled vmap blocks. However vmap block sizing should be 1471 * fairly reasonable according to the vmalloc size, so it shouldn't be a 1472 * big problem. 1473 */ 1474 1475 static unsigned long addr_to_vb_idx(unsigned long addr) 1476 { 1477 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 1478 addr /= VMAP_BLOCK_SIZE; 1479 return addr; 1480 } 1481 1482 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) 1483 { 1484 unsigned long addr; 1485 1486 addr = va_start + (pages_off << PAGE_SHIFT); 1487 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); 1488 return (void *)addr; 1489 } 1490 1491 /** 1492 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this 1493 * block. Of course pages number can't exceed VMAP_BBMAP_BITS 1494 * @order: how many 2^order pages should be occupied in newly allocated block 1495 * @gfp_mask: flags for the page level allocator 1496 * 1497 * Return: virtual address in a newly allocated block or ERR_PTR(-errno) 1498 */ 1499 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 1500 { 1501 struct vmap_block_queue *vbq; 1502 struct vmap_block *vb; 1503 struct vmap_area *va; 1504 unsigned long vb_idx; 1505 int node, err; 1506 void *vaddr; 1507 1508 node = numa_node_id(); 1509 1510 vb = kmalloc_node(sizeof(struct vmap_block), 1511 gfp_mask & GFP_RECLAIM_MASK, node); 1512 if (unlikely(!vb)) 1513 return ERR_PTR(-ENOMEM); 1514 1515 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 1516 VMALLOC_START, VMALLOC_END, 1517 node, gfp_mask); 1518 if (IS_ERR(va)) { 1519 kfree(vb); 1520 return ERR_CAST(va); 1521 } 1522 1523 err = radix_tree_preload(gfp_mask); 1524 if (unlikely(err)) { 1525 kfree(vb); 1526 free_vmap_area(va); 1527 return ERR_PTR(err); 1528 } 1529 1530 vaddr = vmap_block_vaddr(va->va_start, 0); 1531 spin_lock_init(&vb->lock); 1532 vb->va = va; 1533 /* At least something should be left free */ 1534 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); 1535 vb->free = VMAP_BBMAP_BITS - (1UL << order); 1536 vb->dirty = 0; 1537 vb->dirty_min = VMAP_BBMAP_BITS; 1538 vb->dirty_max = 0; 1539 INIT_LIST_HEAD(&vb->free_list); 1540 1541 vb_idx = addr_to_vb_idx(va->va_start); 1542 spin_lock(&vmap_block_tree_lock); 1543 err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); 1544 spin_unlock(&vmap_block_tree_lock); 1545 BUG_ON(err); 1546 radix_tree_preload_end(); 1547 1548 vbq = &get_cpu_var(vmap_block_queue); 1549 spin_lock(&vbq->lock); 1550 list_add_tail_rcu(&vb->free_list, &vbq->free); 1551 spin_unlock(&vbq->lock); 1552 put_cpu_var(vmap_block_queue); 1553 1554 return vaddr; 1555 } 1556 1557 static void free_vmap_block(struct vmap_block *vb) 1558 { 1559 struct vmap_block *tmp; 1560 unsigned long vb_idx; 1561 1562 vb_idx = addr_to_vb_idx(vb->va->va_start); 1563 spin_lock(&vmap_block_tree_lock); 1564 tmp = radix_tree_delete(&vmap_block_tree, vb_idx); 1565 spin_unlock(&vmap_block_tree_lock); 1566 BUG_ON(tmp != vb); 1567 1568 free_vmap_area_noflush(vb->va); 1569 kfree_rcu(vb, rcu_head); 1570 } 1571 1572 static void purge_fragmented_blocks(int cpu) 1573 { 1574 LIST_HEAD(purge); 1575 struct vmap_block *vb; 1576 struct vmap_block *n_vb; 1577 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1578 1579 rcu_read_lock(); 1580 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1581 1582 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 1583 continue; 1584 1585 spin_lock(&vb->lock); 1586 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 1587 vb->free = 0; /* prevent further allocs after releasing lock */ 1588 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 1589 vb->dirty_min = 0; 1590 vb->dirty_max = VMAP_BBMAP_BITS; 1591 spin_lock(&vbq->lock); 1592 list_del_rcu(&vb->free_list); 1593 spin_unlock(&vbq->lock); 1594 spin_unlock(&vb->lock); 1595 list_add_tail(&vb->purge, &purge); 1596 } else 1597 spin_unlock(&vb->lock); 1598 } 1599 rcu_read_unlock(); 1600 1601 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 1602 list_del(&vb->purge); 1603 free_vmap_block(vb); 1604 } 1605 } 1606 1607 static void purge_fragmented_blocks_allcpus(void) 1608 { 1609 int cpu; 1610 1611 for_each_possible_cpu(cpu) 1612 purge_fragmented_blocks(cpu); 1613 } 1614 1615 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 1616 { 1617 struct vmap_block_queue *vbq; 1618 struct vmap_block *vb; 1619 void *vaddr = NULL; 1620 unsigned int order; 1621 1622 BUG_ON(offset_in_page(size)); 1623 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1624 if (WARN_ON(size == 0)) { 1625 /* 1626 * Allocating 0 bytes isn't what caller wants since 1627 * get_order(0) returns funny result. Just warn and terminate 1628 * early. 1629 */ 1630 return NULL; 1631 } 1632 order = get_order(size); 1633 1634 rcu_read_lock(); 1635 vbq = &get_cpu_var(vmap_block_queue); 1636 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1637 unsigned long pages_off; 1638 1639 spin_lock(&vb->lock); 1640 if (vb->free < (1UL << order)) { 1641 spin_unlock(&vb->lock); 1642 continue; 1643 } 1644 1645 pages_off = VMAP_BBMAP_BITS - vb->free; 1646 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); 1647 vb->free -= 1UL << order; 1648 if (vb->free == 0) { 1649 spin_lock(&vbq->lock); 1650 list_del_rcu(&vb->free_list); 1651 spin_unlock(&vbq->lock); 1652 } 1653 1654 spin_unlock(&vb->lock); 1655 break; 1656 } 1657 1658 put_cpu_var(vmap_block_queue); 1659 rcu_read_unlock(); 1660 1661 /* Allocate new block if nothing was found */ 1662 if (!vaddr) 1663 vaddr = new_vmap_block(order, gfp_mask); 1664 1665 return vaddr; 1666 } 1667 1668 static void vb_free(const void *addr, unsigned long size) 1669 { 1670 unsigned long offset; 1671 unsigned long vb_idx; 1672 unsigned int order; 1673 struct vmap_block *vb; 1674 1675 BUG_ON(offset_in_page(size)); 1676 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1677 1678 flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); 1679 1680 order = get_order(size); 1681 1682 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); 1683 offset >>= PAGE_SHIFT; 1684 1685 vb_idx = addr_to_vb_idx((unsigned long)addr); 1686 rcu_read_lock(); 1687 vb = radix_tree_lookup(&vmap_block_tree, vb_idx); 1688 rcu_read_unlock(); 1689 BUG_ON(!vb); 1690 1691 vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); 1692 1693 if (debug_pagealloc_enabled_static()) 1694 flush_tlb_kernel_range((unsigned long)addr, 1695 (unsigned long)addr + size); 1696 1697 spin_lock(&vb->lock); 1698 1699 /* Expand dirty range */ 1700 vb->dirty_min = min(vb->dirty_min, offset); 1701 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); 1702 1703 vb->dirty += 1UL << order; 1704 if (vb->dirty == VMAP_BBMAP_BITS) { 1705 BUG_ON(vb->free); 1706 spin_unlock(&vb->lock); 1707 free_vmap_block(vb); 1708 } else 1709 spin_unlock(&vb->lock); 1710 } 1711 1712 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) 1713 { 1714 int cpu; 1715 1716 if (unlikely(!vmap_initialized)) 1717 return; 1718 1719 might_sleep(); 1720 1721 for_each_possible_cpu(cpu) { 1722 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1723 struct vmap_block *vb; 1724 1725 rcu_read_lock(); 1726 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1727 spin_lock(&vb->lock); 1728 if (vb->dirty) { 1729 unsigned long va_start = vb->va->va_start; 1730 unsigned long s, e; 1731 1732 s = va_start + (vb->dirty_min << PAGE_SHIFT); 1733 e = va_start + (vb->dirty_max << PAGE_SHIFT); 1734 1735 start = min(s, start); 1736 end = max(e, end); 1737 1738 flush = 1; 1739 } 1740 spin_unlock(&vb->lock); 1741 } 1742 rcu_read_unlock(); 1743 } 1744 1745 mutex_lock(&vmap_purge_lock); 1746 purge_fragmented_blocks_allcpus(); 1747 if (!__purge_vmap_area_lazy(start, end) && flush) 1748 flush_tlb_kernel_range(start, end); 1749 mutex_unlock(&vmap_purge_lock); 1750 } 1751 1752 /** 1753 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 1754 * 1755 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 1756 * to amortize TLB flushing overheads. What this means is that any page you 1757 * have now, may, in a former life, have been mapped into kernel virtual 1758 * address by the vmap layer and so there might be some CPUs with TLB entries 1759 * still referencing that page (additional to the regular 1:1 kernel mapping). 1760 * 1761 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 1762 * be sure that none of the pages we have control over will have any aliases 1763 * from the vmap layer. 1764 */ 1765 void vm_unmap_aliases(void) 1766 { 1767 unsigned long start = ULONG_MAX, end = 0; 1768 int flush = 0; 1769 1770 _vm_unmap_aliases(start, end, flush); 1771 } 1772 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 1773 1774 /** 1775 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 1776 * @mem: the pointer returned by vm_map_ram 1777 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 1778 */ 1779 void vm_unmap_ram(const void *mem, unsigned int count) 1780 { 1781 unsigned long size = (unsigned long)count << PAGE_SHIFT; 1782 unsigned long addr = (unsigned long)mem; 1783 struct vmap_area *va; 1784 1785 might_sleep(); 1786 BUG_ON(!addr); 1787 BUG_ON(addr < VMALLOC_START); 1788 BUG_ON(addr > VMALLOC_END); 1789 BUG_ON(!PAGE_ALIGNED(addr)); 1790 1791 kasan_poison_vmalloc(mem, size); 1792 1793 if (likely(count <= VMAP_MAX_ALLOC)) { 1794 debug_check_no_locks_freed(mem, size); 1795 vb_free(mem, size); 1796 return; 1797 } 1798 1799 va = find_vmap_area(addr); 1800 BUG_ON(!va); 1801 debug_check_no_locks_freed((void *)va->va_start, 1802 (va->va_end - va->va_start)); 1803 free_unmap_vmap_area(va); 1804 } 1805 EXPORT_SYMBOL(vm_unmap_ram); 1806 1807 /** 1808 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 1809 * @pages: an array of pointers to the pages to be mapped 1810 * @count: number of pages 1811 * @node: prefer to allocate data structures on this node 1812 * @prot: memory protection to use. PAGE_KERNEL for regular RAM 1813 * 1814 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be 1815 * faster than vmap so it's good. But if you mix long-life and short-life 1816 * objects with vm_map_ram(), it could consume lots of address space through 1817 * fragmentation (especially on a 32bit machine). You could see failures in 1818 * the end. Please use this function for short-lived objects. 1819 * 1820 * Returns: a pointer to the address that has been mapped, or %NULL on failure 1821 */ 1822 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) 1823 { 1824 unsigned long size = (unsigned long)count << PAGE_SHIFT; 1825 unsigned long addr; 1826 void *mem; 1827 1828 if (likely(count <= VMAP_MAX_ALLOC)) { 1829 mem = vb_alloc(size, GFP_KERNEL); 1830 if (IS_ERR(mem)) 1831 return NULL; 1832 addr = (unsigned long)mem; 1833 } else { 1834 struct vmap_area *va; 1835 va = alloc_vmap_area(size, PAGE_SIZE, 1836 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 1837 if (IS_ERR(va)) 1838 return NULL; 1839 1840 addr = va->va_start; 1841 mem = (void *)addr; 1842 } 1843 1844 kasan_unpoison_vmalloc(mem, size); 1845 1846 if (vmap_page_range(addr, addr + size, prot, pages) < 0) { 1847 vm_unmap_ram(mem, count); 1848 return NULL; 1849 } 1850 return mem; 1851 } 1852 EXPORT_SYMBOL(vm_map_ram); 1853 1854 static struct vm_struct *vmlist __initdata; 1855 1856 /** 1857 * vm_area_add_early - add vmap area early during boot 1858 * @vm: vm_struct to add 1859 * 1860 * This function is used to add fixed kernel vm area to vmlist before 1861 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags 1862 * should contain proper values and the other fields should be zero. 1863 * 1864 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1865 */ 1866 void __init vm_area_add_early(struct vm_struct *vm) 1867 { 1868 struct vm_struct *tmp, **p; 1869 1870 BUG_ON(vmap_initialized); 1871 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1872 if (tmp->addr >= vm->addr) { 1873 BUG_ON(tmp->addr < vm->addr + vm->size); 1874 break; 1875 } else 1876 BUG_ON(tmp->addr + tmp->size > vm->addr); 1877 } 1878 vm->next = *p; 1879 *p = vm; 1880 } 1881 1882 /** 1883 * vm_area_register_early - register vmap area early during boot 1884 * @vm: vm_struct to register 1885 * @align: requested alignment 1886 * 1887 * This function is used to register kernel vm area before 1888 * vmalloc_init() is called. @vm->size and @vm->flags should contain 1889 * proper values on entry and other fields should be zero. On return, 1890 * vm->addr contains the allocated address. 1891 * 1892 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1893 */ 1894 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 1895 { 1896 static size_t vm_init_off __initdata; 1897 unsigned long addr; 1898 1899 addr = ALIGN(VMALLOC_START + vm_init_off, align); 1900 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; 1901 1902 vm->addr = (void *)addr; 1903 1904 vm_area_add_early(vm); 1905 } 1906 1907 static void vmap_init_free_space(void) 1908 { 1909 unsigned long vmap_start = 1; 1910 const unsigned long vmap_end = ULONG_MAX; 1911 struct vmap_area *busy, *free; 1912 1913 /* 1914 * B F B B B F 1915 * -|-----|.....|-----|-----|-----|.....|- 1916 * | The KVA space | 1917 * |<--------------------------------->| 1918 */ 1919 list_for_each_entry(busy, &vmap_area_list, list) { 1920 if (busy->va_start - vmap_start > 0) { 1921 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1922 if (!WARN_ON_ONCE(!free)) { 1923 free->va_start = vmap_start; 1924 free->va_end = busy->va_start; 1925 1926 insert_vmap_area_augment(free, NULL, 1927 &free_vmap_area_root, 1928 &free_vmap_area_list); 1929 } 1930 } 1931 1932 vmap_start = busy->va_end; 1933 } 1934 1935 if (vmap_end - vmap_start > 0) { 1936 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1937 if (!WARN_ON_ONCE(!free)) { 1938 free->va_start = vmap_start; 1939 free->va_end = vmap_end; 1940 1941 insert_vmap_area_augment(free, NULL, 1942 &free_vmap_area_root, 1943 &free_vmap_area_list); 1944 } 1945 } 1946 } 1947 1948 void __init vmalloc_init(void) 1949 { 1950 struct vmap_area *va; 1951 struct vm_struct *tmp; 1952 int i; 1953 1954 /* 1955 * Create the cache for vmap_area objects. 1956 */ 1957 vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); 1958 1959 for_each_possible_cpu(i) { 1960 struct vmap_block_queue *vbq; 1961 struct vfree_deferred *p; 1962 1963 vbq = &per_cpu(vmap_block_queue, i); 1964 spin_lock_init(&vbq->lock); 1965 INIT_LIST_HEAD(&vbq->free); 1966 p = &per_cpu(vfree_deferred, i); 1967 init_llist_head(&p->list); 1968 INIT_WORK(&p->wq, free_work); 1969 } 1970 1971 /* Import existing vmlist entries. */ 1972 for (tmp = vmlist; tmp; tmp = tmp->next) { 1973 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1974 if (WARN_ON_ONCE(!va)) 1975 continue; 1976 1977 va->va_start = (unsigned long)tmp->addr; 1978 va->va_end = va->va_start + tmp->size; 1979 va->vm = tmp; 1980 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1981 } 1982 1983 /* 1984 * Now we can initialize a free vmap space. 1985 */ 1986 vmap_init_free_space(); 1987 vmap_initialized = true; 1988 } 1989 1990 /** 1991 * map_kernel_range_noflush - map kernel VM area with the specified pages 1992 * @addr: start of the VM area to map 1993 * @size: size of the VM area to map 1994 * @prot: page protection flags to use 1995 * @pages: pages to map 1996 * 1997 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size 1998 * specify should have been allocated using get_vm_area() and its 1999 * friends. 2000 * 2001 * NOTE: 2002 * This function does NOT do any cache flushing. The caller is 2003 * responsible for calling flush_cache_vmap() on to-be-mapped areas 2004 * before calling this function. 2005 * 2006 * RETURNS: 2007 * The number of pages mapped on success, -errno on failure. 2008 */ 2009 int map_kernel_range_noflush(unsigned long addr, unsigned long size, 2010 pgprot_t prot, struct page **pages) 2011 { 2012 return vmap_page_range_noflush(addr, addr + size, prot, pages); 2013 } 2014 2015 /** 2016 * unmap_kernel_range_noflush - unmap kernel VM area 2017 * @addr: start of the VM area to unmap 2018 * @size: size of the VM area to unmap 2019 * 2020 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size 2021 * specify should have been allocated using get_vm_area() and its 2022 * friends. 2023 * 2024 * NOTE: 2025 * This function does NOT do any cache flushing. The caller is 2026 * responsible for calling flush_cache_vunmap() on to-be-mapped areas 2027 * before calling this function and flush_tlb_kernel_range() after. 2028 */ 2029 void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) 2030 { 2031 vunmap_page_range(addr, addr + size); 2032 } 2033 EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); 2034 2035 /** 2036 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB 2037 * @addr: start of the VM area to unmap 2038 * @size: size of the VM area to unmap 2039 * 2040 * Similar to unmap_kernel_range_noflush() but flushes vcache before 2041 * the unmapping and tlb after. 2042 */ 2043 void unmap_kernel_range(unsigned long addr, unsigned long size) 2044 { 2045 unsigned long end = addr + size; 2046 2047 flush_cache_vunmap(addr, end); 2048 vunmap_page_range(addr, end); 2049 flush_tlb_kernel_range(addr, end); 2050 } 2051 EXPORT_SYMBOL_GPL(unmap_kernel_range); 2052 2053 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) 2054 { 2055 unsigned long addr = (unsigned long)area->addr; 2056 unsigned long end = addr + get_vm_area_size(area); 2057 int err; 2058 2059 err = vmap_page_range(addr, end, prot, pages); 2060 2061 return err > 0 ? 0 : err; 2062 } 2063 EXPORT_SYMBOL_GPL(map_vm_area); 2064 2065 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, 2066 struct vmap_area *va, unsigned long flags, const void *caller) 2067 { 2068 vm->flags = flags; 2069 vm->addr = (void *)va->va_start; 2070 vm->size = va->va_end - va->va_start; 2071 vm->caller = caller; 2072 va->vm = vm; 2073 } 2074 2075 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 2076 unsigned long flags, const void *caller) 2077 { 2078 spin_lock(&vmap_area_lock); 2079 setup_vmalloc_vm_locked(vm, va, flags, caller); 2080 spin_unlock(&vmap_area_lock); 2081 } 2082 2083 static void clear_vm_uninitialized_flag(struct vm_struct *vm) 2084 { 2085 /* 2086 * Before removing VM_UNINITIALIZED, 2087 * we should make sure that vm has proper values. 2088 * Pair with smp_rmb() in show_numa_info(). 2089 */ 2090 smp_wmb(); 2091 vm->flags &= ~VM_UNINITIALIZED; 2092 } 2093 2094 static struct vm_struct *__get_vm_area_node(unsigned long size, 2095 unsigned long align, unsigned long flags, unsigned long start, 2096 unsigned long end, int node, gfp_t gfp_mask, const void *caller) 2097 { 2098 struct vmap_area *va; 2099 struct vm_struct *area; 2100 unsigned long requested_size = size; 2101 2102 BUG_ON(in_interrupt()); 2103 size = PAGE_ALIGN(size); 2104 if (unlikely(!size)) 2105 return NULL; 2106 2107 if (flags & VM_IOREMAP) 2108 align = 1ul << clamp_t(int, get_count_order_long(size), 2109 PAGE_SHIFT, IOREMAP_MAX_ORDER); 2110 2111 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 2112 if (unlikely(!area)) 2113 return NULL; 2114 2115 if (!(flags & VM_NO_GUARD)) 2116 size += PAGE_SIZE; 2117 2118 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 2119 if (IS_ERR(va)) { 2120 kfree(area); 2121 return NULL; 2122 } 2123 2124 kasan_unpoison_vmalloc((void *)va->va_start, requested_size); 2125 2126 setup_vmalloc_vm(area, va, flags, caller); 2127 2128 return area; 2129 } 2130 2131 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 2132 unsigned long start, unsigned long end) 2133 { 2134 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, 2135 GFP_KERNEL, __builtin_return_address(0)); 2136 } 2137 EXPORT_SYMBOL_GPL(__get_vm_area); 2138 2139 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 2140 unsigned long start, unsigned long end, 2141 const void *caller) 2142 { 2143 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, 2144 GFP_KERNEL, caller); 2145 } 2146 2147 /** 2148 * get_vm_area - reserve a contiguous kernel virtual area 2149 * @size: size of the area 2150 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 2151 * 2152 * Search an area of @size in the kernel virtual mapping area, 2153 * and reserved it for out purposes. Returns the area descriptor 2154 * on success or %NULL on failure. 2155 * 2156 * Return: the area descriptor on success or %NULL on failure. 2157 */ 2158 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 2159 { 2160 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2161 NUMA_NO_NODE, GFP_KERNEL, 2162 __builtin_return_address(0)); 2163 } 2164 2165 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 2166 const void *caller) 2167 { 2168 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2169 NUMA_NO_NODE, GFP_KERNEL, caller); 2170 } 2171 2172 /** 2173 * find_vm_area - find a continuous kernel virtual area 2174 * @addr: base address 2175 * 2176 * Search for the kernel VM area starting at @addr, and return it. 2177 * It is up to the caller to do all required locking to keep the returned 2178 * pointer valid. 2179 * 2180 * Return: pointer to the found area or %NULL on faulure 2181 */ 2182 struct vm_struct *find_vm_area(const void *addr) 2183 { 2184 struct vmap_area *va; 2185 2186 va = find_vmap_area((unsigned long)addr); 2187 if (!va) 2188 return NULL; 2189 2190 return va->vm; 2191 } 2192 2193 /** 2194 * remove_vm_area - find and remove a continuous kernel virtual area 2195 * @addr: base address 2196 * 2197 * Search for the kernel VM area starting at @addr, and remove it. 2198 * This function returns the found VM area, but using it is NOT safe 2199 * on SMP machines, except for its size or flags. 2200 * 2201 * Return: pointer to the found area or %NULL on faulure 2202 */ 2203 struct vm_struct *remove_vm_area(const void *addr) 2204 { 2205 struct vmap_area *va; 2206 2207 might_sleep(); 2208 2209 spin_lock(&vmap_area_lock); 2210 va = __find_vmap_area((unsigned long)addr); 2211 if (va && va->vm) { 2212 struct vm_struct *vm = va->vm; 2213 2214 va->vm = NULL; 2215 spin_unlock(&vmap_area_lock); 2216 2217 kasan_free_shadow(vm); 2218 free_unmap_vmap_area(va); 2219 2220 return vm; 2221 } 2222 2223 spin_unlock(&vmap_area_lock); 2224 return NULL; 2225 } 2226 2227 static inline void set_area_direct_map(const struct vm_struct *area, 2228 int (*set_direct_map)(struct page *page)) 2229 { 2230 int i; 2231 2232 for (i = 0; i < area->nr_pages; i++) 2233 if (page_address(area->pages[i])) 2234 set_direct_map(area->pages[i]); 2235 } 2236 2237 /* Handle removing and resetting vm mappings related to the vm_struct. */ 2238 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) 2239 { 2240 unsigned long start = ULONG_MAX, end = 0; 2241 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; 2242 int flush_dmap = 0; 2243 int i; 2244 2245 remove_vm_area(area->addr); 2246 2247 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ 2248 if (!flush_reset) 2249 return; 2250 2251 /* 2252 * If not deallocating pages, just do the flush of the VM area and 2253 * return. 2254 */ 2255 if (!deallocate_pages) { 2256 vm_unmap_aliases(); 2257 return; 2258 } 2259 2260 /* 2261 * If execution gets here, flush the vm mapping and reset the direct 2262 * map. Find the start and end range of the direct mappings to make sure 2263 * the vm_unmap_aliases() flush includes the direct map. 2264 */ 2265 for (i = 0; i < area->nr_pages; i++) { 2266 unsigned long addr = (unsigned long)page_address(area->pages[i]); 2267 if (addr) { 2268 start = min(addr, start); 2269 end = max(addr + PAGE_SIZE, end); 2270 flush_dmap = 1; 2271 } 2272 } 2273 2274 /* 2275 * Set direct map to something invalid so that it won't be cached if 2276 * there are any accesses after the TLB flush, then flush the TLB and 2277 * reset the direct map permissions to the default. 2278 */ 2279 set_area_direct_map(area, set_direct_map_invalid_noflush); 2280 _vm_unmap_aliases(start, end, flush_dmap); 2281 set_area_direct_map(area, set_direct_map_default_noflush); 2282 } 2283 2284 static void __vunmap(const void *addr, int deallocate_pages) 2285 { 2286 struct vm_struct *area; 2287 2288 if (!addr) 2289 return; 2290 2291 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", 2292 addr)) 2293 return; 2294 2295 area = find_vm_area(addr); 2296 if (unlikely(!area)) { 2297 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 2298 addr); 2299 return; 2300 } 2301 2302 debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); 2303 debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); 2304 2305 kasan_poison_vmalloc(area->addr, area->size); 2306 2307 vm_remove_mappings(area, deallocate_pages); 2308 2309 if (deallocate_pages) { 2310 int i; 2311 2312 for (i = 0; i < area->nr_pages; i++) { 2313 struct page *page = area->pages[i]; 2314 2315 BUG_ON(!page); 2316 __free_pages(page, 0); 2317 } 2318 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); 2319 2320 kvfree(area->pages); 2321 } 2322 2323 kfree(area); 2324 return; 2325 } 2326 2327 static inline void __vfree_deferred(const void *addr) 2328 { 2329 /* 2330 * Use raw_cpu_ptr() because this can be called from preemptible 2331 * context. Preemption is absolutely fine here, because the llist_add() 2332 * implementation is lockless, so it works even if we are adding to 2333 * nother cpu's list. schedule_work() should be fine with this too. 2334 */ 2335 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); 2336 2337 if (llist_add((struct llist_node *)addr, &p->list)) 2338 schedule_work(&p->wq); 2339 } 2340 2341 /** 2342 * vfree_atomic - release memory allocated by vmalloc() 2343 * @addr: memory base address 2344 * 2345 * This one is just like vfree() but can be called in any atomic context 2346 * except NMIs. 2347 */ 2348 void vfree_atomic(const void *addr) 2349 { 2350 BUG_ON(in_nmi()); 2351 2352 kmemleak_free(addr); 2353 2354 if (!addr) 2355 return; 2356 __vfree_deferred(addr); 2357 } 2358 2359 static void __vfree(const void *addr) 2360 { 2361 if (unlikely(in_interrupt())) 2362 __vfree_deferred(addr); 2363 else 2364 __vunmap(addr, 1); 2365 } 2366 2367 /** 2368 * vfree - release memory allocated by vmalloc() 2369 * @addr: memory base address 2370 * 2371 * Free the virtually continuous memory area starting at @addr, as 2372 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is 2373 * NULL, no operation is performed. 2374 * 2375 * Must not be called in NMI context (strictly speaking, only if we don't 2376 * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 2377 * conventions for vfree() arch-depenedent would be a really bad idea) 2378 * 2379 * May sleep if called *not* from interrupt context. 2380 * 2381 * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) 2382 */ 2383 void vfree(const void *addr) 2384 { 2385 BUG_ON(in_nmi()); 2386 2387 kmemleak_free(addr); 2388 2389 might_sleep_if(!in_interrupt()); 2390 2391 if (!addr) 2392 return; 2393 2394 __vfree(addr); 2395 } 2396 EXPORT_SYMBOL(vfree); 2397 2398 /** 2399 * vunmap - release virtual mapping obtained by vmap() 2400 * @addr: memory base address 2401 * 2402 * Free the virtually contiguous memory area starting at @addr, 2403 * which was created from the page array passed to vmap(). 2404 * 2405 * Must not be called in interrupt context. 2406 */ 2407 void vunmap(const void *addr) 2408 { 2409 BUG_ON(in_interrupt()); 2410 might_sleep(); 2411 if (addr) 2412 __vunmap(addr, 0); 2413 } 2414 EXPORT_SYMBOL(vunmap); 2415 2416 /** 2417 * vmap - map an array of pages into virtually contiguous space 2418 * @pages: array of page pointers 2419 * @count: number of pages to map 2420 * @flags: vm_area->flags 2421 * @prot: page protection for the mapping 2422 * 2423 * Maps @count pages from @pages into contiguous kernel virtual 2424 * space. 2425 * 2426 * Return: the address of the area or %NULL on failure 2427 */ 2428 void *vmap(struct page **pages, unsigned int count, 2429 unsigned long flags, pgprot_t prot) 2430 { 2431 struct vm_struct *area; 2432 unsigned long size; /* In bytes */ 2433 2434 might_sleep(); 2435 2436 if (count > totalram_pages()) 2437 return NULL; 2438 2439 size = (unsigned long)count << PAGE_SHIFT; 2440 area = get_vm_area_caller(size, flags, __builtin_return_address(0)); 2441 if (!area) 2442 return NULL; 2443 2444 if (map_vm_area(area, prot, pages)) { 2445 vunmap(area->addr); 2446 return NULL; 2447 } 2448 2449 return area->addr; 2450 } 2451 EXPORT_SYMBOL(vmap); 2452 2453 static void *__vmalloc_node(unsigned long size, unsigned long align, 2454 gfp_t gfp_mask, pgprot_t prot, 2455 int node, const void *caller); 2456 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 2457 pgprot_t prot, int node) 2458 { 2459 struct page **pages; 2460 unsigned int nr_pages, array_size, i; 2461 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 2462 const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; 2463 const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? 2464 0 : 2465 __GFP_HIGHMEM; 2466 2467 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 2468 array_size = (nr_pages * sizeof(struct page *)); 2469 2470 /* Please note that the recursion is strictly bounded. */ 2471 if (array_size > PAGE_SIZE) { 2472 pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, 2473 PAGE_KERNEL, node, area->caller); 2474 } else { 2475 pages = kmalloc_node(array_size, nested_gfp, node); 2476 } 2477 2478 if (!pages) { 2479 remove_vm_area(area->addr); 2480 kfree(area); 2481 return NULL; 2482 } 2483 2484 area->pages = pages; 2485 area->nr_pages = nr_pages; 2486 2487 for (i = 0; i < area->nr_pages; i++) { 2488 struct page *page; 2489 2490 if (node == NUMA_NO_NODE) 2491 page = alloc_page(alloc_mask|highmem_mask); 2492 else 2493 page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); 2494 2495 if (unlikely(!page)) { 2496 /* Successfully allocated i pages, free them in __vunmap() */ 2497 area->nr_pages = i; 2498 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2499 goto fail; 2500 } 2501 area->pages[i] = page; 2502 if (gfpflags_allow_blocking(gfp_mask)) 2503 cond_resched(); 2504 } 2505 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2506 2507 if (map_vm_area(area, prot, pages)) 2508 goto fail; 2509 return area->addr; 2510 2511 fail: 2512 warn_alloc(gfp_mask, NULL, 2513 "vmalloc: allocation failure, allocated %ld of %ld bytes", 2514 (area->nr_pages*PAGE_SIZE), area->size); 2515 __vfree(area->addr); 2516 return NULL; 2517 } 2518 2519 /** 2520 * __vmalloc_node_range - allocate virtually contiguous memory 2521 * @size: allocation size 2522 * @align: desired alignment 2523 * @start: vm area range start 2524 * @end: vm area range end 2525 * @gfp_mask: flags for the page level allocator 2526 * @prot: protection mask for the allocated pages 2527 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 2528 * @node: node to use for allocation or NUMA_NO_NODE 2529 * @caller: caller's return address 2530 * 2531 * Allocate enough pages to cover @size from the page level 2532 * allocator with @gfp_mask flags. Map them into contiguous 2533 * kernel virtual space, using a pagetable protection of @prot. 2534 * 2535 * Return: the address of the area or %NULL on failure 2536 */ 2537 void *__vmalloc_node_range(unsigned long size, unsigned long align, 2538 unsigned long start, unsigned long end, gfp_t gfp_mask, 2539 pgprot_t prot, unsigned long vm_flags, int node, 2540 const void *caller) 2541 { 2542 struct vm_struct *area; 2543 void *addr; 2544 unsigned long real_size = size; 2545 2546 size = PAGE_ALIGN(size); 2547 if (!size || (size >> PAGE_SHIFT) > totalram_pages()) 2548 goto fail; 2549 2550 area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | 2551 vm_flags, start, end, node, gfp_mask, caller); 2552 if (!area) 2553 goto fail; 2554 2555 addr = __vmalloc_area_node(area, gfp_mask, prot, node); 2556 if (!addr) 2557 return NULL; 2558 2559 /* 2560 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 2561 * flag. It means that vm_struct is not fully initialized. 2562 * Now, it is fully initialized, so remove this flag here. 2563 */ 2564 clear_vm_uninitialized_flag(area); 2565 2566 kmemleak_vmalloc(area, size, gfp_mask); 2567 2568 return addr; 2569 2570 fail: 2571 warn_alloc(gfp_mask, NULL, 2572 "vmalloc: allocation failure: %lu bytes", real_size); 2573 return NULL; 2574 } 2575 2576 /* 2577 * This is only for performance analysis of vmalloc and stress purpose. 2578 * It is required by vmalloc test module, therefore do not use it other 2579 * than that. 2580 */ 2581 #ifdef CONFIG_TEST_VMALLOC_MODULE 2582 EXPORT_SYMBOL_GPL(__vmalloc_node_range); 2583 #endif 2584 2585 /** 2586 * __vmalloc_node - allocate virtually contiguous memory 2587 * @size: allocation size 2588 * @align: desired alignment 2589 * @gfp_mask: flags for the page level allocator 2590 * @prot: protection mask for the allocated pages 2591 * @node: node to use for allocation or NUMA_NO_NODE 2592 * @caller: caller's return address 2593 * 2594 * Allocate enough pages to cover @size from the page level 2595 * allocator with @gfp_mask flags. Map them into contiguous 2596 * kernel virtual space, using a pagetable protection of @prot. 2597 * 2598 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 2599 * and __GFP_NOFAIL are not supported 2600 * 2601 * Any use of gfp flags outside of GFP_KERNEL should be consulted 2602 * with mm people. 2603 * 2604 * Return: pointer to the allocated memory or %NULL on error 2605 */ 2606 static void *__vmalloc_node(unsigned long size, unsigned long align, 2607 gfp_t gfp_mask, pgprot_t prot, 2608 int node, const void *caller) 2609 { 2610 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 2611 gfp_mask, prot, 0, node, caller); 2612 } 2613 2614 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 2615 { 2616 return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, 2617 __builtin_return_address(0)); 2618 } 2619 EXPORT_SYMBOL(__vmalloc); 2620 2621 static inline void *__vmalloc_node_flags(unsigned long size, 2622 int node, gfp_t flags) 2623 { 2624 return __vmalloc_node(size, 1, flags, PAGE_KERNEL, 2625 node, __builtin_return_address(0)); 2626 } 2627 2628 2629 void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, 2630 void *caller) 2631 { 2632 return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); 2633 } 2634 2635 /** 2636 * vmalloc - allocate virtually contiguous memory 2637 * @size: allocation size 2638 * 2639 * Allocate enough pages to cover @size from the page level 2640 * allocator and map them into contiguous kernel virtual space. 2641 * 2642 * For tight control over page level allocator and protection flags 2643 * use __vmalloc() instead. 2644 * 2645 * Return: pointer to the allocated memory or %NULL on error 2646 */ 2647 void *vmalloc(unsigned long size) 2648 { 2649 return __vmalloc_node_flags(size, NUMA_NO_NODE, 2650 GFP_KERNEL); 2651 } 2652 EXPORT_SYMBOL(vmalloc); 2653 2654 /** 2655 * vzalloc - allocate virtually contiguous memory with zero fill 2656 * @size: allocation size 2657 * 2658 * Allocate enough pages to cover @size from the page level 2659 * allocator and map them into contiguous kernel virtual space. 2660 * The memory allocated is set to zero. 2661 * 2662 * For tight control over page level allocator and protection flags 2663 * use __vmalloc() instead. 2664 * 2665 * Return: pointer to the allocated memory or %NULL on error 2666 */ 2667 void *vzalloc(unsigned long size) 2668 { 2669 return __vmalloc_node_flags(size, NUMA_NO_NODE, 2670 GFP_KERNEL | __GFP_ZERO); 2671 } 2672 EXPORT_SYMBOL(vzalloc); 2673 2674 /** 2675 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 2676 * @size: allocation size 2677 * 2678 * The resulting memory area is zeroed so it can be mapped to userspace 2679 * without leaking data. 2680 * 2681 * Return: pointer to the allocated memory or %NULL on error 2682 */ 2683 void *vmalloc_user(unsigned long size) 2684 { 2685 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 2686 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, 2687 VM_USERMAP, NUMA_NO_NODE, 2688 __builtin_return_address(0)); 2689 } 2690 EXPORT_SYMBOL(vmalloc_user); 2691 2692 /** 2693 * vmalloc_node - allocate memory on a specific node 2694 * @size: allocation size 2695 * @node: numa node 2696 * 2697 * Allocate enough pages to cover @size from the page level 2698 * allocator and map them into contiguous kernel virtual space. 2699 * 2700 * For tight control over page level allocator and protection flags 2701 * use __vmalloc() instead. 2702 * 2703 * Return: pointer to the allocated memory or %NULL on error 2704 */ 2705 void *vmalloc_node(unsigned long size, int node) 2706 { 2707 return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, 2708 node, __builtin_return_address(0)); 2709 } 2710 EXPORT_SYMBOL(vmalloc_node); 2711 2712 /** 2713 * vzalloc_node - allocate memory on a specific node with zero fill 2714 * @size: allocation size 2715 * @node: numa node 2716 * 2717 * Allocate enough pages to cover @size from the page level 2718 * allocator and map them into contiguous kernel virtual space. 2719 * The memory allocated is set to zero. 2720 * 2721 * For tight control over page level allocator and protection flags 2722 * use __vmalloc_node() instead. 2723 * 2724 * Return: pointer to the allocated memory or %NULL on error 2725 */ 2726 void *vzalloc_node(unsigned long size, int node) 2727 { 2728 return __vmalloc_node_flags(size, node, 2729 GFP_KERNEL | __GFP_ZERO); 2730 } 2731 EXPORT_SYMBOL(vzalloc_node); 2732 2733 /** 2734 * vmalloc_user_node_flags - allocate memory for userspace on a specific node 2735 * @size: allocation size 2736 * @node: numa node 2737 * @flags: flags for the page level allocator 2738 * 2739 * The resulting memory area is zeroed so it can be mapped to userspace 2740 * without leaking data. 2741 * 2742 * Return: pointer to the allocated memory or %NULL on error 2743 */ 2744 void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) 2745 { 2746 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 2747 flags | __GFP_ZERO, PAGE_KERNEL, 2748 VM_USERMAP, node, 2749 __builtin_return_address(0)); 2750 } 2751 EXPORT_SYMBOL(vmalloc_user_node_flags); 2752 2753 /** 2754 * vmalloc_exec - allocate virtually contiguous, executable memory 2755 * @size: allocation size 2756 * 2757 * Kernel-internal function to allocate enough pages to cover @size 2758 * the page level allocator and map them into contiguous and 2759 * executable kernel virtual space. 2760 * 2761 * For tight control over page level allocator and protection flags 2762 * use __vmalloc() instead. 2763 * 2764 * Return: pointer to the allocated memory or %NULL on error 2765 */ 2766 void *vmalloc_exec(unsigned long size) 2767 { 2768 return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, 2769 GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, 2770 NUMA_NO_NODE, __builtin_return_address(0)); 2771 } 2772 2773 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 2774 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 2775 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 2776 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) 2777 #else 2778 /* 2779 * 64b systems should always have either DMA or DMA32 zones. For others 2780 * GFP_DMA32 should do the right thing and use the normal zone. 2781 */ 2782 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL 2783 #endif 2784 2785 /** 2786 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 2787 * @size: allocation size 2788 * 2789 * Allocate enough 32bit PA addressable pages to cover @size from the 2790 * page level allocator and map them into contiguous kernel virtual space. 2791 * 2792 * Return: pointer to the allocated memory or %NULL on error 2793 */ 2794 void *vmalloc_32(unsigned long size) 2795 { 2796 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, 2797 NUMA_NO_NODE, __builtin_return_address(0)); 2798 } 2799 EXPORT_SYMBOL(vmalloc_32); 2800 2801 /** 2802 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 2803 * @size: allocation size 2804 * 2805 * The resulting memory area is 32bit addressable and zeroed so it can be 2806 * mapped to userspace without leaking data. 2807 * 2808 * Return: pointer to the allocated memory or %NULL on error 2809 */ 2810 void *vmalloc_32_user(unsigned long size) 2811 { 2812 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 2813 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 2814 VM_USERMAP, NUMA_NO_NODE, 2815 __builtin_return_address(0)); 2816 } 2817 EXPORT_SYMBOL(vmalloc_32_user); 2818 2819 /* 2820 * small helper routine , copy contents to buf from addr. 2821 * If the page is not present, fill zero. 2822 */ 2823 2824 static int aligned_vread(char *buf, char *addr, unsigned long count) 2825 { 2826 struct page *p; 2827 int copied = 0; 2828 2829 while (count) { 2830 unsigned long offset, length; 2831 2832 offset = offset_in_page(addr); 2833 length = PAGE_SIZE - offset; 2834 if (length > count) 2835 length = count; 2836 p = vmalloc_to_page(addr); 2837 /* 2838 * To do safe access to this _mapped_ area, we need 2839 * lock. But adding lock here means that we need to add 2840 * overhead of vmalloc()/vfree() calles for this _debug_ 2841 * interface, rarely used. Instead of that, we'll use 2842 * kmap() and get small overhead in this access function. 2843 */ 2844 if (p) { 2845 /* 2846 * we can expect USER0 is not used (see vread/vwrite's 2847 * function description) 2848 */ 2849 void *map = kmap_atomic(p); 2850 memcpy(buf, map + offset, length); 2851 kunmap_atomic(map); 2852 } else 2853 memset(buf, 0, length); 2854 2855 addr += length; 2856 buf += length; 2857 copied += length; 2858 count -= length; 2859 } 2860 return copied; 2861 } 2862 2863 static int aligned_vwrite(char *buf, char *addr, unsigned long count) 2864 { 2865 struct page *p; 2866 int copied = 0; 2867 2868 while (count) { 2869 unsigned long offset, length; 2870 2871 offset = offset_in_page(addr); 2872 length = PAGE_SIZE - offset; 2873 if (length > count) 2874 length = count; 2875 p = vmalloc_to_page(addr); 2876 /* 2877 * To do safe access to this _mapped_ area, we need 2878 * lock. But adding lock here means that we need to add 2879 * overhead of vmalloc()/vfree() calles for this _debug_ 2880 * interface, rarely used. Instead of that, we'll use 2881 * kmap() and get small overhead in this access function. 2882 */ 2883 if (p) { 2884 /* 2885 * we can expect USER0 is not used (see vread/vwrite's 2886 * function description) 2887 */ 2888 void *map = kmap_atomic(p); 2889 memcpy(map + offset, buf, length); 2890 kunmap_atomic(map); 2891 } 2892 addr += length; 2893 buf += length; 2894 copied += length; 2895 count -= length; 2896 } 2897 return copied; 2898 } 2899 2900 /** 2901 * vread() - read vmalloc area in a safe way. 2902 * @buf: buffer for reading data 2903 * @addr: vm address. 2904 * @count: number of bytes to be read. 2905 * 2906 * This function checks that addr is a valid vmalloc'ed area, and 2907 * copy data from that area to a given buffer. If the given memory range 2908 * of [addr...addr+count) includes some valid address, data is copied to 2909 * proper area of @buf. If there are memory holes, they'll be zero-filled. 2910 * IOREMAP area is treated as memory hole and no copy is done. 2911 * 2912 * If [addr...addr+count) doesn't includes any intersects with alive 2913 * vm_struct area, returns 0. @buf should be kernel's buffer. 2914 * 2915 * Note: In usual ops, vread() is never necessary because the caller 2916 * should know vmalloc() area is valid and can use memcpy(). 2917 * This is for routines which have to access vmalloc area without 2918 * any information, as /dev/kmem. 2919 * 2920 * Return: number of bytes for which addr and buf should be increased 2921 * (same number as @count) or %0 if [addr...addr+count) doesn't 2922 * include any intersection with valid vmalloc area 2923 */ 2924 long vread(char *buf, char *addr, unsigned long count) 2925 { 2926 struct vmap_area *va; 2927 struct vm_struct *vm; 2928 char *vaddr, *buf_start = buf; 2929 unsigned long buflen = count; 2930 unsigned long n; 2931 2932 /* Don't allow overflow */ 2933 if ((unsigned long) addr + count < count) 2934 count = -(unsigned long) addr; 2935 2936 spin_lock(&vmap_area_lock); 2937 list_for_each_entry(va, &vmap_area_list, list) { 2938 if (!count) 2939 break; 2940 2941 if (!va->vm) 2942 continue; 2943 2944 vm = va->vm; 2945 vaddr = (char *) vm->addr; 2946 if (addr >= vaddr + get_vm_area_size(vm)) 2947 continue; 2948 while (addr < vaddr) { 2949 if (count == 0) 2950 goto finished; 2951 *buf = '\0'; 2952 buf++; 2953 addr++; 2954 count--; 2955 } 2956 n = vaddr + get_vm_area_size(vm) - addr; 2957 if (n > count) 2958 n = count; 2959 if (!(vm->flags & VM_IOREMAP)) 2960 aligned_vread(buf, addr, n); 2961 else /* IOREMAP area is treated as memory hole */ 2962 memset(buf, 0, n); 2963 buf += n; 2964 addr += n; 2965 count -= n; 2966 } 2967 finished: 2968 spin_unlock(&vmap_area_lock); 2969 2970 if (buf == buf_start) 2971 return 0; 2972 /* zero-fill memory holes */ 2973 if (buf != buf_start + buflen) 2974 memset(buf, 0, buflen - (buf - buf_start)); 2975 2976 return buflen; 2977 } 2978 2979 /** 2980 * vwrite() - write vmalloc area in a safe way. 2981 * @buf: buffer for source data 2982 * @addr: vm address. 2983 * @count: number of bytes to be read. 2984 * 2985 * This function checks that addr is a valid vmalloc'ed area, and 2986 * copy data from a buffer to the given addr. If specified range of 2987 * [addr...addr+count) includes some valid address, data is copied from 2988 * proper area of @buf. If there are memory holes, no copy to hole. 2989 * IOREMAP area is treated as memory hole and no copy is done. 2990 * 2991 * If [addr...addr+count) doesn't includes any intersects with alive 2992 * vm_struct area, returns 0. @buf should be kernel's buffer. 2993 * 2994 * Note: In usual ops, vwrite() is never necessary because the caller 2995 * should know vmalloc() area is valid and can use memcpy(). 2996 * This is for routines which have to access vmalloc area without 2997 * any information, as /dev/kmem. 2998 * 2999 * Return: number of bytes for which addr and buf should be 3000 * increased (same number as @count) or %0 if [addr...addr+count) 3001 * doesn't include any intersection with valid vmalloc area 3002 */ 3003 long vwrite(char *buf, char *addr, unsigned long count) 3004 { 3005 struct vmap_area *va; 3006 struct vm_struct *vm; 3007 char *vaddr; 3008 unsigned long n, buflen; 3009 int copied = 0; 3010 3011 /* Don't allow overflow */ 3012 if ((unsigned long) addr + count < count) 3013 count = -(unsigned long) addr; 3014 buflen = count; 3015 3016 spin_lock(&vmap_area_lock); 3017 list_for_each_entry(va, &vmap_area_list, list) { 3018 if (!count) 3019 break; 3020 3021 if (!va->vm) 3022 continue; 3023 3024 vm = va->vm; 3025 vaddr = (char *) vm->addr; 3026 if (addr >= vaddr + get_vm_area_size(vm)) 3027 continue; 3028 while (addr < vaddr) { 3029 if (count == 0) 3030 goto finished; 3031 buf++; 3032 addr++; 3033 count--; 3034 } 3035 n = vaddr + get_vm_area_size(vm) - addr; 3036 if (n > count) 3037 n = count; 3038 if (!(vm->flags & VM_IOREMAP)) { 3039 aligned_vwrite(buf, addr, n); 3040 copied++; 3041 } 3042 buf += n; 3043 addr += n; 3044 count -= n; 3045 } 3046 finished: 3047 spin_unlock(&vmap_area_lock); 3048 if (!copied) 3049 return 0; 3050 return buflen; 3051 } 3052 3053 /** 3054 * remap_vmalloc_range_partial - map vmalloc pages to userspace 3055 * @vma: vma to cover 3056 * @uaddr: target user address to start at 3057 * @kaddr: virtual address of vmalloc kernel memory 3058 * @pgoff: offset from @kaddr to start at 3059 * @size: size of map area 3060 * 3061 * Returns: 0 for success, -Exxx on failure 3062 * 3063 * This function checks that @kaddr is a valid vmalloc'ed area, 3064 * and that it is big enough to cover the range starting at 3065 * @uaddr in @vma. Will return failure if that criteria isn't 3066 * met. 3067 * 3068 * Similar to remap_pfn_range() (see mm/memory.c) 3069 */ 3070 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 3071 void *kaddr, unsigned long pgoff, 3072 unsigned long size) 3073 { 3074 struct vm_struct *area; 3075 unsigned long off; 3076 unsigned long end_index; 3077 3078 if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) 3079 return -EINVAL; 3080 3081 size = PAGE_ALIGN(size); 3082 3083 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) 3084 return -EINVAL; 3085 3086 area = find_vm_area(kaddr); 3087 if (!area) 3088 return -EINVAL; 3089 3090 if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) 3091 return -EINVAL; 3092 3093 if (check_add_overflow(size, off, &end_index) || 3094 end_index > get_vm_area_size(area)) 3095 return -EINVAL; 3096 kaddr += off; 3097 3098 do { 3099 struct page *page = vmalloc_to_page(kaddr); 3100 int ret; 3101 3102 ret = vm_insert_page(vma, uaddr, page); 3103 if (ret) 3104 return ret; 3105 3106 uaddr += PAGE_SIZE; 3107 kaddr += PAGE_SIZE; 3108 size -= PAGE_SIZE; 3109 } while (size > 0); 3110 3111 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3112 3113 return 0; 3114 } 3115 EXPORT_SYMBOL(remap_vmalloc_range_partial); 3116 3117 /** 3118 * remap_vmalloc_range - map vmalloc pages to userspace 3119 * @vma: vma to cover (map full range of vma) 3120 * @addr: vmalloc memory 3121 * @pgoff: number of pages into addr before first page to map 3122 * 3123 * Returns: 0 for success, -Exxx on failure 3124 * 3125 * This function checks that addr is a valid vmalloc'ed area, and 3126 * that it is big enough to cover the vma. Will return failure if 3127 * that criteria isn't met. 3128 * 3129 * Similar to remap_pfn_range() (see mm/memory.c) 3130 */ 3131 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 3132 unsigned long pgoff) 3133 { 3134 return remap_vmalloc_range_partial(vma, vma->vm_start, 3135 addr, pgoff, 3136 vma->vm_end - vma->vm_start); 3137 } 3138 EXPORT_SYMBOL(remap_vmalloc_range); 3139 3140 /* 3141 * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose 3142 * not to have one. 3143 * 3144 * The purpose of this function is to make sure the vmalloc area 3145 * mappings are identical in all page-tables in the system. 3146 */ 3147 void __weak vmalloc_sync_mappings(void) 3148 { 3149 } 3150 3151 void __weak vmalloc_sync_unmappings(void) 3152 { 3153 } 3154 3155 static int f(pte_t *pte, unsigned long addr, void *data) 3156 { 3157 pte_t ***p = data; 3158 3159 if (p) { 3160 *(*p) = pte; 3161 (*p)++; 3162 } 3163 return 0; 3164 } 3165 3166 /** 3167 * alloc_vm_area - allocate a range of kernel address space 3168 * @size: size of the area 3169 * @ptes: returns the PTEs for the address space 3170 * 3171 * Returns: NULL on failure, vm_struct on success 3172 * 3173 * This function reserves a range of kernel address space, and 3174 * allocates pagetables to map that range. No actual mappings 3175 * are created. 3176 * 3177 * If @ptes is non-NULL, pointers to the PTEs (in init_mm) 3178 * allocated for the VM area are returned. 3179 */ 3180 struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) 3181 { 3182 struct vm_struct *area; 3183 3184 area = get_vm_area_caller(size, VM_IOREMAP, 3185 __builtin_return_address(0)); 3186 if (area == NULL) 3187 return NULL; 3188 3189 /* 3190 * This ensures that page tables are constructed for this region 3191 * of kernel virtual address space and mapped into init_mm. 3192 */ 3193 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 3194 size, f, ptes ? &ptes : NULL)) { 3195 free_vm_area(area); 3196 return NULL; 3197 } 3198 3199 return area; 3200 } 3201 EXPORT_SYMBOL_GPL(alloc_vm_area); 3202 3203 void free_vm_area(struct vm_struct *area) 3204 { 3205 struct vm_struct *ret; 3206 ret = remove_vm_area(area->addr); 3207 BUG_ON(ret != area); 3208 kfree(area); 3209 } 3210 EXPORT_SYMBOL_GPL(free_vm_area); 3211 3212 #ifdef CONFIG_SMP 3213 static struct vmap_area *node_to_va(struct rb_node *n) 3214 { 3215 return rb_entry_safe(n, struct vmap_area, rb_node); 3216 } 3217 3218 /** 3219 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to 3220 * @addr: target address 3221 * 3222 * Returns: vmap_area if it is found. If there is no such area 3223 * the first highest(reverse order) vmap_area is returned 3224 * i.e. va->va_start < addr && va->va_end < addr or NULL 3225 * if there are no any areas before @addr. 3226 */ 3227 static struct vmap_area * 3228 pvm_find_va_enclose_addr(unsigned long addr) 3229 { 3230 struct vmap_area *va, *tmp; 3231 struct rb_node *n; 3232 3233 n = free_vmap_area_root.rb_node; 3234 va = NULL; 3235 3236 while (n) { 3237 tmp = rb_entry(n, struct vmap_area, rb_node); 3238 if (tmp->va_start <= addr) { 3239 va = tmp; 3240 if (tmp->va_end >= addr) 3241 break; 3242 3243 n = n->rb_right; 3244 } else { 3245 n = n->rb_left; 3246 } 3247 } 3248 3249 return va; 3250 } 3251 3252 /** 3253 * pvm_determine_end_from_reverse - find the highest aligned address 3254 * of free block below VMALLOC_END 3255 * @va: 3256 * in - the VA we start the search(reverse order); 3257 * out - the VA with the highest aligned end address. 3258 * 3259 * Returns: determined end address within vmap_area 3260 */ 3261 static unsigned long 3262 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) 3263 { 3264 unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3265 unsigned long addr; 3266 3267 if (likely(*va)) { 3268 list_for_each_entry_from_reverse((*va), 3269 &free_vmap_area_list, list) { 3270 addr = min((*va)->va_end & ~(align - 1), vmalloc_end); 3271 if ((*va)->va_start < addr) 3272 return addr; 3273 } 3274 } 3275 3276 return 0; 3277 } 3278 3279 /** 3280 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 3281 * @offsets: array containing offset of each area 3282 * @sizes: array containing size of each area 3283 * @nr_vms: the number of areas to allocate 3284 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 3285 * 3286 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 3287 * vm_structs on success, %NULL on failure 3288 * 3289 * Percpu allocator wants to use congruent vm areas so that it can 3290 * maintain the offsets among percpu areas. This function allocates 3291 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to 3292 * be scattered pretty far, distance between two areas easily going up 3293 * to gigabytes. To avoid interacting with regular vmallocs, these 3294 * areas are allocated from top. 3295 * 3296 * Despite its complicated look, this allocator is rather simple. It 3297 * does everything top-down and scans free blocks from the end looking 3298 * for matching base. While scanning, if any of the areas do not fit the 3299 * base address is pulled down to fit the area. Scanning is repeated till 3300 * all the areas fit and then all necessary data structures are inserted 3301 * and the result is returned. 3302 */ 3303 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 3304 const size_t *sizes, int nr_vms, 3305 size_t align) 3306 { 3307 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 3308 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3309 struct vmap_area **vas, *va; 3310 struct vm_struct **vms; 3311 int area, area2, last_area, term_area; 3312 unsigned long base, start, size, end, last_end, orig_start, orig_end; 3313 bool purged = false; 3314 enum fit_type type; 3315 3316 /* verify parameters and allocate data structures */ 3317 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); 3318 for (last_area = 0, area = 0; area < nr_vms; area++) { 3319 start = offsets[area]; 3320 end = start + sizes[area]; 3321 3322 /* is everything aligned properly? */ 3323 BUG_ON(!IS_ALIGNED(offsets[area], align)); 3324 BUG_ON(!IS_ALIGNED(sizes[area], align)); 3325 3326 /* detect the area with the highest address */ 3327 if (start > offsets[last_area]) 3328 last_area = area; 3329 3330 for (area2 = area + 1; area2 < nr_vms; area2++) { 3331 unsigned long start2 = offsets[area2]; 3332 unsigned long end2 = start2 + sizes[area2]; 3333 3334 BUG_ON(start2 < end && start < end2); 3335 } 3336 } 3337 last_end = offsets[last_area] + sizes[last_area]; 3338 3339 if (vmalloc_end - vmalloc_start < last_end) { 3340 WARN_ON(true); 3341 return NULL; 3342 } 3343 3344 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); 3345 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); 3346 if (!vas || !vms) 3347 goto err_free2; 3348 3349 for (area = 0; area < nr_vms; area++) { 3350 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); 3351 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 3352 if (!vas[area] || !vms[area]) 3353 goto err_free; 3354 } 3355 retry: 3356 spin_lock(&free_vmap_area_lock); 3357 3358 /* start scanning - we scan from the top, begin with the last area */ 3359 area = term_area = last_area; 3360 start = offsets[area]; 3361 end = start + sizes[area]; 3362 3363 va = pvm_find_va_enclose_addr(vmalloc_end); 3364 base = pvm_determine_end_from_reverse(&va, align) - end; 3365 3366 while (true) { 3367 /* 3368 * base might have underflowed, add last_end before 3369 * comparing. 3370 */ 3371 if (base + last_end < vmalloc_start + last_end) 3372 goto overflow; 3373 3374 /* 3375 * Fitting base has not been found. 3376 */ 3377 if (va == NULL) 3378 goto overflow; 3379 3380 /* 3381 * If required width exceeds current VA block, move 3382 * base downwards and then recheck. 3383 */ 3384 if (base + end > va->va_end) { 3385 base = pvm_determine_end_from_reverse(&va, align) - end; 3386 term_area = area; 3387 continue; 3388 } 3389 3390 /* 3391 * If this VA does not fit, move base downwards and recheck. 3392 */ 3393 if (base + start < va->va_start) { 3394 va = node_to_va(rb_prev(&va->rb_node)); 3395 base = pvm_determine_end_from_reverse(&va, align) - end; 3396 term_area = area; 3397 continue; 3398 } 3399 3400 /* 3401 * This area fits, move on to the previous one. If 3402 * the previous one is the terminal one, we're done. 3403 */ 3404 area = (area + nr_vms - 1) % nr_vms; 3405 if (area == term_area) 3406 break; 3407 3408 start = offsets[area]; 3409 end = start + sizes[area]; 3410 va = pvm_find_va_enclose_addr(base + end); 3411 } 3412 3413 /* we've found a fitting base, insert all va's */ 3414 for (area = 0; area < nr_vms; area++) { 3415 int ret; 3416 3417 start = base + offsets[area]; 3418 size = sizes[area]; 3419 3420 va = pvm_find_va_enclose_addr(start); 3421 if (WARN_ON_ONCE(va == NULL)) 3422 /* It is a BUG(), but trigger recovery instead. */ 3423 goto recovery; 3424 3425 type = classify_va_fit_type(va, start, size); 3426 if (WARN_ON_ONCE(type == NOTHING_FIT)) 3427 /* It is a BUG(), but trigger recovery instead. */ 3428 goto recovery; 3429 3430 ret = adjust_va_to_fit_type(va, start, size, type); 3431 if (unlikely(ret)) 3432 goto recovery; 3433 3434 /* Allocated area. */ 3435 va = vas[area]; 3436 va->va_start = start; 3437 va->va_end = start + size; 3438 } 3439 3440 spin_unlock(&free_vmap_area_lock); 3441 3442 /* populate the kasan shadow space */ 3443 for (area = 0; area < nr_vms; area++) { 3444 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) 3445 goto err_free_shadow; 3446 3447 kasan_unpoison_vmalloc((void *)vas[area]->va_start, 3448 sizes[area]); 3449 } 3450 3451 /* insert all vm's */ 3452 spin_lock(&vmap_area_lock); 3453 for (area = 0; area < nr_vms; area++) { 3454 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); 3455 3456 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, 3457 pcpu_get_vm_areas); 3458 } 3459 spin_unlock(&vmap_area_lock); 3460 3461 kfree(vas); 3462 return vms; 3463 3464 recovery: 3465 /* 3466 * Remove previously allocated areas. There is no 3467 * need in removing these areas from the busy tree, 3468 * because they are inserted only on the final step 3469 * and when pcpu_get_vm_areas() is success. 3470 */ 3471 while (area--) { 3472 orig_start = vas[area]->va_start; 3473 orig_end = vas[area]->va_end; 3474 va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, 3475 &free_vmap_area_list); 3476 kasan_release_vmalloc(orig_start, orig_end, 3477 va->va_start, va->va_end); 3478 vas[area] = NULL; 3479 } 3480 3481 overflow: 3482 spin_unlock(&free_vmap_area_lock); 3483 if (!purged) { 3484 purge_vmap_area_lazy(); 3485 purged = true; 3486 3487 /* Before "retry", check if we recover. */ 3488 for (area = 0; area < nr_vms; area++) { 3489 if (vas[area]) 3490 continue; 3491 3492 vas[area] = kmem_cache_zalloc( 3493 vmap_area_cachep, GFP_KERNEL); 3494 if (!vas[area]) 3495 goto err_free; 3496 } 3497 3498 goto retry; 3499 } 3500 3501 err_free: 3502 for (area = 0; area < nr_vms; area++) { 3503 if (vas[area]) 3504 kmem_cache_free(vmap_area_cachep, vas[area]); 3505 3506 kfree(vms[area]); 3507 } 3508 err_free2: 3509 kfree(vas); 3510 kfree(vms); 3511 return NULL; 3512 3513 err_free_shadow: 3514 spin_lock(&free_vmap_area_lock); 3515 /* 3516 * We release all the vmalloc shadows, even the ones for regions that 3517 * hadn't been successfully added. This relies on kasan_release_vmalloc 3518 * being able to tolerate this case. 3519 */ 3520 for (area = 0; area < nr_vms; area++) { 3521 orig_start = vas[area]->va_start; 3522 orig_end = vas[area]->va_end; 3523 va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, 3524 &free_vmap_area_list); 3525 kasan_release_vmalloc(orig_start, orig_end, 3526 va->va_start, va->va_end); 3527 vas[area] = NULL; 3528 kfree(vms[area]); 3529 } 3530 spin_unlock(&free_vmap_area_lock); 3531 kfree(vas); 3532 kfree(vms); 3533 return NULL; 3534 } 3535 3536 /** 3537 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 3538 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 3539 * @nr_vms: the number of allocated areas 3540 * 3541 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 3542 */ 3543 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 3544 { 3545 int i; 3546 3547 for (i = 0; i < nr_vms; i++) 3548 free_vm_area(vms[i]); 3549 kfree(vms); 3550 } 3551 #endif /* CONFIG_SMP */ 3552 3553 #ifdef CONFIG_PROC_FS 3554 static void *s_start(struct seq_file *m, loff_t *pos) 3555 __acquires(&vmap_purge_lock) 3556 __acquires(&vmap_area_lock) 3557 { 3558 mutex_lock(&vmap_purge_lock); 3559 spin_lock(&vmap_area_lock); 3560 3561 return seq_list_start(&vmap_area_list, *pos); 3562 } 3563 3564 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3565 { 3566 return seq_list_next(p, &vmap_area_list, pos); 3567 } 3568 3569 static void s_stop(struct seq_file *m, void *p) 3570 __releases(&vmap_purge_lock) 3571 __releases(&vmap_area_lock) 3572 { 3573 mutex_unlock(&vmap_purge_lock); 3574 spin_unlock(&vmap_area_lock); 3575 } 3576 3577 static void show_numa_info(struct seq_file *m, struct vm_struct *v) 3578 { 3579 if (IS_ENABLED(CONFIG_NUMA)) { 3580 unsigned int nr, *counters = m->private; 3581 3582 if (!counters) 3583 return; 3584 3585 if (v->flags & VM_UNINITIALIZED) 3586 return; 3587 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 3588 smp_rmb(); 3589 3590 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 3591 3592 for (nr = 0; nr < v->nr_pages; nr++) 3593 counters[page_to_nid(v->pages[nr])]++; 3594 3595 for_each_node_state(nr, N_HIGH_MEMORY) 3596 if (counters[nr]) 3597 seq_printf(m, " N%u=%u", nr, counters[nr]); 3598 } 3599 } 3600 3601 static void show_purge_info(struct seq_file *m) 3602 { 3603 struct llist_node *head; 3604 struct vmap_area *va; 3605 3606 head = READ_ONCE(vmap_purge_list.first); 3607 if (head == NULL) 3608 return; 3609 3610 llist_for_each_entry(va, head, purge_list) { 3611 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 3612 (void *)va->va_start, (void *)va->va_end, 3613 va->va_end - va->va_start); 3614 } 3615 } 3616 3617 static int s_show(struct seq_file *m, void *p) 3618 { 3619 struct vmap_area *va; 3620 struct vm_struct *v; 3621 3622 va = list_entry(p, struct vmap_area, list); 3623 3624 /* 3625 * s_show can encounter race with remove_vm_area, !vm on behalf 3626 * of vmap area is being tear down or vm_map_ram allocation. 3627 */ 3628 if (!va->vm) { 3629 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", 3630 (void *)va->va_start, (void *)va->va_end, 3631 va->va_end - va->va_start); 3632 3633 return 0; 3634 } 3635 3636 v = va->vm; 3637 3638 seq_printf(m, "0x%pK-0x%pK %7ld", 3639 v->addr, v->addr + v->size, v->size); 3640 3641 if (v->caller) 3642 seq_printf(m, " %pS", v->caller); 3643 3644 if (v->nr_pages) 3645 seq_printf(m, " pages=%d", v->nr_pages); 3646 3647 if (v->phys_addr) 3648 seq_printf(m, " phys=%pa", &v->phys_addr); 3649 3650 if (v->flags & VM_IOREMAP) 3651 seq_puts(m, " ioremap"); 3652 3653 if (v->flags & VM_ALLOC) 3654 seq_puts(m, " vmalloc"); 3655 3656 if (v->flags & VM_MAP) 3657 seq_puts(m, " vmap"); 3658 3659 if (v->flags & VM_USERMAP) 3660 seq_puts(m, " user"); 3661 3662 if (v->flags & VM_DMA_COHERENT) 3663 seq_puts(m, " dma-coherent"); 3664 3665 if (is_vmalloc_addr(v->pages)) 3666 seq_puts(m, " vpages"); 3667 3668 show_numa_info(m, v); 3669 seq_putc(m, '\n'); 3670 3671 /* 3672 * As a final step, dump "unpurged" areas. Note, 3673 * that entire "/proc/vmallocinfo" output will not 3674 * be address sorted, because the purge list is not 3675 * sorted. 3676 */ 3677 if (list_is_last(&va->list, &vmap_area_list)) 3678 show_purge_info(m); 3679 3680 return 0; 3681 } 3682 3683 static const struct seq_operations vmalloc_op = { 3684 .start = s_start, 3685 .next = s_next, 3686 .stop = s_stop, 3687 .show = s_show, 3688 }; 3689 3690 static int __init proc_vmalloc_init(void) 3691 { 3692 if (IS_ENABLED(CONFIG_NUMA)) 3693 proc_create_seq_private("vmallocinfo", 0400, NULL, 3694 &vmalloc_op, 3695 nr_node_ids * sizeof(unsigned int), NULL); 3696 else 3697 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); 3698 return 0; 3699 } 3700 module_init(proc_vmalloc_init); 3701 3702 #endif 3703