xref: /openbmc/linux/mm/vmalloc.c (revision 8ebc80a25f9d9bf7a8e368b266d5b740c485c362)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  Copyright (C) 1993  Linus Torvalds
4   *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
5   *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
6   *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
7   *  Numa awareness, Christoph Lameter, SGI, June 2005
8   *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
9   */
10  
11  #include <linux/vmalloc.h>
12  #include <linux/mm.h>
13  #include <linux/module.h>
14  #include <linux/highmem.h>
15  #include <linux/sched/signal.h>
16  #include <linux/slab.h>
17  #include <linux/spinlock.h>
18  #include <linux/interrupt.h>
19  #include <linux/proc_fs.h>
20  #include <linux/seq_file.h>
21  #include <linux/set_memory.h>
22  #include <linux/debugobjects.h>
23  #include <linux/kallsyms.h>
24  #include <linux/list.h>
25  #include <linux/notifier.h>
26  #include <linux/rbtree.h>
27  #include <linux/xarray.h>
28  #include <linux/io.h>
29  #include <linux/rcupdate.h>
30  #include <linux/pfn.h>
31  #include <linux/kmemleak.h>
32  #include <linux/atomic.h>
33  #include <linux/compiler.h>
34  #include <linux/memcontrol.h>
35  #include <linux/llist.h>
36  #include <linux/uio.h>
37  #include <linux/bitops.h>
38  #include <linux/rbtree_augmented.h>
39  #include <linux/overflow.h>
40  #include <linux/pgtable.h>
41  #include <linux/hugetlb.h>
42  #include <linux/sched/mm.h>
43  #include <asm/tlbflush.h>
44  #include <asm/shmparam.h>
45  
46  #define CREATE_TRACE_POINTS
47  #include <trace/events/vmalloc.h>
48  
49  #include "internal.h"
50  #include "pgalloc-track.h"
51  
52  #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
53  static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
54  
set_nohugeiomap(char * str)55  static int __init set_nohugeiomap(char *str)
56  {
57  	ioremap_max_page_shift = PAGE_SHIFT;
58  	return 0;
59  }
60  early_param("nohugeiomap", set_nohugeiomap);
61  #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
62  static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
63  #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
64  
65  #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
66  static bool __ro_after_init vmap_allow_huge = true;
67  
set_nohugevmalloc(char * str)68  static int __init set_nohugevmalloc(char *str)
69  {
70  	vmap_allow_huge = false;
71  	return 0;
72  }
73  early_param("nohugevmalloc", set_nohugevmalloc);
74  #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
75  static const bool vmap_allow_huge = false;
76  #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
77  
is_vmalloc_addr(const void * x)78  bool is_vmalloc_addr(const void *x)
79  {
80  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
81  
82  	return addr >= VMALLOC_START && addr < VMALLOC_END;
83  }
84  EXPORT_SYMBOL(is_vmalloc_addr);
85  
86  struct vfree_deferred {
87  	struct llist_head list;
88  	struct work_struct wq;
89  };
90  static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
91  
92  /*** Page table manipulation functions ***/
vmap_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,phys_addr_t phys_addr,pgprot_t prot,unsigned int max_page_shift,pgtbl_mod_mask * mask)93  static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
94  			phys_addr_t phys_addr, pgprot_t prot,
95  			unsigned int max_page_shift, pgtbl_mod_mask *mask)
96  {
97  	pte_t *pte;
98  	u64 pfn;
99  	unsigned long size = PAGE_SIZE;
100  
101  	pfn = phys_addr >> PAGE_SHIFT;
102  	pte = pte_alloc_kernel_track(pmd, addr, mask);
103  	if (!pte)
104  		return -ENOMEM;
105  	do {
106  		BUG_ON(!pte_none(ptep_get(pte)));
107  
108  #ifdef CONFIG_HUGETLB_PAGE
109  		size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
110  		if (size != PAGE_SIZE) {
111  			pte_t entry = pfn_pte(pfn, prot);
112  
113  			entry = arch_make_huge_pte(entry, ilog2(size), 0);
114  			set_huge_pte_at(&init_mm, addr, pte, entry, size);
115  			pfn += PFN_DOWN(size);
116  			continue;
117  		}
118  #endif
119  		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
120  		pfn++;
121  	} while (pte += PFN_DOWN(size), addr += size, addr != end);
122  	*mask |= PGTBL_PTE_MODIFIED;
123  	return 0;
124  }
125  
vmap_try_huge_pmd(pmd_t * pmd,unsigned long addr,unsigned long end,phys_addr_t phys_addr,pgprot_t prot,unsigned int max_page_shift)126  static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
127  			phys_addr_t phys_addr, pgprot_t prot,
128  			unsigned int max_page_shift)
129  {
130  	if (max_page_shift < PMD_SHIFT)
131  		return 0;
132  
133  	if (!arch_vmap_pmd_supported(prot))
134  		return 0;
135  
136  	if ((end - addr) != PMD_SIZE)
137  		return 0;
138  
139  	if (!IS_ALIGNED(addr, PMD_SIZE))
140  		return 0;
141  
142  	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
143  		return 0;
144  
145  	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
146  		return 0;
147  
148  	return pmd_set_huge(pmd, phys_addr, prot);
149  }
150  
vmap_pmd_range(pud_t * pud,unsigned long addr,unsigned long end,phys_addr_t phys_addr,pgprot_t prot,unsigned int max_page_shift,pgtbl_mod_mask * mask)151  static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
152  			phys_addr_t phys_addr, pgprot_t prot,
153  			unsigned int max_page_shift, pgtbl_mod_mask *mask)
154  {
155  	pmd_t *pmd;
156  	unsigned long next;
157  
158  	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
159  	if (!pmd)
160  		return -ENOMEM;
161  	do {
162  		next = pmd_addr_end(addr, end);
163  
164  		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
165  					max_page_shift)) {
166  			*mask |= PGTBL_PMD_MODIFIED;
167  			continue;
168  		}
169  
170  		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
171  			return -ENOMEM;
172  	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
173  	return 0;
174  }
175  
vmap_try_huge_pud(pud_t * pud,unsigned long addr,unsigned long end,phys_addr_t phys_addr,pgprot_t prot,unsigned int max_page_shift)176  static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
177  			phys_addr_t phys_addr, pgprot_t prot,
178  			unsigned int max_page_shift)
179  {
180  	if (max_page_shift < PUD_SHIFT)
181  		return 0;
182  
183  	if (!arch_vmap_pud_supported(prot))
184  		return 0;
185  
186  	if ((end - addr) != PUD_SIZE)
187  		return 0;
188  
189  	if (!IS_ALIGNED(addr, PUD_SIZE))
190  		return 0;
191  
192  	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
193  		return 0;
194  
195  	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
196  		return 0;
197  
198  	return pud_set_huge(pud, phys_addr, prot);
199  }
200  
vmap_pud_range(p4d_t * p4d,unsigned long addr,unsigned long end,phys_addr_t phys_addr,pgprot_t prot,unsigned int max_page_shift,pgtbl_mod_mask * mask)201  static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
202  			phys_addr_t phys_addr, pgprot_t prot,
203  			unsigned int max_page_shift, pgtbl_mod_mask *mask)
204  {
205  	pud_t *pud;
206  	unsigned long next;
207  
208  	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
209  	if (!pud)
210  		return -ENOMEM;
211  	do {
212  		next = pud_addr_end(addr, end);
213  
214  		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
215  					max_page_shift)) {
216  			*mask |= PGTBL_PUD_MODIFIED;
217  			continue;
218  		}
219  
220  		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
221  					max_page_shift, mask))
222  			return -ENOMEM;
223  	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
224  	return 0;
225  }
226  
vmap_try_huge_p4d(p4d_t * p4d,unsigned long addr,unsigned long end,phys_addr_t phys_addr,pgprot_t prot,unsigned int max_page_shift)227  static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
228  			phys_addr_t phys_addr, pgprot_t prot,
229  			unsigned int max_page_shift)
230  {
231  	if (max_page_shift < P4D_SHIFT)
232  		return 0;
233  
234  	if (!arch_vmap_p4d_supported(prot))
235  		return 0;
236  
237  	if ((end - addr) != P4D_SIZE)
238  		return 0;
239  
240  	if (!IS_ALIGNED(addr, P4D_SIZE))
241  		return 0;
242  
243  	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
244  		return 0;
245  
246  	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
247  		return 0;
248  
249  	return p4d_set_huge(p4d, phys_addr, prot);
250  }
251  
vmap_p4d_range(pgd_t * pgd,unsigned long addr,unsigned long end,phys_addr_t phys_addr,pgprot_t prot,unsigned int max_page_shift,pgtbl_mod_mask * mask)252  static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
253  			phys_addr_t phys_addr, pgprot_t prot,
254  			unsigned int max_page_shift, pgtbl_mod_mask *mask)
255  {
256  	p4d_t *p4d;
257  	unsigned long next;
258  
259  	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
260  	if (!p4d)
261  		return -ENOMEM;
262  	do {
263  		next = p4d_addr_end(addr, end);
264  
265  		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
266  					max_page_shift)) {
267  			*mask |= PGTBL_P4D_MODIFIED;
268  			continue;
269  		}
270  
271  		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
272  					max_page_shift, mask))
273  			return -ENOMEM;
274  	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
275  	return 0;
276  }
277  
vmap_range_noflush(unsigned long addr,unsigned long end,phys_addr_t phys_addr,pgprot_t prot,unsigned int max_page_shift)278  static int vmap_range_noflush(unsigned long addr, unsigned long end,
279  			phys_addr_t phys_addr, pgprot_t prot,
280  			unsigned int max_page_shift)
281  {
282  	pgd_t *pgd;
283  	unsigned long start;
284  	unsigned long next;
285  	int err;
286  	pgtbl_mod_mask mask = 0;
287  
288  	might_sleep();
289  	BUG_ON(addr >= end);
290  
291  	start = addr;
292  	pgd = pgd_offset_k(addr);
293  	do {
294  		next = pgd_addr_end(addr, end);
295  		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
296  					max_page_shift, &mask);
297  		if (err)
298  			break;
299  	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
300  
301  	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
302  		arch_sync_kernel_mappings(start, end);
303  
304  	return err;
305  }
306  
ioremap_page_range(unsigned long addr,unsigned long end,phys_addr_t phys_addr,pgprot_t prot)307  int ioremap_page_range(unsigned long addr, unsigned long end,
308  		phys_addr_t phys_addr, pgprot_t prot)
309  {
310  	int err;
311  
312  	err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
313  				 ioremap_max_page_shift);
314  	flush_cache_vmap(addr, end);
315  	if (!err)
316  		err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
317  					       ioremap_max_page_shift);
318  	return err;
319  }
320  
vunmap_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,pgtbl_mod_mask * mask)321  static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
322  			     pgtbl_mod_mask *mask)
323  {
324  	pte_t *pte;
325  
326  	pte = pte_offset_kernel(pmd, addr);
327  	do {
328  		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
329  		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
330  	} while (pte++, addr += PAGE_SIZE, addr != end);
331  	*mask |= PGTBL_PTE_MODIFIED;
332  }
333  
vunmap_pmd_range(pud_t * pud,unsigned long addr,unsigned long end,pgtbl_mod_mask * mask)334  static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
335  			     pgtbl_mod_mask *mask)
336  {
337  	pmd_t *pmd;
338  	unsigned long next;
339  	int cleared;
340  
341  	pmd = pmd_offset(pud, addr);
342  	do {
343  		next = pmd_addr_end(addr, end);
344  
345  		cleared = pmd_clear_huge(pmd);
346  		if (cleared || pmd_bad(*pmd))
347  			*mask |= PGTBL_PMD_MODIFIED;
348  
349  		if (cleared)
350  			continue;
351  		if (pmd_none_or_clear_bad(pmd))
352  			continue;
353  		vunmap_pte_range(pmd, addr, next, mask);
354  
355  		cond_resched();
356  	} while (pmd++, addr = next, addr != end);
357  }
358  
vunmap_pud_range(p4d_t * p4d,unsigned long addr,unsigned long end,pgtbl_mod_mask * mask)359  static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
360  			     pgtbl_mod_mask *mask)
361  {
362  	pud_t *pud;
363  	unsigned long next;
364  	int cleared;
365  
366  	pud = pud_offset(p4d, addr);
367  	do {
368  		next = pud_addr_end(addr, end);
369  
370  		cleared = pud_clear_huge(pud);
371  		if (cleared || pud_bad(*pud))
372  			*mask |= PGTBL_PUD_MODIFIED;
373  
374  		if (cleared)
375  			continue;
376  		if (pud_none_or_clear_bad(pud))
377  			continue;
378  		vunmap_pmd_range(pud, addr, next, mask);
379  	} while (pud++, addr = next, addr != end);
380  }
381  
vunmap_p4d_range(pgd_t * pgd,unsigned long addr,unsigned long end,pgtbl_mod_mask * mask)382  static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
383  			     pgtbl_mod_mask *mask)
384  {
385  	p4d_t *p4d;
386  	unsigned long next;
387  
388  	p4d = p4d_offset(pgd, addr);
389  	do {
390  		next = p4d_addr_end(addr, end);
391  
392  		p4d_clear_huge(p4d);
393  		if (p4d_bad(*p4d))
394  			*mask |= PGTBL_P4D_MODIFIED;
395  
396  		if (p4d_none_or_clear_bad(p4d))
397  			continue;
398  		vunmap_pud_range(p4d, addr, next, mask);
399  	} while (p4d++, addr = next, addr != end);
400  }
401  
402  /*
403   * vunmap_range_noflush is similar to vunmap_range, but does not
404   * flush caches or TLBs.
405   *
406   * The caller is responsible for calling flush_cache_vmap() before calling
407   * this function, and flush_tlb_kernel_range after it has returned
408   * successfully (and before the addresses are expected to cause a page fault
409   * or be re-mapped for something else, if TLB flushes are being delayed or
410   * coalesced).
411   *
412   * This is an internal function only. Do not use outside mm/.
413   */
__vunmap_range_noflush(unsigned long start,unsigned long end)414  void __vunmap_range_noflush(unsigned long start, unsigned long end)
415  {
416  	unsigned long next;
417  	pgd_t *pgd;
418  	unsigned long addr = start;
419  	pgtbl_mod_mask mask = 0;
420  
421  	BUG_ON(addr >= end);
422  	pgd = pgd_offset_k(addr);
423  	do {
424  		next = pgd_addr_end(addr, end);
425  		if (pgd_bad(*pgd))
426  			mask |= PGTBL_PGD_MODIFIED;
427  		if (pgd_none_or_clear_bad(pgd))
428  			continue;
429  		vunmap_p4d_range(pgd, addr, next, &mask);
430  	} while (pgd++, addr = next, addr != end);
431  
432  	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
433  		arch_sync_kernel_mappings(start, end);
434  }
435  
vunmap_range_noflush(unsigned long start,unsigned long end)436  void vunmap_range_noflush(unsigned long start, unsigned long end)
437  {
438  	kmsan_vunmap_range_noflush(start, end);
439  	__vunmap_range_noflush(start, end);
440  }
441  
442  /**
443   * vunmap_range - unmap kernel virtual addresses
444   * @addr: start of the VM area to unmap
445   * @end: end of the VM area to unmap (non-inclusive)
446   *
447   * Clears any present PTEs in the virtual address range, flushes TLBs and
448   * caches. Any subsequent access to the address before it has been re-mapped
449   * is a kernel bug.
450   */
vunmap_range(unsigned long addr,unsigned long end)451  void vunmap_range(unsigned long addr, unsigned long end)
452  {
453  	flush_cache_vunmap(addr, end);
454  	vunmap_range_noflush(addr, end);
455  	flush_tlb_kernel_range(addr, end);
456  }
457  
vmap_pages_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,pgprot_t prot,struct page ** pages,int * nr,pgtbl_mod_mask * mask)458  static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
459  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
460  		pgtbl_mod_mask *mask)
461  {
462  	pte_t *pte;
463  
464  	/*
465  	 * nr is a running index into the array which helps higher level
466  	 * callers keep track of where we're up to.
467  	 */
468  
469  	pte = pte_alloc_kernel_track(pmd, addr, mask);
470  	if (!pte)
471  		return -ENOMEM;
472  	do {
473  		struct page *page = pages[*nr];
474  
475  		if (WARN_ON(!pte_none(ptep_get(pte))))
476  			return -EBUSY;
477  		if (WARN_ON(!page))
478  			return -ENOMEM;
479  		if (WARN_ON(!pfn_valid(page_to_pfn(page))))
480  			return -EINVAL;
481  
482  		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
483  		(*nr)++;
484  	} while (pte++, addr += PAGE_SIZE, addr != end);
485  	*mask |= PGTBL_PTE_MODIFIED;
486  	return 0;
487  }
488  
vmap_pages_pmd_range(pud_t * pud,unsigned long addr,unsigned long end,pgprot_t prot,struct page ** pages,int * nr,pgtbl_mod_mask * mask)489  static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
490  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
491  		pgtbl_mod_mask *mask)
492  {
493  	pmd_t *pmd;
494  	unsigned long next;
495  
496  	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
497  	if (!pmd)
498  		return -ENOMEM;
499  	do {
500  		next = pmd_addr_end(addr, end);
501  		if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
502  			return -ENOMEM;
503  	} while (pmd++, addr = next, addr != end);
504  	return 0;
505  }
506  
vmap_pages_pud_range(p4d_t * p4d,unsigned long addr,unsigned long end,pgprot_t prot,struct page ** pages,int * nr,pgtbl_mod_mask * mask)507  static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
508  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
509  		pgtbl_mod_mask *mask)
510  {
511  	pud_t *pud;
512  	unsigned long next;
513  
514  	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
515  	if (!pud)
516  		return -ENOMEM;
517  	do {
518  		next = pud_addr_end(addr, end);
519  		if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
520  			return -ENOMEM;
521  	} while (pud++, addr = next, addr != end);
522  	return 0;
523  }
524  
vmap_pages_p4d_range(pgd_t * pgd,unsigned long addr,unsigned long end,pgprot_t prot,struct page ** pages,int * nr,pgtbl_mod_mask * mask)525  static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
526  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
527  		pgtbl_mod_mask *mask)
528  {
529  	p4d_t *p4d;
530  	unsigned long next;
531  
532  	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
533  	if (!p4d)
534  		return -ENOMEM;
535  	do {
536  		next = p4d_addr_end(addr, end);
537  		if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
538  			return -ENOMEM;
539  	} while (p4d++, addr = next, addr != end);
540  	return 0;
541  }
542  
vmap_small_pages_range_noflush(unsigned long addr,unsigned long end,pgprot_t prot,struct page ** pages)543  static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
544  		pgprot_t prot, struct page **pages)
545  {
546  	unsigned long start = addr;
547  	pgd_t *pgd;
548  	unsigned long next;
549  	int err = 0;
550  	int nr = 0;
551  	pgtbl_mod_mask mask = 0;
552  
553  	BUG_ON(addr >= end);
554  	pgd = pgd_offset_k(addr);
555  	do {
556  		next = pgd_addr_end(addr, end);
557  		if (pgd_bad(*pgd))
558  			mask |= PGTBL_PGD_MODIFIED;
559  		err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
560  		if (err)
561  			break;
562  	} while (pgd++, addr = next, addr != end);
563  
564  	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
565  		arch_sync_kernel_mappings(start, end);
566  
567  	return err;
568  }
569  
570  /*
571   * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
572   * flush caches.
573   *
574   * The caller is responsible for calling flush_cache_vmap() after this
575   * function returns successfully and before the addresses are accessed.
576   *
577   * This is an internal function only. Do not use outside mm/.
578   */
__vmap_pages_range_noflush(unsigned long addr,unsigned long end,pgprot_t prot,struct page ** pages,unsigned int page_shift)579  int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
580  		pgprot_t prot, struct page **pages, unsigned int page_shift)
581  {
582  	unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
583  
584  	WARN_ON(page_shift < PAGE_SHIFT);
585  
586  	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
587  			page_shift == PAGE_SHIFT)
588  		return vmap_small_pages_range_noflush(addr, end, prot, pages);
589  
590  	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
591  		int err;
592  
593  		err = vmap_range_noflush(addr, addr + (1UL << page_shift),
594  					page_to_phys(pages[i]), prot,
595  					page_shift);
596  		if (err)
597  			return err;
598  
599  		addr += 1UL << page_shift;
600  	}
601  
602  	return 0;
603  }
604  
vmap_pages_range_noflush(unsigned long addr,unsigned long end,pgprot_t prot,struct page ** pages,unsigned int page_shift)605  int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
606  		pgprot_t prot, struct page **pages, unsigned int page_shift)
607  {
608  	int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
609  						 page_shift);
610  
611  	if (ret)
612  		return ret;
613  	return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
614  }
615  
616  /**
617   * vmap_pages_range - map pages to a kernel virtual address
618   * @addr: start of the VM area to map
619   * @end: end of the VM area to map (non-inclusive)
620   * @prot: page protection flags to use
621   * @pages: pages to map (always PAGE_SIZE pages)
622   * @page_shift: maximum shift that the pages may be mapped with, @pages must
623   * be aligned and contiguous up to at least this shift.
624   *
625   * RETURNS:
626   * 0 on success, -errno on failure.
627   */
vmap_pages_range(unsigned long addr,unsigned long end,pgprot_t prot,struct page ** pages,unsigned int page_shift)628  static int vmap_pages_range(unsigned long addr, unsigned long end,
629  		pgprot_t prot, struct page **pages, unsigned int page_shift)
630  {
631  	int err;
632  
633  	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
634  	flush_cache_vmap(addr, end);
635  	return err;
636  }
637  
is_vmalloc_or_module_addr(const void * x)638  int is_vmalloc_or_module_addr(const void *x)
639  {
640  	/*
641  	 * ARM, x86-64 and sparc64 put modules in a special place,
642  	 * and fall back on vmalloc() if that fails. Others
643  	 * just put it in the vmalloc space.
644  	 */
645  #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
646  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
647  	if (addr >= MODULES_VADDR && addr < MODULES_END)
648  		return 1;
649  #endif
650  	return is_vmalloc_addr(x);
651  }
652  EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
653  
654  /*
655   * Walk a vmap address to the struct page it maps. Huge vmap mappings will
656   * return the tail page that corresponds to the base page address, which
657   * matches small vmap mappings.
658   */
vmalloc_to_page(const void * vmalloc_addr)659  struct page *vmalloc_to_page(const void *vmalloc_addr)
660  {
661  	unsigned long addr = (unsigned long) vmalloc_addr;
662  	struct page *page = NULL;
663  	pgd_t *pgd = pgd_offset_k(addr);
664  	p4d_t *p4d;
665  	pud_t *pud;
666  	pmd_t *pmd;
667  	pte_t *ptep, pte;
668  
669  	/*
670  	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
671  	 * architectures that do not vmalloc module space
672  	 */
673  	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
674  
675  	if (pgd_none(*pgd))
676  		return NULL;
677  	if (WARN_ON_ONCE(pgd_leaf(*pgd)))
678  		return NULL; /* XXX: no allowance for huge pgd */
679  	if (WARN_ON_ONCE(pgd_bad(*pgd)))
680  		return NULL;
681  
682  	p4d = p4d_offset(pgd, addr);
683  	if (p4d_none(*p4d))
684  		return NULL;
685  	if (p4d_leaf(*p4d))
686  		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
687  	if (WARN_ON_ONCE(p4d_bad(*p4d)))
688  		return NULL;
689  
690  	pud = pud_offset(p4d, addr);
691  	if (pud_none(*pud))
692  		return NULL;
693  	if (pud_leaf(*pud))
694  		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
695  	if (WARN_ON_ONCE(pud_bad(*pud)))
696  		return NULL;
697  
698  	pmd = pmd_offset(pud, addr);
699  	if (pmd_none(*pmd))
700  		return NULL;
701  	if (pmd_leaf(*pmd))
702  		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
703  	if (WARN_ON_ONCE(pmd_bad(*pmd)))
704  		return NULL;
705  
706  	ptep = pte_offset_kernel(pmd, addr);
707  	pte = ptep_get(ptep);
708  	if (pte_present(pte))
709  		page = pte_page(pte);
710  
711  	return page;
712  }
713  EXPORT_SYMBOL(vmalloc_to_page);
714  
715  /*
716   * Map a vmalloc()-space virtual address to the physical page frame number.
717   */
vmalloc_to_pfn(const void * vmalloc_addr)718  unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
719  {
720  	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
721  }
722  EXPORT_SYMBOL(vmalloc_to_pfn);
723  
724  
725  /*** Global kva allocator ***/
726  
727  #define DEBUG_AUGMENT_PROPAGATE_CHECK 0
728  #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
729  
730  
731  static DEFINE_SPINLOCK(vmap_area_lock);
732  static DEFINE_SPINLOCK(free_vmap_area_lock);
733  /* Export for kexec only */
734  LIST_HEAD(vmap_area_list);
735  static struct rb_root vmap_area_root = RB_ROOT;
736  static bool vmap_initialized __read_mostly;
737  
738  static struct rb_root purge_vmap_area_root = RB_ROOT;
739  static LIST_HEAD(purge_vmap_area_list);
740  static DEFINE_SPINLOCK(purge_vmap_area_lock);
741  
742  /*
743   * This kmem_cache is used for vmap_area objects. Instead of
744   * allocating from slab we reuse an object from this cache to
745   * make things faster. Especially in "no edge" splitting of
746   * free block.
747   */
748  static struct kmem_cache *vmap_area_cachep;
749  
750  /*
751   * This linked list is used in pair with free_vmap_area_root.
752   * It gives O(1) access to prev/next to perform fast coalescing.
753   */
754  static LIST_HEAD(free_vmap_area_list);
755  
756  /*
757   * This augment red-black tree represents the free vmap space.
758   * All vmap_area objects in this tree are sorted by va->va_start
759   * address. It is used for allocation and merging when a vmap
760   * object is released.
761   *
762   * Each vmap_area node contains a maximum available free block
763   * of its sub-tree, right or left. Therefore it is possible to
764   * find a lowest match of free area.
765   */
766  static struct rb_root free_vmap_area_root = RB_ROOT;
767  
768  /*
769   * Preload a CPU with one object for "no edge" split case. The
770   * aim is to get rid of allocations from the atomic context, thus
771   * to use more permissive allocation masks.
772   */
773  static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
774  
775  static __always_inline unsigned long
va_size(struct vmap_area * va)776  va_size(struct vmap_area *va)
777  {
778  	return (va->va_end - va->va_start);
779  }
780  
781  static __always_inline unsigned long
get_subtree_max_size(struct rb_node * node)782  get_subtree_max_size(struct rb_node *node)
783  {
784  	struct vmap_area *va;
785  
786  	va = rb_entry_safe(node, struct vmap_area, rb_node);
787  	return va ? va->subtree_max_size : 0;
788  }
789  
790  RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
791  	struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
792  
793  static void reclaim_and_purge_vmap_areas(void);
794  static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
795  static void drain_vmap_area_work(struct work_struct *work);
796  static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
797  
798  static atomic_long_t nr_vmalloc_pages;
799  
vmalloc_nr_pages(void)800  unsigned long vmalloc_nr_pages(void)
801  {
802  	return atomic_long_read(&nr_vmalloc_pages);
803  }
804  
805  /* Look up the first VA which satisfies addr < va_end, NULL if none. */
find_vmap_area_exceed_addr(unsigned long addr)806  static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
807  {
808  	struct vmap_area *va = NULL;
809  	struct rb_node *n = vmap_area_root.rb_node;
810  
811  	addr = (unsigned long)kasan_reset_tag((void *)addr);
812  
813  	while (n) {
814  		struct vmap_area *tmp;
815  
816  		tmp = rb_entry(n, struct vmap_area, rb_node);
817  		if (tmp->va_end > addr) {
818  			va = tmp;
819  			if (tmp->va_start <= addr)
820  				break;
821  
822  			n = n->rb_left;
823  		} else
824  			n = n->rb_right;
825  	}
826  
827  	return va;
828  }
829  
__find_vmap_area(unsigned long addr,struct rb_root * root)830  static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
831  {
832  	struct rb_node *n = root->rb_node;
833  
834  	addr = (unsigned long)kasan_reset_tag((void *)addr);
835  
836  	while (n) {
837  		struct vmap_area *va;
838  
839  		va = rb_entry(n, struct vmap_area, rb_node);
840  		if (addr < va->va_start)
841  			n = n->rb_left;
842  		else if (addr >= va->va_end)
843  			n = n->rb_right;
844  		else
845  			return va;
846  	}
847  
848  	return NULL;
849  }
850  
851  /*
852   * This function returns back addresses of parent node
853   * and its left or right link for further processing.
854   *
855   * Otherwise NULL is returned. In that case all further
856   * steps regarding inserting of conflicting overlap range
857   * have to be declined and actually considered as a bug.
858   */
859  static __always_inline struct rb_node **
find_va_links(struct vmap_area * va,struct rb_root * root,struct rb_node * from,struct rb_node ** parent)860  find_va_links(struct vmap_area *va,
861  	struct rb_root *root, struct rb_node *from,
862  	struct rb_node **parent)
863  {
864  	struct vmap_area *tmp_va;
865  	struct rb_node **link;
866  
867  	if (root) {
868  		link = &root->rb_node;
869  		if (unlikely(!*link)) {
870  			*parent = NULL;
871  			return link;
872  		}
873  	} else {
874  		link = &from;
875  	}
876  
877  	/*
878  	 * Go to the bottom of the tree. When we hit the last point
879  	 * we end up with parent rb_node and correct direction, i name
880  	 * it link, where the new va->rb_node will be attached to.
881  	 */
882  	do {
883  		tmp_va = rb_entry(*link, struct vmap_area, rb_node);
884  
885  		/*
886  		 * During the traversal we also do some sanity check.
887  		 * Trigger the BUG() if there are sides(left/right)
888  		 * or full overlaps.
889  		 */
890  		if (va->va_end <= tmp_va->va_start)
891  			link = &(*link)->rb_left;
892  		else if (va->va_start >= tmp_va->va_end)
893  			link = &(*link)->rb_right;
894  		else {
895  			WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
896  				va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
897  
898  			return NULL;
899  		}
900  	} while (*link);
901  
902  	*parent = &tmp_va->rb_node;
903  	return link;
904  }
905  
906  static __always_inline struct list_head *
get_va_next_sibling(struct rb_node * parent,struct rb_node ** link)907  get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
908  {
909  	struct list_head *list;
910  
911  	if (unlikely(!parent))
912  		/*
913  		 * The red-black tree where we try to find VA neighbors
914  		 * before merging or inserting is empty, i.e. it means
915  		 * there is no free vmap space. Normally it does not
916  		 * happen but we handle this case anyway.
917  		 */
918  		return NULL;
919  
920  	list = &rb_entry(parent, struct vmap_area, rb_node)->list;
921  	return (&parent->rb_right == link ? list->next : list);
922  }
923  
924  static __always_inline void
__link_va(struct vmap_area * va,struct rb_root * root,struct rb_node * parent,struct rb_node ** link,struct list_head * head,bool augment)925  __link_va(struct vmap_area *va, struct rb_root *root,
926  	struct rb_node *parent, struct rb_node **link,
927  	struct list_head *head, bool augment)
928  {
929  	/*
930  	 * VA is still not in the list, but we can
931  	 * identify its future previous list_head node.
932  	 */
933  	if (likely(parent)) {
934  		head = &rb_entry(parent, struct vmap_area, rb_node)->list;
935  		if (&parent->rb_right != link)
936  			head = head->prev;
937  	}
938  
939  	/* Insert to the rb-tree */
940  	rb_link_node(&va->rb_node, parent, link);
941  	if (augment) {
942  		/*
943  		 * Some explanation here. Just perform simple insertion
944  		 * to the tree. We do not set va->subtree_max_size to
945  		 * its current size before calling rb_insert_augmented().
946  		 * It is because we populate the tree from the bottom
947  		 * to parent levels when the node _is_ in the tree.
948  		 *
949  		 * Therefore we set subtree_max_size to zero after insertion,
950  		 * to let __augment_tree_propagate_from() puts everything to
951  		 * the correct order later on.
952  		 */
953  		rb_insert_augmented(&va->rb_node,
954  			root, &free_vmap_area_rb_augment_cb);
955  		va->subtree_max_size = 0;
956  	} else {
957  		rb_insert_color(&va->rb_node, root);
958  	}
959  
960  	/* Address-sort this list */
961  	list_add(&va->list, head);
962  }
963  
964  static __always_inline void
link_va(struct vmap_area * va,struct rb_root * root,struct rb_node * parent,struct rb_node ** link,struct list_head * head)965  link_va(struct vmap_area *va, struct rb_root *root,
966  	struct rb_node *parent, struct rb_node **link,
967  	struct list_head *head)
968  {
969  	__link_va(va, root, parent, link, head, false);
970  }
971  
972  static __always_inline void
link_va_augment(struct vmap_area * va,struct rb_root * root,struct rb_node * parent,struct rb_node ** link,struct list_head * head)973  link_va_augment(struct vmap_area *va, struct rb_root *root,
974  	struct rb_node *parent, struct rb_node **link,
975  	struct list_head *head)
976  {
977  	__link_va(va, root, parent, link, head, true);
978  }
979  
980  static __always_inline void
__unlink_va(struct vmap_area * va,struct rb_root * root,bool augment)981  __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
982  {
983  	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
984  		return;
985  
986  	if (augment)
987  		rb_erase_augmented(&va->rb_node,
988  			root, &free_vmap_area_rb_augment_cb);
989  	else
990  		rb_erase(&va->rb_node, root);
991  
992  	list_del_init(&va->list);
993  	RB_CLEAR_NODE(&va->rb_node);
994  }
995  
996  static __always_inline void
unlink_va(struct vmap_area * va,struct rb_root * root)997  unlink_va(struct vmap_area *va, struct rb_root *root)
998  {
999  	__unlink_va(va, root, false);
1000  }
1001  
1002  static __always_inline void
unlink_va_augment(struct vmap_area * va,struct rb_root * root)1003  unlink_va_augment(struct vmap_area *va, struct rb_root *root)
1004  {
1005  	__unlink_va(va, root, true);
1006  }
1007  
1008  #if DEBUG_AUGMENT_PROPAGATE_CHECK
1009  /*
1010   * Gets called when remove the node and rotate.
1011   */
1012  static __always_inline unsigned long
compute_subtree_max_size(struct vmap_area * va)1013  compute_subtree_max_size(struct vmap_area *va)
1014  {
1015  	return max3(va_size(va),
1016  		get_subtree_max_size(va->rb_node.rb_left),
1017  		get_subtree_max_size(va->rb_node.rb_right));
1018  }
1019  
1020  static void
augment_tree_propagate_check(void)1021  augment_tree_propagate_check(void)
1022  {
1023  	struct vmap_area *va;
1024  	unsigned long computed_size;
1025  
1026  	list_for_each_entry(va, &free_vmap_area_list, list) {
1027  		computed_size = compute_subtree_max_size(va);
1028  		if (computed_size != va->subtree_max_size)
1029  			pr_emerg("tree is corrupted: %lu, %lu\n",
1030  				va_size(va), va->subtree_max_size);
1031  	}
1032  }
1033  #endif
1034  
1035  /*
1036   * This function populates subtree_max_size from bottom to upper
1037   * levels starting from VA point. The propagation must be done
1038   * when VA size is modified by changing its va_start/va_end. Or
1039   * in case of newly inserting of VA to the tree.
1040   *
1041   * It means that __augment_tree_propagate_from() must be called:
1042   * - After VA has been inserted to the tree(free path);
1043   * - After VA has been shrunk(allocation path);
1044   * - After VA has been increased(merging path).
1045   *
1046   * Please note that, it does not mean that upper parent nodes
1047   * and their subtree_max_size are recalculated all the time up
1048   * to the root node.
1049   *
1050   *       4--8
1051   *        /\
1052   *       /  \
1053   *      /    \
1054   *    2--2  8--8
1055   *
1056   * For example if we modify the node 4, shrinking it to 2, then
1057   * no any modification is required. If we shrink the node 2 to 1
1058   * its subtree_max_size is updated only, and set to 1. If we shrink
1059   * the node 8 to 6, then its subtree_max_size is set to 6 and parent
1060   * node becomes 4--6.
1061   */
1062  static __always_inline void
augment_tree_propagate_from(struct vmap_area * va)1063  augment_tree_propagate_from(struct vmap_area *va)
1064  {
1065  	/*
1066  	 * Populate the tree from bottom towards the root until
1067  	 * the calculated maximum available size of checked node
1068  	 * is equal to its current one.
1069  	 */
1070  	free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
1071  
1072  #if DEBUG_AUGMENT_PROPAGATE_CHECK
1073  	augment_tree_propagate_check();
1074  #endif
1075  }
1076  
1077  static void
insert_vmap_area(struct vmap_area * va,struct rb_root * root,struct list_head * head)1078  insert_vmap_area(struct vmap_area *va,
1079  	struct rb_root *root, struct list_head *head)
1080  {
1081  	struct rb_node **link;
1082  	struct rb_node *parent;
1083  
1084  	link = find_va_links(va, root, NULL, &parent);
1085  	if (link)
1086  		link_va(va, root, parent, link, head);
1087  }
1088  
1089  static void
insert_vmap_area_augment(struct vmap_area * va,struct rb_node * from,struct rb_root * root,struct list_head * head)1090  insert_vmap_area_augment(struct vmap_area *va,
1091  	struct rb_node *from, struct rb_root *root,
1092  	struct list_head *head)
1093  {
1094  	struct rb_node **link;
1095  	struct rb_node *parent;
1096  
1097  	if (from)
1098  		link = find_va_links(va, NULL, from, &parent);
1099  	else
1100  		link = find_va_links(va, root, NULL, &parent);
1101  
1102  	if (link) {
1103  		link_va_augment(va, root, parent, link, head);
1104  		augment_tree_propagate_from(va);
1105  	}
1106  }
1107  
1108  /*
1109   * Merge de-allocated chunk of VA memory with previous
1110   * and next free blocks. If coalesce is not done a new
1111   * free area is inserted. If VA has been merged, it is
1112   * freed.
1113   *
1114   * Please note, it can return NULL in case of overlap
1115   * ranges, followed by WARN() report. Despite it is a
1116   * buggy behaviour, a system can be alive and keep
1117   * ongoing.
1118   */
1119  static __always_inline struct vmap_area *
__merge_or_add_vmap_area(struct vmap_area * va,struct rb_root * root,struct list_head * head,bool augment)1120  __merge_or_add_vmap_area(struct vmap_area *va,
1121  	struct rb_root *root, struct list_head *head, bool augment)
1122  {
1123  	struct vmap_area *sibling;
1124  	struct list_head *next;
1125  	struct rb_node **link;
1126  	struct rb_node *parent;
1127  	bool merged = false;
1128  
1129  	/*
1130  	 * Find a place in the tree where VA potentially will be
1131  	 * inserted, unless it is merged with its sibling/siblings.
1132  	 */
1133  	link = find_va_links(va, root, NULL, &parent);
1134  	if (!link)
1135  		return NULL;
1136  
1137  	/*
1138  	 * Get next node of VA to check if merging can be done.
1139  	 */
1140  	next = get_va_next_sibling(parent, link);
1141  	if (unlikely(next == NULL))
1142  		goto insert;
1143  
1144  	/*
1145  	 * start            end
1146  	 * |                |
1147  	 * |<------VA------>|<-----Next----->|
1148  	 *                  |                |
1149  	 *                  start            end
1150  	 */
1151  	if (next != head) {
1152  		sibling = list_entry(next, struct vmap_area, list);
1153  		if (sibling->va_start == va->va_end) {
1154  			sibling->va_start = va->va_start;
1155  
1156  			/* Free vmap_area object. */
1157  			kmem_cache_free(vmap_area_cachep, va);
1158  
1159  			/* Point to the new merged area. */
1160  			va = sibling;
1161  			merged = true;
1162  		}
1163  	}
1164  
1165  	/*
1166  	 * start            end
1167  	 * |                |
1168  	 * |<-----Prev----->|<------VA------>|
1169  	 *                  |                |
1170  	 *                  start            end
1171  	 */
1172  	if (next->prev != head) {
1173  		sibling = list_entry(next->prev, struct vmap_area, list);
1174  		if (sibling->va_end == va->va_start) {
1175  			/*
1176  			 * If both neighbors are coalesced, it is important
1177  			 * to unlink the "next" node first, followed by merging
1178  			 * with "previous" one. Otherwise the tree might not be
1179  			 * fully populated if a sibling's augmented value is
1180  			 * "normalized" because of rotation operations.
1181  			 */
1182  			if (merged)
1183  				__unlink_va(va, root, augment);
1184  
1185  			sibling->va_end = va->va_end;
1186  
1187  			/* Free vmap_area object. */
1188  			kmem_cache_free(vmap_area_cachep, va);
1189  
1190  			/* Point to the new merged area. */
1191  			va = sibling;
1192  			merged = true;
1193  		}
1194  	}
1195  
1196  insert:
1197  	if (!merged)
1198  		__link_va(va, root, parent, link, head, augment);
1199  
1200  	return va;
1201  }
1202  
1203  static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area * va,struct rb_root * root,struct list_head * head)1204  merge_or_add_vmap_area(struct vmap_area *va,
1205  	struct rb_root *root, struct list_head *head)
1206  {
1207  	return __merge_or_add_vmap_area(va, root, head, false);
1208  }
1209  
1210  static __always_inline struct vmap_area *
merge_or_add_vmap_area_augment(struct vmap_area * va,struct rb_root * root,struct list_head * head)1211  merge_or_add_vmap_area_augment(struct vmap_area *va,
1212  	struct rb_root *root, struct list_head *head)
1213  {
1214  	va = __merge_or_add_vmap_area(va, root, head, true);
1215  	if (va)
1216  		augment_tree_propagate_from(va);
1217  
1218  	return va;
1219  }
1220  
1221  static __always_inline bool
is_within_this_va(struct vmap_area * va,unsigned long size,unsigned long align,unsigned long vstart)1222  is_within_this_va(struct vmap_area *va, unsigned long size,
1223  	unsigned long align, unsigned long vstart)
1224  {
1225  	unsigned long nva_start_addr;
1226  
1227  	if (va->va_start > vstart)
1228  		nva_start_addr = ALIGN(va->va_start, align);
1229  	else
1230  		nva_start_addr = ALIGN(vstart, align);
1231  
1232  	/* Can be overflowed due to big size or alignment. */
1233  	if (nva_start_addr + size < nva_start_addr ||
1234  			nva_start_addr < vstart)
1235  		return false;
1236  
1237  	return (nva_start_addr + size <= va->va_end);
1238  }
1239  
1240  /*
1241   * Find the first free block(lowest start address) in the tree,
1242   * that will accomplish the request corresponding to passing
1243   * parameters. Please note, with an alignment bigger than PAGE_SIZE,
1244   * a search length is adjusted to account for worst case alignment
1245   * overhead.
1246   */
1247  static __always_inline struct vmap_area *
find_vmap_lowest_match(struct rb_root * root,unsigned long size,unsigned long align,unsigned long vstart,bool adjust_search_size)1248  find_vmap_lowest_match(struct rb_root *root, unsigned long size,
1249  	unsigned long align, unsigned long vstart, bool adjust_search_size)
1250  {
1251  	struct vmap_area *va;
1252  	struct rb_node *node;
1253  	unsigned long length;
1254  
1255  	/* Start from the root. */
1256  	node = root->rb_node;
1257  
1258  	/* Adjust the search size for alignment overhead. */
1259  	length = adjust_search_size ? size + align - 1 : size;
1260  
1261  	while (node) {
1262  		va = rb_entry(node, struct vmap_area, rb_node);
1263  
1264  		if (get_subtree_max_size(node->rb_left) >= length &&
1265  				vstart < va->va_start) {
1266  			node = node->rb_left;
1267  		} else {
1268  			if (is_within_this_va(va, size, align, vstart))
1269  				return va;
1270  
1271  			/*
1272  			 * Does not make sense to go deeper towards the right
1273  			 * sub-tree if it does not have a free block that is
1274  			 * equal or bigger to the requested search length.
1275  			 */
1276  			if (get_subtree_max_size(node->rb_right) >= length) {
1277  				node = node->rb_right;
1278  				continue;
1279  			}
1280  
1281  			/*
1282  			 * OK. We roll back and find the first right sub-tree,
1283  			 * that will satisfy the search criteria. It can happen
1284  			 * due to "vstart" restriction or an alignment overhead
1285  			 * that is bigger then PAGE_SIZE.
1286  			 */
1287  			while ((node = rb_parent(node))) {
1288  				va = rb_entry(node, struct vmap_area, rb_node);
1289  				if (is_within_this_va(va, size, align, vstart))
1290  					return va;
1291  
1292  				if (get_subtree_max_size(node->rb_right) >= length &&
1293  						vstart <= va->va_start) {
1294  					/*
1295  					 * Shift the vstart forward. Please note, we update it with
1296  					 * parent's start address adding "1" because we do not want
1297  					 * to enter same sub-tree after it has already been checked
1298  					 * and no suitable free block found there.
1299  					 */
1300  					vstart = va->va_start + 1;
1301  					node = node->rb_right;
1302  					break;
1303  				}
1304  			}
1305  		}
1306  	}
1307  
1308  	return NULL;
1309  }
1310  
1311  #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1312  #include <linux/random.h>
1313  
1314  static struct vmap_area *
find_vmap_lowest_linear_match(struct list_head * head,unsigned long size,unsigned long align,unsigned long vstart)1315  find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
1316  	unsigned long align, unsigned long vstart)
1317  {
1318  	struct vmap_area *va;
1319  
1320  	list_for_each_entry(va, head, list) {
1321  		if (!is_within_this_va(va, size, align, vstart))
1322  			continue;
1323  
1324  		return va;
1325  	}
1326  
1327  	return NULL;
1328  }
1329  
1330  static void
find_vmap_lowest_match_check(struct rb_root * root,struct list_head * head,unsigned long size,unsigned long align)1331  find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
1332  			     unsigned long size, unsigned long align)
1333  {
1334  	struct vmap_area *va_1, *va_2;
1335  	unsigned long vstart;
1336  	unsigned int rnd;
1337  
1338  	get_random_bytes(&rnd, sizeof(rnd));
1339  	vstart = VMALLOC_START + rnd;
1340  
1341  	va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
1342  	va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
1343  
1344  	if (va_1 != va_2)
1345  		pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
1346  			va_1, va_2, vstart);
1347  }
1348  #endif
1349  
1350  enum fit_type {
1351  	NOTHING_FIT = 0,
1352  	FL_FIT_TYPE = 1,	/* full fit */
1353  	LE_FIT_TYPE = 2,	/* left edge fit */
1354  	RE_FIT_TYPE = 3,	/* right edge fit */
1355  	NE_FIT_TYPE = 4		/* no edge fit */
1356  };
1357  
1358  static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area * va,unsigned long nva_start_addr,unsigned long size)1359  classify_va_fit_type(struct vmap_area *va,
1360  	unsigned long nva_start_addr, unsigned long size)
1361  {
1362  	enum fit_type type;
1363  
1364  	/* Check if it is within VA. */
1365  	if (nva_start_addr < va->va_start ||
1366  			nva_start_addr + size > va->va_end)
1367  		return NOTHING_FIT;
1368  
1369  	/* Now classify. */
1370  	if (va->va_start == nva_start_addr) {
1371  		if (va->va_end == nva_start_addr + size)
1372  			type = FL_FIT_TYPE;
1373  		else
1374  			type = LE_FIT_TYPE;
1375  	} else if (va->va_end == nva_start_addr + size) {
1376  		type = RE_FIT_TYPE;
1377  	} else {
1378  		type = NE_FIT_TYPE;
1379  	}
1380  
1381  	return type;
1382  }
1383  
1384  static __always_inline int
adjust_va_to_fit_type(struct rb_root * root,struct list_head * head,struct vmap_area * va,unsigned long nva_start_addr,unsigned long size)1385  adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
1386  		      struct vmap_area *va, unsigned long nva_start_addr,
1387  		      unsigned long size)
1388  {
1389  	struct vmap_area *lva = NULL;
1390  	enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
1391  
1392  	if (type == FL_FIT_TYPE) {
1393  		/*
1394  		 * No need to split VA, it fully fits.
1395  		 *
1396  		 * |               |
1397  		 * V      NVA      V
1398  		 * |---------------|
1399  		 */
1400  		unlink_va_augment(va, root);
1401  		kmem_cache_free(vmap_area_cachep, va);
1402  	} else if (type == LE_FIT_TYPE) {
1403  		/*
1404  		 * Split left edge of fit VA.
1405  		 *
1406  		 * |       |
1407  		 * V  NVA  V   R
1408  		 * |-------|-------|
1409  		 */
1410  		va->va_start += size;
1411  	} else if (type == RE_FIT_TYPE) {
1412  		/*
1413  		 * Split right edge of fit VA.
1414  		 *
1415  		 *         |       |
1416  		 *     L   V  NVA  V
1417  		 * |-------|-------|
1418  		 */
1419  		va->va_end = nva_start_addr;
1420  	} else if (type == NE_FIT_TYPE) {
1421  		/*
1422  		 * Split no edge of fit VA.
1423  		 *
1424  		 *     |       |
1425  		 *   L V  NVA  V R
1426  		 * |---|-------|---|
1427  		 */
1428  		lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
1429  		if (unlikely(!lva)) {
1430  			/*
1431  			 * For percpu allocator we do not do any pre-allocation
1432  			 * and leave it as it is. The reason is it most likely
1433  			 * never ends up with NE_FIT_TYPE splitting. In case of
1434  			 * percpu allocations offsets and sizes are aligned to
1435  			 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
1436  			 * are its main fitting cases.
1437  			 *
1438  			 * There are a few exceptions though, as an example it is
1439  			 * a first allocation (early boot up) when we have "one"
1440  			 * big free space that has to be split.
1441  			 *
1442  			 * Also we can hit this path in case of regular "vmap"
1443  			 * allocations, if "this" current CPU was not preloaded.
1444  			 * See the comment in alloc_vmap_area() why. If so, then
1445  			 * GFP_NOWAIT is used instead to get an extra object for
1446  			 * split purpose. That is rare and most time does not
1447  			 * occur.
1448  			 *
1449  			 * What happens if an allocation gets failed. Basically,
1450  			 * an "overflow" path is triggered to purge lazily freed
1451  			 * areas to free some memory, then, the "retry" path is
1452  			 * triggered to repeat one more time. See more details
1453  			 * in alloc_vmap_area() function.
1454  			 */
1455  			lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
1456  			if (!lva)
1457  				return -1;
1458  		}
1459  
1460  		/*
1461  		 * Build the remainder.
1462  		 */
1463  		lva->va_start = va->va_start;
1464  		lva->va_end = nva_start_addr;
1465  
1466  		/*
1467  		 * Shrink this VA to remaining size.
1468  		 */
1469  		va->va_start = nva_start_addr + size;
1470  	} else {
1471  		return -1;
1472  	}
1473  
1474  	if (type != FL_FIT_TYPE) {
1475  		augment_tree_propagate_from(va);
1476  
1477  		if (lva)	/* type == NE_FIT_TYPE */
1478  			insert_vmap_area_augment(lva, &va->rb_node, root, head);
1479  	}
1480  
1481  	return 0;
1482  }
1483  
1484  /*
1485   * Returns a start address of the newly allocated area, if success.
1486   * Otherwise a vend is returned that indicates failure.
1487   */
1488  static __always_inline unsigned long
__alloc_vmap_area(struct rb_root * root,struct list_head * head,unsigned long size,unsigned long align,unsigned long vstart,unsigned long vend)1489  __alloc_vmap_area(struct rb_root *root, struct list_head *head,
1490  	unsigned long size, unsigned long align,
1491  	unsigned long vstart, unsigned long vend)
1492  {
1493  	bool adjust_search_size = true;
1494  	unsigned long nva_start_addr;
1495  	struct vmap_area *va;
1496  	int ret;
1497  
1498  	/*
1499  	 * Do not adjust when:
1500  	 *   a) align <= PAGE_SIZE, because it does not make any sense.
1501  	 *      All blocks(their start addresses) are at least PAGE_SIZE
1502  	 *      aligned anyway;
1503  	 *   b) a short range where a requested size corresponds to exactly
1504  	 *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
1505  	 *      With adjusted search length an allocation would not succeed.
1506  	 */
1507  	if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
1508  		adjust_search_size = false;
1509  
1510  	va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
1511  	if (unlikely(!va))
1512  		return vend;
1513  
1514  	if (va->va_start > vstart)
1515  		nva_start_addr = ALIGN(va->va_start, align);
1516  	else
1517  		nva_start_addr = ALIGN(vstart, align);
1518  
1519  	/* Check the "vend" restriction. */
1520  	if (nva_start_addr + size > vend)
1521  		return vend;
1522  
1523  	/* Update the free vmap_area. */
1524  	ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
1525  	if (WARN_ON_ONCE(ret))
1526  		return vend;
1527  
1528  #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1529  	find_vmap_lowest_match_check(root, head, size, align);
1530  #endif
1531  
1532  	return nva_start_addr;
1533  }
1534  
1535  /*
1536   * Free a region of KVA allocated by alloc_vmap_area
1537   */
free_vmap_area(struct vmap_area * va)1538  static void free_vmap_area(struct vmap_area *va)
1539  {
1540  	/*
1541  	 * Remove from the busy tree/list.
1542  	 */
1543  	spin_lock(&vmap_area_lock);
1544  	unlink_va(va, &vmap_area_root);
1545  	spin_unlock(&vmap_area_lock);
1546  
1547  	/*
1548  	 * Insert/Merge it back to the free tree/list.
1549  	 */
1550  	spin_lock(&free_vmap_area_lock);
1551  	merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
1552  	spin_unlock(&free_vmap_area_lock);
1553  }
1554  
1555  static inline void
preload_this_cpu_lock(spinlock_t * lock,gfp_t gfp_mask,int node)1556  preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
1557  {
1558  	struct vmap_area *va = NULL;
1559  
1560  	/*
1561  	 * Preload this CPU with one extra vmap_area object. It is used
1562  	 * when fit type of free area is NE_FIT_TYPE. It guarantees that
1563  	 * a CPU that does an allocation is preloaded.
1564  	 *
1565  	 * We do it in non-atomic context, thus it allows us to use more
1566  	 * permissive allocation masks to be more stable under low memory
1567  	 * condition and high memory pressure.
1568  	 */
1569  	if (!this_cpu_read(ne_fit_preload_node))
1570  		va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
1571  
1572  	spin_lock(lock);
1573  
1574  	if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
1575  		kmem_cache_free(vmap_area_cachep, va);
1576  }
1577  
1578  /*
1579   * Allocate a region of KVA of the specified size and alignment, within the
1580   * vstart and vend.
1581   */
alloc_vmap_area(unsigned long size,unsigned long align,unsigned long vstart,unsigned long vend,int node,gfp_t gfp_mask,unsigned long va_flags)1582  static struct vmap_area *alloc_vmap_area(unsigned long size,
1583  				unsigned long align,
1584  				unsigned long vstart, unsigned long vend,
1585  				int node, gfp_t gfp_mask,
1586  				unsigned long va_flags)
1587  {
1588  	struct vmap_area *va;
1589  	unsigned long freed;
1590  	unsigned long addr;
1591  	int purged = 0;
1592  	int ret;
1593  
1594  	if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
1595  		return ERR_PTR(-EINVAL);
1596  
1597  	if (unlikely(!vmap_initialized))
1598  		return ERR_PTR(-EBUSY);
1599  
1600  	might_sleep();
1601  	gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
1602  
1603  	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
1604  	if (unlikely(!va))
1605  		return ERR_PTR(-ENOMEM);
1606  
1607  	/*
1608  	 * Only scan the relevant parts containing pointers to other objects
1609  	 * to avoid false negatives.
1610  	 */
1611  	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
1612  
1613  retry:
1614  	preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
1615  	addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
1616  		size, align, vstart, vend);
1617  	spin_unlock(&free_vmap_area_lock);
1618  
1619  	trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
1620  
1621  	/*
1622  	 * If an allocation fails, the "vend" address is
1623  	 * returned. Therefore trigger the overflow path.
1624  	 */
1625  	if (unlikely(addr == vend))
1626  		goto overflow;
1627  
1628  	va->va_start = addr;
1629  	va->va_end = addr + size;
1630  	va->vm = NULL;
1631  	va->flags = va_flags;
1632  
1633  	spin_lock(&vmap_area_lock);
1634  	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
1635  	spin_unlock(&vmap_area_lock);
1636  
1637  	BUG_ON(!IS_ALIGNED(va->va_start, align));
1638  	BUG_ON(va->va_start < vstart);
1639  	BUG_ON(va->va_end > vend);
1640  
1641  	ret = kasan_populate_vmalloc(addr, size);
1642  	if (ret) {
1643  		free_vmap_area(va);
1644  		return ERR_PTR(ret);
1645  	}
1646  
1647  	return va;
1648  
1649  overflow:
1650  	if (!purged) {
1651  		reclaim_and_purge_vmap_areas();
1652  		purged = 1;
1653  		goto retry;
1654  	}
1655  
1656  	freed = 0;
1657  	blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
1658  
1659  	if (freed > 0) {
1660  		purged = 0;
1661  		goto retry;
1662  	}
1663  
1664  	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
1665  		pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
1666  			size);
1667  
1668  	kmem_cache_free(vmap_area_cachep, va);
1669  	return ERR_PTR(-EBUSY);
1670  }
1671  
register_vmap_purge_notifier(struct notifier_block * nb)1672  int register_vmap_purge_notifier(struct notifier_block *nb)
1673  {
1674  	return blocking_notifier_chain_register(&vmap_notify_list, nb);
1675  }
1676  EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
1677  
unregister_vmap_purge_notifier(struct notifier_block * nb)1678  int unregister_vmap_purge_notifier(struct notifier_block *nb)
1679  {
1680  	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
1681  }
1682  EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
1683  
1684  /*
1685   * lazy_max_pages is the maximum amount of virtual address space we gather up
1686   * before attempting to purge with a TLB flush.
1687   *
1688   * There is a tradeoff here: a larger number will cover more kernel page tables
1689   * and take slightly longer to purge, but it will linearly reduce the number of
1690   * global TLB flushes that must be performed. It would seem natural to scale
1691   * this number up linearly with the number of CPUs (because vmapping activity
1692   * could also scale linearly with the number of CPUs), however it is likely
1693   * that in practice, workloads might be constrained in other ways that mean
1694   * vmap activity will not scale linearly with CPUs. Also, I want to be
1695   * conservative and not introduce a big latency on huge systems, so go with
1696   * a less aggressive log scale. It will still be an improvement over the old
1697   * code, and it will be simple to change the scale factor if we find that it
1698   * becomes a problem on bigger systems.
1699   */
lazy_max_pages(void)1700  static unsigned long lazy_max_pages(void)
1701  {
1702  	unsigned int log;
1703  
1704  	log = fls(num_online_cpus());
1705  
1706  	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
1707  }
1708  
1709  static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
1710  
1711  /*
1712   * Serialize vmap purging.  There is no actual critical section protected
1713   * by this lock, but we want to avoid concurrent calls for performance
1714   * reasons and to make the pcpu_get_vm_areas more deterministic.
1715   */
1716  static DEFINE_MUTEX(vmap_purge_lock);
1717  
1718  /* for per-CPU blocks */
1719  static void purge_fragmented_blocks_allcpus(void);
1720  
1721  /*
1722   * Purges all lazily-freed vmap areas.
1723   */
__purge_vmap_area_lazy(unsigned long start,unsigned long end)1724  static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
1725  {
1726  	unsigned long resched_threshold;
1727  	unsigned int num_purged_areas = 0;
1728  	struct list_head local_purge_list;
1729  	struct vmap_area *va, *n_va;
1730  
1731  	lockdep_assert_held(&vmap_purge_lock);
1732  
1733  	spin_lock(&purge_vmap_area_lock);
1734  	purge_vmap_area_root = RB_ROOT;
1735  	list_replace_init(&purge_vmap_area_list, &local_purge_list);
1736  	spin_unlock(&purge_vmap_area_lock);
1737  
1738  	if (unlikely(list_empty(&local_purge_list)))
1739  		goto out;
1740  
1741  	start = min(start,
1742  		list_first_entry(&local_purge_list,
1743  			struct vmap_area, list)->va_start);
1744  
1745  	end = max(end,
1746  		list_last_entry(&local_purge_list,
1747  			struct vmap_area, list)->va_end);
1748  
1749  	flush_tlb_kernel_range(start, end);
1750  	resched_threshold = lazy_max_pages() << 1;
1751  
1752  	spin_lock(&free_vmap_area_lock);
1753  	list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
1754  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
1755  		unsigned long orig_start = va->va_start;
1756  		unsigned long orig_end = va->va_end;
1757  
1758  		/*
1759  		 * Finally insert or merge lazily-freed area. It is
1760  		 * detached and there is no need to "unlink" it from
1761  		 * anything.
1762  		 */
1763  		va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
1764  				&free_vmap_area_list);
1765  
1766  		if (!va)
1767  			continue;
1768  
1769  		if (is_vmalloc_or_module_addr((void *)orig_start))
1770  			kasan_release_vmalloc(orig_start, orig_end,
1771  					      va->va_start, va->va_end);
1772  
1773  		atomic_long_sub(nr, &vmap_lazy_nr);
1774  		num_purged_areas++;
1775  
1776  		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
1777  			cond_resched_lock(&free_vmap_area_lock);
1778  	}
1779  	spin_unlock(&free_vmap_area_lock);
1780  
1781  out:
1782  	trace_purge_vmap_area_lazy(start, end, num_purged_areas);
1783  	return num_purged_areas > 0;
1784  }
1785  
1786  /*
1787   * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
1788   */
reclaim_and_purge_vmap_areas(void)1789  static void reclaim_and_purge_vmap_areas(void)
1790  
1791  {
1792  	mutex_lock(&vmap_purge_lock);
1793  	purge_fragmented_blocks_allcpus();
1794  	__purge_vmap_area_lazy(ULONG_MAX, 0);
1795  	mutex_unlock(&vmap_purge_lock);
1796  }
1797  
drain_vmap_area_work(struct work_struct * work)1798  static void drain_vmap_area_work(struct work_struct *work)
1799  {
1800  	unsigned long nr_lazy;
1801  
1802  	do {
1803  		mutex_lock(&vmap_purge_lock);
1804  		__purge_vmap_area_lazy(ULONG_MAX, 0);
1805  		mutex_unlock(&vmap_purge_lock);
1806  
1807  		/* Recheck if further work is required. */
1808  		nr_lazy = atomic_long_read(&vmap_lazy_nr);
1809  	} while (nr_lazy > lazy_max_pages());
1810  }
1811  
1812  /*
1813   * Free a vmap area, caller ensuring that the area has been unmapped,
1814   * unlinked and flush_cache_vunmap had been called for the correct
1815   * range previously.
1816   */
free_vmap_area_noflush(struct vmap_area * va)1817  static void free_vmap_area_noflush(struct vmap_area *va)
1818  {
1819  	unsigned long nr_lazy_max = lazy_max_pages();
1820  	unsigned long va_start = va->va_start;
1821  	unsigned long nr_lazy;
1822  
1823  	if (WARN_ON_ONCE(!list_empty(&va->list)))
1824  		return;
1825  
1826  	nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
1827  				PAGE_SHIFT, &vmap_lazy_nr);
1828  
1829  	/*
1830  	 * Merge or place it to the purge tree/list.
1831  	 */
1832  	spin_lock(&purge_vmap_area_lock);
1833  	merge_or_add_vmap_area(va,
1834  		&purge_vmap_area_root, &purge_vmap_area_list);
1835  	spin_unlock(&purge_vmap_area_lock);
1836  
1837  	trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
1838  
1839  	/* After this point, we may free va at any time */
1840  	if (unlikely(nr_lazy > nr_lazy_max))
1841  		schedule_work(&drain_vmap_work);
1842  }
1843  
1844  /*
1845   * Free and unmap a vmap area
1846   */
free_unmap_vmap_area(struct vmap_area * va)1847  static void free_unmap_vmap_area(struct vmap_area *va)
1848  {
1849  	flush_cache_vunmap(va->va_start, va->va_end);
1850  	vunmap_range_noflush(va->va_start, va->va_end);
1851  	if (debug_pagealloc_enabled_static())
1852  		flush_tlb_kernel_range(va->va_start, va->va_end);
1853  
1854  	free_vmap_area_noflush(va);
1855  }
1856  
find_vmap_area(unsigned long addr)1857  struct vmap_area *find_vmap_area(unsigned long addr)
1858  {
1859  	struct vmap_area *va;
1860  
1861  	spin_lock(&vmap_area_lock);
1862  	va = __find_vmap_area(addr, &vmap_area_root);
1863  	spin_unlock(&vmap_area_lock);
1864  
1865  	return va;
1866  }
1867  
find_unlink_vmap_area(unsigned long addr)1868  static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
1869  {
1870  	struct vmap_area *va;
1871  
1872  	spin_lock(&vmap_area_lock);
1873  	va = __find_vmap_area(addr, &vmap_area_root);
1874  	if (va)
1875  		unlink_va(va, &vmap_area_root);
1876  	spin_unlock(&vmap_area_lock);
1877  
1878  	return va;
1879  }
1880  
1881  /*** Per cpu kva allocator ***/
1882  
1883  /*
1884   * vmap space is limited especially on 32 bit architectures. Ensure there is
1885   * room for at least 16 percpu vmap blocks per CPU.
1886   */
1887  /*
1888   * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
1889   * to #define VMALLOC_SPACE		(VMALLOC_END-VMALLOC_START). Guess
1890   * instead (we just need a rough idea)
1891   */
1892  #if BITS_PER_LONG == 32
1893  #define VMALLOC_SPACE		(128UL*1024*1024)
1894  #else
1895  #define VMALLOC_SPACE		(128UL*1024*1024*1024)
1896  #endif
1897  
1898  #define VMALLOC_PAGES		(VMALLOC_SPACE / PAGE_SIZE)
1899  #define VMAP_MAX_ALLOC		BITS_PER_LONG	/* 256K with 4K pages */
1900  #define VMAP_BBMAP_BITS_MAX	1024	/* 4MB with 4K pages */
1901  #define VMAP_BBMAP_BITS_MIN	(VMAP_MAX_ALLOC*2)
1902  #define VMAP_MIN(x, y)		((x) < (y) ? (x) : (y)) /* can't use min() */
1903  #define VMAP_MAX(x, y)		((x) > (y) ? (x) : (y)) /* can't use max() */
1904  #define VMAP_BBMAP_BITS		\
1905  		VMAP_MIN(VMAP_BBMAP_BITS_MAX,	\
1906  		VMAP_MAX(VMAP_BBMAP_BITS_MIN,	\
1907  			VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
1908  
1909  #define VMAP_BLOCK_SIZE		(VMAP_BBMAP_BITS * PAGE_SIZE)
1910  
1911  /*
1912   * Purge threshold to prevent overeager purging of fragmented blocks for
1913   * regular operations: Purge if vb->free is less than 1/4 of the capacity.
1914   */
1915  #define VMAP_PURGE_THRESHOLD	(VMAP_BBMAP_BITS / 4)
1916  
1917  #define VMAP_RAM		0x1 /* indicates vm_map_ram area*/
1918  #define VMAP_BLOCK		0x2 /* mark out the vmap_block sub-type*/
1919  #define VMAP_FLAGS_MASK		0x3
1920  
1921  struct vmap_block_queue {
1922  	spinlock_t lock;
1923  	struct list_head free;
1924  
1925  	/*
1926  	 * An xarray requires an extra memory dynamically to
1927  	 * be allocated. If it is an issue, we can use rb-tree
1928  	 * instead.
1929  	 */
1930  	struct xarray vmap_blocks;
1931  };
1932  
1933  struct vmap_block {
1934  	spinlock_t lock;
1935  	struct vmap_area *va;
1936  	unsigned long free, dirty;
1937  	DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
1938  	unsigned long dirty_min, dirty_max; /*< dirty range */
1939  	struct list_head free_list;
1940  	struct rcu_head rcu_head;
1941  	struct list_head purge;
1942  	unsigned int cpu;
1943  };
1944  
1945  /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
1946  static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
1947  
1948  /*
1949   * In order to fast access to any "vmap_block" associated with a
1950   * specific address, we use a hash.
1951   *
1952   * A per-cpu vmap_block_queue is used in both ways, to serialize
1953   * an access to free block chains among CPUs(alloc path) and it
1954   * also acts as a vmap_block hash(alloc/free paths). It means we
1955   * overload it, since we already have the per-cpu array which is
1956   * used as a hash table. When used as a hash a 'cpu' passed to
1957   * per_cpu() is not actually a CPU but rather a hash index.
1958   *
1959   * A hash function is addr_to_vb_xa() which hashes any address
1960   * to a specific index(in a hash) it belongs to. This then uses a
1961   * per_cpu() macro to access an array with generated index.
1962   *
1963   * An example:
1964   *
1965   *  CPU_1  CPU_2  CPU_0
1966   *    |      |      |
1967   *    V      V      V
1968   * 0     10     20     30     40     50     60
1969   * |------|------|------|------|------|------|...<vmap address space>
1970   *   CPU0   CPU1   CPU2   CPU0   CPU1   CPU2
1971   *
1972   * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
1973   *   it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
1974   *
1975   * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
1976   *   it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
1977   *
1978   * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
1979   *   it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
1980   *
1981   * This technique almost always avoids lock contention on insert/remove,
1982   * however xarray spinlocks protect against any contention that remains.
1983   */
1984  static struct xarray *
addr_to_vb_xa(unsigned long addr)1985  addr_to_vb_xa(unsigned long addr)
1986  {
1987  	int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids;
1988  
1989  	/*
1990  	 * Please note, nr_cpu_ids points on a highest set
1991  	 * possible bit, i.e. we never invoke cpumask_next()
1992  	 * if an index points on it which is nr_cpu_ids - 1.
1993  	 */
1994  	if (!cpu_possible(index))
1995  		index = cpumask_next(index, cpu_possible_mask);
1996  
1997  	return &per_cpu(vmap_block_queue, index).vmap_blocks;
1998  }
1999  
2000  /*
2001   * We should probably have a fallback mechanism to allocate virtual memory
2002   * out of partially filled vmap blocks. However vmap block sizing should be
2003   * fairly reasonable according to the vmalloc size, so it shouldn't be a
2004   * big problem.
2005   */
2006  
addr_to_vb_idx(unsigned long addr)2007  static unsigned long addr_to_vb_idx(unsigned long addr)
2008  {
2009  	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
2010  	addr /= VMAP_BLOCK_SIZE;
2011  	return addr;
2012  }
2013  
vmap_block_vaddr(unsigned long va_start,unsigned long pages_off)2014  static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
2015  {
2016  	unsigned long addr;
2017  
2018  	addr = va_start + (pages_off << PAGE_SHIFT);
2019  	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
2020  	return (void *)addr;
2021  }
2022  
2023  /**
2024   * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
2025   *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
2026   * @order:    how many 2^order pages should be occupied in newly allocated block
2027   * @gfp_mask: flags for the page level allocator
2028   *
2029   * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
2030   */
new_vmap_block(unsigned int order,gfp_t gfp_mask)2031  static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
2032  {
2033  	struct vmap_block_queue *vbq;
2034  	struct vmap_block *vb;
2035  	struct vmap_area *va;
2036  	struct xarray *xa;
2037  	unsigned long vb_idx;
2038  	int node, err;
2039  	void *vaddr;
2040  
2041  	node = numa_node_id();
2042  
2043  	vb = kmalloc_node(sizeof(struct vmap_block),
2044  			gfp_mask & GFP_RECLAIM_MASK, node);
2045  	if (unlikely(!vb))
2046  		return ERR_PTR(-ENOMEM);
2047  
2048  	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
2049  					VMALLOC_START, VMALLOC_END,
2050  					node, gfp_mask,
2051  					VMAP_RAM|VMAP_BLOCK);
2052  	if (IS_ERR(va)) {
2053  		kfree(vb);
2054  		return ERR_CAST(va);
2055  	}
2056  
2057  	vaddr = vmap_block_vaddr(va->va_start, 0);
2058  	spin_lock_init(&vb->lock);
2059  	vb->va = va;
2060  	/* At least something should be left free */
2061  	BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
2062  	bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
2063  	vb->free = VMAP_BBMAP_BITS - (1UL << order);
2064  	vb->dirty = 0;
2065  	vb->dirty_min = VMAP_BBMAP_BITS;
2066  	vb->dirty_max = 0;
2067  	bitmap_set(vb->used_map, 0, (1UL << order));
2068  	INIT_LIST_HEAD(&vb->free_list);
2069  	vb->cpu = raw_smp_processor_id();
2070  
2071  	xa = addr_to_vb_xa(va->va_start);
2072  	vb_idx = addr_to_vb_idx(va->va_start);
2073  	err = xa_insert(xa, vb_idx, vb, gfp_mask);
2074  	if (err) {
2075  		kfree(vb);
2076  		free_vmap_area(va);
2077  		return ERR_PTR(err);
2078  	}
2079  	/*
2080  	 * list_add_tail_rcu could happened in another core
2081  	 * rather than vb->cpu due to task migration, which
2082  	 * is safe as list_add_tail_rcu will ensure the list's
2083  	 * integrity together with list_for_each_rcu from read
2084  	 * side.
2085  	 */
2086  	vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
2087  	spin_lock(&vbq->lock);
2088  	list_add_tail_rcu(&vb->free_list, &vbq->free);
2089  	spin_unlock(&vbq->lock);
2090  
2091  	return vaddr;
2092  }
2093  
free_vmap_block(struct vmap_block * vb)2094  static void free_vmap_block(struct vmap_block *vb)
2095  {
2096  	struct vmap_block *tmp;
2097  	struct xarray *xa;
2098  
2099  	xa = addr_to_vb_xa(vb->va->va_start);
2100  	tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
2101  	BUG_ON(tmp != vb);
2102  
2103  	spin_lock(&vmap_area_lock);
2104  	unlink_va(vb->va, &vmap_area_root);
2105  	spin_unlock(&vmap_area_lock);
2106  
2107  	free_vmap_area_noflush(vb->va);
2108  	kfree_rcu(vb, rcu_head);
2109  }
2110  
purge_fragmented_block(struct vmap_block * vb,struct list_head * purge_list,bool force_purge)2111  static bool purge_fragmented_block(struct vmap_block *vb,
2112  		struct list_head *purge_list, bool force_purge)
2113  {
2114  	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);
2115  
2116  	if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
2117  	    vb->dirty == VMAP_BBMAP_BITS)
2118  		return false;
2119  
2120  	/* Don't overeagerly purge usable blocks unless requested */
2121  	if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
2122  		return false;
2123  
2124  	/* prevent further allocs after releasing lock */
2125  	WRITE_ONCE(vb->free, 0);
2126  	/* prevent purging it again */
2127  	WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
2128  	vb->dirty_min = 0;
2129  	vb->dirty_max = VMAP_BBMAP_BITS;
2130  	spin_lock(&vbq->lock);
2131  	list_del_rcu(&vb->free_list);
2132  	spin_unlock(&vbq->lock);
2133  	list_add_tail(&vb->purge, purge_list);
2134  	return true;
2135  }
2136  
free_purged_blocks(struct list_head * purge_list)2137  static void free_purged_blocks(struct list_head *purge_list)
2138  {
2139  	struct vmap_block *vb, *n_vb;
2140  
2141  	list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
2142  		list_del(&vb->purge);
2143  		free_vmap_block(vb);
2144  	}
2145  }
2146  
purge_fragmented_blocks(int cpu)2147  static void purge_fragmented_blocks(int cpu)
2148  {
2149  	LIST_HEAD(purge);
2150  	struct vmap_block *vb;
2151  	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
2152  
2153  	rcu_read_lock();
2154  	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
2155  		unsigned long free = READ_ONCE(vb->free);
2156  		unsigned long dirty = READ_ONCE(vb->dirty);
2157  
2158  		if (free + dirty != VMAP_BBMAP_BITS ||
2159  		    dirty == VMAP_BBMAP_BITS)
2160  			continue;
2161  
2162  		spin_lock(&vb->lock);
2163  		purge_fragmented_block(vb, &purge, true);
2164  		spin_unlock(&vb->lock);
2165  	}
2166  	rcu_read_unlock();
2167  	free_purged_blocks(&purge);
2168  }
2169  
purge_fragmented_blocks_allcpus(void)2170  static void purge_fragmented_blocks_allcpus(void)
2171  {
2172  	int cpu;
2173  
2174  	for_each_possible_cpu(cpu)
2175  		purge_fragmented_blocks(cpu);
2176  }
2177  
vb_alloc(unsigned long size,gfp_t gfp_mask)2178  static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
2179  {
2180  	struct vmap_block_queue *vbq;
2181  	struct vmap_block *vb;
2182  	void *vaddr = NULL;
2183  	unsigned int order;
2184  
2185  	BUG_ON(offset_in_page(size));
2186  	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
2187  	if (WARN_ON(size == 0)) {
2188  		/*
2189  		 * Allocating 0 bytes isn't what caller wants since
2190  		 * get_order(0) returns funny result. Just warn and terminate
2191  		 * early.
2192  		 */
2193  		return NULL;
2194  	}
2195  	order = get_order(size);
2196  
2197  	rcu_read_lock();
2198  	vbq = raw_cpu_ptr(&vmap_block_queue);
2199  	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
2200  		unsigned long pages_off;
2201  
2202  		if (READ_ONCE(vb->free) < (1UL << order))
2203  			continue;
2204  
2205  		spin_lock(&vb->lock);
2206  		if (vb->free < (1UL << order)) {
2207  			spin_unlock(&vb->lock);
2208  			continue;
2209  		}
2210  
2211  		pages_off = VMAP_BBMAP_BITS - vb->free;
2212  		vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
2213  		WRITE_ONCE(vb->free, vb->free - (1UL << order));
2214  		bitmap_set(vb->used_map, pages_off, (1UL << order));
2215  		if (vb->free == 0) {
2216  			spin_lock(&vbq->lock);
2217  			list_del_rcu(&vb->free_list);
2218  			spin_unlock(&vbq->lock);
2219  		}
2220  
2221  		spin_unlock(&vb->lock);
2222  		break;
2223  	}
2224  
2225  	rcu_read_unlock();
2226  
2227  	/* Allocate new block if nothing was found */
2228  	if (!vaddr)
2229  		vaddr = new_vmap_block(order, gfp_mask);
2230  
2231  	return vaddr;
2232  }
2233  
vb_free(unsigned long addr,unsigned long size)2234  static void vb_free(unsigned long addr, unsigned long size)
2235  {
2236  	unsigned long offset;
2237  	unsigned int order;
2238  	struct vmap_block *vb;
2239  	struct xarray *xa;
2240  
2241  	BUG_ON(offset_in_page(size));
2242  	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
2243  
2244  	flush_cache_vunmap(addr, addr + size);
2245  
2246  	order = get_order(size);
2247  	offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
2248  
2249  	xa = addr_to_vb_xa(addr);
2250  	vb = xa_load(xa, addr_to_vb_idx(addr));
2251  
2252  	spin_lock(&vb->lock);
2253  	bitmap_clear(vb->used_map, offset, (1UL << order));
2254  	spin_unlock(&vb->lock);
2255  
2256  	vunmap_range_noflush(addr, addr + size);
2257  
2258  	if (debug_pagealloc_enabled_static())
2259  		flush_tlb_kernel_range(addr, addr + size);
2260  
2261  	spin_lock(&vb->lock);
2262  
2263  	/* Expand the not yet TLB flushed dirty range */
2264  	vb->dirty_min = min(vb->dirty_min, offset);
2265  	vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
2266  
2267  	WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
2268  	if (vb->dirty == VMAP_BBMAP_BITS) {
2269  		BUG_ON(vb->free);
2270  		spin_unlock(&vb->lock);
2271  		free_vmap_block(vb);
2272  	} else
2273  		spin_unlock(&vb->lock);
2274  }
2275  
_vm_unmap_aliases(unsigned long start,unsigned long end,int flush)2276  static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
2277  {
2278  	LIST_HEAD(purge_list);
2279  	int cpu;
2280  
2281  	if (unlikely(!vmap_initialized))
2282  		return;
2283  
2284  	mutex_lock(&vmap_purge_lock);
2285  
2286  	for_each_possible_cpu(cpu) {
2287  		struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
2288  		struct vmap_block *vb;
2289  		unsigned long idx;
2290  
2291  		rcu_read_lock();
2292  		xa_for_each(&vbq->vmap_blocks, idx, vb) {
2293  			spin_lock(&vb->lock);
2294  
2295  			/*
2296  			 * Try to purge a fragmented block first. If it's
2297  			 * not purgeable, check whether there is dirty
2298  			 * space to be flushed.
2299  			 */
2300  			if (!purge_fragmented_block(vb, &purge_list, false) &&
2301  			    vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
2302  				unsigned long va_start = vb->va->va_start;
2303  				unsigned long s, e;
2304  
2305  				s = va_start + (vb->dirty_min << PAGE_SHIFT);
2306  				e = va_start + (vb->dirty_max << PAGE_SHIFT);
2307  
2308  				start = min(s, start);
2309  				end   = max(e, end);
2310  
2311  				/* Prevent that this is flushed again */
2312  				vb->dirty_min = VMAP_BBMAP_BITS;
2313  				vb->dirty_max = 0;
2314  
2315  				flush = 1;
2316  			}
2317  			spin_unlock(&vb->lock);
2318  		}
2319  		rcu_read_unlock();
2320  	}
2321  	free_purged_blocks(&purge_list);
2322  
2323  	if (!__purge_vmap_area_lazy(start, end) && flush)
2324  		flush_tlb_kernel_range(start, end);
2325  	mutex_unlock(&vmap_purge_lock);
2326  }
2327  
2328  /**
2329   * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
2330   *
2331   * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
2332   * to amortize TLB flushing overheads. What this means is that any page you
2333   * have now, may, in a former life, have been mapped into kernel virtual
2334   * address by the vmap layer and so there might be some CPUs with TLB entries
2335   * still referencing that page (additional to the regular 1:1 kernel mapping).
2336   *
2337   * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
2338   * be sure that none of the pages we have control over will have any aliases
2339   * from the vmap layer.
2340   */
vm_unmap_aliases(void)2341  void vm_unmap_aliases(void)
2342  {
2343  	unsigned long start = ULONG_MAX, end = 0;
2344  	int flush = 0;
2345  
2346  	_vm_unmap_aliases(start, end, flush);
2347  }
2348  EXPORT_SYMBOL_GPL(vm_unmap_aliases);
2349  
2350  /**
2351   * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
2352   * @mem: the pointer returned by vm_map_ram
2353   * @count: the count passed to that vm_map_ram call (cannot unmap partial)
2354   */
vm_unmap_ram(const void * mem,unsigned int count)2355  void vm_unmap_ram(const void *mem, unsigned int count)
2356  {
2357  	unsigned long size = (unsigned long)count << PAGE_SHIFT;
2358  	unsigned long addr = (unsigned long)kasan_reset_tag(mem);
2359  	struct vmap_area *va;
2360  
2361  	might_sleep();
2362  	BUG_ON(!addr);
2363  	BUG_ON(addr < VMALLOC_START);
2364  	BUG_ON(addr > VMALLOC_END);
2365  	BUG_ON(!PAGE_ALIGNED(addr));
2366  
2367  	kasan_poison_vmalloc(mem, size);
2368  
2369  	if (likely(count <= VMAP_MAX_ALLOC)) {
2370  		debug_check_no_locks_freed(mem, size);
2371  		vb_free(addr, size);
2372  		return;
2373  	}
2374  
2375  	va = find_unlink_vmap_area(addr);
2376  	if (WARN_ON_ONCE(!va))
2377  		return;
2378  
2379  	debug_check_no_locks_freed((void *)va->va_start,
2380  				    (va->va_end - va->va_start));
2381  	free_unmap_vmap_area(va);
2382  }
2383  EXPORT_SYMBOL(vm_unmap_ram);
2384  
2385  /**
2386   * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
2387   * @pages: an array of pointers to the pages to be mapped
2388   * @count: number of pages
2389   * @node: prefer to allocate data structures on this node
2390   *
2391   * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
2392   * faster than vmap so it's good.  But if you mix long-life and short-life
2393   * objects with vm_map_ram(), it could consume lots of address space through
2394   * fragmentation (especially on a 32bit machine).  You could see failures in
2395   * the end.  Please use this function for short-lived objects.
2396   *
2397   * Returns: a pointer to the address that has been mapped, or %NULL on failure
2398   */
vm_map_ram(struct page ** pages,unsigned int count,int node)2399  void *vm_map_ram(struct page **pages, unsigned int count, int node)
2400  {
2401  	unsigned long size = (unsigned long)count << PAGE_SHIFT;
2402  	unsigned long addr;
2403  	void *mem;
2404  
2405  	if (likely(count <= VMAP_MAX_ALLOC)) {
2406  		mem = vb_alloc(size, GFP_KERNEL);
2407  		if (IS_ERR(mem))
2408  			return NULL;
2409  		addr = (unsigned long)mem;
2410  	} else {
2411  		struct vmap_area *va;
2412  		va = alloc_vmap_area(size, PAGE_SIZE,
2413  				VMALLOC_START, VMALLOC_END,
2414  				node, GFP_KERNEL, VMAP_RAM);
2415  		if (IS_ERR(va))
2416  			return NULL;
2417  
2418  		addr = va->va_start;
2419  		mem = (void *)addr;
2420  	}
2421  
2422  	if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
2423  				pages, PAGE_SHIFT) < 0) {
2424  		vm_unmap_ram(mem, count);
2425  		return NULL;
2426  	}
2427  
2428  	/*
2429  	 * Mark the pages as accessible, now that they are mapped.
2430  	 * With hardware tag-based KASAN, marking is skipped for
2431  	 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
2432  	 */
2433  	mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
2434  
2435  	return mem;
2436  }
2437  EXPORT_SYMBOL(vm_map_ram);
2438  
2439  static struct vm_struct *vmlist __initdata;
2440  
vm_area_page_order(struct vm_struct * vm)2441  static inline unsigned int vm_area_page_order(struct vm_struct *vm)
2442  {
2443  #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
2444  	return vm->page_order;
2445  #else
2446  	return 0;
2447  #endif
2448  }
2449  
set_vm_area_page_order(struct vm_struct * vm,unsigned int order)2450  static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
2451  {
2452  #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
2453  	vm->page_order = order;
2454  #else
2455  	BUG_ON(order != 0);
2456  #endif
2457  }
2458  
2459  /**
2460   * vm_area_add_early - add vmap area early during boot
2461   * @vm: vm_struct to add
2462   *
2463   * This function is used to add fixed kernel vm area to vmlist before
2464   * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
2465   * should contain proper values and the other fields should be zero.
2466   *
2467   * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
2468   */
vm_area_add_early(struct vm_struct * vm)2469  void __init vm_area_add_early(struct vm_struct *vm)
2470  {
2471  	struct vm_struct *tmp, **p;
2472  
2473  	BUG_ON(vmap_initialized);
2474  	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
2475  		if (tmp->addr >= vm->addr) {
2476  			BUG_ON(tmp->addr < vm->addr + vm->size);
2477  			break;
2478  		} else
2479  			BUG_ON(tmp->addr + tmp->size > vm->addr);
2480  	}
2481  	vm->next = *p;
2482  	*p = vm;
2483  }
2484  
2485  /**
2486   * vm_area_register_early - register vmap area early during boot
2487   * @vm: vm_struct to register
2488   * @align: requested alignment
2489   *
2490   * This function is used to register kernel vm area before
2491   * vmalloc_init() is called.  @vm->size and @vm->flags should contain
2492   * proper values on entry and other fields should be zero.  On return,
2493   * vm->addr contains the allocated address.
2494   *
2495   * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
2496   */
vm_area_register_early(struct vm_struct * vm,size_t align)2497  void __init vm_area_register_early(struct vm_struct *vm, size_t align)
2498  {
2499  	unsigned long addr = ALIGN(VMALLOC_START, align);
2500  	struct vm_struct *cur, **p;
2501  
2502  	BUG_ON(vmap_initialized);
2503  
2504  	for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
2505  		if ((unsigned long)cur->addr - addr >= vm->size)
2506  			break;
2507  		addr = ALIGN((unsigned long)cur->addr + cur->size, align);
2508  	}
2509  
2510  	BUG_ON(addr > VMALLOC_END - vm->size);
2511  	vm->addr = (void *)addr;
2512  	vm->next = *p;
2513  	*p = vm;
2514  	kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
2515  }
2516  
vmap_init_free_space(void)2517  static void vmap_init_free_space(void)
2518  {
2519  	unsigned long vmap_start = 1;
2520  	const unsigned long vmap_end = ULONG_MAX;
2521  	struct vmap_area *busy, *free;
2522  
2523  	/*
2524  	 *     B     F     B     B     B     F
2525  	 * -|-----|.....|-----|-----|-----|.....|-
2526  	 *  |           The KVA space           |
2527  	 *  |<--------------------------------->|
2528  	 */
2529  	list_for_each_entry(busy, &vmap_area_list, list) {
2530  		if (busy->va_start - vmap_start > 0) {
2531  			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
2532  			if (!WARN_ON_ONCE(!free)) {
2533  				free->va_start = vmap_start;
2534  				free->va_end = busy->va_start;
2535  
2536  				insert_vmap_area_augment(free, NULL,
2537  					&free_vmap_area_root,
2538  						&free_vmap_area_list);
2539  			}
2540  		}
2541  
2542  		vmap_start = busy->va_end;
2543  	}
2544  
2545  	if (vmap_end - vmap_start > 0) {
2546  		free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
2547  		if (!WARN_ON_ONCE(!free)) {
2548  			free->va_start = vmap_start;
2549  			free->va_end = vmap_end;
2550  
2551  			insert_vmap_area_augment(free, NULL,
2552  				&free_vmap_area_root,
2553  					&free_vmap_area_list);
2554  		}
2555  	}
2556  }
2557  
setup_vmalloc_vm_locked(struct vm_struct * vm,struct vmap_area * va,unsigned long flags,const void * caller)2558  static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
2559  	struct vmap_area *va, unsigned long flags, const void *caller)
2560  {
2561  	vm->flags = flags;
2562  	vm->addr = (void *)va->va_start;
2563  	vm->size = va->va_end - va->va_start;
2564  	vm->caller = caller;
2565  	va->vm = vm;
2566  }
2567  
setup_vmalloc_vm(struct vm_struct * vm,struct vmap_area * va,unsigned long flags,const void * caller)2568  static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
2569  			      unsigned long flags, const void *caller)
2570  {
2571  	spin_lock(&vmap_area_lock);
2572  	setup_vmalloc_vm_locked(vm, va, flags, caller);
2573  	spin_unlock(&vmap_area_lock);
2574  }
2575  
clear_vm_uninitialized_flag(struct vm_struct * vm)2576  static void clear_vm_uninitialized_flag(struct vm_struct *vm)
2577  {
2578  	/*
2579  	 * Before removing VM_UNINITIALIZED,
2580  	 * we should make sure that vm has proper values.
2581  	 * Pair with smp_rmb() in show_numa_info().
2582  	 */
2583  	smp_wmb();
2584  	vm->flags &= ~VM_UNINITIALIZED;
2585  }
2586  
__get_vm_area_node(unsigned long size,unsigned long align,unsigned long shift,unsigned long flags,unsigned long start,unsigned long end,int node,gfp_t gfp_mask,const void * caller)2587  static struct vm_struct *__get_vm_area_node(unsigned long size,
2588  		unsigned long align, unsigned long shift, unsigned long flags,
2589  		unsigned long start, unsigned long end, int node,
2590  		gfp_t gfp_mask, const void *caller)
2591  {
2592  	struct vmap_area *va;
2593  	struct vm_struct *area;
2594  	unsigned long requested_size = size;
2595  
2596  	BUG_ON(in_interrupt());
2597  	size = ALIGN(size, 1ul << shift);
2598  	if (unlikely(!size))
2599  		return NULL;
2600  
2601  	if (flags & VM_IOREMAP)
2602  		align = 1ul << clamp_t(int, get_count_order_long(size),
2603  				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
2604  
2605  	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
2606  	if (unlikely(!area))
2607  		return NULL;
2608  
2609  	if (!(flags & VM_NO_GUARD))
2610  		size += PAGE_SIZE;
2611  
2612  	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
2613  	if (IS_ERR(va)) {
2614  		kfree(area);
2615  		return NULL;
2616  	}
2617  
2618  	setup_vmalloc_vm(area, va, flags, caller);
2619  
2620  	/*
2621  	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
2622  	 * best-effort approach, as they can be mapped outside of vmalloc code.
2623  	 * For VM_ALLOC mappings, the pages are marked as accessible after
2624  	 * getting mapped in __vmalloc_node_range().
2625  	 * With hardware tag-based KASAN, marking is skipped for
2626  	 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
2627  	 */
2628  	if (!(flags & VM_ALLOC))
2629  		area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
2630  						    KASAN_VMALLOC_PROT_NORMAL);
2631  
2632  	return area;
2633  }
2634  
__get_vm_area_caller(unsigned long size,unsigned long flags,unsigned long start,unsigned long end,const void * caller)2635  struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
2636  				       unsigned long start, unsigned long end,
2637  				       const void *caller)
2638  {
2639  	return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
2640  				  NUMA_NO_NODE, GFP_KERNEL, caller);
2641  }
2642  
2643  /**
2644   * get_vm_area - reserve a contiguous kernel virtual area
2645   * @size:	 size of the area
2646   * @flags:	 %VM_IOREMAP for I/O mappings or VM_ALLOC
2647   *
2648   * Search an area of @size in the kernel virtual mapping area,
2649   * and reserved it for out purposes.  Returns the area descriptor
2650   * on success or %NULL on failure.
2651   *
2652   * Return: the area descriptor on success or %NULL on failure.
2653   */
get_vm_area(unsigned long size,unsigned long flags)2654  struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
2655  {
2656  	return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
2657  				  VMALLOC_START, VMALLOC_END,
2658  				  NUMA_NO_NODE, GFP_KERNEL,
2659  				  __builtin_return_address(0));
2660  }
2661  
get_vm_area_caller(unsigned long size,unsigned long flags,const void * caller)2662  struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
2663  				const void *caller)
2664  {
2665  	return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
2666  				  VMALLOC_START, VMALLOC_END,
2667  				  NUMA_NO_NODE, GFP_KERNEL, caller);
2668  }
2669  
2670  /**
2671   * find_vm_area - find a continuous kernel virtual area
2672   * @addr:	  base address
2673   *
2674   * Search for the kernel VM area starting at @addr, and return it.
2675   * It is up to the caller to do all required locking to keep the returned
2676   * pointer valid.
2677   *
2678   * Return: the area descriptor on success or %NULL on failure.
2679   */
find_vm_area(const void * addr)2680  struct vm_struct *find_vm_area(const void *addr)
2681  {
2682  	struct vmap_area *va;
2683  
2684  	va = find_vmap_area((unsigned long)addr);
2685  	if (!va)
2686  		return NULL;
2687  
2688  	return va->vm;
2689  }
2690  
2691  /**
2692   * remove_vm_area - find and remove a continuous kernel virtual area
2693   * @addr:	    base address
2694   *
2695   * Search for the kernel VM area starting at @addr, and remove it.
2696   * This function returns the found VM area, but using it is NOT safe
2697   * on SMP machines, except for its size or flags.
2698   *
2699   * Return: the area descriptor on success or %NULL on failure.
2700   */
remove_vm_area(const void * addr)2701  struct vm_struct *remove_vm_area(const void *addr)
2702  {
2703  	struct vmap_area *va;
2704  	struct vm_struct *vm;
2705  
2706  	might_sleep();
2707  
2708  	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
2709  			addr))
2710  		return NULL;
2711  
2712  	va = find_unlink_vmap_area((unsigned long)addr);
2713  	if (!va || !va->vm)
2714  		return NULL;
2715  	vm = va->vm;
2716  
2717  	debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
2718  	debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
2719  	kasan_free_module_shadow(vm);
2720  	kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));
2721  
2722  	free_unmap_vmap_area(va);
2723  	return vm;
2724  }
2725  
set_area_direct_map(const struct vm_struct * area,int (* set_direct_map)(struct page * page))2726  static inline void set_area_direct_map(const struct vm_struct *area,
2727  				       int (*set_direct_map)(struct page *page))
2728  {
2729  	int i;
2730  
2731  	/* HUGE_VMALLOC passes small pages to set_direct_map */
2732  	for (i = 0; i < area->nr_pages; i++)
2733  		if (page_address(area->pages[i]))
2734  			set_direct_map(area->pages[i]);
2735  }
2736  
2737  /*
2738   * Flush the vm mapping and reset the direct map.
2739   */
vm_reset_perms(struct vm_struct * area)2740  static void vm_reset_perms(struct vm_struct *area)
2741  {
2742  	unsigned long start = ULONG_MAX, end = 0;
2743  	unsigned int page_order = vm_area_page_order(area);
2744  	int flush_dmap = 0;
2745  	int i;
2746  
2747  	/*
2748  	 * Find the start and end range of the direct mappings to make sure that
2749  	 * the vm_unmap_aliases() flush includes the direct map.
2750  	 */
2751  	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
2752  		unsigned long addr = (unsigned long)page_address(area->pages[i]);
2753  
2754  		if (addr) {
2755  			unsigned long page_size;
2756  
2757  			page_size = PAGE_SIZE << page_order;
2758  			start = min(addr, start);
2759  			end = max(addr + page_size, end);
2760  			flush_dmap = 1;
2761  		}
2762  	}
2763  
2764  	/*
2765  	 * Set direct map to something invalid so that it won't be cached if
2766  	 * there are any accesses after the TLB flush, then flush the TLB and
2767  	 * reset the direct map permissions to the default.
2768  	 */
2769  	set_area_direct_map(area, set_direct_map_invalid_noflush);
2770  	_vm_unmap_aliases(start, end, flush_dmap);
2771  	set_area_direct_map(area, set_direct_map_default_noflush);
2772  }
2773  
delayed_vfree_work(struct work_struct * w)2774  static void delayed_vfree_work(struct work_struct *w)
2775  {
2776  	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
2777  	struct llist_node *t, *llnode;
2778  
2779  	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
2780  		vfree(llnode);
2781  }
2782  
2783  /**
2784   * vfree_atomic - release memory allocated by vmalloc()
2785   * @addr:	  memory base address
2786   *
2787   * This one is just like vfree() but can be called in any atomic context
2788   * except NMIs.
2789   */
vfree_atomic(const void * addr)2790  void vfree_atomic(const void *addr)
2791  {
2792  	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
2793  
2794  	BUG_ON(in_nmi());
2795  	kmemleak_free(addr);
2796  
2797  	/*
2798  	 * Use raw_cpu_ptr() because this can be called from preemptible
2799  	 * context. Preemption is absolutely fine here, because the llist_add()
2800  	 * implementation is lockless, so it works even if we are adding to
2801  	 * another cpu's list. schedule_work() should be fine with this too.
2802  	 */
2803  	if (addr && llist_add((struct llist_node *)addr, &p->list))
2804  		schedule_work(&p->wq);
2805  }
2806  
2807  /**
2808   * vfree - Release memory allocated by vmalloc()
2809   * @addr:  Memory base address
2810   *
2811   * Free the virtually continuous memory area starting at @addr, as obtained
2812   * from one of the vmalloc() family of APIs.  This will usually also free the
2813   * physical memory underlying the virtual allocation, but that memory is
2814   * reference counted, so it will not be freed until the last user goes away.
2815   *
2816   * If @addr is NULL, no operation is performed.
2817   *
2818   * Context:
2819   * May sleep if called *not* from interrupt context.
2820   * Must not be called in NMI context (strictly speaking, it could be
2821   * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
2822   * conventions for vfree() arch-dependent would be a really bad idea).
2823   */
vfree(const void * addr)2824  void vfree(const void *addr)
2825  {
2826  	struct vm_struct *vm;
2827  	int i;
2828  
2829  	if (unlikely(in_interrupt())) {
2830  		vfree_atomic(addr);
2831  		return;
2832  	}
2833  
2834  	BUG_ON(in_nmi());
2835  	kmemleak_free(addr);
2836  	might_sleep();
2837  
2838  	if (!addr)
2839  		return;
2840  
2841  	vm = remove_vm_area(addr);
2842  	if (unlikely(!vm)) {
2843  		WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
2844  				addr);
2845  		return;
2846  	}
2847  
2848  	if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
2849  		vm_reset_perms(vm);
2850  	for (i = 0; i < vm->nr_pages; i++) {
2851  		struct page *page = vm->pages[i];
2852  
2853  		BUG_ON(!page);
2854  		if (!(vm->flags & VM_MAP_PUT_PAGES))
2855  			mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
2856  		/*
2857  		 * High-order allocs for huge vmallocs are split, so
2858  		 * can be freed as an array of order-0 allocations
2859  		 */
2860  		__free_page(page);
2861  		cond_resched();
2862  	}
2863  	if (!(vm->flags & VM_MAP_PUT_PAGES))
2864  		atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
2865  	kvfree(vm->pages);
2866  	kfree(vm);
2867  }
2868  EXPORT_SYMBOL(vfree);
2869  
2870  /**
2871   * vunmap - release virtual mapping obtained by vmap()
2872   * @addr:   memory base address
2873   *
2874   * Free the virtually contiguous memory area starting at @addr,
2875   * which was created from the page array passed to vmap().
2876   *
2877   * Must not be called in interrupt context.
2878   */
vunmap(const void * addr)2879  void vunmap(const void *addr)
2880  {
2881  	struct vm_struct *vm;
2882  
2883  	BUG_ON(in_interrupt());
2884  	might_sleep();
2885  
2886  	if (!addr)
2887  		return;
2888  	vm = remove_vm_area(addr);
2889  	if (unlikely(!vm)) {
2890  		WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
2891  				addr);
2892  		return;
2893  	}
2894  	kfree(vm);
2895  }
2896  EXPORT_SYMBOL(vunmap);
2897  
2898  /**
2899   * vmap - map an array of pages into virtually contiguous space
2900   * @pages: array of page pointers
2901   * @count: number of pages to map
2902   * @flags: vm_area->flags
2903   * @prot: page protection for the mapping
2904   *
2905   * Maps @count pages from @pages into contiguous kernel virtual space.
2906   * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
2907   * (which must be kmalloc or vmalloc memory) and one reference per pages in it
2908   * are transferred from the caller to vmap(), and will be freed / dropped when
2909   * vfree() is called on the return value.
2910   *
2911   * Return: the address of the area or %NULL on failure
2912   */
vmap(struct page ** pages,unsigned int count,unsigned long flags,pgprot_t prot)2913  void *vmap(struct page **pages, unsigned int count,
2914  	   unsigned long flags, pgprot_t prot)
2915  {
2916  	struct vm_struct *area;
2917  	unsigned long addr;
2918  	unsigned long size;		/* In bytes */
2919  
2920  	might_sleep();
2921  
2922  	if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
2923  		return NULL;
2924  
2925  	/*
2926  	 * Your top guard is someone else's bottom guard. Not having a top
2927  	 * guard compromises someone else's mappings too.
2928  	 */
2929  	if (WARN_ON_ONCE(flags & VM_NO_GUARD))
2930  		flags &= ~VM_NO_GUARD;
2931  
2932  	if (count > totalram_pages())
2933  		return NULL;
2934  
2935  	size = (unsigned long)count << PAGE_SHIFT;
2936  	area = get_vm_area_caller(size, flags, __builtin_return_address(0));
2937  	if (!area)
2938  		return NULL;
2939  
2940  	addr = (unsigned long)area->addr;
2941  	if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
2942  				pages, PAGE_SHIFT) < 0) {
2943  		vunmap(area->addr);
2944  		return NULL;
2945  	}
2946  
2947  	if (flags & VM_MAP_PUT_PAGES) {
2948  		area->pages = pages;
2949  		area->nr_pages = count;
2950  	}
2951  	return area->addr;
2952  }
2953  EXPORT_SYMBOL(vmap);
2954  
2955  #ifdef CONFIG_VMAP_PFN
2956  struct vmap_pfn_data {
2957  	unsigned long	*pfns;
2958  	pgprot_t	prot;
2959  	unsigned int	idx;
2960  };
2961  
vmap_pfn_apply(pte_t * pte,unsigned long addr,void * private)2962  static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
2963  {
2964  	struct vmap_pfn_data *data = private;
2965  	unsigned long pfn = data->pfns[data->idx];
2966  	pte_t ptent;
2967  
2968  	if (WARN_ON_ONCE(pfn_valid(pfn)))
2969  		return -EINVAL;
2970  
2971  	ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
2972  	set_pte_at(&init_mm, addr, pte, ptent);
2973  
2974  	data->idx++;
2975  	return 0;
2976  }
2977  
2978  /**
2979   * vmap_pfn - map an array of PFNs into virtually contiguous space
2980   * @pfns: array of PFNs
2981   * @count: number of pages to map
2982   * @prot: page protection for the mapping
2983   *
2984   * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
2985   * the start address of the mapping.
2986   */
vmap_pfn(unsigned long * pfns,unsigned int count,pgprot_t prot)2987  void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
2988  {
2989  	struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
2990  	struct vm_struct *area;
2991  
2992  	area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
2993  			__builtin_return_address(0));
2994  	if (!area)
2995  		return NULL;
2996  	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
2997  			count * PAGE_SIZE, vmap_pfn_apply, &data)) {
2998  		free_vm_area(area);
2999  		return NULL;
3000  	}
3001  
3002  	flush_cache_vmap((unsigned long)area->addr,
3003  			 (unsigned long)area->addr + count * PAGE_SIZE);
3004  
3005  	return area->addr;
3006  }
3007  EXPORT_SYMBOL_GPL(vmap_pfn);
3008  #endif /* CONFIG_VMAP_PFN */
3009  
3010  static inline unsigned int
vm_area_alloc_pages(gfp_t gfp,int nid,unsigned int order,unsigned int nr_pages,struct page ** pages)3011  vm_area_alloc_pages(gfp_t gfp, int nid,
3012  		unsigned int order, unsigned int nr_pages, struct page **pages)
3013  {
3014  	unsigned int nr_allocated = 0;
3015  	gfp_t alloc_gfp = gfp;
3016  	bool nofail = gfp & __GFP_NOFAIL;
3017  	struct page *page;
3018  	int i;
3019  
3020  	/*
3021  	 * For order-0 pages we make use of bulk allocator, if
3022  	 * the page array is partly or not at all populated due
3023  	 * to fails, fallback to a single page allocator that is
3024  	 * more permissive.
3025  	 */
3026  	if (!order) {
3027  		/* bulk allocator doesn't support nofail req. officially */
3028  		gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
3029  
3030  		while (nr_allocated < nr_pages) {
3031  			unsigned int nr, nr_pages_request;
3032  
3033  			/*
3034  			 * A maximum allowed request is hard-coded and is 100
3035  			 * pages per call. That is done in order to prevent a
3036  			 * long preemption off scenario in the bulk-allocator
3037  			 * so the range is [1:100].
3038  			 */
3039  			nr_pages_request = min(100U, nr_pages - nr_allocated);
3040  
3041  			/* memory allocation should consider mempolicy, we can't
3042  			 * wrongly use nearest node when nid == NUMA_NO_NODE,
3043  			 * otherwise memory may be allocated in only one node,
3044  			 * but mempolicy wants to alloc memory by interleaving.
3045  			 */
3046  			if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
3047  				nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
3048  							nr_pages_request,
3049  							pages + nr_allocated);
3050  
3051  			else
3052  				nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
3053  							nr_pages_request,
3054  							pages + nr_allocated);
3055  
3056  			nr_allocated += nr;
3057  			cond_resched();
3058  
3059  			/*
3060  			 * If zero or pages were obtained partly,
3061  			 * fallback to a single page allocator.
3062  			 */
3063  			if (nr != nr_pages_request)
3064  				break;
3065  		}
3066  	} else if (gfp & __GFP_NOFAIL) {
3067  		/*
3068  		 * Higher order nofail allocations are really expensive and
3069  		 * potentially dangerous (pre-mature OOM, disruptive reclaim
3070  		 * and compaction etc.
3071  		 */
3072  		alloc_gfp &= ~__GFP_NOFAIL;
3073  	}
3074  
3075  	/* High-order pages or fallback path if "bulk" fails. */
3076  	while (nr_allocated < nr_pages) {
3077  		if (!nofail && fatal_signal_pending(current))
3078  			break;
3079  
3080  		if (nid == NUMA_NO_NODE)
3081  			page = alloc_pages(alloc_gfp, order);
3082  		else
3083  			page = alloc_pages_node(nid, alloc_gfp, order);
3084  		if (unlikely(!page))
3085  			break;
3086  
3087  		/*
3088  		 * Higher order allocations must be able to be treated as
3089  		 * indepdenent small pages by callers (as they can with
3090  		 * small-page vmallocs). Some drivers do their own refcounting
3091  		 * on vmalloc_to_page() pages, some use page->mapping,
3092  		 * page->lru, etc.
3093  		 */
3094  		if (order)
3095  			split_page(page, order);
3096  
3097  		/*
3098  		 * Careful, we allocate and map page-order pages, but
3099  		 * tracking is done per PAGE_SIZE page so as to keep the
3100  		 * vm_struct APIs independent of the physical/mapped size.
3101  		 */
3102  		for (i = 0; i < (1U << order); i++)
3103  			pages[nr_allocated + i] = page + i;
3104  
3105  		cond_resched();
3106  		nr_allocated += 1U << order;
3107  	}
3108  
3109  	return nr_allocated;
3110  }
3111  
__vmalloc_area_node(struct vm_struct * area,gfp_t gfp_mask,pgprot_t prot,unsigned int page_shift,int node)3112  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
3113  				 pgprot_t prot, unsigned int page_shift,
3114  				 int node)
3115  {
3116  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
3117  	bool nofail = gfp_mask & __GFP_NOFAIL;
3118  	unsigned long addr = (unsigned long)area->addr;
3119  	unsigned long size = get_vm_area_size(area);
3120  	unsigned long array_size;
3121  	unsigned int nr_small_pages = size >> PAGE_SHIFT;
3122  	unsigned int page_order;
3123  	unsigned int flags;
3124  	int ret;
3125  
3126  	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
3127  
3128  	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
3129  		gfp_mask |= __GFP_HIGHMEM;
3130  
3131  	/* Please note that the recursion is strictly bounded. */
3132  	if (array_size > PAGE_SIZE) {
3133  		area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
3134  					area->caller);
3135  	} else {
3136  		area->pages = kmalloc_node(array_size, nested_gfp, node);
3137  	}
3138  
3139  	if (!area->pages) {
3140  		warn_alloc(gfp_mask, NULL,
3141  			"vmalloc error: size %lu, failed to allocated page array size %lu",
3142  			nr_small_pages * PAGE_SIZE, array_size);
3143  		free_vm_area(area);
3144  		return NULL;
3145  	}
3146  
3147  	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
3148  	page_order = vm_area_page_order(area);
3149  
3150  	area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
3151  		node, page_order, nr_small_pages, area->pages);
3152  
3153  	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
3154  	if (gfp_mask & __GFP_ACCOUNT) {
3155  		int i;
3156  
3157  		for (i = 0; i < area->nr_pages; i++)
3158  			mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
3159  	}
3160  
3161  	/*
3162  	 * If not enough pages were obtained to accomplish an
3163  	 * allocation request, free them via vfree() if any.
3164  	 */
3165  	if (area->nr_pages != nr_small_pages) {
3166  		/*
3167  		 * vm_area_alloc_pages() can fail due to insufficient memory but
3168  		 * also:-
3169  		 *
3170  		 * - a pending fatal signal
3171  		 * - insufficient huge page-order pages
3172  		 *
3173  		 * Since we always retry allocations at order-0 in the huge page
3174  		 * case a warning for either is spurious.
3175  		 */
3176  		if (!fatal_signal_pending(current) && page_order == 0)
3177  			warn_alloc(gfp_mask, NULL,
3178  				"vmalloc error: size %lu, failed to allocate pages",
3179  				area->nr_pages * PAGE_SIZE);
3180  		goto fail;
3181  	}
3182  
3183  	/*
3184  	 * page tables allocations ignore external gfp mask, enforce it
3185  	 * by the scope API
3186  	 */
3187  	if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
3188  		flags = memalloc_nofs_save();
3189  	else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
3190  		flags = memalloc_noio_save();
3191  
3192  	do {
3193  		ret = vmap_pages_range(addr, addr + size, prot, area->pages,
3194  			page_shift);
3195  		if (nofail && (ret < 0))
3196  			schedule_timeout_uninterruptible(1);
3197  	} while (nofail && (ret < 0));
3198  
3199  	if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
3200  		memalloc_nofs_restore(flags);
3201  	else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
3202  		memalloc_noio_restore(flags);
3203  
3204  	if (ret < 0) {
3205  		warn_alloc(gfp_mask, NULL,
3206  			"vmalloc error: size %lu, failed to map pages",
3207  			area->nr_pages * PAGE_SIZE);
3208  		goto fail;
3209  	}
3210  
3211  	return area->addr;
3212  
3213  fail:
3214  	vfree(area->addr);
3215  	return NULL;
3216  }
3217  
3218  /**
3219   * __vmalloc_node_range - allocate virtually contiguous memory
3220   * @size:		  allocation size
3221   * @align:		  desired alignment
3222   * @start:		  vm area range start
3223   * @end:		  vm area range end
3224   * @gfp_mask:		  flags for the page level allocator
3225   * @prot:		  protection mask for the allocated pages
3226   * @vm_flags:		  additional vm area flags (e.g. %VM_NO_GUARD)
3227   * @node:		  node to use for allocation or NUMA_NO_NODE
3228   * @caller:		  caller's return address
3229   *
3230   * Allocate enough pages to cover @size from the page level
3231   * allocator with @gfp_mask flags. Please note that the full set of gfp
3232   * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
3233   * supported.
3234   * Zone modifiers are not supported. From the reclaim modifiers
3235   * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
3236   * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
3237   * __GFP_RETRY_MAYFAIL are not supported).
3238   *
3239   * __GFP_NOWARN can be used to suppress failures messages.
3240   *
3241   * Map them into contiguous kernel virtual space, using a pagetable
3242   * protection of @prot.
3243   *
3244   * Return: the address of the area or %NULL on failure
3245   */
__vmalloc_node_range(unsigned long size,unsigned long align,unsigned long start,unsigned long end,gfp_t gfp_mask,pgprot_t prot,unsigned long vm_flags,int node,const void * caller)3246  void *__vmalloc_node_range(unsigned long size, unsigned long align,
3247  			unsigned long start, unsigned long end, gfp_t gfp_mask,
3248  			pgprot_t prot, unsigned long vm_flags, int node,
3249  			const void *caller)
3250  {
3251  	struct vm_struct *area;
3252  	void *ret;
3253  	kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
3254  	unsigned long real_size = size;
3255  	unsigned long real_align = align;
3256  	unsigned int shift = PAGE_SHIFT;
3257  
3258  	if (WARN_ON_ONCE(!size))
3259  		return NULL;
3260  
3261  	if ((size >> PAGE_SHIFT) > totalram_pages()) {
3262  		warn_alloc(gfp_mask, NULL,
3263  			"vmalloc error: size %lu, exceeds total pages",
3264  			real_size);
3265  		return NULL;
3266  	}
3267  
3268  	if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
3269  		unsigned long size_per_node;
3270  
3271  		/*
3272  		 * Try huge pages. Only try for PAGE_KERNEL allocations,
3273  		 * others like modules don't yet expect huge pages in
3274  		 * their allocations due to apply_to_page_range not
3275  		 * supporting them.
3276  		 */
3277  
3278  		size_per_node = size;
3279  		if (node == NUMA_NO_NODE)
3280  			size_per_node /= num_online_nodes();
3281  		if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
3282  			shift = PMD_SHIFT;
3283  		else
3284  			shift = arch_vmap_pte_supported_shift(size_per_node);
3285  
3286  		align = max(real_align, 1UL << shift);
3287  		size = ALIGN(real_size, 1UL << shift);
3288  	}
3289  
3290  again:
3291  	area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
3292  				  VM_UNINITIALIZED | vm_flags, start, end, node,
3293  				  gfp_mask, caller);
3294  	if (!area) {
3295  		bool nofail = gfp_mask & __GFP_NOFAIL;
3296  		warn_alloc(gfp_mask, NULL,
3297  			"vmalloc error: size %lu, vm_struct allocation failed%s",
3298  			real_size, (nofail) ? ". Retrying." : "");
3299  		if (nofail) {
3300  			schedule_timeout_uninterruptible(1);
3301  			goto again;
3302  		}
3303  		goto fail;
3304  	}
3305  
3306  	/*
3307  	 * Prepare arguments for __vmalloc_area_node() and
3308  	 * kasan_unpoison_vmalloc().
3309  	 */
3310  	if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
3311  		if (kasan_hw_tags_enabled()) {
3312  			/*
3313  			 * Modify protection bits to allow tagging.
3314  			 * This must be done before mapping.
3315  			 */
3316  			prot = arch_vmap_pgprot_tagged(prot);
3317  
3318  			/*
3319  			 * Skip page_alloc poisoning and zeroing for physical
3320  			 * pages backing VM_ALLOC mapping. Memory is instead
3321  			 * poisoned and zeroed by kasan_unpoison_vmalloc().
3322  			 */
3323  			gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
3324  		}
3325  
3326  		/* Take note that the mapping is PAGE_KERNEL. */
3327  		kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
3328  	}
3329  
3330  	/* Allocate physical pages and map them into vmalloc space. */
3331  	ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
3332  	if (!ret)
3333  		goto fail;
3334  
3335  	/*
3336  	 * Mark the pages as accessible, now that they are mapped.
3337  	 * The condition for setting KASAN_VMALLOC_INIT should complement the
3338  	 * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
3339  	 * to make sure that memory is initialized under the same conditions.
3340  	 * Tag-based KASAN modes only assign tags to normal non-executable
3341  	 * allocations, see __kasan_unpoison_vmalloc().
3342  	 */
3343  	kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
3344  	if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
3345  	    (gfp_mask & __GFP_SKIP_ZERO))
3346  		kasan_flags |= KASAN_VMALLOC_INIT;
3347  	/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
3348  	area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
3349  
3350  	/*
3351  	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
3352  	 * flag. It means that vm_struct is not fully initialized.
3353  	 * Now, it is fully initialized, so remove this flag here.
3354  	 */
3355  	clear_vm_uninitialized_flag(area);
3356  
3357  	size = PAGE_ALIGN(size);
3358  	if (!(vm_flags & VM_DEFER_KMEMLEAK))
3359  		kmemleak_vmalloc(area, size, gfp_mask);
3360  
3361  	return area->addr;
3362  
3363  fail:
3364  	if (shift > PAGE_SHIFT) {
3365  		shift = PAGE_SHIFT;
3366  		align = real_align;
3367  		size = real_size;
3368  		goto again;
3369  	}
3370  
3371  	return NULL;
3372  }
3373  
3374  /**
3375   * __vmalloc_node - allocate virtually contiguous memory
3376   * @size:	    allocation size
3377   * @align:	    desired alignment
3378   * @gfp_mask:	    flags for the page level allocator
3379   * @node:	    node to use for allocation or NUMA_NO_NODE
3380   * @caller:	    caller's return address
3381   *
3382   * Allocate enough pages to cover @size from the page level allocator with
3383   * @gfp_mask flags.  Map them into contiguous kernel virtual space.
3384   *
3385   * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
3386   * and __GFP_NOFAIL are not supported
3387   *
3388   * Any use of gfp flags outside of GFP_KERNEL should be consulted
3389   * with mm people.
3390   *
3391   * Return: pointer to the allocated memory or %NULL on error
3392   */
__vmalloc_node(unsigned long size,unsigned long align,gfp_t gfp_mask,int node,const void * caller)3393  void *__vmalloc_node(unsigned long size, unsigned long align,
3394  			    gfp_t gfp_mask, int node, const void *caller)
3395  {
3396  	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
3397  				gfp_mask, PAGE_KERNEL, 0, node, caller);
3398  }
3399  /*
3400   * This is only for performance analysis of vmalloc and stress purpose.
3401   * It is required by vmalloc test module, therefore do not use it other
3402   * than that.
3403   */
3404  #ifdef CONFIG_TEST_VMALLOC_MODULE
3405  EXPORT_SYMBOL_GPL(__vmalloc_node);
3406  #endif
3407  
__vmalloc(unsigned long size,gfp_t gfp_mask)3408  void *__vmalloc(unsigned long size, gfp_t gfp_mask)
3409  {
3410  	return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
3411  				__builtin_return_address(0));
3412  }
3413  EXPORT_SYMBOL(__vmalloc);
3414  
3415  /**
3416   * vmalloc - allocate virtually contiguous memory
3417   * @size:    allocation size
3418   *
3419   * Allocate enough pages to cover @size from the page level
3420   * allocator and map them into contiguous kernel virtual space.
3421   *
3422   * For tight control over page level allocator and protection flags
3423   * use __vmalloc() instead.
3424   *
3425   * Return: pointer to the allocated memory or %NULL on error
3426   */
vmalloc(unsigned long size)3427  void *vmalloc(unsigned long size)
3428  {
3429  	return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
3430  				__builtin_return_address(0));
3431  }
3432  EXPORT_SYMBOL(vmalloc);
3433  
3434  /**
3435   * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
3436   * @size:      allocation size
3437   * @gfp_mask:  flags for the page level allocator
3438   *
3439   * Allocate enough pages to cover @size from the page level
3440   * allocator and map them into contiguous kernel virtual space.
3441   * If @size is greater than or equal to PMD_SIZE, allow using
3442   * huge pages for the memory
3443   *
3444   * Return: pointer to the allocated memory or %NULL on error
3445   */
vmalloc_huge(unsigned long size,gfp_t gfp_mask)3446  void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
3447  {
3448  	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
3449  				    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
3450  				    NUMA_NO_NODE, __builtin_return_address(0));
3451  }
3452  EXPORT_SYMBOL_GPL(vmalloc_huge);
3453  
3454  /**
3455   * vzalloc - allocate virtually contiguous memory with zero fill
3456   * @size:    allocation size
3457   *
3458   * Allocate enough pages to cover @size from the page level
3459   * allocator and map them into contiguous kernel virtual space.
3460   * The memory allocated is set to zero.
3461   *
3462   * For tight control over page level allocator and protection flags
3463   * use __vmalloc() instead.
3464   *
3465   * Return: pointer to the allocated memory or %NULL on error
3466   */
vzalloc(unsigned long size)3467  void *vzalloc(unsigned long size)
3468  {
3469  	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
3470  				__builtin_return_address(0));
3471  }
3472  EXPORT_SYMBOL(vzalloc);
3473  
3474  /**
3475   * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
3476   * @size: allocation size
3477   *
3478   * The resulting memory area is zeroed so it can be mapped to userspace
3479   * without leaking data.
3480   *
3481   * Return: pointer to the allocated memory or %NULL on error
3482   */
vmalloc_user(unsigned long size)3483  void *vmalloc_user(unsigned long size)
3484  {
3485  	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
3486  				    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
3487  				    VM_USERMAP, NUMA_NO_NODE,
3488  				    __builtin_return_address(0));
3489  }
3490  EXPORT_SYMBOL(vmalloc_user);
3491  
3492  /**
3493   * vmalloc_node - allocate memory on a specific node
3494   * @size:	  allocation size
3495   * @node:	  numa node
3496   *
3497   * Allocate enough pages to cover @size from the page level
3498   * allocator and map them into contiguous kernel virtual space.
3499   *
3500   * For tight control over page level allocator and protection flags
3501   * use __vmalloc() instead.
3502   *
3503   * Return: pointer to the allocated memory or %NULL on error
3504   */
vmalloc_node(unsigned long size,int node)3505  void *vmalloc_node(unsigned long size, int node)
3506  {
3507  	return __vmalloc_node(size, 1, GFP_KERNEL, node,
3508  			__builtin_return_address(0));
3509  }
3510  EXPORT_SYMBOL(vmalloc_node);
3511  
3512  /**
3513   * vzalloc_node - allocate memory on a specific node with zero fill
3514   * @size:	allocation size
3515   * @node:	numa node
3516   *
3517   * Allocate enough pages to cover @size from the page level
3518   * allocator and map them into contiguous kernel virtual space.
3519   * The memory allocated is set to zero.
3520   *
3521   * Return: pointer to the allocated memory or %NULL on error
3522   */
vzalloc_node(unsigned long size,int node)3523  void *vzalloc_node(unsigned long size, int node)
3524  {
3525  	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
3526  				__builtin_return_address(0));
3527  }
3528  EXPORT_SYMBOL(vzalloc_node);
3529  
3530  #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
3531  #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
3532  #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
3533  #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
3534  #else
3535  /*
3536   * 64b systems should always have either DMA or DMA32 zones. For others
3537   * GFP_DMA32 should do the right thing and use the normal zone.
3538   */
3539  #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
3540  #endif
3541  
3542  /**
3543   * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
3544   * @size:	allocation size
3545   *
3546   * Allocate enough 32bit PA addressable pages to cover @size from the
3547   * page level allocator and map them into contiguous kernel virtual space.
3548   *
3549   * Return: pointer to the allocated memory or %NULL on error
3550   */
vmalloc_32(unsigned long size)3551  void *vmalloc_32(unsigned long size)
3552  {
3553  	return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
3554  			__builtin_return_address(0));
3555  }
3556  EXPORT_SYMBOL(vmalloc_32);
3557  
3558  /**
3559   * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
3560   * @size:	     allocation size
3561   *
3562   * The resulting memory area is 32bit addressable and zeroed so it can be
3563   * mapped to userspace without leaking data.
3564   *
3565   * Return: pointer to the allocated memory or %NULL on error
3566   */
vmalloc_32_user(unsigned long size)3567  void *vmalloc_32_user(unsigned long size)
3568  {
3569  	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
3570  				    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
3571  				    VM_USERMAP, NUMA_NO_NODE,
3572  				    __builtin_return_address(0));
3573  }
3574  EXPORT_SYMBOL(vmalloc_32_user);
3575  
3576  /*
3577   * Atomically zero bytes in the iterator.
3578   *
3579   * Returns the number of zeroed bytes.
3580   */
zero_iter(struct iov_iter * iter,size_t count)3581  static size_t zero_iter(struct iov_iter *iter, size_t count)
3582  {
3583  	size_t remains = count;
3584  
3585  	while (remains > 0) {
3586  		size_t num, copied;
3587  
3588  		num = min_t(size_t, remains, PAGE_SIZE);
3589  		copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
3590  		remains -= copied;
3591  
3592  		if (copied < num)
3593  			break;
3594  	}
3595  
3596  	return count - remains;
3597  }
3598  
3599  /*
3600   * small helper routine, copy contents to iter from addr.
3601   * If the page is not present, fill zero.
3602   *
3603   * Returns the number of copied bytes.
3604   */
aligned_vread_iter(struct iov_iter * iter,const char * addr,size_t count)3605  static size_t aligned_vread_iter(struct iov_iter *iter,
3606  				 const char *addr, size_t count)
3607  {
3608  	size_t remains = count;
3609  	struct page *page;
3610  
3611  	while (remains > 0) {
3612  		unsigned long offset, length;
3613  		size_t copied = 0;
3614  
3615  		offset = offset_in_page(addr);
3616  		length = PAGE_SIZE - offset;
3617  		if (length > remains)
3618  			length = remains;
3619  		page = vmalloc_to_page(addr);
3620  		/*
3621  		 * To do safe access to this _mapped_ area, we need lock. But
3622  		 * adding lock here means that we need to add overhead of
3623  		 * vmalloc()/vfree() calls for this _debug_ interface, rarely
3624  		 * used. Instead of that, we'll use an local mapping via
3625  		 * copy_page_to_iter_nofault() and accept a small overhead in
3626  		 * this access function.
3627  		 */
3628  		if (page)
3629  			copied = copy_page_to_iter_nofault(page, offset,
3630  							   length, iter);
3631  		else
3632  			copied = zero_iter(iter, length);
3633  
3634  		addr += copied;
3635  		remains -= copied;
3636  
3637  		if (copied != length)
3638  			break;
3639  	}
3640  
3641  	return count - remains;
3642  }
3643  
3644  /*
3645   * Read from a vm_map_ram region of memory.
3646   *
3647   * Returns the number of copied bytes.
3648   */
vmap_ram_vread_iter(struct iov_iter * iter,const char * addr,size_t count,unsigned long flags)3649  static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
3650  				  size_t count, unsigned long flags)
3651  {
3652  	char *start;
3653  	struct vmap_block *vb;
3654  	struct xarray *xa;
3655  	unsigned long offset;
3656  	unsigned int rs, re;
3657  	size_t remains, n;
3658  
3659  	/*
3660  	 * If it's area created by vm_map_ram() interface directly, but
3661  	 * not further subdividing and delegating management to vmap_block,
3662  	 * handle it here.
3663  	 */
3664  	if (!(flags & VMAP_BLOCK))
3665  		return aligned_vread_iter(iter, addr, count);
3666  
3667  	remains = count;
3668  
3669  	/*
3670  	 * Area is split into regions and tracked with vmap_block, read out
3671  	 * each region and zero fill the hole between regions.
3672  	 */
3673  	xa = addr_to_vb_xa((unsigned long) addr);
3674  	vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
3675  	if (!vb)
3676  		goto finished_zero;
3677  
3678  	spin_lock(&vb->lock);
3679  	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
3680  		spin_unlock(&vb->lock);
3681  		goto finished_zero;
3682  	}
3683  
3684  	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
3685  		size_t copied;
3686  
3687  		if (remains == 0)
3688  			goto finished;
3689  
3690  		start = vmap_block_vaddr(vb->va->va_start, rs);
3691  
3692  		if (addr < start) {
3693  			size_t to_zero = min_t(size_t, start - addr, remains);
3694  			size_t zeroed = zero_iter(iter, to_zero);
3695  
3696  			addr += zeroed;
3697  			remains -= zeroed;
3698  
3699  			if (remains == 0 || zeroed != to_zero)
3700  				goto finished;
3701  		}
3702  
3703  		/*it could start reading from the middle of used region*/
3704  		offset = offset_in_page(addr);
3705  		n = ((re - rs + 1) << PAGE_SHIFT) - offset;
3706  		if (n > remains)
3707  			n = remains;
3708  
3709  		copied = aligned_vread_iter(iter, start + offset, n);
3710  
3711  		addr += copied;
3712  		remains -= copied;
3713  
3714  		if (copied != n)
3715  			goto finished;
3716  	}
3717  
3718  	spin_unlock(&vb->lock);
3719  
3720  finished_zero:
3721  	/* zero-fill the left dirty or free regions */
3722  	return count - remains + zero_iter(iter, remains);
3723  finished:
3724  	/* We couldn't copy/zero everything */
3725  	spin_unlock(&vb->lock);
3726  	return count - remains;
3727  }
3728  
3729  /**
3730   * vread_iter() - read vmalloc area in a safe way to an iterator.
3731   * @iter:         the iterator to which data should be written.
3732   * @addr:         vm address.
3733   * @count:        number of bytes to be read.
3734   *
3735   * This function checks that addr is a valid vmalloc'ed area, and
3736   * copy data from that area to a given buffer. If the given memory range
3737   * of [addr...addr+count) includes some valid address, data is copied to
3738   * proper area of @buf. If there are memory holes, they'll be zero-filled.
3739   * IOREMAP area is treated as memory hole and no copy is done.
3740   *
3741   * If [addr...addr+count) doesn't includes any intersects with alive
3742   * vm_struct area, returns 0. @buf should be kernel's buffer.
3743   *
3744   * Note: In usual ops, vread() is never necessary because the caller
3745   * should know vmalloc() area is valid and can use memcpy().
3746   * This is for routines which have to access vmalloc area without
3747   * any information, as /proc/kcore.
3748   *
3749   * Return: number of bytes for which addr and buf should be increased
3750   * (same number as @count) or %0 if [addr...addr+count) doesn't
3751   * include any intersection with valid vmalloc area
3752   */
vread_iter(struct iov_iter * iter,const char * addr,size_t count)3753  long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
3754  {
3755  	struct vmap_area *va;
3756  	struct vm_struct *vm;
3757  	char *vaddr;
3758  	size_t n, size, flags, remains;
3759  
3760  	addr = kasan_reset_tag(addr);
3761  
3762  	/* Don't allow overflow */
3763  	if ((unsigned long) addr + count < count)
3764  		count = -(unsigned long) addr;
3765  
3766  	remains = count;
3767  
3768  	spin_lock(&vmap_area_lock);
3769  	va = find_vmap_area_exceed_addr((unsigned long)addr);
3770  	if (!va)
3771  		goto finished_zero;
3772  
3773  	/* no intersects with alive vmap_area */
3774  	if ((unsigned long)addr + remains <= va->va_start)
3775  		goto finished_zero;
3776  
3777  	list_for_each_entry_from(va, &vmap_area_list, list) {
3778  		size_t copied;
3779  
3780  		if (remains == 0)
3781  			goto finished;
3782  
3783  		vm = va->vm;
3784  		flags = va->flags & VMAP_FLAGS_MASK;
3785  		/*
3786  		 * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
3787  		 * be set together with VMAP_RAM.
3788  		 */
3789  		WARN_ON(flags == VMAP_BLOCK);
3790  
3791  		if (!vm && !flags)
3792  			continue;
3793  
3794  		if (vm && (vm->flags & VM_UNINITIALIZED))
3795  			continue;
3796  
3797  		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
3798  		smp_rmb();
3799  
3800  		vaddr = (char *) va->va_start;
3801  		size = vm ? get_vm_area_size(vm) : va_size(va);
3802  
3803  		if (addr >= vaddr + size)
3804  			continue;
3805  
3806  		if (addr < vaddr) {
3807  			size_t to_zero = min_t(size_t, vaddr - addr, remains);
3808  			size_t zeroed = zero_iter(iter, to_zero);
3809  
3810  			addr += zeroed;
3811  			remains -= zeroed;
3812  
3813  			if (remains == 0 || zeroed != to_zero)
3814  				goto finished;
3815  		}
3816  
3817  		n = vaddr + size - addr;
3818  		if (n > remains)
3819  			n = remains;
3820  
3821  		if (flags & VMAP_RAM)
3822  			copied = vmap_ram_vread_iter(iter, addr, n, flags);
3823  		else if (!(vm->flags & VM_IOREMAP))
3824  			copied = aligned_vread_iter(iter, addr, n);
3825  		else /* IOREMAP area is treated as memory hole */
3826  			copied = zero_iter(iter, n);
3827  
3828  		addr += copied;
3829  		remains -= copied;
3830  
3831  		if (copied != n)
3832  			goto finished;
3833  	}
3834  
3835  finished_zero:
3836  	spin_unlock(&vmap_area_lock);
3837  	/* zero-fill memory holes */
3838  	return count - remains + zero_iter(iter, remains);
3839  finished:
3840  	/* Nothing remains, or We couldn't copy/zero everything. */
3841  	spin_unlock(&vmap_area_lock);
3842  
3843  	return count - remains;
3844  }
3845  
3846  /**
3847   * remap_vmalloc_range_partial - map vmalloc pages to userspace
3848   * @vma:		vma to cover
3849   * @uaddr:		target user address to start at
3850   * @kaddr:		virtual address of vmalloc kernel memory
3851   * @pgoff:		offset from @kaddr to start at
3852   * @size:		size of map area
3853   *
3854   * Returns:	0 for success, -Exxx on failure
3855   *
3856   * This function checks that @kaddr is a valid vmalloc'ed area,
3857   * and that it is big enough to cover the range starting at
3858   * @uaddr in @vma. Will return failure if that criteria isn't
3859   * met.
3860   *
3861   * Similar to remap_pfn_range() (see mm/memory.c)
3862   */
remap_vmalloc_range_partial(struct vm_area_struct * vma,unsigned long uaddr,void * kaddr,unsigned long pgoff,unsigned long size)3863  int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
3864  				void *kaddr, unsigned long pgoff,
3865  				unsigned long size)
3866  {
3867  	struct vm_struct *area;
3868  	unsigned long off;
3869  	unsigned long end_index;
3870  
3871  	if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
3872  		return -EINVAL;
3873  
3874  	size = PAGE_ALIGN(size);
3875  
3876  	if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
3877  		return -EINVAL;
3878  
3879  	area = find_vm_area(kaddr);
3880  	if (!area)
3881  		return -EINVAL;
3882  
3883  	if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
3884  		return -EINVAL;
3885  
3886  	if (check_add_overflow(size, off, &end_index) ||
3887  	    end_index > get_vm_area_size(area))
3888  		return -EINVAL;
3889  	kaddr += off;
3890  
3891  	do {
3892  		struct page *page = vmalloc_to_page(kaddr);
3893  		int ret;
3894  
3895  		ret = vm_insert_page(vma, uaddr, page);
3896  		if (ret)
3897  			return ret;
3898  
3899  		uaddr += PAGE_SIZE;
3900  		kaddr += PAGE_SIZE;
3901  		size -= PAGE_SIZE;
3902  	} while (size > 0);
3903  
3904  	vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
3905  
3906  	return 0;
3907  }
3908  
3909  /**
3910   * remap_vmalloc_range - map vmalloc pages to userspace
3911   * @vma:		vma to cover (map full range of vma)
3912   * @addr:		vmalloc memory
3913   * @pgoff:		number of pages into addr before first page to map
3914   *
3915   * Returns:	0 for success, -Exxx on failure
3916   *
3917   * This function checks that addr is a valid vmalloc'ed area, and
3918   * that it is big enough to cover the vma. Will return failure if
3919   * that criteria isn't met.
3920   *
3921   * Similar to remap_pfn_range() (see mm/memory.c)
3922   */
remap_vmalloc_range(struct vm_area_struct * vma,void * addr,unsigned long pgoff)3923  int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
3924  						unsigned long pgoff)
3925  {
3926  	return remap_vmalloc_range_partial(vma, vma->vm_start,
3927  					   addr, pgoff,
3928  					   vma->vm_end - vma->vm_start);
3929  }
3930  EXPORT_SYMBOL(remap_vmalloc_range);
3931  
free_vm_area(struct vm_struct * area)3932  void free_vm_area(struct vm_struct *area)
3933  {
3934  	struct vm_struct *ret;
3935  	ret = remove_vm_area(area->addr);
3936  	BUG_ON(ret != area);
3937  	kfree(area);
3938  }
3939  EXPORT_SYMBOL_GPL(free_vm_area);
3940  
3941  #ifdef CONFIG_SMP
node_to_va(struct rb_node * n)3942  static struct vmap_area *node_to_va(struct rb_node *n)
3943  {
3944  	return rb_entry_safe(n, struct vmap_area, rb_node);
3945  }
3946  
3947  /**
3948   * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
3949   * @addr: target address
3950   *
3951   * Returns: vmap_area if it is found. If there is no such area
3952   *   the first highest(reverse order) vmap_area is returned
3953   *   i.e. va->va_start < addr && va->va_end < addr or NULL
3954   *   if there are no any areas before @addr.
3955   */
3956  static struct vmap_area *
pvm_find_va_enclose_addr(unsigned long addr)3957  pvm_find_va_enclose_addr(unsigned long addr)
3958  {
3959  	struct vmap_area *va, *tmp;
3960  	struct rb_node *n;
3961  
3962  	n = free_vmap_area_root.rb_node;
3963  	va = NULL;
3964  
3965  	while (n) {
3966  		tmp = rb_entry(n, struct vmap_area, rb_node);
3967  		if (tmp->va_start <= addr) {
3968  			va = tmp;
3969  			if (tmp->va_end >= addr)
3970  				break;
3971  
3972  			n = n->rb_right;
3973  		} else {
3974  			n = n->rb_left;
3975  		}
3976  	}
3977  
3978  	return va;
3979  }
3980  
3981  /**
3982   * pvm_determine_end_from_reverse - find the highest aligned address
3983   * of free block below VMALLOC_END
3984   * @va:
3985   *   in - the VA we start the search(reverse order);
3986   *   out - the VA with the highest aligned end address.
3987   * @align: alignment for required highest address
3988   *
3989   * Returns: determined end address within vmap_area
3990   */
3991  static unsigned long
pvm_determine_end_from_reverse(struct vmap_area ** va,unsigned long align)3992  pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
3993  {
3994  	unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
3995  	unsigned long addr;
3996  
3997  	if (likely(*va)) {
3998  		list_for_each_entry_from_reverse((*va),
3999  				&free_vmap_area_list, list) {
4000  			addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
4001  			if ((*va)->va_start < addr)
4002  				return addr;
4003  		}
4004  	}
4005  
4006  	return 0;
4007  }
4008  
4009  /**
4010   * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
4011   * @offsets: array containing offset of each area
4012   * @sizes: array containing size of each area
4013   * @nr_vms: the number of areas to allocate
4014   * @align: alignment, all entries in @offsets and @sizes must be aligned to this
4015   *
4016   * Returns: kmalloc'd vm_struct pointer array pointing to allocated
4017   *	    vm_structs on success, %NULL on failure
4018   *
4019   * Percpu allocator wants to use congruent vm areas so that it can
4020   * maintain the offsets among percpu areas.  This function allocates
4021   * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
4022   * be scattered pretty far, distance between two areas easily going up
4023   * to gigabytes.  To avoid interacting with regular vmallocs, these
4024   * areas are allocated from top.
4025   *
4026   * Despite its complicated look, this allocator is rather simple. It
4027   * does everything top-down and scans free blocks from the end looking
4028   * for matching base. While scanning, if any of the areas do not fit the
4029   * base address is pulled down to fit the area. Scanning is repeated till
4030   * all the areas fit and then all necessary data structures are inserted
4031   * and the result is returned.
4032   */
pcpu_get_vm_areas(const unsigned long * offsets,const size_t * sizes,int nr_vms,size_t align)4033  struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
4034  				     const size_t *sizes, int nr_vms,
4035  				     size_t align)
4036  {
4037  	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
4038  	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
4039  	struct vmap_area **vas, *va;
4040  	struct vm_struct **vms;
4041  	int area, area2, last_area, term_area;
4042  	unsigned long base, start, size, end, last_end, orig_start, orig_end;
4043  	bool purged = false;
4044  
4045  	/* verify parameters and allocate data structures */
4046  	BUG_ON(offset_in_page(align) || !is_power_of_2(align));
4047  	for (last_area = 0, area = 0; area < nr_vms; area++) {
4048  		start = offsets[area];
4049  		end = start + sizes[area];
4050  
4051  		/* is everything aligned properly? */
4052  		BUG_ON(!IS_ALIGNED(offsets[area], align));
4053  		BUG_ON(!IS_ALIGNED(sizes[area], align));
4054  
4055  		/* detect the area with the highest address */
4056  		if (start > offsets[last_area])
4057  			last_area = area;
4058  
4059  		for (area2 = area + 1; area2 < nr_vms; area2++) {
4060  			unsigned long start2 = offsets[area2];
4061  			unsigned long end2 = start2 + sizes[area2];
4062  
4063  			BUG_ON(start2 < end && start < end2);
4064  		}
4065  	}
4066  	last_end = offsets[last_area] + sizes[last_area];
4067  
4068  	if (vmalloc_end - vmalloc_start < last_end) {
4069  		WARN_ON(true);
4070  		return NULL;
4071  	}
4072  
4073  	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
4074  	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
4075  	if (!vas || !vms)
4076  		goto err_free2;
4077  
4078  	for (area = 0; area < nr_vms; area++) {
4079  		vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
4080  		vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
4081  		if (!vas[area] || !vms[area])
4082  			goto err_free;
4083  	}
4084  retry:
4085  	spin_lock(&free_vmap_area_lock);
4086  
4087  	/* start scanning - we scan from the top, begin with the last area */
4088  	area = term_area = last_area;
4089  	start = offsets[area];
4090  	end = start + sizes[area];
4091  
4092  	va = pvm_find_va_enclose_addr(vmalloc_end);
4093  	base = pvm_determine_end_from_reverse(&va, align) - end;
4094  
4095  	while (true) {
4096  		/*
4097  		 * base might have underflowed, add last_end before
4098  		 * comparing.
4099  		 */
4100  		if (base + last_end < vmalloc_start + last_end)
4101  			goto overflow;
4102  
4103  		/*
4104  		 * Fitting base has not been found.
4105  		 */
4106  		if (va == NULL)
4107  			goto overflow;
4108  
4109  		/*
4110  		 * If required width exceeds current VA block, move
4111  		 * base downwards and then recheck.
4112  		 */
4113  		if (base + end > va->va_end) {
4114  			base = pvm_determine_end_from_reverse(&va, align) - end;
4115  			term_area = area;
4116  			continue;
4117  		}
4118  
4119  		/*
4120  		 * If this VA does not fit, move base downwards and recheck.
4121  		 */
4122  		if (base + start < va->va_start) {
4123  			va = node_to_va(rb_prev(&va->rb_node));
4124  			base = pvm_determine_end_from_reverse(&va, align) - end;
4125  			term_area = area;
4126  			continue;
4127  		}
4128  
4129  		/*
4130  		 * This area fits, move on to the previous one.  If
4131  		 * the previous one is the terminal one, we're done.
4132  		 */
4133  		area = (area + nr_vms - 1) % nr_vms;
4134  		if (area == term_area)
4135  			break;
4136  
4137  		start = offsets[area];
4138  		end = start + sizes[area];
4139  		va = pvm_find_va_enclose_addr(base + end);
4140  	}
4141  
4142  	/* we've found a fitting base, insert all va's */
4143  	for (area = 0; area < nr_vms; area++) {
4144  		int ret;
4145  
4146  		start = base + offsets[area];
4147  		size = sizes[area];
4148  
4149  		va = pvm_find_va_enclose_addr(start);
4150  		if (WARN_ON_ONCE(va == NULL))
4151  			/* It is a BUG(), but trigger recovery instead. */
4152  			goto recovery;
4153  
4154  		ret = adjust_va_to_fit_type(&free_vmap_area_root,
4155  					    &free_vmap_area_list,
4156  					    va, start, size);
4157  		if (WARN_ON_ONCE(unlikely(ret)))
4158  			/* It is a BUG(), but trigger recovery instead. */
4159  			goto recovery;
4160  
4161  		/* Allocated area. */
4162  		va = vas[area];
4163  		va->va_start = start;
4164  		va->va_end = start + size;
4165  	}
4166  
4167  	spin_unlock(&free_vmap_area_lock);
4168  
4169  	/* populate the kasan shadow space */
4170  	for (area = 0; area < nr_vms; area++) {
4171  		if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
4172  			goto err_free_shadow;
4173  	}
4174  
4175  	/* insert all vm's */
4176  	spin_lock(&vmap_area_lock);
4177  	for (area = 0; area < nr_vms; area++) {
4178  		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
4179  
4180  		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
4181  				 pcpu_get_vm_areas);
4182  	}
4183  	spin_unlock(&vmap_area_lock);
4184  
4185  	/*
4186  	 * Mark allocated areas as accessible. Do it now as a best-effort
4187  	 * approach, as they can be mapped outside of vmalloc code.
4188  	 * With hardware tag-based KASAN, marking is skipped for
4189  	 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
4190  	 */
4191  	for (area = 0; area < nr_vms; area++)
4192  		vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
4193  				vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
4194  
4195  	kfree(vas);
4196  	return vms;
4197  
4198  recovery:
4199  	/*
4200  	 * Remove previously allocated areas. There is no
4201  	 * need in removing these areas from the busy tree,
4202  	 * because they are inserted only on the final step
4203  	 * and when pcpu_get_vm_areas() is success.
4204  	 */
4205  	while (area--) {
4206  		orig_start = vas[area]->va_start;
4207  		orig_end = vas[area]->va_end;
4208  		va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
4209  				&free_vmap_area_list);
4210  		if (va)
4211  			kasan_release_vmalloc(orig_start, orig_end,
4212  				va->va_start, va->va_end);
4213  		vas[area] = NULL;
4214  	}
4215  
4216  overflow:
4217  	spin_unlock(&free_vmap_area_lock);
4218  	if (!purged) {
4219  		reclaim_and_purge_vmap_areas();
4220  		purged = true;
4221  
4222  		/* Before "retry", check if we recover. */
4223  		for (area = 0; area < nr_vms; area++) {
4224  			if (vas[area])
4225  				continue;
4226  
4227  			vas[area] = kmem_cache_zalloc(
4228  				vmap_area_cachep, GFP_KERNEL);
4229  			if (!vas[area])
4230  				goto err_free;
4231  		}
4232  
4233  		goto retry;
4234  	}
4235  
4236  err_free:
4237  	for (area = 0; area < nr_vms; area++) {
4238  		if (vas[area])
4239  			kmem_cache_free(vmap_area_cachep, vas[area]);
4240  
4241  		kfree(vms[area]);
4242  	}
4243  err_free2:
4244  	kfree(vas);
4245  	kfree(vms);
4246  	return NULL;
4247  
4248  err_free_shadow:
4249  	spin_lock(&free_vmap_area_lock);
4250  	/*
4251  	 * We release all the vmalloc shadows, even the ones for regions that
4252  	 * hadn't been successfully added. This relies on kasan_release_vmalloc
4253  	 * being able to tolerate this case.
4254  	 */
4255  	for (area = 0; area < nr_vms; area++) {
4256  		orig_start = vas[area]->va_start;
4257  		orig_end = vas[area]->va_end;
4258  		va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
4259  				&free_vmap_area_list);
4260  		if (va)
4261  			kasan_release_vmalloc(orig_start, orig_end,
4262  				va->va_start, va->va_end);
4263  		vas[area] = NULL;
4264  		kfree(vms[area]);
4265  	}
4266  	spin_unlock(&free_vmap_area_lock);
4267  	kfree(vas);
4268  	kfree(vms);
4269  	return NULL;
4270  }
4271  
4272  /**
4273   * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
4274   * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
4275   * @nr_vms: the number of allocated areas
4276   *
4277   * Free vm_structs and the array allocated by pcpu_get_vm_areas().
4278   */
pcpu_free_vm_areas(struct vm_struct ** vms,int nr_vms)4279  void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
4280  {
4281  	int i;
4282  
4283  	for (i = 0; i < nr_vms; i++)
4284  		free_vm_area(vms[i]);
4285  	kfree(vms);
4286  }
4287  #endif	/* CONFIG_SMP */
4288  
4289  #ifdef CONFIG_PRINTK
vmalloc_dump_obj(void * object)4290  bool vmalloc_dump_obj(void *object)
4291  {
4292  	void *objp = (void *)PAGE_ALIGN((unsigned long)object);
4293  	const void *caller;
4294  	struct vm_struct *vm;
4295  	struct vmap_area *va;
4296  	unsigned long addr;
4297  	unsigned int nr_pages;
4298  
4299  	if (!spin_trylock(&vmap_area_lock))
4300  		return false;
4301  	va = __find_vmap_area((unsigned long)objp, &vmap_area_root);
4302  	if (!va) {
4303  		spin_unlock(&vmap_area_lock);
4304  		return false;
4305  	}
4306  
4307  	vm = va->vm;
4308  	if (!vm) {
4309  		spin_unlock(&vmap_area_lock);
4310  		return false;
4311  	}
4312  	addr = (unsigned long)vm->addr;
4313  	caller = vm->caller;
4314  	nr_pages = vm->nr_pages;
4315  	spin_unlock(&vmap_area_lock);
4316  	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
4317  		nr_pages, addr, caller);
4318  	return true;
4319  }
4320  #endif
4321  
4322  #ifdef CONFIG_PROC_FS
s_start(struct seq_file * m,loff_t * pos)4323  static void *s_start(struct seq_file *m, loff_t *pos)
4324  	__acquires(&vmap_purge_lock)
4325  	__acquires(&vmap_area_lock)
4326  {
4327  	mutex_lock(&vmap_purge_lock);
4328  	spin_lock(&vmap_area_lock);
4329  
4330  	return seq_list_start(&vmap_area_list, *pos);
4331  }
4332  
s_next(struct seq_file * m,void * p,loff_t * pos)4333  static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4334  {
4335  	return seq_list_next(p, &vmap_area_list, pos);
4336  }
4337  
s_stop(struct seq_file * m,void * p)4338  static void s_stop(struct seq_file *m, void *p)
4339  	__releases(&vmap_area_lock)
4340  	__releases(&vmap_purge_lock)
4341  {
4342  	spin_unlock(&vmap_area_lock);
4343  	mutex_unlock(&vmap_purge_lock);
4344  }
4345  
show_numa_info(struct seq_file * m,struct vm_struct * v)4346  static void show_numa_info(struct seq_file *m, struct vm_struct *v)
4347  {
4348  	if (IS_ENABLED(CONFIG_NUMA)) {
4349  		unsigned int nr, *counters = m->private;
4350  		unsigned int step = 1U << vm_area_page_order(v);
4351  
4352  		if (!counters)
4353  			return;
4354  
4355  		if (v->flags & VM_UNINITIALIZED)
4356  			return;
4357  		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
4358  		smp_rmb();
4359  
4360  		memset(counters, 0, nr_node_ids * sizeof(unsigned int));
4361  
4362  		for (nr = 0; nr < v->nr_pages; nr += step)
4363  			counters[page_to_nid(v->pages[nr])] += step;
4364  		for_each_node_state(nr, N_HIGH_MEMORY)
4365  			if (counters[nr])
4366  				seq_printf(m, " N%u=%u", nr, counters[nr]);
4367  	}
4368  }
4369  
show_purge_info(struct seq_file * m)4370  static void show_purge_info(struct seq_file *m)
4371  {
4372  	struct vmap_area *va;
4373  
4374  	spin_lock(&purge_vmap_area_lock);
4375  	list_for_each_entry(va, &purge_vmap_area_list, list) {
4376  		seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
4377  			(void *)va->va_start, (void *)va->va_end,
4378  			va->va_end - va->va_start);
4379  	}
4380  	spin_unlock(&purge_vmap_area_lock);
4381  }
4382  
s_show(struct seq_file * m,void * p)4383  static int s_show(struct seq_file *m, void *p)
4384  {
4385  	struct vmap_area *va;
4386  	struct vm_struct *v;
4387  
4388  	va = list_entry(p, struct vmap_area, list);
4389  
4390  	if (!va->vm) {
4391  		if (va->flags & VMAP_RAM)
4392  			seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
4393  				(void *)va->va_start, (void *)va->va_end,
4394  				va->va_end - va->va_start);
4395  
4396  		goto final;
4397  	}
4398  
4399  	v = va->vm;
4400  
4401  	seq_printf(m, "0x%pK-0x%pK %7ld",
4402  		v->addr, v->addr + v->size, v->size);
4403  
4404  	if (v->caller)
4405  		seq_printf(m, " %pS", v->caller);
4406  
4407  	if (v->nr_pages)
4408  		seq_printf(m, " pages=%d", v->nr_pages);
4409  
4410  	if (v->phys_addr)
4411  		seq_printf(m, " phys=%pa", &v->phys_addr);
4412  
4413  	if (v->flags & VM_IOREMAP)
4414  		seq_puts(m, " ioremap");
4415  
4416  	if (v->flags & VM_ALLOC)
4417  		seq_puts(m, " vmalloc");
4418  
4419  	if (v->flags & VM_MAP)
4420  		seq_puts(m, " vmap");
4421  
4422  	if (v->flags & VM_USERMAP)
4423  		seq_puts(m, " user");
4424  
4425  	if (v->flags & VM_DMA_COHERENT)
4426  		seq_puts(m, " dma-coherent");
4427  
4428  	if (is_vmalloc_addr(v->pages))
4429  		seq_puts(m, " vpages");
4430  
4431  	show_numa_info(m, v);
4432  	seq_putc(m, '\n');
4433  
4434  	/*
4435  	 * As a final step, dump "unpurged" areas.
4436  	 */
4437  final:
4438  	if (list_is_last(&va->list, &vmap_area_list))
4439  		show_purge_info(m);
4440  
4441  	return 0;
4442  }
4443  
4444  static const struct seq_operations vmalloc_op = {
4445  	.start = s_start,
4446  	.next = s_next,
4447  	.stop = s_stop,
4448  	.show = s_show,
4449  };
4450  
proc_vmalloc_init(void)4451  static int __init proc_vmalloc_init(void)
4452  {
4453  	if (IS_ENABLED(CONFIG_NUMA))
4454  		proc_create_seq_private("vmallocinfo", 0400, NULL,
4455  				&vmalloc_op,
4456  				nr_node_ids * sizeof(unsigned int), NULL);
4457  	else
4458  		proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
4459  	return 0;
4460  }
4461  module_init(proc_vmalloc_init);
4462  
4463  #endif
4464  
vmalloc_init(void)4465  void __init vmalloc_init(void)
4466  {
4467  	struct vmap_area *va;
4468  	struct vm_struct *tmp;
4469  	int i;
4470  
4471  	/*
4472  	 * Create the cache for vmap_area objects.
4473  	 */
4474  	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
4475  
4476  	for_each_possible_cpu(i) {
4477  		struct vmap_block_queue *vbq;
4478  		struct vfree_deferred *p;
4479  
4480  		vbq = &per_cpu(vmap_block_queue, i);
4481  		spin_lock_init(&vbq->lock);
4482  		INIT_LIST_HEAD(&vbq->free);
4483  		p = &per_cpu(vfree_deferred, i);
4484  		init_llist_head(&p->list);
4485  		INIT_WORK(&p->wq, delayed_vfree_work);
4486  		xa_init(&vbq->vmap_blocks);
4487  	}
4488  
4489  	/* Import existing vmlist entries. */
4490  	for (tmp = vmlist; tmp; tmp = tmp->next) {
4491  		va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
4492  		if (WARN_ON_ONCE(!va))
4493  			continue;
4494  
4495  		va->va_start = (unsigned long)tmp->addr;
4496  		va->va_end = va->va_start + tmp->size;
4497  		va->vm = tmp;
4498  		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
4499  	}
4500  
4501  	/*
4502  	 * Now we can initialize a free vmap space.
4503  	 */
4504  	vmap_init_free_space();
4505  	vmap_initialized = true;
4506  }
4507