xref: /openbmc/linux/mm/hugetlb.c (revision 40867d74c374b235e14d839f3a77f26684feefe5)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Generic hugetlb support.
4   * (C) Nadia Yvette Chambers, April 2004
5   */
6  #include <linux/list.h>
7  #include <linux/init.h>
8  #include <linux/mm.h>
9  #include <linux/seq_file.h>
10  #include <linux/sysctl.h>
11  #include <linux/highmem.h>
12  #include <linux/mmu_notifier.h>
13  #include <linux/nodemask.h>
14  #include <linux/pagemap.h>
15  #include <linux/mempolicy.h>
16  #include <linux/compiler.h>
17  #include <linux/cpuset.h>
18  #include <linux/mutex.h>
19  #include <linux/memblock.h>
20  #include <linux/sysfs.h>
21  #include <linux/slab.h>
22  #include <linux/sched/mm.h>
23  #include <linux/mmdebug.h>
24  #include <linux/sched/signal.h>
25  #include <linux/rmap.h>
26  #include <linux/string_helpers.h>
27  #include <linux/swap.h>
28  #include <linux/swapops.h>
29  #include <linux/jhash.h>
30  #include <linux/numa.h>
31  #include <linux/llist.h>
32  #include <linux/cma.h>
33  #include <linux/migrate.h>
34  
35  #include <asm/page.h>
36  #include <asm/pgalloc.h>
37  #include <asm/tlb.h>
38  
39  #include <linux/io.h>
40  #include <linux/hugetlb.h>
41  #include <linux/hugetlb_cgroup.h>
42  #include <linux/node.h>
43  #include <linux/page_owner.h>
44  #include "internal.h"
45  #include "hugetlb_vmemmap.h"
46  
47  int hugetlb_max_hstate __read_mostly;
48  unsigned int default_hstate_idx;
49  struct hstate hstates[HUGE_MAX_HSTATE];
50  
51  #ifdef CONFIG_CMA
52  static struct cma *hugetlb_cma[MAX_NUMNODES];
53  static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
54  static bool hugetlb_cma_page(struct page *page, unsigned int order)
55  {
56  	return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
57  				1 << order);
58  }
59  #else
60  static bool hugetlb_cma_page(struct page *page, unsigned int order)
61  {
62  	return false;
63  }
64  #endif
65  static unsigned long hugetlb_cma_size __initdata;
66  
67  /*
68   * Minimum page order among possible hugepage sizes, set to a proper value
69   * at boot time.
70   */
71  static unsigned int minimum_order __read_mostly = UINT_MAX;
72  
73  __initdata LIST_HEAD(huge_boot_pages);
74  
75  /* for command line parsing */
76  static struct hstate * __initdata parsed_hstate;
77  static unsigned long __initdata default_hstate_max_huge_pages;
78  static bool __initdata parsed_valid_hugepagesz = true;
79  static bool __initdata parsed_default_hugepagesz;
80  static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
81  
82  /*
83   * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
84   * free_huge_pages, and surplus_huge_pages.
85   */
86  DEFINE_SPINLOCK(hugetlb_lock);
87  
88  /*
89   * Serializes faults on the same logical page.  This is used to
90   * prevent spurious OOMs when the hugepage pool is fully utilized.
91   */
92  static int num_fault_mutexes;
93  struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
94  
95  /* Forward declaration */
96  static int hugetlb_acct_memory(struct hstate *h, long delta);
97  
98  static inline bool subpool_is_free(struct hugepage_subpool *spool)
99  {
100  	if (spool->count)
101  		return false;
102  	if (spool->max_hpages != -1)
103  		return spool->used_hpages == 0;
104  	if (spool->min_hpages != -1)
105  		return spool->rsv_hpages == spool->min_hpages;
106  
107  	return true;
108  }
109  
110  static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
111  						unsigned long irq_flags)
112  {
113  	spin_unlock_irqrestore(&spool->lock, irq_flags);
114  
115  	/* If no pages are used, and no other handles to the subpool
116  	 * remain, give up any reservations based on minimum size and
117  	 * free the subpool */
118  	if (subpool_is_free(spool)) {
119  		if (spool->min_hpages != -1)
120  			hugetlb_acct_memory(spool->hstate,
121  						-spool->min_hpages);
122  		kfree(spool);
123  	}
124  }
125  
126  struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
127  						long min_hpages)
128  {
129  	struct hugepage_subpool *spool;
130  
131  	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
132  	if (!spool)
133  		return NULL;
134  
135  	spin_lock_init(&spool->lock);
136  	spool->count = 1;
137  	spool->max_hpages = max_hpages;
138  	spool->hstate = h;
139  	spool->min_hpages = min_hpages;
140  
141  	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
142  		kfree(spool);
143  		return NULL;
144  	}
145  	spool->rsv_hpages = min_hpages;
146  
147  	return spool;
148  }
149  
150  void hugepage_put_subpool(struct hugepage_subpool *spool)
151  {
152  	unsigned long flags;
153  
154  	spin_lock_irqsave(&spool->lock, flags);
155  	BUG_ON(!spool->count);
156  	spool->count--;
157  	unlock_or_release_subpool(spool, flags);
158  }
159  
160  /*
161   * Subpool accounting for allocating and reserving pages.
162   * Return -ENOMEM if there are not enough resources to satisfy the
163   * request.  Otherwise, return the number of pages by which the
164   * global pools must be adjusted (upward).  The returned value may
165   * only be different than the passed value (delta) in the case where
166   * a subpool minimum size must be maintained.
167   */
168  static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
169  				      long delta)
170  {
171  	long ret = delta;
172  
173  	if (!spool)
174  		return ret;
175  
176  	spin_lock_irq(&spool->lock);
177  
178  	if (spool->max_hpages != -1) {		/* maximum size accounting */
179  		if ((spool->used_hpages + delta) <= spool->max_hpages)
180  			spool->used_hpages += delta;
181  		else {
182  			ret = -ENOMEM;
183  			goto unlock_ret;
184  		}
185  	}
186  
187  	/* minimum size accounting */
188  	if (spool->min_hpages != -1 && spool->rsv_hpages) {
189  		if (delta > spool->rsv_hpages) {
190  			/*
191  			 * Asking for more reserves than those already taken on
192  			 * behalf of subpool.  Return difference.
193  			 */
194  			ret = delta - spool->rsv_hpages;
195  			spool->rsv_hpages = 0;
196  		} else {
197  			ret = 0;	/* reserves already accounted for */
198  			spool->rsv_hpages -= delta;
199  		}
200  	}
201  
202  unlock_ret:
203  	spin_unlock_irq(&spool->lock);
204  	return ret;
205  }
206  
207  /*
208   * Subpool accounting for freeing and unreserving pages.
209   * Return the number of global page reservations that must be dropped.
210   * The return value may only be different than the passed value (delta)
211   * in the case where a subpool minimum size must be maintained.
212   */
213  static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
214  				       long delta)
215  {
216  	long ret = delta;
217  	unsigned long flags;
218  
219  	if (!spool)
220  		return delta;
221  
222  	spin_lock_irqsave(&spool->lock, flags);
223  
224  	if (spool->max_hpages != -1)		/* maximum size accounting */
225  		spool->used_hpages -= delta;
226  
227  	 /* minimum size accounting */
228  	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
229  		if (spool->rsv_hpages + delta <= spool->min_hpages)
230  			ret = 0;
231  		else
232  			ret = spool->rsv_hpages + delta - spool->min_hpages;
233  
234  		spool->rsv_hpages += delta;
235  		if (spool->rsv_hpages > spool->min_hpages)
236  			spool->rsv_hpages = spool->min_hpages;
237  	}
238  
239  	/*
240  	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
241  	 * quota reference, free it now.
242  	 */
243  	unlock_or_release_subpool(spool, flags);
244  
245  	return ret;
246  }
247  
248  static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
249  {
250  	return HUGETLBFS_SB(inode->i_sb)->spool;
251  }
252  
253  static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
254  {
255  	return subpool_inode(file_inode(vma->vm_file));
256  }
257  
258  /* Helper that removes a struct file_region from the resv_map cache and returns
259   * it for use.
260   */
261  static struct file_region *
262  get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
263  {
264  	struct file_region *nrg = NULL;
265  
266  	VM_BUG_ON(resv->region_cache_count <= 0);
267  
268  	resv->region_cache_count--;
269  	nrg = list_first_entry(&resv->region_cache, struct file_region, link);
270  	list_del(&nrg->link);
271  
272  	nrg->from = from;
273  	nrg->to = to;
274  
275  	return nrg;
276  }
277  
278  static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
279  					      struct file_region *rg)
280  {
281  #ifdef CONFIG_CGROUP_HUGETLB
282  	nrg->reservation_counter = rg->reservation_counter;
283  	nrg->css = rg->css;
284  	if (rg->css)
285  		css_get(rg->css);
286  #endif
287  }
288  
289  /* Helper that records hugetlb_cgroup uncharge info. */
290  static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
291  						struct hstate *h,
292  						struct resv_map *resv,
293  						struct file_region *nrg)
294  {
295  #ifdef CONFIG_CGROUP_HUGETLB
296  	if (h_cg) {
297  		nrg->reservation_counter =
298  			&h_cg->rsvd_hugepage[hstate_index(h)];
299  		nrg->css = &h_cg->css;
300  		/*
301  		 * The caller will hold exactly one h_cg->css reference for the
302  		 * whole contiguous reservation region. But this area might be
303  		 * scattered when there are already some file_regions reside in
304  		 * it. As a result, many file_regions may share only one css
305  		 * reference. In order to ensure that one file_region must hold
306  		 * exactly one h_cg->css reference, we should do css_get for
307  		 * each file_region and leave the reference held by caller
308  		 * untouched.
309  		 */
310  		css_get(&h_cg->css);
311  		if (!resv->pages_per_hpage)
312  			resv->pages_per_hpage = pages_per_huge_page(h);
313  		/* pages_per_hpage should be the same for all entries in
314  		 * a resv_map.
315  		 */
316  		VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
317  	} else {
318  		nrg->reservation_counter = NULL;
319  		nrg->css = NULL;
320  	}
321  #endif
322  }
323  
324  static void put_uncharge_info(struct file_region *rg)
325  {
326  #ifdef CONFIG_CGROUP_HUGETLB
327  	if (rg->css)
328  		css_put(rg->css);
329  #endif
330  }
331  
332  static bool has_same_uncharge_info(struct file_region *rg,
333  				   struct file_region *org)
334  {
335  #ifdef CONFIG_CGROUP_HUGETLB
336  	return rg->reservation_counter == org->reservation_counter &&
337  	       rg->css == org->css;
338  
339  #else
340  	return true;
341  #endif
342  }
343  
344  static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
345  {
346  	struct file_region *nrg = NULL, *prg = NULL;
347  
348  	prg = list_prev_entry(rg, link);
349  	if (&prg->link != &resv->regions && prg->to == rg->from &&
350  	    has_same_uncharge_info(prg, rg)) {
351  		prg->to = rg->to;
352  
353  		list_del(&rg->link);
354  		put_uncharge_info(rg);
355  		kfree(rg);
356  
357  		rg = prg;
358  	}
359  
360  	nrg = list_next_entry(rg, link);
361  	if (&nrg->link != &resv->regions && nrg->from == rg->to &&
362  	    has_same_uncharge_info(nrg, rg)) {
363  		nrg->from = rg->from;
364  
365  		list_del(&rg->link);
366  		put_uncharge_info(rg);
367  		kfree(rg);
368  	}
369  }
370  
371  static inline long
372  hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
373  		     long to, struct hstate *h, struct hugetlb_cgroup *cg,
374  		     long *regions_needed)
375  {
376  	struct file_region *nrg;
377  
378  	if (!regions_needed) {
379  		nrg = get_file_region_entry_from_cache(map, from, to);
380  		record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
381  		list_add(&nrg->link, rg->link.prev);
382  		coalesce_file_region(map, nrg);
383  	} else
384  		*regions_needed += 1;
385  
386  	return to - from;
387  }
388  
389  /*
390   * Must be called with resv->lock held.
391   *
392   * Calling this with regions_needed != NULL will count the number of pages
393   * to be added but will not modify the linked list. And regions_needed will
394   * indicate the number of file_regions needed in the cache to carry out to add
395   * the regions for this range.
396   */
397  static long add_reservation_in_range(struct resv_map *resv, long f, long t,
398  				     struct hugetlb_cgroup *h_cg,
399  				     struct hstate *h, long *regions_needed)
400  {
401  	long add = 0;
402  	struct list_head *head = &resv->regions;
403  	long last_accounted_offset = f;
404  	struct file_region *rg = NULL, *trg = NULL;
405  
406  	if (regions_needed)
407  		*regions_needed = 0;
408  
409  	/* In this loop, we essentially handle an entry for the range
410  	 * [last_accounted_offset, rg->from), at every iteration, with some
411  	 * bounds checking.
412  	 */
413  	list_for_each_entry_safe(rg, trg, head, link) {
414  		/* Skip irrelevant regions that start before our range. */
415  		if (rg->from < f) {
416  			/* If this region ends after the last accounted offset,
417  			 * then we need to update last_accounted_offset.
418  			 */
419  			if (rg->to > last_accounted_offset)
420  				last_accounted_offset = rg->to;
421  			continue;
422  		}
423  
424  		/* When we find a region that starts beyond our range, we've
425  		 * finished.
426  		 */
427  		if (rg->from >= t)
428  			break;
429  
430  		/* Add an entry for last_accounted_offset -> rg->from, and
431  		 * update last_accounted_offset.
432  		 */
433  		if (rg->from > last_accounted_offset)
434  			add += hugetlb_resv_map_add(resv, rg,
435  						    last_accounted_offset,
436  						    rg->from, h, h_cg,
437  						    regions_needed);
438  
439  		last_accounted_offset = rg->to;
440  	}
441  
442  	/* Handle the case where our range extends beyond
443  	 * last_accounted_offset.
444  	 */
445  	if (last_accounted_offset < t)
446  		add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
447  					    t, h, h_cg, regions_needed);
448  
449  	return add;
450  }
451  
452  /* Must be called with resv->lock acquired. Will drop lock to allocate entries.
453   */
454  static int allocate_file_region_entries(struct resv_map *resv,
455  					int regions_needed)
456  	__must_hold(&resv->lock)
457  {
458  	struct list_head allocated_regions;
459  	int to_allocate = 0, i = 0;
460  	struct file_region *trg = NULL, *rg = NULL;
461  
462  	VM_BUG_ON(regions_needed < 0);
463  
464  	INIT_LIST_HEAD(&allocated_regions);
465  
466  	/*
467  	 * Check for sufficient descriptors in the cache to accommodate
468  	 * the number of in progress add operations plus regions_needed.
469  	 *
470  	 * This is a while loop because when we drop the lock, some other call
471  	 * to region_add or region_del may have consumed some region_entries,
472  	 * so we keep looping here until we finally have enough entries for
473  	 * (adds_in_progress + regions_needed).
474  	 */
475  	while (resv->region_cache_count <
476  	       (resv->adds_in_progress + regions_needed)) {
477  		to_allocate = resv->adds_in_progress + regions_needed -
478  			      resv->region_cache_count;
479  
480  		/* At this point, we should have enough entries in the cache
481  		 * for all the existing adds_in_progress. We should only be
482  		 * needing to allocate for regions_needed.
483  		 */
484  		VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
485  
486  		spin_unlock(&resv->lock);
487  		for (i = 0; i < to_allocate; i++) {
488  			trg = kmalloc(sizeof(*trg), GFP_KERNEL);
489  			if (!trg)
490  				goto out_of_memory;
491  			list_add(&trg->link, &allocated_regions);
492  		}
493  
494  		spin_lock(&resv->lock);
495  
496  		list_splice(&allocated_regions, &resv->region_cache);
497  		resv->region_cache_count += to_allocate;
498  	}
499  
500  	return 0;
501  
502  out_of_memory:
503  	list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
504  		list_del(&rg->link);
505  		kfree(rg);
506  	}
507  	return -ENOMEM;
508  }
509  
510  /*
511   * Add the huge page range represented by [f, t) to the reserve
512   * map.  Regions will be taken from the cache to fill in this range.
513   * Sufficient regions should exist in the cache due to the previous
514   * call to region_chg with the same range, but in some cases the cache will not
515   * have sufficient entries due to races with other code doing region_add or
516   * region_del.  The extra needed entries will be allocated.
517   *
518   * regions_needed is the out value provided by a previous call to region_chg.
519   *
520   * Return the number of new huge pages added to the map.  This number is greater
521   * than or equal to zero.  If file_region entries needed to be allocated for
522   * this operation and we were not able to allocate, it returns -ENOMEM.
523   * region_add of regions of length 1 never allocate file_regions and cannot
524   * fail; region_chg will always allocate at least 1 entry and a region_add for
525   * 1 page will only require at most 1 entry.
526   */
527  static long region_add(struct resv_map *resv, long f, long t,
528  		       long in_regions_needed, struct hstate *h,
529  		       struct hugetlb_cgroup *h_cg)
530  {
531  	long add = 0, actual_regions_needed = 0;
532  
533  	spin_lock(&resv->lock);
534  retry:
535  
536  	/* Count how many regions are actually needed to execute this add. */
537  	add_reservation_in_range(resv, f, t, NULL, NULL,
538  				 &actual_regions_needed);
539  
540  	/*
541  	 * Check for sufficient descriptors in the cache to accommodate
542  	 * this add operation. Note that actual_regions_needed may be greater
543  	 * than in_regions_needed, as the resv_map may have been modified since
544  	 * the region_chg call. In this case, we need to make sure that we
545  	 * allocate extra entries, such that we have enough for all the
546  	 * existing adds_in_progress, plus the excess needed for this
547  	 * operation.
548  	 */
549  	if (actual_regions_needed > in_regions_needed &&
550  	    resv->region_cache_count <
551  		    resv->adds_in_progress +
552  			    (actual_regions_needed - in_regions_needed)) {
553  		/* region_add operation of range 1 should never need to
554  		 * allocate file_region entries.
555  		 */
556  		VM_BUG_ON(t - f <= 1);
557  
558  		if (allocate_file_region_entries(
559  			    resv, actual_regions_needed - in_regions_needed)) {
560  			return -ENOMEM;
561  		}
562  
563  		goto retry;
564  	}
565  
566  	add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
567  
568  	resv->adds_in_progress -= in_regions_needed;
569  
570  	spin_unlock(&resv->lock);
571  	return add;
572  }
573  
574  /*
575   * Examine the existing reserve map and determine how many
576   * huge pages in the specified range [f, t) are NOT currently
577   * represented.  This routine is called before a subsequent
578   * call to region_add that will actually modify the reserve
579   * map to add the specified range [f, t).  region_chg does
580   * not change the number of huge pages represented by the
581   * map.  A number of new file_region structures is added to the cache as a
582   * placeholder, for the subsequent region_add call to use. At least 1
583   * file_region structure is added.
584   *
585   * out_regions_needed is the number of regions added to the
586   * resv->adds_in_progress.  This value needs to be provided to a follow up call
587   * to region_add or region_abort for proper accounting.
588   *
589   * Returns the number of huge pages that need to be added to the existing
590   * reservation map for the range [f, t).  This number is greater or equal to
591   * zero.  -ENOMEM is returned if a new file_region structure or cache entry
592   * is needed and can not be allocated.
593   */
594  static long region_chg(struct resv_map *resv, long f, long t,
595  		       long *out_regions_needed)
596  {
597  	long chg = 0;
598  
599  	spin_lock(&resv->lock);
600  
601  	/* Count how many hugepages in this range are NOT represented. */
602  	chg = add_reservation_in_range(resv, f, t, NULL, NULL,
603  				       out_regions_needed);
604  
605  	if (*out_regions_needed == 0)
606  		*out_regions_needed = 1;
607  
608  	if (allocate_file_region_entries(resv, *out_regions_needed))
609  		return -ENOMEM;
610  
611  	resv->adds_in_progress += *out_regions_needed;
612  
613  	spin_unlock(&resv->lock);
614  	return chg;
615  }
616  
617  /*
618   * Abort the in progress add operation.  The adds_in_progress field
619   * of the resv_map keeps track of the operations in progress between
620   * calls to region_chg and region_add.  Operations are sometimes
621   * aborted after the call to region_chg.  In such cases, region_abort
622   * is called to decrement the adds_in_progress counter. regions_needed
623   * is the value returned by the region_chg call, it is used to decrement
624   * the adds_in_progress counter.
625   *
626   * NOTE: The range arguments [f, t) are not needed or used in this
627   * routine.  They are kept to make reading the calling code easier as
628   * arguments will match the associated region_chg call.
629   */
630  static void region_abort(struct resv_map *resv, long f, long t,
631  			 long regions_needed)
632  {
633  	spin_lock(&resv->lock);
634  	VM_BUG_ON(!resv->region_cache_count);
635  	resv->adds_in_progress -= regions_needed;
636  	spin_unlock(&resv->lock);
637  }
638  
639  /*
640   * Delete the specified range [f, t) from the reserve map.  If the
641   * t parameter is LONG_MAX, this indicates that ALL regions after f
642   * should be deleted.  Locate the regions which intersect [f, t)
643   * and either trim, delete or split the existing regions.
644   *
645   * Returns the number of huge pages deleted from the reserve map.
646   * In the normal case, the return value is zero or more.  In the
647   * case where a region must be split, a new region descriptor must
648   * be allocated.  If the allocation fails, -ENOMEM will be returned.
649   * NOTE: If the parameter t == LONG_MAX, then we will never split
650   * a region and possibly return -ENOMEM.  Callers specifying
651   * t == LONG_MAX do not need to check for -ENOMEM error.
652   */
653  static long region_del(struct resv_map *resv, long f, long t)
654  {
655  	struct list_head *head = &resv->regions;
656  	struct file_region *rg, *trg;
657  	struct file_region *nrg = NULL;
658  	long del = 0;
659  
660  retry:
661  	spin_lock(&resv->lock);
662  	list_for_each_entry_safe(rg, trg, head, link) {
663  		/*
664  		 * Skip regions before the range to be deleted.  file_region
665  		 * ranges are normally of the form [from, to).  However, there
666  		 * may be a "placeholder" entry in the map which is of the form
667  		 * (from, to) with from == to.  Check for placeholder entries
668  		 * at the beginning of the range to be deleted.
669  		 */
670  		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
671  			continue;
672  
673  		if (rg->from >= t)
674  			break;
675  
676  		if (f > rg->from && t < rg->to) { /* Must split region */
677  			/*
678  			 * Check for an entry in the cache before dropping
679  			 * lock and attempting allocation.
680  			 */
681  			if (!nrg &&
682  			    resv->region_cache_count > resv->adds_in_progress) {
683  				nrg = list_first_entry(&resv->region_cache,
684  							struct file_region,
685  							link);
686  				list_del(&nrg->link);
687  				resv->region_cache_count--;
688  			}
689  
690  			if (!nrg) {
691  				spin_unlock(&resv->lock);
692  				nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
693  				if (!nrg)
694  					return -ENOMEM;
695  				goto retry;
696  			}
697  
698  			del += t - f;
699  			hugetlb_cgroup_uncharge_file_region(
700  				resv, rg, t - f, false);
701  
702  			/* New entry for end of split region */
703  			nrg->from = t;
704  			nrg->to = rg->to;
705  
706  			copy_hugetlb_cgroup_uncharge_info(nrg, rg);
707  
708  			INIT_LIST_HEAD(&nrg->link);
709  
710  			/* Original entry is trimmed */
711  			rg->to = f;
712  
713  			list_add(&nrg->link, &rg->link);
714  			nrg = NULL;
715  			break;
716  		}
717  
718  		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
719  			del += rg->to - rg->from;
720  			hugetlb_cgroup_uncharge_file_region(resv, rg,
721  							    rg->to - rg->from, true);
722  			list_del(&rg->link);
723  			kfree(rg);
724  			continue;
725  		}
726  
727  		if (f <= rg->from) {	/* Trim beginning of region */
728  			hugetlb_cgroup_uncharge_file_region(resv, rg,
729  							    t - rg->from, false);
730  
731  			del += t - rg->from;
732  			rg->from = t;
733  		} else {		/* Trim end of region */
734  			hugetlb_cgroup_uncharge_file_region(resv, rg,
735  							    rg->to - f, false);
736  
737  			del += rg->to - f;
738  			rg->to = f;
739  		}
740  	}
741  
742  	spin_unlock(&resv->lock);
743  	kfree(nrg);
744  	return del;
745  }
746  
747  /*
748   * A rare out of memory error was encountered which prevented removal of
749   * the reserve map region for a page.  The huge page itself was free'ed
750   * and removed from the page cache.  This routine will adjust the subpool
751   * usage count, and the global reserve count if needed.  By incrementing
752   * these counts, the reserve map entry which could not be deleted will
753   * appear as a "reserved" entry instead of simply dangling with incorrect
754   * counts.
755   */
756  void hugetlb_fix_reserve_counts(struct inode *inode)
757  {
758  	struct hugepage_subpool *spool = subpool_inode(inode);
759  	long rsv_adjust;
760  	bool reserved = false;
761  
762  	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
763  	if (rsv_adjust > 0) {
764  		struct hstate *h = hstate_inode(inode);
765  
766  		if (!hugetlb_acct_memory(h, 1))
767  			reserved = true;
768  	} else if (!rsv_adjust) {
769  		reserved = true;
770  	}
771  
772  	if (!reserved)
773  		pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
774  }
775  
776  /*
777   * Count and return the number of huge pages in the reserve map
778   * that intersect with the range [f, t).
779   */
780  static long region_count(struct resv_map *resv, long f, long t)
781  {
782  	struct list_head *head = &resv->regions;
783  	struct file_region *rg;
784  	long chg = 0;
785  
786  	spin_lock(&resv->lock);
787  	/* Locate each segment we overlap with, and count that overlap. */
788  	list_for_each_entry(rg, head, link) {
789  		long seg_from;
790  		long seg_to;
791  
792  		if (rg->to <= f)
793  			continue;
794  		if (rg->from >= t)
795  			break;
796  
797  		seg_from = max(rg->from, f);
798  		seg_to = min(rg->to, t);
799  
800  		chg += seg_to - seg_from;
801  	}
802  	spin_unlock(&resv->lock);
803  
804  	return chg;
805  }
806  
807  /*
808   * Convert the address within this vma to the page offset within
809   * the mapping, in pagecache page units; huge pages here.
810   */
811  static pgoff_t vma_hugecache_offset(struct hstate *h,
812  			struct vm_area_struct *vma, unsigned long address)
813  {
814  	return ((address - vma->vm_start) >> huge_page_shift(h)) +
815  			(vma->vm_pgoff >> huge_page_order(h));
816  }
817  
818  pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
819  				     unsigned long address)
820  {
821  	return vma_hugecache_offset(hstate_vma(vma), vma, address);
822  }
823  EXPORT_SYMBOL_GPL(linear_hugepage_index);
824  
825  /*
826   * Return the size of the pages allocated when backing a VMA. In the majority
827   * cases this will be same size as used by the page table entries.
828   */
829  unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
830  {
831  	if (vma->vm_ops && vma->vm_ops->pagesize)
832  		return vma->vm_ops->pagesize(vma);
833  	return PAGE_SIZE;
834  }
835  EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
836  
837  /*
838   * Return the page size being used by the MMU to back a VMA. In the majority
839   * of cases, the page size used by the kernel matches the MMU size. On
840   * architectures where it differs, an architecture-specific 'strong'
841   * version of this symbol is required.
842   */
843  __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
844  {
845  	return vma_kernel_pagesize(vma);
846  }
847  
848  /*
849   * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
850   * bits of the reservation map pointer, which are always clear due to
851   * alignment.
852   */
853  #define HPAGE_RESV_OWNER    (1UL << 0)
854  #define HPAGE_RESV_UNMAPPED (1UL << 1)
855  #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
856  
857  /*
858   * These helpers are used to track how many pages are reserved for
859   * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
860   * is guaranteed to have their future faults succeed.
861   *
862   * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
863   * the reserve counters are updated with the hugetlb_lock held. It is safe
864   * to reset the VMA at fork() time as it is not in use yet and there is no
865   * chance of the global counters getting corrupted as a result of the values.
866   *
867   * The private mapping reservation is represented in a subtly different
868   * manner to a shared mapping.  A shared mapping has a region map associated
869   * with the underlying file, this region map represents the backing file
870   * pages which have ever had a reservation assigned which this persists even
871   * after the page is instantiated.  A private mapping has a region map
872   * associated with the original mmap which is attached to all VMAs which
873   * reference it, this region map represents those offsets which have consumed
874   * reservation ie. where pages have been instantiated.
875   */
876  static unsigned long get_vma_private_data(struct vm_area_struct *vma)
877  {
878  	return (unsigned long)vma->vm_private_data;
879  }
880  
881  static void set_vma_private_data(struct vm_area_struct *vma,
882  							unsigned long value)
883  {
884  	vma->vm_private_data = (void *)value;
885  }
886  
887  static void
888  resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
889  					  struct hugetlb_cgroup *h_cg,
890  					  struct hstate *h)
891  {
892  #ifdef CONFIG_CGROUP_HUGETLB
893  	if (!h_cg || !h) {
894  		resv_map->reservation_counter = NULL;
895  		resv_map->pages_per_hpage = 0;
896  		resv_map->css = NULL;
897  	} else {
898  		resv_map->reservation_counter =
899  			&h_cg->rsvd_hugepage[hstate_index(h)];
900  		resv_map->pages_per_hpage = pages_per_huge_page(h);
901  		resv_map->css = &h_cg->css;
902  	}
903  #endif
904  }
905  
906  struct resv_map *resv_map_alloc(void)
907  {
908  	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
909  	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
910  
911  	if (!resv_map || !rg) {
912  		kfree(resv_map);
913  		kfree(rg);
914  		return NULL;
915  	}
916  
917  	kref_init(&resv_map->refs);
918  	spin_lock_init(&resv_map->lock);
919  	INIT_LIST_HEAD(&resv_map->regions);
920  
921  	resv_map->adds_in_progress = 0;
922  	/*
923  	 * Initialize these to 0. On shared mappings, 0's here indicate these
924  	 * fields don't do cgroup accounting. On private mappings, these will be
925  	 * re-initialized to the proper values, to indicate that hugetlb cgroup
926  	 * reservations are to be un-charged from here.
927  	 */
928  	resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
929  
930  	INIT_LIST_HEAD(&resv_map->region_cache);
931  	list_add(&rg->link, &resv_map->region_cache);
932  	resv_map->region_cache_count = 1;
933  
934  	return resv_map;
935  }
936  
937  void resv_map_release(struct kref *ref)
938  {
939  	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
940  	struct list_head *head = &resv_map->region_cache;
941  	struct file_region *rg, *trg;
942  
943  	/* Clear out any active regions before we release the map. */
944  	region_del(resv_map, 0, LONG_MAX);
945  
946  	/* ... and any entries left in the cache */
947  	list_for_each_entry_safe(rg, trg, head, link) {
948  		list_del(&rg->link);
949  		kfree(rg);
950  	}
951  
952  	VM_BUG_ON(resv_map->adds_in_progress);
953  
954  	kfree(resv_map);
955  }
956  
957  static inline struct resv_map *inode_resv_map(struct inode *inode)
958  {
959  	/*
960  	 * At inode evict time, i_mapping may not point to the original
961  	 * address space within the inode.  This original address space
962  	 * contains the pointer to the resv_map.  So, always use the
963  	 * address space embedded within the inode.
964  	 * The VERY common case is inode->mapping == &inode->i_data but,
965  	 * this may not be true for device special inodes.
966  	 */
967  	return (struct resv_map *)(&inode->i_data)->private_data;
968  }
969  
970  static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
971  {
972  	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
973  	if (vma->vm_flags & VM_MAYSHARE) {
974  		struct address_space *mapping = vma->vm_file->f_mapping;
975  		struct inode *inode = mapping->host;
976  
977  		return inode_resv_map(inode);
978  
979  	} else {
980  		return (struct resv_map *)(get_vma_private_data(vma) &
981  							~HPAGE_RESV_MASK);
982  	}
983  }
984  
985  static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
986  {
987  	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
988  	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
989  
990  	set_vma_private_data(vma, (get_vma_private_data(vma) &
991  				HPAGE_RESV_MASK) | (unsigned long)map);
992  }
993  
994  static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
995  {
996  	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
997  	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
998  
999  	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
1000  }
1001  
1002  static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
1003  {
1004  	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1005  
1006  	return (get_vma_private_data(vma) & flag) != 0;
1007  }
1008  
1009  /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
1010  void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
1011  {
1012  	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1013  	if (!(vma->vm_flags & VM_MAYSHARE))
1014  		vma->vm_private_data = (void *)0;
1015  }
1016  
1017  /*
1018   * Reset and decrement one ref on hugepage private reservation.
1019   * Called with mm->mmap_sem writer semaphore held.
1020   * This function should be only used by move_vma() and operate on
1021   * same sized vma. It should never come here with last ref on the
1022   * reservation.
1023   */
1024  void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1025  {
1026  	/*
1027  	 * Clear the old hugetlb private page reservation.
1028  	 * It has already been transferred to new_vma.
1029  	 *
1030  	 * During a mremap() operation of a hugetlb vma we call move_vma()
1031  	 * which copies vma into new_vma and unmaps vma. After the copy
1032  	 * operation both new_vma and vma share a reference to the resv_map
1033  	 * struct, and at that point vma is about to be unmapped. We don't
1034  	 * want to return the reservation to the pool at unmap of vma because
1035  	 * the reservation still lives on in new_vma, so simply decrement the
1036  	 * ref here and remove the resv_map reference from this vma.
1037  	 */
1038  	struct resv_map *reservations = vma_resv_map(vma);
1039  
1040  	if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1041  		resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
1042  		kref_put(&reservations->refs, resv_map_release);
1043  	}
1044  
1045  	reset_vma_resv_huge_pages(vma);
1046  }
1047  
1048  /* Returns true if the VMA has associated reserve pages */
1049  static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
1050  {
1051  	if (vma->vm_flags & VM_NORESERVE) {
1052  		/*
1053  		 * This address is already reserved by other process(chg == 0),
1054  		 * so, we should decrement reserved count. Without decrementing,
1055  		 * reserve count remains after releasing inode, because this
1056  		 * allocated page will go into page cache and is regarded as
1057  		 * coming from reserved pool in releasing step.  Currently, we
1058  		 * don't have any other solution to deal with this situation
1059  		 * properly, so add work-around here.
1060  		 */
1061  		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
1062  			return true;
1063  		else
1064  			return false;
1065  	}
1066  
1067  	/* Shared mappings always use reserves */
1068  	if (vma->vm_flags & VM_MAYSHARE) {
1069  		/*
1070  		 * We know VM_NORESERVE is not set.  Therefore, there SHOULD
1071  		 * be a region map for all pages.  The only situation where
1072  		 * there is no region map is if a hole was punched via
1073  		 * fallocate.  In this case, there really are no reserves to
1074  		 * use.  This situation is indicated if chg != 0.
1075  		 */
1076  		if (chg)
1077  			return false;
1078  		else
1079  			return true;
1080  	}
1081  
1082  	/*
1083  	 * Only the process that called mmap() has reserves for
1084  	 * private mappings.
1085  	 */
1086  	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1087  		/*
1088  		 * Like the shared case above, a hole punch or truncate
1089  		 * could have been performed on the private mapping.
1090  		 * Examine the value of chg to determine if reserves
1091  		 * actually exist or were previously consumed.
1092  		 * Very Subtle - The value of chg comes from a previous
1093  		 * call to vma_needs_reserves().  The reserve map for
1094  		 * private mappings has different (opposite) semantics
1095  		 * than that of shared mappings.  vma_needs_reserves()
1096  		 * has already taken this difference in semantics into
1097  		 * account.  Therefore, the meaning of chg is the same
1098  		 * as in the shared case above.  Code could easily be
1099  		 * combined, but keeping it separate draws attention to
1100  		 * subtle differences.
1101  		 */
1102  		if (chg)
1103  			return false;
1104  		else
1105  			return true;
1106  	}
1107  
1108  	return false;
1109  }
1110  
1111  static void enqueue_huge_page(struct hstate *h, struct page *page)
1112  {
1113  	int nid = page_to_nid(page);
1114  
1115  	lockdep_assert_held(&hugetlb_lock);
1116  	VM_BUG_ON_PAGE(page_count(page), page);
1117  
1118  	list_move(&page->lru, &h->hugepage_freelists[nid]);
1119  	h->free_huge_pages++;
1120  	h->free_huge_pages_node[nid]++;
1121  	SetHPageFreed(page);
1122  }
1123  
1124  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
1125  {
1126  	struct page *page;
1127  	bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1128  
1129  	lockdep_assert_held(&hugetlb_lock);
1130  	list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
1131  		if (pin && !is_pinnable_page(page))
1132  			continue;
1133  
1134  		if (PageHWPoison(page))
1135  			continue;
1136  
1137  		list_move(&page->lru, &h->hugepage_activelist);
1138  		set_page_refcounted(page);
1139  		ClearHPageFreed(page);
1140  		h->free_huge_pages--;
1141  		h->free_huge_pages_node[nid]--;
1142  		return page;
1143  	}
1144  
1145  	return NULL;
1146  }
1147  
1148  static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
1149  		nodemask_t *nmask)
1150  {
1151  	unsigned int cpuset_mems_cookie;
1152  	struct zonelist *zonelist;
1153  	struct zone *zone;
1154  	struct zoneref *z;
1155  	int node = NUMA_NO_NODE;
1156  
1157  	zonelist = node_zonelist(nid, gfp_mask);
1158  
1159  retry_cpuset:
1160  	cpuset_mems_cookie = read_mems_allowed_begin();
1161  	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
1162  		struct page *page;
1163  
1164  		if (!cpuset_zone_allowed(zone, gfp_mask))
1165  			continue;
1166  		/*
1167  		 * no need to ask again on the same node. Pool is node rather than
1168  		 * zone aware
1169  		 */
1170  		if (zone_to_nid(zone) == node)
1171  			continue;
1172  		node = zone_to_nid(zone);
1173  
1174  		page = dequeue_huge_page_node_exact(h, node);
1175  		if (page)
1176  			return page;
1177  	}
1178  	if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1179  		goto retry_cpuset;
1180  
1181  	return NULL;
1182  }
1183  
1184  static struct page *dequeue_huge_page_vma(struct hstate *h,
1185  				struct vm_area_struct *vma,
1186  				unsigned long address, int avoid_reserve,
1187  				long chg)
1188  {
1189  	struct page *page = NULL;
1190  	struct mempolicy *mpol;
1191  	gfp_t gfp_mask;
1192  	nodemask_t *nodemask;
1193  	int nid;
1194  
1195  	/*
1196  	 * A child process with MAP_PRIVATE mappings created by their parent
1197  	 * have no page reserves. This check ensures that reservations are
1198  	 * not "stolen". The child may still get SIGKILLed
1199  	 */
1200  	if (!vma_has_reserves(vma, chg) &&
1201  			h->free_huge_pages - h->resv_huge_pages == 0)
1202  		goto err;
1203  
1204  	/* If reserves cannot be used, ensure enough pages are in the pool */
1205  	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
1206  		goto err;
1207  
1208  	gfp_mask = htlb_alloc_mask(h);
1209  	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1210  
1211  	if (mpol_is_preferred_many(mpol)) {
1212  		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1213  
1214  		/* Fallback to all nodes if page==NULL */
1215  		nodemask = NULL;
1216  	}
1217  
1218  	if (!page)
1219  		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1220  
1221  	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
1222  		SetHPageRestoreReserve(page);
1223  		h->resv_huge_pages--;
1224  	}
1225  
1226  	mpol_cond_put(mpol);
1227  	return page;
1228  
1229  err:
1230  	return NULL;
1231  }
1232  
1233  /*
1234   * common helper functions for hstate_next_node_to_{alloc|free}.
1235   * We may have allocated or freed a huge page based on a different
1236   * nodes_allowed previously, so h->next_node_to_{alloc|free} might
1237   * be outside of *nodes_allowed.  Ensure that we use an allowed
1238   * node for alloc or free.
1239   */
1240  static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1241  {
1242  	nid = next_node_in(nid, *nodes_allowed);
1243  	VM_BUG_ON(nid >= MAX_NUMNODES);
1244  
1245  	return nid;
1246  }
1247  
1248  static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1249  {
1250  	if (!node_isset(nid, *nodes_allowed))
1251  		nid = next_node_allowed(nid, nodes_allowed);
1252  	return nid;
1253  }
1254  
1255  /*
1256   * returns the previously saved node ["this node"] from which to
1257   * allocate a persistent huge page for the pool and advance the
1258   * next node from which to allocate, handling wrap at end of node
1259   * mask.
1260   */
1261  static int hstate_next_node_to_alloc(struct hstate *h,
1262  					nodemask_t *nodes_allowed)
1263  {
1264  	int nid;
1265  
1266  	VM_BUG_ON(!nodes_allowed);
1267  
1268  	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
1269  	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
1270  
1271  	return nid;
1272  }
1273  
1274  /*
1275   * helper for remove_pool_huge_page() - return the previously saved
1276   * node ["this node"] from which to free a huge page.  Advance the
1277   * next node id whether or not we find a free huge page to free so
1278   * that the next attempt to free addresses the next node.
1279   */
1280  static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1281  {
1282  	int nid;
1283  
1284  	VM_BUG_ON(!nodes_allowed);
1285  
1286  	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1287  	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1288  
1289  	return nid;
1290  }
1291  
1292  #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
1293  	for (nr_nodes = nodes_weight(*mask);				\
1294  		nr_nodes > 0 &&						\
1295  		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
1296  		nr_nodes--)
1297  
1298  #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
1299  	for (nr_nodes = nodes_weight(*mask);				\
1300  		nr_nodes > 0 &&						\
1301  		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
1302  		nr_nodes--)
1303  
1304  /* used to demote non-gigantic_huge pages as well */
1305  static void __destroy_compound_gigantic_page(struct page *page,
1306  					unsigned int order, bool demote)
1307  {
1308  	int i;
1309  	int nr_pages = 1 << order;
1310  	struct page *p = page + 1;
1311  
1312  	atomic_set(compound_mapcount_ptr(page), 0);
1313  	atomic_set(compound_pincount_ptr(page), 0);
1314  
1315  	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1316  		p->mapping = NULL;
1317  		clear_compound_head(p);
1318  		if (!demote)
1319  			set_page_refcounted(p);
1320  	}
1321  
1322  	set_compound_order(page, 0);
1323  	page[1].compound_nr = 0;
1324  	__ClearPageHead(page);
1325  }
1326  
1327  static void destroy_compound_hugetlb_page_for_demote(struct page *page,
1328  					unsigned int order)
1329  {
1330  	__destroy_compound_gigantic_page(page, order, true);
1331  }
1332  
1333  #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1334  static void destroy_compound_gigantic_page(struct page *page,
1335  					unsigned int order)
1336  {
1337  	__destroy_compound_gigantic_page(page, order, false);
1338  }
1339  
1340  static void free_gigantic_page(struct page *page, unsigned int order)
1341  {
1342  	/*
1343  	 * If the page isn't allocated using the cma allocator,
1344  	 * cma_release() returns false.
1345  	 */
1346  #ifdef CONFIG_CMA
1347  	if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
1348  		return;
1349  #endif
1350  
1351  	free_contig_range(page_to_pfn(page), 1 << order);
1352  }
1353  
1354  #ifdef CONFIG_CONTIG_ALLOC
1355  static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1356  		int nid, nodemask_t *nodemask)
1357  {
1358  	unsigned long nr_pages = pages_per_huge_page(h);
1359  	if (nid == NUMA_NO_NODE)
1360  		nid = numa_mem_id();
1361  
1362  #ifdef CONFIG_CMA
1363  	{
1364  		struct page *page;
1365  		int node;
1366  
1367  		if (hugetlb_cma[nid]) {
1368  			page = cma_alloc(hugetlb_cma[nid], nr_pages,
1369  					huge_page_order(h), true);
1370  			if (page)
1371  				return page;
1372  		}
1373  
1374  		if (!(gfp_mask & __GFP_THISNODE)) {
1375  			for_each_node_mask(node, *nodemask) {
1376  				if (node == nid || !hugetlb_cma[node])
1377  					continue;
1378  
1379  				page = cma_alloc(hugetlb_cma[node], nr_pages,
1380  						huge_page_order(h), true);
1381  				if (page)
1382  					return page;
1383  			}
1384  		}
1385  	}
1386  #endif
1387  
1388  	return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1389  }
1390  
1391  #else /* !CONFIG_CONTIG_ALLOC */
1392  static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1393  					int nid, nodemask_t *nodemask)
1394  {
1395  	return NULL;
1396  }
1397  #endif /* CONFIG_CONTIG_ALLOC */
1398  
1399  #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1400  static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1401  					int nid, nodemask_t *nodemask)
1402  {
1403  	return NULL;
1404  }
1405  static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1406  static inline void destroy_compound_gigantic_page(struct page *page,
1407  						unsigned int order) { }
1408  #endif
1409  
1410  /*
1411   * Remove hugetlb page from lists, and update dtor so that page appears
1412   * as just a compound page.
1413   *
1414   * A reference is held on the page, except in the case of demote.
1415   *
1416   * Must be called with hugetlb lock held.
1417   */
1418  static void __remove_hugetlb_page(struct hstate *h, struct page *page,
1419  							bool adjust_surplus,
1420  							bool demote)
1421  {
1422  	int nid = page_to_nid(page);
1423  
1424  	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1425  	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
1426  
1427  	lockdep_assert_held(&hugetlb_lock);
1428  	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1429  		return;
1430  
1431  	list_del(&page->lru);
1432  
1433  	if (HPageFreed(page)) {
1434  		h->free_huge_pages--;
1435  		h->free_huge_pages_node[nid]--;
1436  	}
1437  	if (adjust_surplus) {
1438  		h->surplus_huge_pages--;
1439  		h->surplus_huge_pages_node[nid]--;
1440  	}
1441  
1442  	/*
1443  	 * Very subtle
1444  	 *
1445  	 * For non-gigantic pages set the destructor to the normal compound
1446  	 * page dtor.  This is needed in case someone takes an additional
1447  	 * temporary ref to the page, and freeing is delayed until they drop
1448  	 * their reference.
1449  	 *
1450  	 * For gigantic pages set the destructor to the null dtor.  This
1451  	 * destructor will never be called.  Before freeing the gigantic
1452  	 * page destroy_compound_gigantic_page will turn the compound page
1453  	 * into a simple group of pages.  After this the destructor does not
1454  	 * apply.
1455  	 *
1456  	 * This handles the case where more than one ref is held when and
1457  	 * after update_and_free_page is called.
1458  	 *
1459  	 * In the case of demote we do not ref count the page as it will soon
1460  	 * be turned into a page of smaller size.
1461  	 */
1462  	if (!demote)
1463  		set_page_refcounted(page);
1464  	if (hstate_is_gigantic(h))
1465  		set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
1466  	else
1467  		set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
1468  
1469  	h->nr_huge_pages--;
1470  	h->nr_huge_pages_node[nid]--;
1471  }
1472  
1473  static void remove_hugetlb_page(struct hstate *h, struct page *page,
1474  							bool adjust_surplus)
1475  {
1476  	__remove_hugetlb_page(h, page, adjust_surplus, false);
1477  }
1478  
1479  static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
1480  							bool adjust_surplus)
1481  {
1482  	__remove_hugetlb_page(h, page, adjust_surplus, true);
1483  }
1484  
1485  static void add_hugetlb_page(struct hstate *h, struct page *page,
1486  			     bool adjust_surplus)
1487  {
1488  	int zeroed;
1489  	int nid = page_to_nid(page);
1490  
1491  	VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
1492  
1493  	lockdep_assert_held(&hugetlb_lock);
1494  
1495  	INIT_LIST_HEAD(&page->lru);
1496  	h->nr_huge_pages++;
1497  	h->nr_huge_pages_node[nid]++;
1498  
1499  	if (adjust_surplus) {
1500  		h->surplus_huge_pages++;
1501  		h->surplus_huge_pages_node[nid]++;
1502  	}
1503  
1504  	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1505  	set_page_private(page, 0);
1506  	SetHPageVmemmapOptimized(page);
1507  
1508  	/*
1509  	 * This page is about to be managed by the hugetlb allocator and
1510  	 * should have no users.  Drop our reference, and check for others
1511  	 * just in case.
1512  	 */
1513  	zeroed = put_page_testzero(page);
1514  	if (!zeroed)
1515  		/*
1516  		 * It is VERY unlikely soneone else has taken a ref on
1517  		 * the page.  In this case, we simply return as the
1518  		 * hugetlb destructor (free_huge_page) will be called
1519  		 * when this other ref is dropped.
1520  		 */
1521  		return;
1522  
1523  	arch_clear_hugepage_flags(page);
1524  	enqueue_huge_page(h, page);
1525  }
1526  
1527  static void __update_and_free_page(struct hstate *h, struct page *page)
1528  {
1529  	int i;
1530  	struct page *subpage = page;
1531  
1532  	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1533  		return;
1534  
1535  	if (alloc_huge_page_vmemmap(h, page)) {
1536  		spin_lock_irq(&hugetlb_lock);
1537  		/*
1538  		 * If we cannot allocate vmemmap pages, just refuse to free the
1539  		 * page and put the page back on the hugetlb free list and treat
1540  		 * as a surplus page.
1541  		 */
1542  		add_hugetlb_page(h, page, true);
1543  		spin_unlock_irq(&hugetlb_lock);
1544  		return;
1545  	}
1546  
1547  	for (i = 0; i < pages_per_huge_page(h);
1548  	     i++, subpage = mem_map_next(subpage, page, i)) {
1549  		subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1550  				1 << PG_referenced | 1 << PG_dirty |
1551  				1 << PG_active | 1 << PG_private |
1552  				1 << PG_writeback);
1553  	}
1554  
1555  	/*
1556  	 * Non-gigantic pages demoted from CMA allocated gigantic pages
1557  	 * need to be given back to CMA in free_gigantic_page.
1558  	 */
1559  	if (hstate_is_gigantic(h) ||
1560  	    hugetlb_cma_page(page, huge_page_order(h))) {
1561  		destroy_compound_gigantic_page(page, huge_page_order(h));
1562  		free_gigantic_page(page, huge_page_order(h));
1563  	} else {
1564  		__free_pages(page, huge_page_order(h));
1565  	}
1566  }
1567  
1568  /*
1569   * As update_and_free_page() can be called under any context, so we cannot
1570   * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
1571   * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
1572   * the vmemmap pages.
1573   *
1574   * free_hpage_workfn() locklessly retrieves the linked list of pages to be
1575   * freed and frees them one-by-one. As the page->mapping pointer is going
1576   * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
1577   * structure of a lockless linked list of huge pages to be freed.
1578   */
1579  static LLIST_HEAD(hpage_freelist);
1580  
1581  static void free_hpage_workfn(struct work_struct *work)
1582  {
1583  	struct llist_node *node;
1584  
1585  	node = llist_del_all(&hpage_freelist);
1586  
1587  	while (node) {
1588  		struct page *page;
1589  		struct hstate *h;
1590  
1591  		page = container_of((struct address_space **)node,
1592  				     struct page, mapping);
1593  		node = node->next;
1594  		page->mapping = NULL;
1595  		/*
1596  		 * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
1597  		 * is going to trigger because a previous call to
1598  		 * remove_hugetlb_page() will set_compound_page_dtor(page,
1599  		 * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
1600  		 */
1601  		h = size_to_hstate(page_size(page));
1602  
1603  		__update_and_free_page(h, page);
1604  
1605  		cond_resched();
1606  	}
1607  }
1608  static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1609  
1610  static inline void flush_free_hpage_work(struct hstate *h)
1611  {
1612  	if (free_vmemmap_pages_per_hpage(h))
1613  		flush_work(&free_hpage_work);
1614  }
1615  
1616  static void update_and_free_page(struct hstate *h, struct page *page,
1617  				 bool atomic)
1618  {
1619  	if (!HPageVmemmapOptimized(page) || !atomic) {
1620  		__update_and_free_page(h, page);
1621  		return;
1622  	}
1623  
1624  	/*
1625  	 * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
1626  	 *
1627  	 * Only call schedule_work() if hpage_freelist is previously
1628  	 * empty. Otherwise, schedule_work() had been called but the workfn
1629  	 * hasn't retrieved the list yet.
1630  	 */
1631  	if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
1632  		schedule_work(&free_hpage_work);
1633  }
1634  
1635  static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
1636  {
1637  	struct page *page, *t_page;
1638  
1639  	list_for_each_entry_safe(page, t_page, list, lru) {
1640  		update_and_free_page(h, page, false);
1641  		cond_resched();
1642  	}
1643  }
1644  
1645  struct hstate *size_to_hstate(unsigned long size)
1646  {
1647  	struct hstate *h;
1648  
1649  	for_each_hstate(h) {
1650  		if (huge_page_size(h) == size)
1651  			return h;
1652  	}
1653  	return NULL;
1654  }
1655  
1656  void free_huge_page(struct page *page)
1657  {
1658  	/*
1659  	 * Can't pass hstate in here because it is called from the
1660  	 * compound page destructor.
1661  	 */
1662  	struct hstate *h = page_hstate(page);
1663  	int nid = page_to_nid(page);
1664  	struct hugepage_subpool *spool = hugetlb_page_subpool(page);
1665  	bool restore_reserve;
1666  	unsigned long flags;
1667  
1668  	VM_BUG_ON_PAGE(page_count(page), page);
1669  	VM_BUG_ON_PAGE(page_mapcount(page), page);
1670  
1671  	hugetlb_set_page_subpool(page, NULL);
1672  	page->mapping = NULL;
1673  	restore_reserve = HPageRestoreReserve(page);
1674  	ClearHPageRestoreReserve(page);
1675  
1676  	/*
1677  	 * If HPageRestoreReserve was set on page, page allocation consumed a
1678  	 * reservation.  If the page was associated with a subpool, there
1679  	 * would have been a page reserved in the subpool before allocation
1680  	 * via hugepage_subpool_get_pages().  Since we are 'restoring' the
1681  	 * reservation, do not call hugepage_subpool_put_pages() as this will
1682  	 * remove the reserved page from the subpool.
1683  	 */
1684  	if (!restore_reserve) {
1685  		/*
1686  		 * A return code of zero implies that the subpool will be
1687  		 * under its minimum size if the reservation is not restored
1688  		 * after page is free.  Therefore, force restore_reserve
1689  		 * operation.
1690  		 */
1691  		if (hugepage_subpool_put_pages(spool, 1) == 0)
1692  			restore_reserve = true;
1693  	}
1694  
1695  	spin_lock_irqsave(&hugetlb_lock, flags);
1696  	ClearHPageMigratable(page);
1697  	hugetlb_cgroup_uncharge_page(hstate_index(h),
1698  				     pages_per_huge_page(h), page);
1699  	hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
1700  					  pages_per_huge_page(h), page);
1701  	if (restore_reserve)
1702  		h->resv_huge_pages++;
1703  
1704  	if (HPageTemporary(page)) {
1705  		remove_hugetlb_page(h, page, false);
1706  		spin_unlock_irqrestore(&hugetlb_lock, flags);
1707  		update_and_free_page(h, page, true);
1708  	} else if (h->surplus_huge_pages_node[nid]) {
1709  		/* remove the page from active list */
1710  		remove_hugetlb_page(h, page, true);
1711  		spin_unlock_irqrestore(&hugetlb_lock, flags);
1712  		update_and_free_page(h, page, true);
1713  	} else {
1714  		arch_clear_hugepage_flags(page);
1715  		enqueue_huge_page(h, page);
1716  		spin_unlock_irqrestore(&hugetlb_lock, flags);
1717  	}
1718  }
1719  
1720  /*
1721   * Must be called with the hugetlb lock held
1722   */
1723  static void __prep_account_new_huge_page(struct hstate *h, int nid)
1724  {
1725  	lockdep_assert_held(&hugetlb_lock);
1726  	h->nr_huge_pages++;
1727  	h->nr_huge_pages_node[nid]++;
1728  }
1729  
1730  static void __prep_new_huge_page(struct hstate *h, struct page *page)
1731  {
1732  	free_huge_page_vmemmap(h, page);
1733  	INIT_LIST_HEAD(&page->lru);
1734  	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1735  	hugetlb_set_page_subpool(page, NULL);
1736  	set_hugetlb_cgroup(page, NULL);
1737  	set_hugetlb_cgroup_rsvd(page, NULL);
1738  }
1739  
1740  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1741  {
1742  	__prep_new_huge_page(h, page);
1743  	spin_lock_irq(&hugetlb_lock);
1744  	__prep_account_new_huge_page(h, nid);
1745  	spin_unlock_irq(&hugetlb_lock);
1746  }
1747  
1748  static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
1749  								bool demote)
1750  {
1751  	int i, j;
1752  	int nr_pages = 1 << order;
1753  	struct page *p = page + 1;
1754  
1755  	/* we rely on prep_new_huge_page to set the destructor */
1756  	set_compound_order(page, order);
1757  	__ClearPageReserved(page);
1758  	__SetPageHead(page);
1759  	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1760  		/*
1761  		 * For gigantic hugepages allocated through bootmem at
1762  		 * boot, it's safer to be consistent with the not-gigantic
1763  		 * hugepages and clear the PG_reserved bit from all tail pages
1764  		 * too.  Otherwise drivers using get_user_pages() to access tail
1765  		 * pages may get the reference counting wrong if they see
1766  		 * PG_reserved set on a tail page (despite the head page not
1767  		 * having PG_reserved set).  Enforcing this consistency between
1768  		 * head and tail pages allows drivers to optimize away a check
1769  		 * on the head page when they need know if put_page() is needed
1770  		 * after get_user_pages().
1771  		 */
1772  		__ClearPageReserved(p);
1773  		/*
1774  		 * Subtle and very unlikely
1775  		 *
1776  		 * Gigantic 'page allocators' such as memblock or cma will
1777  		 * return a set of pages with each page ref counted.  We need
1778  		 * to turn this set of pages into a compound page with tail
1779  		 * page ref counts set to zero.  Code such as speculative page
1780  		 * cache adding could take a ref on a 'to be' tail page.
1781  		 * We need to respect any increased ref count, and only set
1782  		 * the ref count to zero if count is currently 1.  If count
1783  		 * is not 1, we return an error.  An error return indicates
1784  		 * the set of pages can not be converted to a gigantic page.
1785  		 * The caller who allocated the pages should then discard the
1786  		 * pages using the appropriate free interface.
1787  		 *
1788  		 * In the case of demote, the ref count will be zero.
1789  		 */
1790  		if (!demote) {
1791  			if (!page_ref_freeze(p, 1)) {
1792  				pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
1793  				goto out_error;
1794  			}
1795  		} else {
1796  			VM_BUG_ON_PAGE(page_count(p), p);
1797  		}
1798  		set_compound_head(p, page);
1799  	}
1800  	atomic_set(compound_mapcount_ptr(page), -1);
1801  	atomic_set(compound_pincount_ptr(page), 0);
1802  	return true;
1803  
1804  out_error:
1805  	/* undo tail page modifications made above */
1806  	p = page + 1;
1807  	for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
1808  		clear_compound_head(p);
1809  		set_page_refcounted(p);
1810  	}
1811  	/* need to clear PG_reserved on remaining tail pages  */
1812  	for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
1813  		__ClearPageReserved(p);
1814  	set_compound_order(page, 0);
1815  	page[1].compound_nr = 0;
1816  	__ClearPageHead(page);
1817  	return false;
1818  }
1819  
1820  static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
1821  {
1822  	return __prep_compound_gigantic_page(page, order, false);
1823  }
1824  
1825  static bool prep_compound_gigantic_page_for_demote(struct page *page,
1826  							unsigned int order)
1827  {
1828  	return __prep_compound_gigantic_page(page, order, true);
1829  }
1830  
1831  /*
1832   * PageHuge() only returns true for hugetlbfs pages, but not for normal or
1833   * transparent huge pages.  See the PageTransHuge() documentation for more
1834   * details.
1835   */
1836  int PageHuge(struct page *page)
1837  {
1838  	if (!PageCompound(page))
1839  		return 0;
1840  
1841  	page = compound_head(page);
1842  	return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
1843  }
1844  EXPORT_SYMBOL_GPL(PageHuge);
1845  
1846  /*
1847   * PageHeadHuge() only returns true for hugetlbfs head page, but not for
1848   * normal or transparent huge pages.
1849   */
1850  int PageHeadHuge(struct page *page_head)
1851  {
1852  	if (!PageHead(page_head))
1853  		return 0;
1854  
1855  	return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
1856  }
1857  
1858  /*
1859   * Find and lock address space (mapping) in write mode.
1860   *
1861   * Upon entry, the page is locked which means that page_mapping() is
1862   * stable.  Due to locking order, we can only trylock_write.  If we can
1863   * not get the lock, simply return NULL to caller.
1864   */
1865  struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
1866  {
1867  	struct address_space *mapping = page_mapping(hpage);
1868  
1869  	if (!mapping)
1870  		return mapping;
1871  
1872  	if (i_mmap_trylock_write(mapping))
1873  		return mapping;
1874  
1875  	return NULL;
1876  }
1877  
1878  pgoff_t hugetlb_basepage_index(struct page *page)
1879  {
1880  	struct page *page_head = compound_head(page);
1881  	pgoff_t index = page_index(page_head);
1882  	unsigned long compound_idx;
1883  
1884  	if (compound_order(page_head) >= MAX_ORDER)
1885  		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
1886  	else
1887  		compound_idx = page - page_head;
1888  
1889  	return (index << compound_order(page_head)) + compound_idx;
1890  }
1891  
1892  static struct page *alloc_buddy_huge_page(struct hstate *h,
1893  		gfp_t gfp_mask, int nid, nodemask_t *nmask,
1894  		nodemask_t *node_alloc_noretry)
1895  {
1896  	int order = huge_page_order(h);
1897  	struct page *page;
1898  	bool alloc_try_hard = true;
1899  
1900  	/*
1901  	 * By default we always try hard to allocate the page with
1902  	 * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating pages in
1903  	 * a loop (to adjust global huge page counts) and previous allocation
1904  	 * failed, do not continue to try hard on the same node.  Use the
1905  	 * node_alloc_noretry bitmap to manage this state information.
1906  	 */
1907  	if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1908  		alloc_try_hard = false;
1909  	gfp_mask |= __GFP_COMP|__GFP_NOWARN;
1910  	if (alloc_try_hard)
1911  		gfp_mask |= __GFP_RETRY_MAYFAIL;
1912  	if (nid == NUMA_NO_NODE)
1913  		nid = numa_mem_id();
1914  	page = __alloc_pages(gfp_mask, order, nid, nmask);
1915  	if (page)
1916  		__count_vm_event(HTLB_BUDDY_PGALLOC);
1917  	else
1918  		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1919  
1920  	/*
1921  	 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
1922  	 * indicates an overall state change.  Clear bit so that we resume
1923  	 * normal 'try hard' allocations.
1924  	 */
1925  	if (node_alloc_noretry && page && !alloc_try_hard)
1926  		node_clear(nid, *node_alloc_noretry);
1927  
1928  	/*
1929  	 * If we tried hard to get a page but failed, set bit so that
1930  	 * subsequent attempts will not try as hard until there is an
1931  	 * overall state change.
1932  	 */
1933  	if (node_alloc_noretry && !page && alloc_try_hard)
1934  		node_set(nid, *node_alloc_noretry);
1935  
1936  	return page;
1937  }
1938  
1939  /*
1940   * Common helper to allocate a fresh hugetlb page. All specific allocators
1941   * should use this function to get new hugetlb pages
1942   */
1943  static struct page *alloc_fresh_huge_page(struct hstate *h,
1944  		gfp_t gfp_mask, int nid, nodemask_t *nmask,
1945  		nodemask_t *node_alloc_noretry)
1946  {
1947  	struct page *page;
1948  	bool retry = false;
1949  
1950  retry:
1951  	if (hstate_is_gigantic(h))
1952  		page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
1953  	else
1954  		page = alloc_buddy_huge_page(h, gfp_mask,
1955  				nid, nmask, node_alloc_noretry);
1956  	if (!page)
1957  		return NULL;
1958  
1959  	if (hstate_is_gigantic(h)) {
1960  		if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
1961  			/*
1962  			 * Rare failure to convert pages to compound page.
1963  			 * Free pages and try again - ONCE!
1964  			 */
1965  			free_gigantic_page(page, huge_page_order(h));
1966  			if (!retry) {
1967  				retry = true;
1968  				goto retry;
1969  			}
1970  			return NULL;
1971  		}
1972  	}
1973  	prep_new_huge_page(h, page, page_to_nid(page));
1974  
1975  	return page;
1976  }
1977  
1978  /*
1979   * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
1980   * manner.
1981   */
1982  static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
1983  				nodemask_t *node_alloc_noretry)
1984  {
1985  	struct page *page;
1986  	int nr_nodes, node;
1987  	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
1988  
1989  	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1990  		page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
1991  						node_alloc_noretry);
1992  		if (page)
1993  			break;
1994  	}
1995  
1996  	if (!page)
1997  		return 0;
1998  
1999  	put_page(page); /* free it into the hugepage allocator */
2000  
2001  	return 1;
2002  }
2003  
2004  /*
2005   * Remove huge page from pool from next node to free.  Attempt to keep
2006   * persistent huge pages more or less balanced over allowed nodes.
2007   * This routine only 'removes' the hugetlb page.  The caller must make
2008   * an additional call to free the page to low level allocators.
2009   * Called with hugetlb_lock locked.
2010   */
2011  static struct page *remove_pool_huge_page(struct hstate *h,
2012  						nodemask_t *nodes_allowed,
2013  						 bool acct_surplus)
2014  {
2015  	int nr_nodes, node;
2016  	struct page *page = NULL;
2017  
2018  	lockdep_assert_held(&hugetlb_lock);
2019  	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2020  		/*
2021  		 * If we're returning unused surplus pages, only examine
2022  		 * nodes with surplus pages.
2023  		 */
2024  		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
2025  		    !list_empty(&h->hugepage_freelists[node])) {
2026  			page = list_entry(h->hugepage_freelists[node].next,
2027  					  struct page, lru);
2028  			remove_hugetlb_page(h, page, acct_surplus);
2029  			break;
2030  		}
2031  	}
2032  
2033  	return page;
2034  }
2035  
2036  /*
2037   * Dissolve a given free hugepage into free buddy pages. This function does
2038   * nothing for in-use hugepages and non-hugepages.
2039   * This function returns values like below:
2040   *
2041   *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
2042   *           when the system is under memory pressure and the feature of
2043   *           freeing unused vmemmap pages associated with each hugetlb page
2044   *           is enabled.
2045   *  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
2046   *           (allocated or reserved.)
2047   *       0:  successfully dissolved free hugepages or the page is not a
2048   *           hugepage (considered as already dissolved)
2049   */
2050  int dissolve_free_huge_page(struct page *page)
2051  {
2052  	int rc = -EBUSY;
2053  
2054  retry:
2055  	/* Not to disrupt normal path by vainly holding hugetlb_lock */
2056  	if (!PageHuge(page))
2057  		return 0;
2058  
2059  	spin_lock_irq(&hugetlb_lock);
2060  	if (!PageHuge(page)) {
2061  		rc = 0;
2062  		goto out;
2063  	}
2064  
2065  	if (!page_count(page)) {
2066  		struct page *head = compound_head(page);
2067  		struct hstate *h = page_hstate(head);
2068  		if (h->free_huge_pages - h->resv_huge_pages == 0)
2069  			goto out;
2070  
2071  		/*
2072  		 * We should make sure that the page is already on the free list
2073  		 * when it is dissolved.
2074  		 */
2075  		if (unlikely(!HPageFreed(head))) {
2076  			spin_unlock_irq(&hugetlb_lock);
2077  			cond_resched();
2078  
2079  			/*
2080  			 * Theoretically, we should return -EBUSY when we
2081  			 * encounter this race. In fact, we have a chance
2082  			 * to successfully dissolve the page if we do a
2083  			 * retry. Because the race window is quite small.
2084  			 * If we seize this opportunity, it is an optimization
2085  			 * for increasing the success rate of dissolving page.
2086  			 */
2087  			goto retry;
2088  		}
2089  
2090  		remove_hugetlb_page(h, head, false);
2091  		h->max_huge_pages--;
2092  		spin_unlock_irq(&hugetlb_lock);
2093  
2094  		/*
2095  		 * Normally update_and_free_page will allocate required vmemmmap
2096  		 * before freeing the page.  update_and_free_page will fail to
2097  		 * free the page if it can not allocate required vmemmap.  We
2098  		 * need to adjust max_huge_pages if the page is not freed.
2099  		 * Attempt to allocate vmemmmap here so that we can take
2100  		 * appropriate action on failure.
2101  		 */
2102  		rc = alloc_huge_page_vmemmap(h, head);
2103  		if (!rc) {
2104  			/*
2105  			 * Move PageHWPoison flag from head page to the raw
2106  			 * error page, which makes any subpages rather than
2107  			 * the error page reusable.
2108  			 */
2109  			if (PageHWPoison(head) && page != head) {
2110  				SetPageHWPoison(page);
2111  				ClearPageHWPoison(head);
2112  			}
2113  			update_and_free_page(h, head, false);
2114  		} else {
2115  			spin_lock_irq(&hugetlb_lock);
2116  			add_hugetlb_page(h, head, false);
2117  			h->max_huge_pages++;
2118  			spin_unlock_irq(&hugetlb_lock);
2119  		}
2120  
2121  		return rc;
2122  	}
2123  out:
2124  	spin_unlock_irq(&hugetlb_lock);
2125  	return rc;
2126  }
2127  
2128  /*
2129   * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
2130   * make specified memory blocks removable from the system.
2131   * Note that this will dissolve a free gigantic hugepage completely, if any
2132   * part of it lies within the given range.
2133   * Also note that if dissolve_free_huge_page() returns with an error, all
2134   * free hugepages that were dissolved before that error are lost.
2135   */
2136  int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
2137  {
2138  	unsigned long pfn;
2139  	struct page *page;
2140  	int rc = 0;
2141  
2142  	if (!hugepages_supported())
2143  		return rc;
2144  
2145  	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
2146  		page = pfn_to_page(pfn);
2147  		rc = dissolve_free_huge_page(page);
2148  		if (rc)
2149  			break;
2150  	}
2151  
2152  	return rc;
2153  }
2154  
2155  /*
2156   * Allocates a fresh surplus page from the page allocator.
2157   */
2158  static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
2159  		int nid, nodemask_t *nmask, bool zero_ref)
2160  {
2161  	struct page *page = NULL;
2162  	bool retry = false;
2163  
2164  	if (hstate_is_gigantic(h))
2165  		return NULL;
2166  
2167  	spin_lock_irq(&hugetlb_lock);
2168  	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2169  		goto out_unlock;
2170  	spin_unlock_irq(&hugetlb_lock);
2171  
2172  retry:
2173  	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2174  	if (!page)
2175  		return NULL;
2176  
2177  	spin_lock_irq(&hugetlb_lock);
2178  	/*
2179  	 * We could have raced with the pool size change.
2180  	 * Double check that and simply deallocate the new page
2181  	 * if we would end up overcommiting the surpluses. Abuse
2182  	 * temporary page to workaround the nasty free_huge_page
2183  	 * codeflow
2184  	 */
2185  	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2186  		SetHPageTemporary(page);
2187  		spin_unlock_irq(&hugetlb_lock);
2188  		put_page(page);
2189  		return NULL;
2190  	}
2191  
2192  	if (zero_ref) {
2193  		/*
2194  		 * Caller requires a page with zero ref count.
2195  		 * We will drop ref count here.  If someone else is holding
2196  		 * a ref, the page will be freed when they drop it.  Abuse
2197  		 * temporary page flag to accomplish this.
2198  		 */
2199  		SetHPageTemporary(page);
2200  		if (!put_page_testzero(page)) {
2201  			/*
2202  			 * Unexpected inflated ref count on freshly allocated
2203  			 * huge.  Retry once.
2204  			 */
2205  			pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
2206  			spin_unlock_irq(&hugetlb_lock);
2207  			if (retry)
2208  				return NULL;
2209  
2210  			retry = true;
2211  			goto retry;
2212  		}
2213  		ClearHPageTemporary(page);
2214  	}
2215  
2216  	h->surplus_huge_pages++;
2217  	h->surplus_huge_pages_node[page_to_nid(page)]++;
2218  
2219  out_unlock:
2220  	spin_unlock_irq(&hugetlb_lock);
2221  
2222  	return page;
2223  }
2224  
2225  static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
2226  				     int nid, nodemask_t *nmask)
2227  {
2228  	struct page *page;
2229  
2230  	if (hstate_is_gigantic(h))
2231  		return NULL;
2232  
2233  	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2234  	if (!page)
2235  		return NULL;
2236  
2237  	/*
2238  	 * We do not account these pages as surplus because they are only
2239  	 * temporary and will be released properly on the last reference
2240  	 */
2241  	SetHPageTemporary(page);
2242  
2243  	return page;
2244  }
2245  
2246  /*
2247   * Use the VMA's mpolicy to allocate a huge page from the buddy.
2248   */
2249  static
2250  struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
2251  		struct vm_area_struct *vma, unsigned long addr)
2252  {
2253  	struct page *page = NULL;
2254  	struct mempolicy *mpol;
2255  	gfp_t gfp_mask = htlb_alloc_mask(h);
2256  	int nid;
2257  	nodemask_t *nodemask;
2258  
2259  	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
2260  	if (mpol_is_preferred_many(mpol)) {
2261  		gfp_t gfp = gfp_mask | __GFP_NOWARN;
2262  
2263  		gfp &=  ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2264  		page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
2265  
2266  		/* Fallback to all nodes if page==NULL */
2267  		nodemask = NULL;
2268  	}
2269  
2270  	if (!page)
2271  		page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
2272  	mpol_cond_put(mpol);
2273  	return page;
2274  }
2275  
2276  /* page migration callback function */
2277  struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
2278  		nodemask_t *nmask, gfp_t gfp_mask)
2279  {
2280  	spin_lock_irq(&hugetlb_lock);
2281  	if (h->free_huge_pages - h->resv_huge_pages > 0) {
2282  		struct page *page;
2283  
2284  		page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
2285  		if (page) {
2286  			spin_unlock_irq(&hugetlb_lock);
2287  			return page;
2288  		}
2289  	}
2290  	spin_unlock_irq(&hugetlb_lock);
2291  
2292  	return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
2293  }
2294  
2295  /* mempolicy aware migration callback */
2296  struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
2297  		unsigned long address)
2298  {
2299  	struct mempolicy *mpol;
2300  	nodemask_t *nodemask;
2301  	struct page *page;
2302  	gfp_t gfp_mask;
2303  	int node;
2304  
2305  	gfp_mask = htlb_alloc_mask(h);
2306  	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
2307  	page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
2308  	mpol_cond_put(mpol);
2309  
2310  	return page;
2311  }
2312  
2313  /*
2314   * Increase the hugetlb pool such that it can accommodate a reservation
2315   * of size 'delta'.
2316   */
2317  static int gather_surplus_pages(struct hstate *h, long delta)
2318  	__must_hold(&hugetlb_lock)
2319  {
2320  	struct list_head surplus_list;
2321  	struct page *page, *tmp;
2322  	int ret;
2323  	long i;
2324  	long needed, allocated;
2325  	bool alloc_ok = true;
2326  
2327  	lockdep_assert_held(&hugetlb_lock);
2328  	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2329  	if (needed <= 0) {
2330  		h->resv_huge_pages += delta;
2331  		return 0;
2332  	}
2333  
2334  	allocated = 0;
2335  	INIT_LIST_HEAD(&surplus_list);
2336  
2337  	ret = -ENOMEM;
2338  retry:
2339  	spin_unlock_irq(&hugetlb_lock);
2340  	for (i = 0; i < needed; i++) {
2341  		page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
2342  				NUMA_NO_NODE, NULL, true);
2343  		if (!page) {
2344  			alloc_ok = false;
2345  			break;
2346  		}
2347  		list_add(&page->lru, &surplus_list);
2348  		cond_resched();
2349  	}
2350  	allocated += i;
2351  
2352  	/*
2353  	 * After retaking hugetlb_lock, we need to recalculate 'needed'
2354  	 * because either resv_huge_pages or free_huge_pages may have changed.
2355  	 */
2356  	spin_lock_irq(&hugetlb_lock);
2357  	needed = (h->resv_huge_pages + delta) -
2358  			(h->free_huge_pages + allocated);
2359  	if (needed > 0) {
2360  		if (alloc_ok)
2361  			goto retry;
2362  		/*
2363  		 * We were not able to allocate enough pages to
2364  		 * satisfy the entire reservation so we free what
2365  		 * we've allocated so far.
2366  		 */
2367  		goto free;
2368  	}
2369  	/*
2370  	 * The surplus_list now contains _at_least_ the number of extra pages
2371  	 * needed to accommodate the reservation.  Add the appropriate number
2372  	 * of pages to the hugetlb pool and free the extras back to the buddy
2373  	 * allocator.  Commit the entire reservation here to prevent another
2374  	 * process from stealing the pages as they are added to the pool but
2375  	 * before they are reserved.
2376  	 */
2377  	needed += allocated;
2378  	h->resv_huge_pages += delta;
2379  	ret = 0;
2380  
2381  	/* Free the needed pages to the hugetlb pool */
2382  	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
2383  		if ((--needed) < 0)
2384  			break;
2385  		/* Add the page to the hugetlb allocator */
2386  		enqueue_huge_page(h, page);
2387  	}
2388  free:
2389  	spin_unlock_irq(&hugetlb_lock);
2390  
2391  	/*
2392  	 * Free unnecessary surplus pages to the buddy allocator.
2393  	 * Pages have no ref count, call free_huge_page directly.
2394  	 */
2395  	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
2396  		free_huge_page(page);
2397  	spin_lock_irq(&hugetlb_lock);
2398  
2399  	return ret;
2400  }
2401  
2402  /*
2403   * This routine has two main purposes:
2404   * 1) Decrement the reservation count (resv_huge_pages) by the value passed
2405   *    in unused_resv_pages.  This corresponds to the prior adjustments made
2406   *    to the associated reservation map.
2407   * 2) Free any unused surplus pages that may have been allocated to satisfy
2408   *    the reservation.  As many as unused_resv_pages may be freed.
2409   */
2410  static void return_unused_surplus_pages(struct hstate *h,
2411  					unsigned long unused_resv_pages)
2412  {
2413  	unsigned long nr_pages;
2414  	struct page *page;
2415  	LIST_HEAD(page_list);
2416  
2417  	lockdep_assert_held(&hugetlb_lock);
2418  	/* Uncommit the reservation */
2419  	h->resv_huge_pages -= unused_resv_pages;
2420  
2421  	/* Cannot return gigantic pages currently */
2422  	if (hstate_is_gigantic(h))
2423  		goto out;
2424  
2425  	/*
2426  	 * Part (or even all) of the reservation could have been backed
2427  	 * by pre-allocated pages. Only free surplus pages.
2428  	 */
2429  	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2430  
2431  	/*
2432  	 * We want to release as many surplus pages as possible, spread
2433  	 * evenly across all nodes with memory. Iterate across these nodes
2434  	 * until we can no longer free unreserved surplus pages. This occurs
2435  	 * when the nodes with surplus pages have no free pages.
2436  	 * remove_pool_huge_page() will balance the freed pages across the
2437  	 * on-line nodes with memory and will handle the hstate accounting.
2438  	 */
2439  	while (nr_pages--) {
2440  		page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
2441  		if (!page)
2442  			goto out;
2443  
2444  		list_add(&page->lru, &page_list);
2445  	}
2446  
2447  out:
2448  	spin_unlock_irq(&hugetlb_lock);
2449  	update_and_free_pages_bulk(h, &page_list);
2450  	spin_lock_irq(&hugetlb_lock);
2451  }
2452  
2453  
2454  /*
2455   * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
2456   * are used by the huge page allocation routines to manage reservations.
2457   *
2458   * vma_needs_reservation is called to determine if the huge page at addr
2459   * within the vma has an associated reservation.  If a reservation is
2460   * needed, the value 1 is returned.  The caller is then responsible for
2461   * managing the global reservation and subpool usage counts.  After
2462   * the huge page has been allocated, vma_commit_reservation is called
2463   * to add the page to the reservation map.  If the page allocation fails,
2464   * the reservation must be ended instead of committed.  vma_end_reservation
2465   * is called in such cases.
2466   *
2467   * In the normal case, vma_commit_reservation returns the same value
2468   * as the preceding vma_needs_reservation call.  The only time this
2469   * is not the case is if a reserve map was changed between calls.  It
2470   * is the responsibility of the caller to notice the difference and
2471   * take appropriate action.
2472   *
2473   * vma_add_reservation is used in error paths where a reservation must
2474   * be restored when a newly allocated huge page must be freed.  It is
2475   * to be called after calling vma_needs_reservation to determine if a
2476   * reservation exists.
2477   *
2478   * vma_del_reservation is used in error paths where an entry in the reserve
2479   * map was created during huge page allocation and must be removed.  It is to
2480   * be called after calling vma_needs_reservation to determine if a reservation
2481   * exists.
2482   */
2483  enum vma_resv_mode {
2484  	VMA_NEEDS_RESV,
2485  	VMA_COMMIT_RESV,
2486  	VMA_END_RESV,
2487  	VMA_ADD_RESV,
2488  	VMA_DEL_RESV,
2489  };
2490  static long __vma_reservation_common(struct hstate *h,
2491  				struct vm_area_struct *vma, unsigned long addr,
2492  				enum vma_resv_mode mode)
2493  {
2494  	struct resv_map *resv;
2495  	pgoff_t idx;
2496  	long ret;
2497  	long dummy_out_regions_needed;
2498  
2499  	resv = vma_resv_map(vma);
2500  	if (!resv)
2501  		return 1;
2502  
2503  	idx = vma_hugecache_offset(h, vma, addr);
2504  	switch (mode) {
2505  	case VMA_NEEDS_RESV:
2506  		ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2507  		/* We assume that vma_reservation_* routines always operate on
2508  		 * 1 page, and that adding to resv map a 1 page entry can only
2509  		 * ever require 1 region.
2510  		 */
2511  		VM_BUG_ON(dummy_out_regions_needed != 1);
2512  		break;
2513  	case VMA_COMMIT_RESV:
2514  		ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2515  		/* region_add calls of range 1 should never fail. */
2516  		VM_BUG_ON(ret < 0);
2517  		break;
2518  	case VMA_END_RESV:
2519  		region_abort(resv, idx, idx + 1, 1);
2520  		ret = 0;
2521  		break;
2522  	case VMA_ADD_RESV:
2523  		if (vma->vm_flags & VM_MAYSHARE) {
2524  			ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2525  			/* region_add calls of range 1 should never fail. */
2526  			VM_BUG_ON(ret < 0);
2527  		} else {
2528  			region_abort(resv, idx, idx + 1, 1);
2529  			ret = region_del(resv, idx, idx + 1);
2530  		}
2531  		break;
2532  	case VMA_DEL_RESV:
2533  		if (vma->vm_flags & VM_MAYSHARE) {
2534  			region_abort(resv, idx, idx + 1, 1);
2535  			ret = region_del(resv, idx, idx + 1);
2536  		} else {
2537  			ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2538  			/* region_add calls of range 1 should never fail. */
2539  			VM_BUG_ON(ret < 0);
2540  		}
2541  		break;
2542  	default:
2543  		BUG();
2544  	}
2545  
2546  	if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
2547  		return ret;
2548  	/*
2549  	 * We know private mapping must have HPAGE_RESV_OWNER set.
2550  	 *
2551  	 * In most cases, reserves always exist for private mappings.
2552  	 * However, a file associated with mapping could have been
2553  	 * hole punched or truncated after reserves were consumed.
2554  	 * As subsequent fault on such a range will not use reserves.
2555  	 * Subtle - The reserve map for private mappings has the
2556  	 * opposite meaning than that of shared mappings.  If NO
2557  	 * entry is in the reserve map, it means a reservation exists.
2558  	 * If an entry exists in the reserve map, it means the
2559  	 * reservation has already been consumed.  As a result, the
2560  	 * return value of this routine is the opposite of the
2561  	 * value returned from reserve map manipulation routines above.
2562  	 */
2563  	if (ret > 0)
2564  		return 0;
2565  	if (ret == 0)
2566  		return 1;
2567  	return ret;
2568  }
2569  
2570  static long vma_needs_reservation(struct hstate *h,
2571  			struct vm_area_struct *vma, unsigned long addr)
2572  {
2573  	return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
2574  }
2575  
2576  static long vma_commit_reservation(struct hstate *h,
2577  			struct vm_area_struct *vma, unsigned long addr)
2578  {
2579  	return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
2580  }
2581  
2582  static void vma_end_reservation(struct hstate *h,
2583  			struct vm_area_struct *vma, unsigned long addr)
2584  {
2585  	(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
2586  }
2587  
2588  static long vma_add_reservation(struct hstate *h,
2589  			struct vm_area_struct *vma, unsigned long addr)
2590  {
2591  	return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
2592  }
2593  
2594  static long vma_del_reservation(struct hstate *h,
2595  			struct vm_area_struct *vma, unsigned long addr)
2596  {
2597  	return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
2598  }
2599  
2600  /*
2601   * This routine is called to restore reservation information on error paths.
2602   * It should ONLY be called for pages allocated via alloc_huge_page(), and
2603   * the hugetlb mutex should remain held when calling this routine.
2604   *
2605   * It handles two specific cases:
2606   * 1) A reservation was in place and the page consumed the reservation.
2607   *    HPageRestoreReserve is set in the page.
2608   * 2) No reservation was in place for the page, so HPageRestoreReserve is
2609   *    not set.  However, alloc_huge_page always updates the reserve map.
2610   *
2611   * In case 1, free_huge_page later in the error path will increment the
2612   * global reserve count.  But, free_huge_page does not have enough context
2613   * to adjust the reservation map.  This case deals primarily with private
2614   * mappings.  Adjust the reserve map here to be consistent with global
2615   * reserve count adjustments to be made by free_huge_page.  Make sure the
2616   * reserve map indicates there is a reservation present.
2617   *
2618   * In case 2, simply undo reserve map modifications done by alloc_huge_page.
2619   */
2620  void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
2621  			unsigned long address, struct page *page)
2622  {
2623  	long rc = vma_needs_reservation(h, vma, address);
2624  
2625  	if (HPageRestoreReserve(page)) {
2626  		if (unlikely(rc < 0))
2627  			/*
2628  			 * Rare out of memory condition in reserve map
2629  			 * manipulation.  Clear HPageRestoreReserve so that
2630  			 * global reserve count will not be incremented
2631  			 * by free_huge_page.  This will make it appear
2632  			 * as though the reservation for this page was
2633  			 * consumed.  This may prevent the task from
2634  			 * faulting in the page at a later time.  This
2635  			 * is better than inconsistent global huge page
2636  			 * accounting of reserve counts.
2637  			 */
2638  			ClearHPageRestoreReserve(page);
2639  		else if (rc)
2640  			(void)vma_add_reservation(h, vma, address);
2641  		else
2642  			vma_end_reservation(h, vma, address);
2643  	} else {
2644  		if (!rc) {
2645  			/*
2646  			 * This indicates there is an entry in the reserve map
2647  			 * not added by alloc_huge_page.  We know it was added
2648  			 * before the alloc_huge_page call, otherwise
2649  			 * HPageRestoreReserve would be set on the page.
2650  			 * Remove the entry so that a subsequent allocation
2651  			 * does not consume a reservation.
2652  			 */
2653  			rc = vma_del_reservation(h, vma, address);
2654  			if (rc < 0)
2655  				/*
2656  				 * VERY rare out of memory condition.  Since
2657  				 * we can not delete the entry, set
2658  				 * HPageRestoreReserve so that the reserve
2659  				 * count will be incremented when the page
2660  				 * is freed.  This reserve will be consumed
2661  				 * on a subsequent allocation.
2662  				 */
2663  				SetHPageRestoreReserve(page);
2664  		} else if (rc < 0) {
2665  			/*
2666  			 * Rare out of memory condition from
2667  			 * vma_needs_reservation call.  Memory allocation is
2668  			 * only attempted if a new entry is needed.  Therefore,
2669  			 * this implies there is not an entry in the
2670  			 * reserve map.
2671  			 *
2672  			 * For shared mappings, no entry in the map indicates
2673  			 * no reservation.  We are done.
2674  			 */
2675  			if (!(vma->vm_flags & VM_MAYSHARE))
2676  				/*
2677  				 * For private mappings, no entry indicates
2678  				 * a reservation is present.  Since we can
2679  				 * not add an entry, set SetHPageRestoreReserve
2680  				 * on the page so reserve count will be
2681  				 * incremented when freed.  This reserve will
2682  				 * be consumed on a subsequent allocation.
2683  				 */
2684  				SetHPageRestoreReserve(page);
2685  		} else
2686  			/*
2687  			 * No reservation present, do nothing
2688  			 */
2689  			 vma_end_reservation(h, vma, address);
2690  	}
2691  }
2692  
2693  /*
2694   * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
2695   * @h: struct hstate old page belongs to
2696   * @old_page: Old page to dissolve
2697   * @list: List to isolate the page in case we need to
2698   * Returns 0 on success, otherwise negated error.
2699   */
2700  static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
2701  					struct list_head *list)
2702  {
2703  	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2704  	int nid = page_to_nid(old_page);
2705  	bool alloc_retry = false;
2706  	struct page *new_page;
2707  	int ret = 0;
2708  
2709  	/*
2710  	 * Before dissolving the page, we need to allocate a new one for the
2711  	 * pool to remain stable.  Here, we allocate the page and 'prep' it
2712  	 * by doing everything but actually updating counters and adding to
2713  	 * the pool.  This simplifies and let us do most of the processing
2714  	 * under the lock.
2715  	 */
2716  alloc_retry:
2717  	new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
2718  	if (!new_page)
2719  		return -ENOMEM;
2720  	/*
2721  	 * If all goes well, this page will be directly added to the free
2722  	 * list in the pool.  For this the ref count needs to be zero.
2723  	 * Attempt to drop now, and retry once if needed.  It is VERY
2724  	 * unlikely there is another ref on the page.
2725  	 *
2726  	 * If someone else has a reference to the page, it will be freed
2727  	 * when they drop their ref.  Abuse temporary page flag to accomplish
2728  	 * this.  Retry once if there is an inflated ref count.
2729  	 */
2730  	SetHPageTemporary(new_page);
2731  	if (!put_page_testzero(new_page)) {
2732  		if (alloc_retry)
2733  			return -EBUSY;
2734  
2735  		alloc_retry = true;
2736  		goto alloc_retry;
2737  	}
2738  	ClearHPageTemporary(new_page);
2739  
2740  	__prep_new_huge_page(h, new_page);
2741  
2742  retry:
2743  	spin_lock_irq(&hugetlb_lock);
2744  	if (!PageHuge(old_page)) {
2745  		/*
2746  		 * Freed from under us. Drop new_page too.
2747  		 */
2748  		goto free_new;
2749  	} else if (page_count(old_page)) {
2750  		/*
2751  		 * Someone has grabbed the page, try to isolate it here.
2752  		 * Fail with -EBUSY if not possible.
2753  		 */
2754  		spin_unlock_irq(&hugetlb_lock);
2755  		if (!isolate_huge_page(old_page, list))
2756  			ret = -EBUSY;
2757  		spin_lock_irq(&hugetlb_lock);
2758  		goto free_new;
2759  	} else if (!HPageFreed(old_page)) {
2760  		/*
2761  		 * Page's refcount is 0 but it has not been enqueued in the
2762  		 * freelist yet. Race window is small, so we can succeed here if
2763  		 * we retry.
2764  		 */
2765  		spin_unlock_irq(&hugetlb_lock);
2766  		cond_resched();
2767  		goto retry;
2768  	} else {
2769  		/*
2770  		 * Ok, old_page is still a genuine free hugepage. Remove it from
2771  		 * the freelist and decrease the counters. These will be
2772  		 * incremented again when calling __prep_account_new_huge_page()
2773  		 * and enqueue_huge_page() for new_page. The counters will remain
2774  		 * stable since this happens under the lock.
2775  		 */
2776  		remove_hugetlb_page(h, old_page, false);
2777  
2778  		/*
2779  		 * Ref count on new page is already zero as it was dropped
2780  		 * earlier.  It can be directly added to the pool free list.
2781  		 */
2782  		__prep_account_new_huge_page(h, nid);
2783  		enqueue_huge_page(h, new_page);
2784  
2785  		/*
2786  		 * Pages have been replaced, we can safely free the old one.
2787  		 */
2788  		spin_unlock_irq(&hugetlb_lock);
2789  		update_and_free_page(h, old_page, false);
2790  	}
2791  
2792  	return ret;
2793  
2794  free_new:
2795  	spin_unlock_irq(&hugetlb_lock);
2796  	/* Page has a zero ref count, but needs a ref to be freed */
2797  	set_page_refcounted(new_page);
2798  	update_and_free_page(h, new_page, false);
2799  
2800  	return ret;
2801  }
2802  
2803  int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
2804  {
2805  	struct hstate *h;
2806  	struct page *head;
2807  	int ret = -EBUSY;
2808  
2809  	/*
2810  	 * The page might have been dissolved from under our feet, so make sure
2811  	 * to carefully check the state under the lock.
2812  	 * Return success when racing as if we dissolved the page ourselves.
2813  	 */
2814  	spin_lock_irq(&hugetlb_lock);
2815  	if (PageHuge(page)) {
2816  		head = compound_head(page);
2817  		h = page_hstate(head);
2818  	} else {
2819  		spin_unlock_irq(&hugetlb_lock);
2820  		return 0;
2821  	}
2822  	spin_unlock_irq(&hugetlb_lock);
2823  
2824  	/*
2825  	 * Fence off gigantic pages as there is a cyclic dependency between
2826  	 * alloc_contig_range and them. Return -ENOMEM as this has the effect
2827  	 * of bailing out right away without further retrying.
2828  	 */
2829  	if (hstate_is_gigantic(h))
2830  		return -ENOMEM;
2831  
2832  	if (page_count(head) && isolate_huge_page(head, list))
2833  		ret = 0;
2834  	else if (!page_count(head))
2835  		ret = alloc_and_dissolve_huge_page(h, head, list);
2836  
2837  	return ret;
2838  }
2839  
2840  struct page *alloc_huge_page(struct vm_area_struct *vma,
2841  				    unsigned long addr, int avoid_reserve)
2842  {
2843  	struct hugepage_subpool *spool = subpool_vma(vma);
2844  	struct hstate *h = hstate_vma(vma);
2845  	struct page *page;
2846  	long map_chg, map_commit;
2847  	long gbl_chg;
2848  	int ret, idx;
2849  	struct hugetlb_cgroup *h_cg;
2850  	bool deferred_reserve;
2851  
2852  	idx = hstate_index(h);
2853  	/*
2854  	 * Examine the region/reserve map to determine if the process
2855  	 * has a reservation for the page to be allocated.  A return
2856  	 * code of zero indicates a reservation exists (no change).
2857  	 */
2858  	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
2859  	if (map_chg < 0)
2860  		return ERR_PTR(-ENOMEM);
2861  
2862  	/*
2863  	 * Processes that did not create the mapping will have no
2864  	 * reserves as indicated by the region/reserve map. Check
2865  	 * that the allocation will not exceed the subpool limit.
2866  	 * Allocations for MAP_NORESERVE mappings also need to be
2867  	 * checked against any subpool limit.
2868  	 */
2869  	if (map_chg || avoid_reserve) {
2870  		gbl_chg = hugepage_subpool_get_pages(spool, 1);
2871  		if (gbl_chg < 0) {
2872  			vma_end_reservation(h, vma, addr);
2873  			return ERR_PTR(-ENOSPC);
2874  		}
2875  
2876  		/*
2877  		 * Even though there was no reservation in the region/reserve
2878  		 * map, there could be reservations associated with the
2879  		 * subpool that can be used.  This would be indicated if the
2880  		 * return value of hugepage_subpool_get_pages() is zero.
2881  		 * However, if avoid_reserve is specified we still avoid even
2882  		 * the subpool reservations.
2883  		 */
2884  		if (avoid_reserve)
2885  			gbl_chg = 1;
2886  	}
2887  
2888  	/* If this allocation is not consuming a reservation, charge it now.
2889  	 */
2890  	deferred_reserve = map_chg || avoid_reserve;
2891  	if (deferred_reserve) {
2892  		ret = hugetlb_cgroup_charge_cgroup_rsvd(
2893  			idx, pages_per_huge_page(h), &h_cg);
2894  		if (ret)
2895  			goto out_subpool_put;
2896  	}
2897  
2898  	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
2899  	if (ret)
2900  		goto out_uncharge_cgroup_reservation;
2901  
2902  	spin_lock_irq(&hugetlb_lock);
2903  	/*
2904  	 * glb_chg is passed to indicate whether or not a page must be taken
2905  	 * from the global free pool (global change).  gbl_chg == 0 indicates
2906  	 * a reservation exists for the allocation.
2907  	 */
2908  	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
2909  	if (!page) {
2910  		spin_unlock_irq(&hugetlb_lock);
2911  		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
2912  		if (!page)
2913  			goto out_uncharge_cgroup;
2914  		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
2915  			SetHPageRestoreReserve(page);
2916  			h->resv_huge_pages--;
2917  		}
2918  		spin_lock_irq(&hugetlb_lock);
2919  		list_add(&page->lru, &h->hugepage_activelist);
2920  		/* Fall through */
2921  	}
2922  	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
2923  	/* If allocation is not consuming a reservation, also store the
2924  	 * hugetlb_cgroup pointer on the page.
2925  	 */
2926  	if (deferred_reserve) {
2927  		hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
2928  						  h_cg, page);
2929  	}
2930  
2931  	spin_unlock_irq(&hugetlb_lock);
2932  
2933  	hugetlb_set_page_subpool(page, spool);
2934  
2935  	map_commit = vma_commit_reservation(h, vma, addr);
2936  	if (unlikely(map_chg > map_commit)) {
2937  		/*
2938  		 * The page was added to the reservation map between
2939  		 * vma_needs_reservation and vma_commit_reservation.
2940  		 * This indicates a race with hugetlb_reserve_pages.
2941  		 * Adjust for the subpool count incremented above AND
2942  		 * in hugetlb_reserve_pages for the same page.  Also,
2943  		 * the reservation count added in hugetlb_reserve_pages
2944  		 * no longer applies.
2945  		 */
2946  		long rsv_adjust;
2947  
2948  		rsv_adjust = hugepage_subpool_put_pages(spool, 1);
2949  		hugetlb_acct_memory(h, -rsv_adjust);
2950  		if (deferred_reserve)
2951  			hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
2952  					pages_per_huge_page(h), page);
2953  	}
2954  	return page;
2955  
2956  out_uncharge_cgroup:
2957  	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
2958  out_uncharge_cgroup_reservation:
2959  	if (deferred_reserve)
2960  		hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
2961  						    h_cg);
2962  out_subpool_put:
2963  	if (map_chg || avoid_reserve)
2964  		hugepage_subpool_put_pages(spool, 1);
2965  	vma_end_reservation(h, vma, addr);
2966  	return ERR_PTR(-ENOSPC);
2967  }
2968  
2969  int alloc_bootmem_huge_page(struct hstate *h, int nid)
2970  	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
2971  int __alloc_bootmem_huge_page(struct hstate *h, int nid)
2972  {
2973  	struct huge_bootmem_page *m = NULL; /* initialize for clang */
2974  	int nr_nodes, node;
2975  
2976  	if (nid != NUMA_NO_NODE && nid >= nr_online_nodes)
2977  		return 0;
2978  	/* do node specific alloc */
2979  	if (nid != NUMA_NO_NODE) {
2980  		m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
2981  				0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
2982  		if (!m)
2983  			return 0;
2984  		goto found;
2985  	}
2986  	/* allocate from next node when distributing huge pages */
2987  	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
2988  		m = memblock_alloc_try_nid_raw(
2989  				huge_page_size(h), huge_page_size(h),
2990  				0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
2991  		/*
2992  		 * Use the beginning of the huge page to store the
2993  		 * huge_bootmem_page struct (until gather_bootmem
2994  		 * puts them into the mem_map).
2995  		 */
2996  		if (!m)
2997  			return 0;
2998  		goto found;
2999  	}
3000  
3001  found:
3002  	/* Put them into a private list first because mem_map is not up yet */
3003  	INIT_LIST_HEAD(&m->list);
3004  	list_add(&m->list, &huge_boot_pages);
3005  	m->hstate = h;
3006  	return 1;
3007  }
3008  
3009  /*
3010   * Put bootmem huge pages into the standard lists after mem_map is up.
3011   * Note: This only applies to gigantic (order > MAX_ORDER) pages.
3012   */
3013  static void __init gather_bootmem_prealloc(void)
3014  {
3015  	struct huge_bootmem_page *m;
3016  
3017  	list_for_each_entry(m, &huge_boot_pages, list) {
3018  		struct page *page = virt_to_page(m);
3019  		struct hstate *h = m->hstate;
3020  
3021  		VM_BUG_ON(!hstate_is_gigantic(h));
3022  		WARN_ON(page_count(page) != 1);
3023  		if (prep_compound_gigantic_page(page, huge_page_order(h))) {
3024  			WARN_ON(PageReserved(page));
3025  			prep_new_huge_page(h, page, page_to_nid(page));
3026  			put_page(page); /* add to the hugepage allocator */
3027  		} else {
3028  			/* VERY unlikely inflated ref count on a tail page */
3029  			free_gigantic_page(page, huge_page_order(h));
3030  		}
3031  
3032  		/*
3033  		 * We need to restore the 'stolen' pages to totalram_pages
3034  		 * in order to fix confusing memory reports from free(1) and
3035  		 * other side-effects, like CommitLimit going negative.
3036  		 */
3037  		adjust_managed_page_count(page, pages_per_huge_page(h));
3038  		cond_resched();
3039  	}
3040  }
3041  static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
3042  {
3043  	unsigned long i;
3044  	char buf[32];
3045  
3046  	for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3047  		if (hstate_is_gigantic(h)) {
3048  			if (!alloc_bootmem_huge_page(h, nid))
3049  				break;
3050  		} else {
3051  			struct page *page;
3052  			gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
3053  
3054  			page = alloc_fresh_huge_page(h, gfp_mask, nid,
3055  					&node_states[N_MEMORY], NULL);
3056  			if (!page)
3057  				break;
3058  			put_page(page); /* free it into the hugepage allocator */
3059  		}
3060  		cond_resched();
3061  	}
3062  	if (i == h->max_huge_pages_node[nid])
3063  		return;
3064  
3065  	string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3066  	pr_warn("HugeTLB: allocating %u of page size %s failed node%d.  Only allocated %lu hugepages.\n",
3067  		h->max_huge_pages_node[nid], buf, nid, i);
3068  	h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3069  	h->max_huge_pages_node[nid] = i;
3070  }
3071  
3072  static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
3073  {
3074  	unsigned long i;
3075  	nodemask_t *node_alloc_noretry;
3076  	bool node_specific_alloc = false;
3077  
3078  	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
3079  	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
3080  		pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3081  		return;
3082  	}
3083  
3084  	/* do node specific alloc */
3085  	for (i = 0; i < nr_online_nodes; i++) {
3086  		if (h->max_huge_pages_node[i] > 0) {
3087  			hugetlb_hstate_alloc_pages_onenode(h, i);
3088  			node_specific_alloc = true;
3089  		}
3090  	}
3091  
3092  	if (node_specific_alloc)
3093  		return;
3094  
3095  	/* below will do all node balanced alloc */
3096  	if (!hstate_is_gigantic(h)) {
3097  		/*
3098  		 * Bit mask controlling how hard we retry per-node allocations.
3099  		 * Ignore errors as lower level routines can deal with
3100  		 * node_alloc_noretry == NULL.  If this kmalloc fails at boot
3101  		 * time, we are likely in bigger trouble.
3102  		 */
3103  		node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
3104  						GFP_KERNEL);
3105  	} else {
3106  		/* allocations done at boot time */
3107  		node_alloc_noretry = NULL;
3108  	}
3109  
3110  	/* bit mask controlling how hard we retry per-node allocations */
3111  	if (node_alloc_noretry)
3112  		nodes_clear(*node_alloc_noretry);
3113  
3114  	for (i = 0; i < h->max_huge_pages; ++i) {
3115  		if (hstate_is_gigantic(h)) {
3116  			if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
3117  				break;
3118  		} else if (!alloc_pool_huge_page(h,
3119  					 &node_states[N_MEMORY],
3120  					 node_alloc_noretry))
3121  			break;
3122  		cond_resched();
3123  	}
3124  	if (i < h->max_huge_pages) {
3125  		char buf[32];
3126  
3127  		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3128  		pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
3129  			h->max_huge_pages, buf, i);
3130  		h->max_huge_pages = i;
3131  	}
3132  	kfree(node_alloc_noretry);
3133  }
3134  
3135  static void __init hugetlb_init_hstates(void)
3136  {
3137  	struct hstate *h, *h2;
3138  
3139  	for_each_hstate(h) {
3140  		if (minimum_order > huge_page_order(h))
3141  			minimum_order = huge_page_order(h);
3142  
3143  		/* oversize hugepages were init'ed in early boot */
3144  		if (!hstate_is_gigantic(h))
3145  			hugetlb_hstate_alloc_pages(h);
3146  
3147  		/*
3148  		 * Set demote order for each hstate.  Note that
3149  		 * h->demote_order is initially 0.
3150  		 * - We can not demote gigantic pages if runtime freeing
3151  		 *   is not supported, so skip this.
3152  		 * - If CMA allocation is possible, we can not demote
3153  		 *   HUGETLB_PAGE_ORDER or smaller size pages.
3154  		 */
3155  		if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3156  			continue;
3157  		if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
3158  			continue;
3159  		for_each_hstate(h2) {
3160  			if (h2 == h)
3161  				continue;
3162  			if (h2->order < h->order &&
3163  			    h2->order > h->demote_order)
3164  				h->demote_order = h2->order;
3165  		}
3166  	}
3167  	VM_BUG_ON(minimum_order == UINT_MAX);
3168  }
3169  
3170  static void __init report_hugepages(void)
3171  {
3172  	struct hstate *h;
3173  
3174  	for_each_hstate(h) {
3175  		char buf[32];
3176  
3177  		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3178  		pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
3179  			buf, h->free_huge_pages);
3180  	}
3181  }
3182  
3183  #ifdef CONFIG_HIGHMEM
3184  static void try_to_free_low(struct hstate *h, unsigned long count,
3185  						nodemask_t *nodes_allowed)
3186  {
3187  	int i;
3188  	LIST_HEAD(page_list);
3189  
3190  	lockdep_assert_held(&hugetlb_lock);
3191  	if (hstate_is_gigantic(h))
3192  		return;
3193  
3194  	/*
3195  	 * Collect pages to be freed on a list, and free after dropping lock
3196  	 */
3197  	for_each_node_mask(i, *nodes_allowed) {
3198  		struct page *page, *next;
3199  		struct list_head *freel = &h->hugepage_freelists[i];
3200  		list_for_each_entry_safe(page, next, freel, lru) {
3201  			if (count >= h->nr_huge_pages)
3202  				goto out;
3203  			if (PageHighMem(page))
3204  				continue;
3205  			remove_hugetlb_page(h, page, false);
3206  			list_add(&page->lru, &page_list);
3207  		}
3208  	}
3209  
3210  out:
3211  	spin_unlock_irq(&hugetlb_lock);
3212  	update_and_free_pages_bulk(h, &page_list);
3213  	spin_lock_irq(&hugetlb_lock);
3214  }
3215  #else
3216  static inline void try_to_free_low(struct hstate *h, unsigned long count,
3217  						nodemask_t *nodes_allowed)
3218  {
3219  }
3220  #endif
3221  
3222  /*
3223   * Increment or decrement surplus_huge_pages.  Keep node-specific counters
3224   * balanced by operating on them in a round-robin fashion.
3225   * Returns 1 if an adjustment was made.
3226   */
3227  static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
3228  				int delta)
3229  {
3230  	int nr_nodes, node;
3231  
3232  	lockdep_assert_held(&hugetlb_lock);
3233  	VM_BUG_ON(delta != -1 && delta != 1);
3234  
3235  	if (delta < 0) {
3236  		for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
3237  			if (h->surplus_huge_pages_node[node])
3238  				goto found;
3239  		}
3240  	} else {
3241  		for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3242  			if (h->surplus_huge_pages_node[node] <
3243  					h->nr_huge_pages_node[node])
3244  				goto found;
3245  		}
3246  	}
3247  	return 0;
3248  
3249  found:
3250  	h->surplus_huge_pages += delta;
3251  	h->surplus_huge_pages_node[node] += delta;
3252  	return 1;
3253  }
3254  
3255  #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
3256  static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
3257  			      nodemask_t *nodes_allowed)
3258  {
3259  	unsigned long min_count, ret;
3260  	struct page *page;
3261  	LIST_HEAD(page_list);
3262  	NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3263  
3264  	/*
3265  	 * Bit mask controlling how hard we retry per-node allocations.
3266  	 * If we can not allocate the bit mask, do not attempt to allocate
3267  	 * the requested huge pages.
3268  	 */
3269  	if (node_alloc_noretry)
3270  		nodes_clear(*node_alloc_noretry);
3271  	else
3272  		return -ENOMEM;
3273  
3274  	/*
3275  	 * resize_lock mutex prevents concurrent adjustments to number of
3276  	 * pages in hstate via the proc/sysfs interfaces.
3277  	 */
3278  	mutex_lock(&h->resize_lock);
3279  	flush_free_hpage_work(h);
3280  	spin_lock_irq(&hugetlb_lock);
3281  
3282  	/*
3283  	 * Check for a node specific request.
3284  	 * Changing node specific huge page count may require a corresponding
3285  	 * change to the global count.  In any case, the passed node mask
3286  	 * (nodes_allowed) will restrict alloc/free to the specified node.
3287  	 */
3288  	if (nid != NUMA_NO_NODE) {
3289  		unsigned long old_count = count;
3290  
3291  		count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
3292  		/*
3293  		 * User may have specified a large count value which caused the
3294  		 * above calculation to overflow.  In this case, they wanted
3295  		 * to allocate as many huge pages as possible.  Set count to
3296  		 * largest possible value to align with their intention.
3297  		 */
3298  		if (count < old_count)
3299  			count = ULONG_MAX;
3300  	}
3301  
3302  	/*
3303  	 * Gigantic pages runtime allocation depend on the capability for large
3304  	 * page range allocation.
3305  	 * If the system does not provide this feature, return an error when
3306  	 * the user tries to allocate gigantic pages but let the user free the
3307  	 * boottime allocated gigantic pages.
3308  	 */
3309  	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3310  		if (count > persistent_huge_pages(h)) {
3311  			spin_unlock_irq(&hugetlb_lock);
3312  			mutex_unlock(&h->resize_lock);
3313  			NODEMASK_FREE(node_alloc_noretry);
3314  			return -EINVAL;
3315  		}
3316  		/* Fall through to decrease pool */
3317  	}
3318  
3319  	/*
3320  	 * Increase the pool size
3321  	 * First take pages out of surplus state.  Then make up the
3322  	 * remaining difference by allocating fresh huge pages.
3323  	 *
3324  	 * We might race with alloc_surplus_huge_page() here and be unable
3325  	 * to convert a surplus huge page to a normal huge page. That is
3326  	 * not critical, though, it just means the overall size of the
3327  	 * pool might be one hugepage larger than it needs to be, but
3328  	 * within all the constraints specified by the sysctls.
3329  	 */
3330  	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3331  		if (!adjust_pool_surplus(h, nodes_allowed, -1))
3332  			break;
3333  	}
3334  
3335  	while (count > persistent_huge_pages(h)) {
3336  		/*
3337  		 * If this allocation races such that we no longer need the
3338  		 * page, free_huge_page will handle it by freeing the page
3339  		 * and reducing the surplus.
3340  		 */
3341  		spin_unlock_irq(&hugetlb_lock);
3342  
3343  		/* yield cpu to avoid soft lockup */
3344  		cond_resched();
3345  
3346  		ret = alloc_pool_huge_page(h, nodes_allowed,
3347  						node_alloc_noretry);
3348  		spin_lock_irq(&hugetlb_lock);
3349  		if (!ret)
3350  			goto out;
3351  
3352  		/* Bail for signals. Probably ctrl-c from user */
3353  		if (signal_pending(current))
3354  			goto out;
3355  	}
3356  
3357  	/*
3358  	 * Decrease the pool size
3359  	 * First return free pages to the buddy allocator (being careful
3360  	 * to keep enough around to satisfy reservations).  Then place
3361  	 * pages into surplus state as needed so the pool will shrink
3362  	 * to the desired size as pages become free.
3363  	 *
3364  	 * By placing pages into the surplus state independent of the
3365  	 * overcommit value, we are allowing the surplus pool size to
3366  	 * exceed overcommit. There are few sane options here. Since
3367  	 * alloc_surplus_huge_page() is checking the global counter,
3368  	 * though, we'll note that we're not allowed to exceed surplus
3369  	 * and won't grow the pool anywhere else. Not until one of the
3370  	 * sysctls are changed, or the surplus pages go out of use.
3371  	 */
3372  	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
3373  	min_count = max(count, min_count);
3374  	try_to_free_low(h, min_count, nodes_allowed);
3375  
3376  	/*
3377  	 * Collect pages to be removed on list without dropping lock
3378  	 */
3379  	while (min_count < persistent_huge_pages(h)) {
3380  		page = remove_pool_huge_page(h, nodes_allowed, 0);
3381  		if (!page)
3382  			break;
3383  
3384  		list_add(&page->lru, &page_list);
3385  	}
3386  	/* free the pages after dropping lock */
3387  	spin_unlock_irq(&hugetlb_lock);
3388  	update_and_free_pages_bulk(h, &page_list);
3389  	flush_free_hpage_work(h);
3390  	spin_lock_irq(&hugetlb_lock);
3391  
3392  	while (count < persistent_huge_pages(h)) {
3393  		if (!adjust_pool_surplus(h, nodes_allowed, 1))
3394  			break;
3395  	}
3396  out:
3397  	h->max_huge_pages = persistent_huge_pages(h);
3398  	spin_unlock_irq(&hugetlb_lock);
3399  	mutex_unlock(&h->resize_lock);
3400  
3401  	NODEMASK_FREE(node_alloc_noretry);
3402  
3403  	return 0;
3404  }
3405  
3406  static int demote_free_huge_page(struct hstate *h, struct page *page)
3407  {
3408  	int i, nid = page_to_nid(page);
3409  	struct hstate *target_hstate;
3410  	int rc = 0;
3411  
3412  	target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
3413  
3414  	remove_hugetlb_page_for_demote(h, page, false);
3415  	spin_unlock_irq(&hugetlb_lock);
3416  
3417  	rc = alloc_huge_page_vmemmap(h, page);
3418  	if (rc) {
3419  		/* Allocation of vmemmmap failed, we can not demote page */
3420  		spin_lock_irq(&hugetlb_lock);
3421  		set_page_refcounted(page);
3422  		add_hugetlb_page(h, page, false);
3423  		return rc;
3424  	}
3425  
3426  	/*
3427  	 * Use destroy_compound_hugetlb_page_for_demote for all huge page
3428  	 * sizes as it will not ref count pages.
3429  	 */
3430  	destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
3431  
3432  	/*
3433  	 * Taking target hstate mutex synchronizes with set_max_huge_pages.
3434  	 * Without the mutex, pages added to target hstate could be marked
3435  	 * as surplus.
3436  	 *
3437  	 * Note that we already hold h->resize_lock.  To prevent deadlock,
3438  	 * use the convention of always taking larger size hstate mutex first.
3439  	 */
3440  	mutex_lock(&target_hstate->resize_lock);
3441  	for (i = 0; i < pages_per_huge_page(h);
3442  				i += pages_per_huge_page(target_hstate)) {
3443  		if (hstate_is_gigantic(target_hstate))
3444  			prep_compound_gigantic_page_for_demote(page + i,
3445  							target_hstate->order);
3446  		else
3447  			prep_compound_page(page + i, target_hstate->order);
3448  		set_page_private(page + i, 0);
3449  		set_page_refcounted(page + i);
3450  		prep_new_huge_page(target_hstate, page + i, nid);
3451  		put_page(page + i);
3452  	}
3453  	mutex_unlock(&target_hstate->resize_lock);
3454  
3455  	spin_lock_irq(&hugetlb_lock);
3456  
3457  	/*
3458  	 * Not absolutely necessary, but for consistency update max_huge_pages
3459  	 * based on pool changes for the demoted page.
3460  	 */
3461  	h->max_huge_pages--;
3462  	target_hstate->max_huge_pages += pages_per_huge_page(h);
3463  
3464  	return rc;
3465  }
3466  
3467  static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
3468  	__must_hold(&hugetlb_lock)
3469  {
3470  	int nr_nodes, node;
3471  	struct page *page;
3472  	int rc = 0;
3473  
3474  	lockdep_assert_held(&hugetlb_lock);
3475  
3476  	/* We should never get here if no demote order */
3477  	if (!h->demote_order) {
3478  		pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
3479  		return -EINVAL;		/* internal error */
3480  	}
3481  
3482  	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3483  		if (!list_empty(&h->hugepage_freelists[node])) {
3484  			page = list_entry(h->hugepage_freelists[node].next,
3485  					struct page, lru);
3486  			rc = demote_free_huge_page(h, page);
3487  			break;
3488  		}
3489  	}
3490  
3491  	return rc;
3492  }
3493  
3494  #define HSTATE_ATTR_RO(_name) \
3495  	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
3496  
3497  #define HSTATE_ATTR_WO(_name) \
3498  	static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
3499  
3500  #define HSTATE_ATTR(_name) \
3501  	static struct kobj_attribute _name##_attr = \
3502  		__ATTR(_name, 0644, _name##_show, _name##_store)
3503  
3504  static struct kobject *hugepages_kobj;
3505  static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3506  
3507  static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
3508  
3509  static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
3510  {
3511  	int i;
3512  
3513  	for (i = 0; i < HUGE_MAX_HSTATE; i++)
3514  		if (hstate_kobjs[i] == kobj) {
3515  			if (nidp)
3516  				*nidp = NUMA_NO_NODE;
3517  			return &hstates[i];
3518  		}
3519  
3520  	return kobj_to_node_hstate(kobj, nidp);
3521  }
3522  
3523  static ssize_t nr_hugepages_show_common(struct kobject *kobj,
3524  					struct kobj_attribute *attr, char *buf)
3525  {
3526  	struct hstate *h;
3527  	unsigned long nr_huge_pages;
3528  	int nid;
3529  
3530  	h = kobj_to_hstate(kobj, &nid);
3531  	if (nid == NUMA_NO_NODE)
3532  		nr_huge_pages = h->nr_huge_pages;
3533  	else
3534  		nr_huge_pages = h->nr_huge_pages_node[nid];
3535  
3536  	return sysfs_emit(buf, "%lu\n", nr_huge_pages);
3537  }
3538  
3539  static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
3540  					   struct hstate *h, int nid,
3541  					   unsigned long count, size_t len)
3542  {
3543  	int err;
3544  	nodemask_t nodes_allowed, *n_mask;
3545  
3546  	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3547  		return -EINVAL;
3548  
3549  	if (nid == NUMA_NO_NODE) {
3550  		/*
3551  		 * global hstate attribute
3552  		 */
3553  		if (!(obey_mempolicy &&
3554  				init_nodemask_of_mempolicy(&nodes_allowed)))
3555  			n_mask = &node_states[N_MEMORY];
3556  		else
3557  			n_mask = &nodes_allowed;
3558  	} else {
3559  		/*
3560  		 * Node specific request.  count adjustment happens in
3561  		 * set_max_huge_pages() after acquiring hugetlb_lock.
3562  		 */
3563  		init_nodemask_of_node(&nodes_allowed, nid);
3564  		n_mask = &nodes_allowed;
3565  	}
3566  
3567  	err = set_max_huge_pages(h, count, nid, n_mask);
3568  
3569  	return err ? err : len;
3570  }
3571  
3572  static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
3573  					 struct kobject *kobj, const char *buf,
3574  					 size_t len)
3575  {
3576  	struct hstate *h;
3577  	unsigned long count;
3578  	int nid;
3579  	int err;
3580  
3581  	err = kstrtoul(buf, 10, &count);
3582  	if (err)
3583  		return err;
3584  
3585  	h = kobj_to_hstate(kobj, &nid);
3586  	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
3587  }
3588  
3589  static ssize_t nr_hugepages_show(struct kobject *kobj,
3590  				       struct kobj_attribute *attr, char *buf)
3591  {
3592  	return nr_hugepages_show_common(kobj, attr, buf);
3593  }
3594  
3595  static ssize_t nr_hugepages_store(struct kobject *kobj,
3596  	       struct kobj_attribute *attr, const char *buf, size_t len)
3597  {
3598  	return nr_hugepages_store_common(false, kobj, buf, len);
3599  }
3600  HSTATE_ATTR(nr_hugepages);
3601  
3602  #ifdef CONFIG_NUMA
3603  
3604  /*
3605   * hstate attribute for optionally mempolicy-based constraint on persistent
3606   * huge page alloc/free.
3607   */
3608  static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
3609  					   struct kobj_attribute *attr,
3610  					   char *buf)
3611  {
3612  	return nr_hugepages_show_common(kobj, attr, buf);
3613  }
3614  
3615  static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
3616  	       struct kobj_attribute *attr, const char *buf, size_t len)
3617  {
3618  	return nr_hugepages_store_common(true, kobj, buf, len);
3619  }
3620  HSTATE_ATTR(nr_hugepages_mempolicy);
3621  #endif
3622  
3623  
3624  static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
3625  					struct kobj_attribute *attr, char *buf)
3626  {
3627  	struct hstate *h = kobj_to_hstate(kobj, NULL);
3628  	return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
3629  }
3630  
3631  static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
3632  		struct kobj_attribute *attr, const char *buf, size_t count)
3633  {
3634  	int err;
3635  	unsigned long input;
3636  	struct hstate *h = kobj_to_hstate(kobj, NULL);
3637  
3638  	if (hstate_is_gigantic(h))
3639  		return -EINVAL;
3640  
3641  	err = kstrtoul(buf, 10, &input);
3642  	if (err)
3643  		return err;
3644  
3645  	spin_lock_irq(&hugetlb_lock);
3646  	h->nr_overcommit_huge_pages = input;
3647  	spin_unlock_irq(&hugetlb_lock);
3648  
3649  	return count;
3650  }
3651  HSTATE_ATTR(nr_overcommit_hugepages);
3652  
3653  static ssize_t free_hugepages_show(struct kobject *kobj,
3654  					struct kobj_attribute *attr, char *buf)
3655  {
3656  	struct hstate *h;
3657  	unsigned long free_huge_pages;
3658  	int nid;
3659  
3660  	h = kobj_to_hstate(kobj, &nid);
3661  	if (nid == NUMA_NO_NODE)
3662  		free_huge_pages = h->free_huge_pages;
3663  	else
3664  		free_huge_pages = h->free_huge_pages_node[nid];
3665  
3666  	return sysfs_emit(buf, "%lu\n", free_huge_pages);
3667  }
3668  HSTATE_ATTR_RO(free_hugepages);
3669  
3670  static ssize_t resv_hugepages_show(struct kobject *kobj,
3671  					struct kobj_attribute *attr, char *buf)
3672  {
3673  	struct hstate *h = kobj_to_hstate(kobj, NULL);
3674  	return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
3675  }
3676  HSTATE_ATTR_RO(resv_hugepages);
3677  
3678  static ssize_t surplus_hugepages_show(struct kobject *kobj,
3679  					struct kobj_attribute *attr, char *buf)
3680  {
3681  	struct hstate *h;
3682  	unsigned long surplus_huge_pages;
3683  	int nid;
3684  
3685  	h = kobj_to_hstate(kobj, &nid);
3686  	if (nid == NUMA_NO_NODE)
3687  		surplus_huge_pages = h->surplus_huge_pages;
3688  	else
3689  		surplus_huge_pages = h->surplus_huge_pages_node[nid];
3690  
3691  	return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
3692  }
3693  HSTATE_ATTR_RO(surplus_hugepages);
3694  
3695  static ssize_t demote_store(struct kobject *kobj,
3696  	       struct kobj_attribute *attr, const char *buf, size_t len)
3697  {
3698  	unsigned long nr_demote;
3699  	unsigned long nr_available;
3700  	nodemask_t nodes_allowed, *n_mask;
3701  	struct hstate *h;
3702  	int err = 0;
3703  	int nid;
3704  
3705  	err = kstrtoul(buf, 10, &nr_demote);
3706  	if (err)
3707  		return err;
3708  	h = kobj_to_hstate(kobj, &nid);
3709  
3710  	if (nid != NUMA_NO_NODE) {
3711  		init_nodemask_of_node(&nodes_allowed, nid);
3712  		n_mask = &nodes_allowed;
3713  	} else {
3714  		n_mask = &node_states[N_MEMORY];
3715  	}
3716  
3717  	/* Synchronize with other sysfs operations modifying huge pages */
3718  	mutex_lock(&h->resize_lock);
3719  	spin_lock_irq(&hugetlb_lock);
3720  
3721  	while (nr_demote) {
3722  		/*
3723  		 * Check for available pages to demote each time thorough the
3724  		 * loop as demote_pool_huge_page will drop hugetlb_lock.
3725  		 */
3726  		if (nid != NUMA_NO_NODE)
3727  			nr_available = h->free_huge_pages_node[nid];
3728  		else
3729  			nr_available = h->free_huge_pages;
3730  		nr_available -= h->resv_huge_pages;
3731  		if (!nr_available)
3732  			break;
3733  
3734  		err = demote_pool_huge_page(h, n_mask);
3735  		if (err)
3736  			break;
3737  
3738  		nr_demote--;
3739  	}
3740  
3741  	spin_unlock_irq(&hugetlb_lock);
3742  	mutex_unlock(&h->resize_lock);
3743  
3744  	if (err)
3745  		return err;
3746  	return len;
3747  }
3748  HSTATE_ATTR_WO(demote);
3749  
3750  static ssize_t demote_size_show(struct kobject *kobj,
3751  					struct kobj_attribute *attr, char *buf)
3752  {
3753  	int nid;
3754  	struct hstate *h = kobj_to_hstate(kobj, &nid);
3755  	unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
3756  
3757  	return sysfs_emit(buf, "%lukB\n", demote_size);
3758  }
3759  
3760  static ssize_t demote_size_store(struct kobject *kobj,
3761  					struct kobj_attribute *attr,
3762  					const char *buf, size_t count)
3763  {
3764  	struct hstate *h, *demote_hstate;
3765  	unsigned long demote_size;
3766  	unsigned int demote_order;
3767  	int nid;
3768  
3769  	demote_size = (unsigned long)memparse(buf, NULL);
3770  
3771  	demote_hstate = size_to_hstate(demote_size);
3772  	if (!demote_hstate)
3773  		return -EINVAL;
3774  	demote_order = demote_hstate->order;
3775  	if (demote_order < HUGETLB_PAGE_ORDER)
3776  		return -EINVAL;
3777  
3778  	/* demote order must be smaller than hstate order */
3779  	h = kobj_to_hstate(kobj, &nid);
3780  	if (demote_order >= h->order)
3781  		return -EINVAL;
3782  
3783  	/* resize_lock synchronizes access to demote size and writes */
3784  	mutex_lock(&h->resize_lock);
3785  	h->demote_order = demote_order;
3786  	mutex_unlock(&h->resize_lock);
3787  
3788  	return count;
3789  }
3790  HSTATE_ATTR(demote_size);
3791  
3792  static struct attribute *hstate_attrs[] = {
3793  	&nr_hugepages_attr.attr,
3794  	&nr_overcommit_hugepages_attr.attr,
3795  	&free_hugepages_attr.attr,
3796  	&resv_hugepages_attr.attr,
3797  	&surplus_hugepages_attr.attr,
3798  #ifdef CONFIG_NUMA
3799  	&nr_hugepages_mempolicy_attr.attr,
3800  #endif
3801  	NULL,
3802  };
3803  
3804  static const struct attribute_group hstate_attr_group = {
3805  	.attrs = hstate_attrs,
3806  };
3807  
3808  static struct attribute *hstate_demote_attrs[] = {
3809  	&demote_size_attr.attr,
3810  	&demote_attr.attr,
3811  	NULL,
3812  };
3813  
3814  static const struct attribute_group hstate_demote_attr_group = {
3815  	.attrs = hstate_demote_attrs,
3816  };
3817  
3818  static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
3819  				    struct kobject **hstate_kobjs,
3820  				    const struct attribute_group *hstate_attr_group)
3821  {
3822  	int retval;
3823  	int hi = hstate_index(h);
3824  
3825  	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
3826  	if (!hstate_kobjs[hi])
3827  		return -ENOMEM;
3828  
3829  	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
3830  	if (retval) {
3831  		kobject_put(hstate_kobjs[hi]);
3832  		hstate_kobjs[hi] = NULL;
3833  	}
3834  
3835  	if (h->demote_order) {
3836  		if (sysfs_create_group(hstate_kobjs[hi],
3837  					&hstate_demote_attr_group))
3838  			pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
3839  	}
3840  
3841  	return retval;
3842  }
3843  
3844  static void __init hugetlb_sysfs_init(void)
3845  {
3846  	struct hstate *h;
3847  	int err;
3848  
3849  	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
3850  	if (!hugepages_kobj)
3851  		return;
3852  
3853  	for_each_hstate(h) {
3854  		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
3855  					 hstate_kobjs, &hstate_attr_group);
3856  		if (err)
3857  			pr_err("HugeTLB: Unable to add hstate %s", h->name);
3858  	}
3859  }
3860  
3861  #ifdef CONFIG_NUMA
3862  
3863  /*
3864   * node_hstate/s - associate per node hstate attributes, via their kobjects,
3865   * with node devices in node_devices[] using a parallel array.  The array
3866   * index of a node device or _hstate == node id.
3867   * This is here to avoid any static dependency of the node device driver, in
3868   * the base kernel, on the hugetlb module.
3869   */
3870  struct node_hstate {
3871  	struct kobject		*hugepages_kobj;
3872  	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
3873  };
3874  static struct node_hstate node_hstates[MAX_NUMNODES];
3875  
3876  /*
3877   * A subset of global hstate attributes for node devices
3878   */
3879  static struct attribute *per_node_hstate_attrs[] = {
3880  	&nr_hugepages_attr.attr,
3881  	&free_hugepages_attr.attr,
3882  	&surplus_hugepages_attr.attr,
3883  	NULL,
3884  };
3885  
3886  static const struct attribute_group per_node_hstate_attr_group = {
3887  	.attrs = per_node_hstate_attrs,
3888  };
3889  
3890  /*
3891   * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
3892   * Returns node id via non-NULL nidp.
3893   */
3894  static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
3895  {
3896  	int nid;
3897  
3898  	for (nid = 0; nid < nr_node_ids; nid++) {
3899  		struct node_hstate *nhs = &node_hstates[nid];
3900  		int i;
3901  		for (i = 0; i < HUGE_MAX_HSTATE; i++)
3902  			if (nhs->hstate_kobjs[i] == kobj) {
3903  				if (nidp)
3904  					*nidp = nid;
3905  				return &hstates[i];
3906  			}
3907  	}
3908  
3909  	BUG();
3910  	return NULL;
3911  }
3912  
3913  /*
3914   * Unregister hstate attributes from a single node device.
3915   * No-op if no hstate attributes attached.
3916   */
3917  static void hugetlb_unregister_node(struct node *node)
3918  {
3919  	struct hstate *h;
3920  	struct node_hstate *nhs = &node_hstates[node->dev.id];
3921  
3922  	if (!nhs->hugepages_kobj)
3923  		return;		/* no hstate attributes */
3924  
3925  	for_each_hstate(h) {
3926  		int idx = hstate_index(h);
3927  		if (nhs->hstate_kobjs[idx]) {
3928  			kobject_put(nhs->hstate_kobjs[idx]);
3929  			nhs->hstate_kobjs[idx] = NULL;
3930  		}
3931  	}
3932  
3933  	kobject_put(nhs->hugepages_kobj);
3934  	nhs->hugepages_kobj = NULL;
3935  }
3936  
3937  
3938  /*
3939   * Register hstate attributes for a single node device.
3940   * No-op if attributes already registered.
3941   */
3942  static void hugetlb_register_node(struct node *node)
3943  {
3944  	struct hstate *h;
3945  	struct node_hstate *nhs = &node_hstates[node->dev.id];
3946  	int err;
3947  
3948  	if (nhs->hugepages_kobj)
3949  		return;		/* already allocated */
3950  
3951  	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
3952  							&node->dev.kobj);
3953  	if (!nhs->hugepages_kobj)
3954  		return;
3955  
3956  	for_each_hstate(h) {
3957  		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
3958  						nhs->hstate_kobjs,
3959  						&per_node_hstate_attr_group);
3960  		if (err) {
3961  			pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
3962  				h->name, node->dev.id);
3963  			hugetlb_unregister_node(node);
3964  			break;
3965  		}
3966  	}
3967  }
3968  
3969  /*
3970   * hugetlb init time:  register hstate attributes for all registered node
3971   * devices of nodes that have memory.  All on-line nodes should have
3972   * registered their associated device by this time.
3973   */
3974  static void __init hugetlb_register_all_nodes(void)
3975  {
3976  	int nid;
3977  
3978  	for_each_node_state(nid, N_MEMORY) {
3979  		struct node *node = node_devices[nid];
3980  		if (node->dev.id == nid)
3981  			hugetlb_register_node(node);
3982  	}
3983  
3984  	/*
3985  	 * Let the node device driver know we're here so it can
3986  	 * [un]register hstate attributes on node hotplug.
3987  	 */
3988  	register_hugetlbfs_with_node(hugetlb_register_node,
3989  				     hugetlb_unregister_node);
3990  }
3991  #else	/* !CONFIG_NUMA */
3992  
3993  static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
3994  {
3995  	BUG();
3996  	if (nidp)
3997  		*nidp = -1;
3998  	return NULL;
3999  }
4000  
4001  static void hugetlb_register_all_nodes(void) { }
4002  
4003  #endif
4004  
4005  static int __init hugetlb_init(void)
4006  {
4007  	int i;
4008  
4009  	BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
4010  			__NR_HPAGEFLAGS);
4011  
4012  	if (!hugepages_supported()) {
4013  		if (hugetlb_max_hstate || default_hstate_max_huge_pages)
4014  			pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
4015  		return 0;
4016  	}
4017  
4018  	/*
4019  	 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists.  Some
4020  	 * architectures depend on setup being done here.
4021  	 */
4022  	hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
4023  	if (!parsed_default_hugepagesz) {
4024  		/*
4025  		 * If we did not parse a default huge page size, set
4026  		 * default_hstate_idx to HPAGE_SIZE hstate. And, if the
4027  		 * number of huge pages for this default size was implicitly
4028  		 * specified, set that here as well.
4029  		 * Note that the implicit setting will overwrite an explicit
4030  		 * setting.  A warning will be printed in this case.
4031  		 */
4032  		default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
4033  		if (default_hstate_max_huge_pages) {
4034  			if (default_hstate.max_huge_pages) {
4035  				char buf[32];
4036  
4037  				string_get_size(huge_page_size(&default_hstate),
4038  					1, STRING_UNITS_2, buf, 32);
4039  				pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
4040  					default_hstate.max_huge_pages, buf);
4041  				pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
4042  					default_hstate_max_huge_pages);
4043  			}
4044  			default_hstate.max_huge_pages =
4045  				default_hstate_max_huge_pages;
4046  
4047  			for (i = 0; i < nr_online_nodes; i++)
4048  				default_hstate.max_huge_pages_node[i] =
4049  					default_hugepages_in_node[i];
4050  		}
4051  	}
4052  
4053  	hugetlb_cma_check();
4054  	hugetlb_init_hstates();
4055  	gather_bootmem_prealloc();
4056  	report_hugepages();
4057  
4058  	hugetlb_sysfs_init();
4059  	hugetlb_register_all_nodes();
4060  	hugetlb_cgroup_file_init();
4061  
4062  #ifdef CONFIG_SMP
4063  	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
4064  #else
4065  	num_fault_mutexes = 1;
4066  #endif
4067  	hugetlb_fault_mutex_table =
4068  		kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
4069  			      GFP_KERNEL);
4070  	BUG_ON(!hugetlb_fault_mutex_table);
4071  
4072  	for (i = 0; i < num_fault_mutexes; i++)
4073  		mutex_init(&hugetlb_fault_mutex_table[i]);
4074  	return 0;
4075  }
4076  subsys_initcall(hugetlb_init);
4077  
4078  /* Overwritten by architectures with more huge page sizes */
4079  bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
4080  {
4081  	return size == HPAGE_SIZE;
4082  }
4083  
4084  void __init hugetlb_add_hstate(unsigned int order)
4085  {
4086  	struct hstate *h;
4087  	unsigned long i;
4088  
4089  	if (size_to_hstate(PAGE_SIZE << order)) {
4090  		return;
4091  	}
4092  	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
4093  	BUG_ON(order == 0);
4094  	h = &hstates[hugetlb_max_hstate++];
4095  	mutex_init(&h->resize_lock);
4096  	h->order = order;
4097  	h->mask = ~(huge_page_size(h) - 1);
4098  	for (i = 0; i < MAX_NUMNODES; ++i)
4099  		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
4100  	INIT_LIST_HEAD(&h->hugepage_activelist);
4101  	h->next_nid_to_alloc = first_memory_node;
4102  	h->next_nid_to_free = first_memory_node;
4103  	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
4104  					huge_page_size(h)/1024);
4105  	hugetlb_vmemmap_init(h);
4106  
4107  	parsed_hstate = h;
4108  }
4109  
4110  bool __init __weak hugetlb_node_alloc_supported(void)
4111  {
4112  	return true;
4113  }
4114  /*
4115   * hugepages command line processing
4116   * hugepages normally follows a valid hugepagsz or default_hugepagsz
4117   * specification.  If not, ignore the hugepages value.  hugepages can also
4118   * be the first huge page command line  option in which case it implicitly
4119   * specifies the number of huge pages for the default size.
4120   */
4121  static int __init hugepages_setup(char *s)
4122  {
4123  	unsigned long *mhp;
4124  	static unsigned long *last_mhp;
4125  	int node = NUMA_NO_NODE;
4126  	int count;
4127  	unsigned long tmp;
4128  	char *p = s;
4129  
4130  	if (!parsed_valid_hugepagesz) {
4131  		pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
4132  		parsed_valid_hugepagesz = true;
4133  		return 0;
4134  	}
4135  
4136  	/*
4137  	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
4138  	 * yet, so this hugepages= parameter goes to the "default hstate".
4139  	 * Otherwise, it goes with the previously parsed hugepagesz or
4140  	 * default_hugepagesz.
4141  	 */
4142  	else if (!hugetlb_max_hstate)
4143  		mhp = &default_hstate_max_huge_pages;
4144  	else
4145  		mhp = &parsed_hstate->max_huge_pages;
4146  
4147  	if (mhp == last_mhp) {
4148  		pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
4149  		return 0;
4150  	}
4151  
4152  	while (*p) {
4153  		count = 0;
4154  		if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4155  			goto invalid;
4156  		/* Parameter is node format */
4157  		if (p[count] == ':') {
4158  			if (!hugetlb_node_alloc_supported()) {
4159  				pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
4160  				return 0;
4161  			}
4162  			if (tmp >= nr_online_nodes)
4163  				goto invalid;
4164  			node = tmp;
4165  			p += count + 1;
4166  			/* Parse hugepages */
4167  			if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4168  				goto invalid;
4169  			if (!hugetlb_max_hstate)
4170  				default_hugepages_in_node[node] = tmp;
4171  			else
4172  				parsed_hstate->max_huge_pages_node[node] = tmp;
4173  			*mhp += tmp;
4174  			/* Go to parse next node*/
4175  			if (p[count] == ',')
4176  				p += count + 1;
4177  			else
4178  				break;
4179  		} else {
4180  			if (p != s)
4181  				goto invalid;
4182  			*mhp = tmp;
4183  			break;
4184  		}
4185  	}
4186  
4187  	/*
4188  	 * Global state is always initialized later in hugetlb_init.
4189  	 * But we need to allocate gigantic hstates here early to still
4190  	 * use the bootmem allocator.
4191  	 */
4192  	if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
4193  		hugetlb_hstate_alloc_pages(parsed_hstate);
4194  
4195  	last_mhp = mhp;
4196  
4197  	return 1;
4198  
4199  invalid:
4200  	pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
4201  	return 0;
4202  }
4203  __setup("hugepages=", hugepages_setup);
4204  
4205  /*
4206   * hugepagesz command line processing
4207   * A specific huge page size can only be specified once with hugepagesz.
4208   * hugepagesz is followed by hugepages on the command line.  The global
4209   * variable 'parsed_valid_hugepagesz' is used to determine if prior
4210   * hugepagesz argument was valid.
4211   */
4212  static int __init hugepagesz_setup(char *s)
4213  {
4214  	unsigned long size;
4215  	struct hstate *h;
4216  
4217  	parsed_valid_hugepagesz = false;
4218  	size = (unsigned long)memparse(s, NULL);
4219  
4220  	if (!arch_hugetlb_valid_size(size)) {
4221  		pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
4222  		return 0;
4223  	}
4224  
4225  	h = size_to_hstate(size);
4226  	if (h) {
4227  		/*
4228  		 * hstate for this size already exists.  This is normally
4229  		 * an error, but is allowed if the existing hstate is the
4230  		 * default hstate.  More specifically, it is only allowed if
4231  		 * the number of huge pages for the default hstate was not
4232  		 * previously specified.
4233  		 */
4234  		if (!parsed_default_hugepagesz ||  h != &default_hstate ||
4235  		    default_hstate.max_huge_pages) {
4236  			pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
4237  			return 0;
4238  		}
4239  
4240  		/*
4241  		 * No need to call hugetlb_add_hstate() as hstate already
4242  		 * exists.  But, do set parsed_hstate so that a following
4243  		 * hugepages= parameter will be applied to this hstate.
4244  		 */
4245  		parsed_hstate = h;
4246  		parsed_valid_hugepagesz = true;
4247  		return 1;
4248  	}
4249  
4250  	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4251  	parsed_valid_hugepagesz = true;
4252  	return 1;
4253  }
4254  __setup("hugepagesz=", hugepagesz_setup);
4255  
4256  /*
4257   * default_hugepagesz command line input
4258   * Only one instance of default_hugepagesz allowed on command line.
4259   */
4260  static int __init default_hugepagesz_setup(char *s)
4261  {
4262  	unsigned long size;
4263  	int i;
4264  
4265  	parsed_valid_hugepagesz = false;
4266  	if (parsed_default_hugepagesz) {
4267  		pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
4268  		return 0;
4269  	}
4270  
4271  	size = (unsigned long)memparse(s, NULL);
4272  
4273  	if (!arch_hugetlb_valid_size(size)) {
4274  		pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
4275  		return 0;
4276  	}
4277  
4278  	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4279  	parsed_valid_hugepagesz = true;
4280  	parsed_default_hugepagesz = true;
4281  	default_hstate_idx = hstate_index(size_to_hstate(size));
4282  
4283  	/*
4284  	 * The number of default huge pages (for this size) could have been
4285  	 * specified as the first hugetlb parameter: hugepages=X.  If so,
4286  	 * then default_hstate_max_huge_pages is set.  If the default huge
4287  	 * page size is gigantic (>= MAX_ORDER), then the pages must be
4288  	 * allocated here from bootmem allocator.
4289  	 */
4290  	if (default_hstate_max_huge_pages) {
4291  		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
4292  		for (i = 0; i < nr_online_nodes; i++)
4293  			default_hstate.max_huge_pages_node[i] =
4294  				default_hugepages_in_node[i];
4295  		if (hstate_is_gigantic(&default_hstate))
4296  			hugetlb_hstate_alloc_pages(&default_hstate);
4297  		default_hstate_max_huge_pages = 0;
4298  	}
4299  
4300  	return 1;
4301  }
4302  __setup("default_hugepagesz=", default_hugepagesz_setup);
4303  
4304  static unsigned int allowed_mems_nr(struct hstate *h)
4305  {
4306  	int node;
4307  	unsigned int nr = 0;
4308  	nodemask_t *mpol_allowed;
4309  	unsigned int *array = h->free_huge_pages_node;
4310  	gfp_t gfp_mask = htlb_alloc_mask(h);
4311  
4312  	mpol_allowed = policy_nodemask_current(gfp_mask);
4313  
4314  	for_each_node_mask(node, cpuset_current_mems_allowed) {
4315  		if (!mpol_allowed || node_isset(node, *mpol_allowed))
4316  			nr += array[node];
4317  	}
4318  
4319  	return nr;
4320  }
4321  
4322  #ifdef CONFIG_SYSCTL
4323  static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
4324  					  void *buffer, size_t *length,
4325  					  loff_t *ppos, unsigned long *out)
4326  {
4327  	struct ctl_table dup_table;
4328  
4329  	/*
4330  	 * In order to avoid races with __do_proc_doulongvec_minmax(), we
4331  	 * can duplicate the @table and alter the duplicate of it.
4332  	 */
4333  	dup_table = *table;
4334  	dup_table.data = out;
4335  
4336  	return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
4337  }
4338  
4339  static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
4340  			 struct ctl_table *table, int write,
4341  			 void *buffer, size_t *length, loff_t *ppos)
4342  {
4343  	struct hstate *h = &default_hstate;
4344  	unsigned long tmp = h->max_huge_pages;
4345  	int ret;
4346  
4347  	if (!hugepages_supported())
4348  		return -EOPNOTSUPP;
4349  
4350  	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4351  					     &tmp);
4352  	if (ret)
4353  		goto out;
4354  
4355  	if (write)
4356  		ret = __nr_hugepages_store_common(obey_mempolicy, h,
4357  						  NUMA_NO_NODE, tmp, *length);
4358  out:
4359  	return ret;
4360  }
4361  
4362  int hugetlb_sysctl_handler(struct ctl_table *table, int write,
4363  			  void *buffer, size_t *length, loff_t *ppos)
4364  {
4365  
4366  	return hugetlb_sysctl_handler_common(false, table, write,
4367  							buffer, length, ppos);
4368  }
4369  
4370  #ifdef CONFIG_NUMA
4371  int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
4372  			  void *buffer, size_t *length, loff_t *ppos)
4373  {
4374  	return hugetlb_sysctl_handler_common(true, table, write,
4375  							buffer, length, ppos);
4376  }
4377  #endif /* CONFIG_NUMA */
4378  
4379  int hugetlb_overcommit_handler(struct ctl_table *table, int write,
4380  		void *buffer, size_t *length, loff_t *ppos)
4381  {
4382  	struct hstate *h = &default_hstate;
4383  	unsigned long tmp;
4384  	int ret;
4385  
4386  	if (!hugepages_supported())
4387  		return -EOPNOTSUPP;
4388  
4389  	tmp = h->nr_overcommit_huge_pages;
4390  
4391  	if (write && hstate_is_gigantic(h))
4392  		return -EINVAL;
4393  
4394  	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4395  					     &tmp);
4396  	if (ret)
4397  		goto out;
4398  
4399  	if (write) {
4400  		spin_lock_irq(&hugetlb_lock);
4401  		h->nr_overcommit_huge_pages = tmp;
4402  		spin_unlock_irq(&hugetlb_lock);
4403  	}
4404  out:
4405  	return ret;
4406  }
4407  
4408  #endif /* CONFIG_SYSCTL */
4409  
4410  void hugetlb_report_meminfo(struct seq_file *m)
4411  {
4412  	struct hstate *h;
4413  	unsigned long total = 0;
4414  
4415  	if (!hugepages_supported())
4416  		return;
4417  
4418  	for_each_hstate(h) {
4419  		unsigned long count = h->nr_huge_pages;
4420  
4421  		total += huge_page_size(h) * count;
4422  
4423  		if (h == &default_hstate)
4424  			seq_printf(m,
4425  				   "HugePages_Total:   %5lu\n"
4426  				   "HugePages_Free:    %5lu\n"
4427  				   "HugePages_Rsvd:    %5lu\n"
4428  				   "HugePages_Surp:    %5lu\n"
4429  				   "Hugepagesize:   %8lu kB\n",
4430  				   count,
4431  				   h->free_huge_pages,
4432  				   h->resv_huge_pages,
4433  				   h->surplus_huge_pages,
4434  				   huge_page_size(h) / SZ_1K);
4435  	}
4436  
4437  	seq_printf(m, "Hugetlb:        %8lu kB\n", total / SZ_1K);
4438  }
4439  
4440  int hugetlb_report_node_meminfo(char *buf, int len, int nid)
4441  {
4442  	struct hstate *h = &default_hstate;
4443  
4444  	if (!hugepages_supported())
4445  		return 0;
4446  
4447  	return sysfs_emit_at(buf, len,
4448  			     "Node %d HugePages_Total: %5u\n"
4449  			     "Node %d HugePages_Free:  %5u\n"
4450  			     "Node %d HugePages_Surp:  %5u\n",
4451  			     nid, h->nr_huge_pages_node[nid],
4452  			     nid, h->free_huge_pages_node[nid],
4453  			     nid, h->surplus_huge_pages_node[nid]);
4454  }
4455  
4456  void hugetlb_show_meminfo(void)
4457  {
4458  	struct hstate *h;
4459  	int nid;
4460  
4461  	if (!hugepages_supported())
4462  		return;
4463  
4464  	for_each_node_state(nid, N_MEMORY)
4465  		for_each_hstate(h)
4466  			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
4467  				nid,
4468  				h->nr_huge_pages_node[nid],
4469  				h->free_huge_pages_node[nid],
4470  				h->surplus_huge_pages_node[nid],
4471  				huge_page_size(h) / SZ_1K);
4472  }
4473  
4474  void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
4475  {
4476  	seq_printf(m, "HugetlbPages:\t%8lu kB\n",
4477  		   atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
4478  }
4479  
4480  /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
4481  unsigned long hugetlb_total_pages(void)
4482  {
4483  	struct hstate *h;
4484  	unsigned long nr_total_pages = 0;
4485  
4486  	for_each_hstate(h)
4487  		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
4488  	return nr_total_pages;
4489  }
4490  
4491  static int hugetlb_acct_memory(struct hstate *h, long delta)
4492  {
4493  	int ret = -ENOMEM;
4494  
4495  	if (!delta)
4496  		return 0;
4497  
4498  	spin_lock_irq(&hugetlb_lock);
4499  	/*
4500  	 * When cpuset is configured, it breaks the strict hugetlb page
4501  	 * reservation as the accounting is done on a global variable. Such
4502  	 * reservation is completely rubbish in the presence of cpuset because
4503  	 * the reservation is not checked against page availability for the
4504  	 * current cpuset. Application can still potentially OOM'ed by kernel
4505  	 * with lack of free htlb page in cpuset that the task is in.
4506  	 * Attempt to enforce strict accounting with cpuset is almost
4507  	 * impossible (or too ugly) because cpuset is too fluid that
4508  	 * task or memory node can be dynamically moved between cpusets.
4509  	 *
4510  	 * The change of semantics for shared hugetlb mapping with cpuset is
4511  	 * undesirable. However, in order to preserve some of the semantics,
4512  	 * we fall back to check against current free page availability as
4513  	 * a best attempt and hopefully to minimize the impact of changing
4514  	 * semantics that cpuset has.
4515  	 *
4516  	 * Apart from cpuset, we also have memory policy mechanism that
4517  	 * also determines from which node the kernel will allocate memory
4518  	 * in a NUMA system. So similar to cpuset, we also should consider
4519  	 * the memory policy of the current task. Similar to the description
4520  	 * above.
4521  	 */
4522  	if (delta > 0) {
4523  		if (gather_surplus_pages(h, delta) < 0)
4524  			goto out;
4525  
4526  		if (delta > allowed_mems_nr(h)) {
4527  			return_unused_surplus_pages(h, delta);
4528  			goto out;
4529  		}
4530  	}
4531  
4532  	ret = 0;
4533  	if (delta < 0)
4534  		return_unused_surplus_pages(h, (unsigned long) -delta);
4535  
4536  out:
4537  	spin_unlock_irq(&hugetlb_lock);
4538  	return ret;
4539  }
4540  
4541  static void hugetlb_vm_op_open(struct vm_area_struct *vma)
4542  {
4543  	struct resv_map *resv = vma_resv_map(vma);
4544  
4545  	/*
4546  	 * This new VMA should share its siblings reservation map if present.
4547  	 * The VMA will only ever have a valid reservation map pointer where
4548  	 * it is being copied for another still existing VMA.  As that VMA
4549  	 * has a reference to the reservation map it cannot disappear until
4550  	 * after this open call completes.  It is therefore safe to take a
4551  	 * new reference here without additional locking.
4552  	 */
4553  	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
4554  		resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
4555  		kref_get(&resv->refs);
4556  	}
4557  }
4558  
4559  static void hugetlb_vm_op_close(struct vm_area_struct *vma)
4560  {
4561  	struct hstate *h = hstate_vma(vma);
4562  	struct resv_map *resv = vma_resv_map(vma);
4563  	struct hugepage_subpool *spool = subpool_vma(vma);
4564  	unsigned long reserve, start, end;
4565  	long gbl_reserve;
4566  
4567  	if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4568  		return;
4569  
4570  	start = vma_hugecache_offset(h, vma, vma->vm_start);
4571  	end = vma_hugecache_offset(h, vma, vma->vm_end);
4572  
4573  	reserve = (end - start) - region_count(resv, start, end);
4574  	hugetlb_cgroup_uncharge_counter(resv, start, end);
4575  	if (reserve) {
4576  		/*
4577  		 * Decrement reserve counts.  The global reserve count may be
4578  		 * adjusted if the subpool has a minimum size.
4579  		 */
4580  		gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
4581  		hugetlb_acct_memory(h, -gbl_reserve);
4582  	}
4583  
4584  	kref_put(&resv->refs, resv_map_release);
4585  }
4586  
4587  static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
4588  {
4589  	if (addr & ~(huge_page_mask(hstate_vma(vma))))
4590  		return -EINVAL;
4591  	return 0;
4592  }
4593  
4594  static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
4595  {
4596  	return huge_page_size(hstate_vma(vma));
4597  }
4598  
4599  /*
4600   * We cannot handle pagefaults against hugetlb pages at all.  They cause
4601   * handle_mm_fault() to try to instantiate regular-sized pages in the
4602   * hugepage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
4603   * this far.
4604   */
4605  static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
4606  {
4607  	BUG();
4608  	return 0;
4609  }
4610  
4611  /*
4612   * When a new function is introduced to vm_operations_struct and added
4613   * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
4614   * This is because under System V memory model, mappings created via
4615   * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
4616   * their original vm_ops are overwritten with shm_vm_ops.
4617   */
4618  const struct vm_operations_struct hugetlb_vm_ops = {
4619  	.fault = hugetlb_vm_op_fault,
4620  	.open = hugetlb_vm_op_open,
4621  	.close = hugetlb_vm_op_close,
4622  	.may_split = hugetlb_vm_op_split,
4623  	.pagesize = hugetlb_vm_op_pagesize,
4624  };
4625  
4626  static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
4627  				int writable)
4628  {
4629  	pte_t entry;
4630  	unsigned int shift = huge_page_shift(hstate_vma(vma));
4631  
4632  	if (writable) {
4633  		entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
4634  					 vma->vm_page_prot)));
4635  	} else {
4636  		entry = huge_pte_wrprotect(mk_huge_pte(page,
4637  					   vma->vm_page_prot));
4638  	}
4639  	entry = pte_mkyoung(entry);
4640  	entry = pte_mkhuge(entry);
4641  	entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
4642  
4643  	return entry;
4644  }
4645  
4646  static void set_huge_ptep_writable(struct vm_area_struct *vma,
4647  				   unsigned long address, pte_t *ptep)
4648  {
4649  	pte_t entry;
4650  
4651  	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
4652  	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
4653  		update_mmu_cache(vma, address, ptep);
4654  }
4655  
4656  bool is_hugetlb_entry_migration(pte_t pte)
4657  {
4658  	swp_entry_t swp;
4659  
4660  	if (huge_pte_none(pte) || pte_present(pte))
4661  		return false;
4662  	swp = pte_to_swp_entry(pte);
4663  	if (is_migration_entry(swp))
4664  		return true;
4665  	else
4666  		return false;
4667  }
4668  
4669  static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
4670  {
4671  	swp_entry_t swp;
4672  
4673  	if (huge_pte_none(pte) || pte_present(pte))
4674  		return false;
4675  	swp = pte_to_swp_entry(pte);
4676  	if (is_hwpoison_entry(swp))
4677  		return true;
4678  	else
4679  		return false;
4680  }
4681  
4682  static void
4683  hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
4684  		     struct page *new_page)
4685  {
4686  	__SetPageUptodate(new_page);
4687  	hugepage_add_new_anon_rmap(new_page, vma, addr);
4688  	set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
4689  	hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
4690  	ClearHPageRestoreReserve(new_page);
4691  	SetHPageMigratable(new_page);
4692  }
4693  
4694  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
4695  			    struct vm_area_struct *vma)
4696  {
4697  	pte_t *src_pte, *dst_pte, entry, dst_entry;
4698  	struct page *ptepage;
4699  	unsigned long addr;
4700  	bool cow = is_cow_mapping(vma->vm_flags);
4701  	struct hstate *h = hstate_vma(vma);
4702  	unsigned long sz = huge_page_size(h);
4703  	unsigned long npages = pages_per_huge_page(h);
4704  	struct address_space *mapping = vma->vm_file->f_mapping;
4705  	struct mmu_notifier_range range;
4706  	int ret = 0;
4707  
4708  	if (cow) {
4709  		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
4710  					vma->vm_start,
4711  					vma->vm_end);
4712  		mmu_notifier_invalidate_range_start(&range);
4713  	} else {
4714  		/*
4715  		 * For shared mappings i_mmap_rwsem must be held to call
4716  		 * huge_pte_alloc, otherwise the returned ptep could go
4717  		 * away if part of a shared pmd and another thread calls
4718  		 * huge_pmd_unshare.
4719  		 */
4720  		i_mmap_lock_read(mapping);
4721  	}
4722  
4723  	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
4724  		spinlock_t *src_ptl, *dst_ptl;
4725  		src_pte = huge_pte_offset(src, addr, sz);
4726  		if (!src_pte)
4727  			continue;
4728  		dst_pte = huge_pte_alloc(dst, vma, addr, sz);
4729  		if (!dst_pte) {
4730  			ret = -ENOMEM;
4731  			break;
4732  		}
4733  
4734  		/*
4735  		 * If the pagetables are shared don't copy or take references.
4736  		 * dst_pte == src_pte is the common case of src/dest sharing.
4737  		 *
4738  		 * However, src could have 'unshared' and dst shares with
4739  		 * another vma.  If dst_pte !none, this implies sharing.
4740  		 * Check here before taking page table lock, and once again
4741  		 * after taking the lock below.
4742  		 */
4743  		dst_entry = huge_ptep_get(dst_pte);
4744  		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
4745  			continue;
4746  
4747  		dst_ptl = huge_pte_lock(h, dst, dst_pte);
4748  		src_ptl = huge_pte_lockptr(h, src, src_pte);
4749  		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4750  		entry = huge_ptep_get(src_pte);
4751  		dst_entry = huge_ptep_get(dst_pte);
4752  again:
4753  		if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
4754  			/*
4755  			 * Skip if src entry none.  Also, skip in the
4756  			 * unlikely case dst entry !none as this implies
4757  			 * sharing with another vma.
4758  			 */
4759  			;
4760  		} else if (unlikely(is_hugetlb_entry_migration(entry) ||
4761  				    is_hugetlb_entry_hwpoisoned(entry))) {
4762  			swp_entry_t swp_entry = pte_to_swp_entry(entry);
4763  
4764  			if (is_writable_migration_entry(swp_entry) && cow) {
4765  				/*
4766  				 * COW mappings require pages in both
4767  				 * parent and child to be set to read.
4768  				 */
4769  				swp_entry = make_readable_migration_entry(
4770  							swp_offset(swp_entry));
4771  				entry = swp_entry_to_pte(swp_entry);
4772  				set_huge_swap_pte_at(src, addr, src_pte,
4773  						     entry, sz);
4774  			}
4775  			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
4776  		} else {
4777  			entry = huge_ptep_get(src_pte);
4778  			ptepage = pte_page(entry);
4779  			get_page(ptepage);
4780  
4781  			/*
4782  			 * This is a rare case where we see pinned hugetlb
4783  			 * pages while they're prone to COW.  We need to do the
4784  			 * COW earlier during fork.
4785  			 *
4786  			 * When pre-allocating the page or copying data, we
4787  			 * need to be without the pgtable locks since we could
4788  			 * sleep during the process.
4789  			 */
4790  			if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
4791  				pte_t src_pte_old = entry;
4792  				struct page *new;
4793  
4794  				spin_unlock(src_ptl);
4795  				spin_unlock(dst_ptl);
4796  				/* Do not use reserve as it's private owned */
4797  				new = alloc_huge_page(vma, addr, 1);
4798  				if (IS_ERR(new)) {
4799  					put_page(ptepage);
4800  					ret = PTR_ERR(new);
4801  					break;
4802  				}
4803  				copy_user_huge_page(new, ptepage, addr, vma,
4804  						    npages);
4805  				put_page(ptepage);
4806  
4807  				/* Install the new huge page if src pte stable */
4808  				dst_ptl = huge_pte_lock(h, dst, dst_pte);
4809  				src_ptl = huge_pte_lockptr(h, src, src_pte);
4810  				spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4811  				entry = huge_ptep_get(src_pte);
4812  				if (!pte_same(src_pte_old, entry)) {
4813  					restore_reserve_on_error(h, vma, addr,
4814  								new);
4815  					put_page(new);
4816  					/* dst_entry won't change as in child */
4817  					goto again;
4818  				}
4819  				hugetlb_install_page(vma, dst_pte, addr, new);
4820  				spin_unlock(src_ptl);
4821  				spin_unlock(dst_ptl);
4822  				continue;
4823  			}
4824  
4825  			if (cow) {
4826  				/*
4827  				 * No need to notify as we are downgrading page
4828  				 * table protection not changing it to point
4829  				 * to a new page.
4830  				 *
4831  				 * See Documentation/vm/mmu_notifier.rst
4832  				 */
4833  				huge_ptep_set_wrprotect(src, addr, src_pte);
4834  				entry = huge_pte_wrprotect(entry);
4835  			}
4836  
4837  			page_dup_rmap(ptepage, true);
4838  			set_huge_pte_at(dst, addr, dst_pte, entry);
4839  			hugetlb_count_add(npages, dst);
4840  		}
4841  		spin_unlock(src_ptl);
4842  		spin_unlock(dst_ptl);
4843  	}
4844  
4845  	if (cow)
4846  		mmu_notifier_invalidate_range_end(&range);
4847  	else
4848  		i_mmap_unlock_read(mapping);
4849  
4850  	return ret;
4851  }
4852  
4853  static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
4854  			  unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
4855  {
4856  	struct hstate *h = hstate_vma(vma);
4857  	struct mm_struct *mm = vma->vm_mm;
4858  	spinlock_t *src_ptl, *dst_ptl;
4859  	pte_t pte;
4860  
4861  	dst_ptl = huge_pte_lock(h, mm, dst_pte);
4862  	src_ptl = huge_pte_lockptr(h, mm, src_pte);
4863  
4864  	/*
4865  	 * We don't have to worry about the ordering of src and dst ptlocks
4866  	 * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock.
4867  	 */
4868  	if (src_ptl != dst_ptl)
4869  		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4870  
4871  	pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
4872  	set_huge_pte_at(mm, new_addr, dst_pte, pte);
4873  
4874  	if (src_ptl != dst_ptl)
4875  		spin_unlock(src_ptl);
4876  	spin_unlock(dst_ptl);
4877  }
4878  
4879  int move_hugetlb_page_tables(struct vm_area_struct *vma,
4880  			     struct vm_area_struct *new_vma,
4881  			     unsigned long old_addr, unsigned long new_addr,
4882  			     unsigned long len)
4883  {
4884  	struct hstate *h = hstate_vma(vma);
4885  	struct address_space *mapping = vma->vm_file->f_mapping;
4886  	unsigned long sz = huge_page_size(h);
4887  	struct mm_struct *mm = vma->vm_mm;
4888  	unsigned long old_end = old_addr + len;
4889  	unsigned long old_addr_copy;
4890  	pte_t *src_pte, *dst_pte;
4891  	struct mmu_notifier_range range;
4892  
4893  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
4894  				old_end);
4895  	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4896  	mmu_notifier_invalidate_range_start(&range);
4897  	/* Prevent race with file truncation */
4898  	i_mmap_lock_write(mapping);
4899  	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
4900  		src_pte = huge_pte_offset(mm, old_addr, sz);
4901  		if (!src_pte)
4902  			continue;
4903  		if (huge_pte_none(huge_ptep_get(src_pte)))
4904  			continue;
4905  
4906  		/* old_addr arg to huge_pmd_unshare() is a pointer and so the
4907  		 * arg may be modified. Pass a copy instead to preserve the
4908  		 * value in old_addr.
4909  		 */
4910  		old_addr_copy = old_addr;
4911  
4912  		if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
4913  			continue;
4914  
4915  		dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
4916  		if (!dst_pte)
4917  			break;
4918  
4919  		move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
4920  	}
4921  	flush_tlb_range(vma, old_end - len, old_end);
4922  	mmu_notifier_invalidate_range_end(&range);
4923  	i_mmap_unlock_write(mapping);
4924  
4925  	return len + old_addr - old_end;
4926  }
4927  
4928  static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
4929  				   unsigned long start, unsigned long end,
4930  				   struct page *ref_page)
4931  {
4932  	struct mm_struct *mm = vma->vm_mm;
4933  	unsigned long address;
4934  	pte_t *ptep;
4935  	pte_t pte;
4936  	spinlock_t *ptl;
4937  	struct page *page;
4938  	struct hstate *h = hstate_vma(vma);
4939  	unsigned long sz = huge_page_size(h);
4940  	struct mmu_notifier_range range;
4941  	bool force_flush = false;
4942  
4943  	WARN_ON(!is_vm_hugetlb_page(vma));
4944  	BUG_ON(start & ~huge_page_mask(h));
4945  	BUG_ON(end & ~huge_page_mask(h));
4946  
4947  	/*
4948  	 * This is a hugetlb vma, all the pte entries should point
4949  	 * to huge page.
4950  	 */
4951  	tlb_change_page_size(tlb, sz);
4952  	tlb_start_vma(tlb, vma);
4953  
4954  	/*
4955  	 * If sharing possible, alert mmu notifiers of worst case.
4956  	 */
4957  	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
4958  				end);
4959  	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4960  	mmu_notifier_invalidate_range_start(&range);
4961  	address = start;
4962  	for (; address < end; address += sz) {
4963  		ptep = huge_pte_offset(mm, address, sz);
4964  		if (!ptep)
4965  			continue;
4966  
4967  		ptl = huge_pte_lock(h, mm, ptep);
4968  		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
4969  			spin_unlock(ptl);
4970  			tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
4971  			force_flush = true;
4972  			continue;
4973  		}
4974  
4975  		pte = huge_ptep_get(ptep);
4976  		if (huge_pte_none(pte)) {
4977  			spin_unlock(ptl);
4978  			continue;
4979  		}
4980  
4981  		/*
4982  		 * Migrating hugepage or HWPoisoned hugepage is already
4983  		 * unmapped and its refcount is dropped, so just clear pte here.
4984  		 */
4985  		if (unlikely(!pte_present(pte))) {
4986  			huge_pte_clear(mm, address, ptep, sz);
4987  			spin_unlock(ptl);
4988  			continue;
4989  		}
4990  
4991  		page = pte_page(pte);
4992  		/*
4993  		 * If a reference page is supplied, it is because a specific
4994  		 * page is being unmapped, not a range. Ensure the page we
4995  		 * are about to unmap is the actual page of interest.
4996  		 */
4997  		if (ref_page) {
4998  			if (page != ref_page) {
4999  				spin_unlock(ptl);
5000  				continue;
5001  			}
5002  			/*
5003  			 * Mark the VMA as having unmapped its page so that
5004  			 * future faults in this VMA will fail rather than
5005  			 * looking like data was lost
5006  			 */
5007  			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
5008  		}
5009  
5010  		pte = huge_ptep_get_and_clear(mm, address, ptep);
5011  		tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
5012  		if (huge_pte_dirty(pte))
5013  			set_page_dirty(page);
5014  
5015  		hugetlb_count_sub(pages_per_huge_page(h), mm);
5016  		page_remove_rmap(page, true);
5017  
5018  		spin_unlock(ptl);
5019  		tlb_remove_page_size(tlb, page, huge_page_size(h));
5020  		/*
5021  		 * Bail out after unmapping reference page if supplied
5022  		 */
5023  		if (ref_page)
5024  			break;
5025  	}
5026  	mmu_notifier_invalidate_range_end(&range);
5027  	tlb_end_vma(tlb, vma);
5028  
5029  	/*
5030  	 * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
5031  	 * could defer the flush until now, since by holding i_mmap_rwsem we
5032  	 * guaranteed that the last refernece would not be dropped. But we must
5033  	 * do the flushing before we return, as otherwise i_mmap_rwsem will be
5034  	 * dropped and the last reference to the shared PMDs page might be
5035  	 * dropped as well.
5036  	 *
5037  	 * In theory we could defer the freeing of the PMD pages as well, but
5038  	 * huge_pmd_unshare() relies on the exact page_count for the PMD page to
5039  	 * detect sharing, so we cannot defer the release of the page either.
5040  	 * Instead, do flush now.
5041  	 */
5042  	if (force_flush)
5043  		tlb_flush_mmu_tlbonly(tlb);
5044  }
5045  
5046  void __unmap_hugepage_range_final(struct mmu_gather *tlb,
5047  			  struct vm_area_struct *vma, unsigned long start,
5048  			  unsigned long end, struct page *ref_page)
5049  {
5050  	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
5051  
5052  	/*
5053  	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
5054  	 * test will fail on a vma being torn down, and not grab a page table
5055  	 * on its way out.  We're lucky that the flag has such an appropriate
5056  	 * name, and can in fact be safely cleared here. We could clear it
5057  	 * before the __unmap_hugepage_range above, but all that's necessary
5058  	 * is to clear it before releasing the i_mmap_rwsem. This works
5059  	 * because in the context this is called, the VMA is about to be
5060  	 * destroyed and the i_mmap_rwsem is held.
5061  	 */
5062  	vma->vm_flags &= ~VM_MAYSHARE;
5063  }
5064  
5065  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
5066  			  unsigned long end, struct page *ref_page)
5067  {
5068  	struct mmu_gather tlb;
5069  
5070  	tlb_gather_mmu(&tlb, vma->vm_mm);
5071  	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
5072  	tlb_finish_mmu(&tlb);
5073  }
5074  
5075  /*
5076   * This is called when the original mapper is failing to COW a MAP_PRIVATE
5077   * mapping it owns the reserve page for. The intention is to unmap the page
5078   * from other VMAs and let the children be SIGKILLed if they are faulting the
5079   * same region.
5080   */
5081  static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
5082  			      struct page *page, unsigned long address)
5083  {
5084  	struct hstate *h = hstate_vma(vma);
5085  	struct vm_area_struct *iter_vma;
5086  	struct address_space *mapping;
5087  	pgoff_t pgoff;
5088  
5089  	/*
5090  	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
5091  	 * from page cache lookup which is in HPAGE_SIZE units.
5092  	 */
5093  	address = address & huge_page_mask(h);
5094  	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
5095  			vma->vm_pgoff;
5096  	mapping = vma->vm_file->f_mapping;
5097  
5098  	/*
5099  	 * Take the mapping lock for the duration of the table walk. As
5100  	 * this mapping should be shared between all the VMAs,
5101  	 * __unmap_hugepage_range() is called as the lock is already held
5102  	 */
5103  	i_mmap_lock_write(mapping);
5104  	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
5105  		/* Do not unmap the current VMA */
5106  		if (iter_vma == vma)
5107  			continue;
5108  
5109  		/*
5110  		 * Shared VMAs have their own reserves and do not affect
5111  		 * MAP_PRIVATE accounting but it is possible that a shared
5112  		 * VMA is using the same page so check and skip such VMAs.
5113  		 */
5114  		if (iter_vma->vm_flags & VM_MAYSHARE)
5115  			continue;
5116  
5117  		/*
5118  		 * Unmap the page from other VMAs without their own reserves.
5119  		 * They get marked to be SIGKILLed if they fault in these
5120  		 * areas. This is because a future no-page fault on this VMA
5121  		 * could insert a zeroed page instead of the data existing
5122  		 * from the time of fork. This would look like data corruption
5123  		 */
5124  		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
5125  			unmap_hugepage_range(iter_vma, address,
5126  					     address + huge_page_size(h), page);
5127  	}
5128  	i_mmap_unlock_write(mapping);
5129  }
5130  
5131  /*
5132   * Hugetlb_cow() should be called with page lock of the original hugepage held.
5133   * Called with hugetlb_fault_mutex_table held and pte_page locked so we
5134   * cannot race with other handlers or page migration.
5135   * Keep the pte_same checks anyway to make transition from the mutex easier.
5136   */
5137  static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
5138  		       unsigned long address, pte_t *ptep,
5139  		       struct page *pagecache_page, spinlock_t *ptl)
5140  {
5141  	pte_t pte;
5142  	struct hstate *h = hstate_vma(vma);
5143  	struct page *old_page, *new_page;
5144  	int outside_reserve = 0;
5145  	vm_fault_t ret = 0;
5146  	unsigned long haddr = address & huge_page_mask(h);
5147  	struct mmu_notifier_range range;
5148  
5149  	pte = huge_ptep_get(ptep);
5150  	old_page = pte_page(pte);
5151  
5152  retry_avoidcopy:
5153  	/* If no-one else is actually using this page, avoid the copy
5154  	 * and just make the page writable */
5155  	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
5156  		page_move_anon_rmap(old_page, vma);
5157  		set_huge_ptep_writable(vma, haddr, ptep);
5158  		return 0;
5159  	}
5160  
5161  	/*
5162  	 * If the process that created a MAP_PRIVATE mapping is about to
5163  	 * perform a COW due to a shared page count, attempt to satisfy
5164  	 * the allocation without using the existing reserves. The pagecache
5165  	 * page is used to determine if the reserve at this address was
5166  	 * consumed or not. If reserves were used, a partial faulted mapping
5167  	 * at the time of fork() could consume its reserves on COW instead
5168  	 * of the full address range.
5169  	 */
5170  	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
5171  			old_page != pagecache_page)
5172  		outside_reserve = 1;
5173  
5174  	get_page(old_page);
5175  
5176  	/*
5177  	 * Drop page table lock as buddy allocator may be called. It will
5178  	 * be acquired again before returning to the caller, as expected.
5179  	 */
5180  	spin_unlock(ptl);
5181  	new_page = alloc_huge_page(vma, haddr, outside_reserve);
5182  
5183  	if (IS_ERR(new_page)) {
5184  		/*
5185  		 * If a process owning a MAP_PRIVATE mapping fails to COW,
5186  		 * it is due to references held by a child and an insufficient
5187  		 * huge page pool. To guarantee the original mappers
5188  		 * reliability, unmap the page from child processes. The child
5189  		 * may get SIGKILLed if it later faults.
5190  		 */
5191  		if (outside_reserve) {
5192  			struct address_space *mapping = vma->vm_file->f_mapping;
5193  			pgoff_t idx;
5194  			u32 hash;
5195  
5196  			put_page(old_page);
5197  			BUG_ON(huge_pte_none(pte));
5198  			/*
5199  			 * Drop hugetlb_fault_mutex and i_mmap_rwsem before
5200  			 * unmapping.  unmapping needs to hold i_mmap_rwsem
5201  			 * in write mode.  Dropping i_mmap_rwsem in read mode
5202  			 * here is OK as COW mappings do not interact with
5203  			 * PMD sharing.
5204  			 *
5205  			 * Reacquire both after unmap operation.
5206  			 */
5207  			idx = vma_hugecache_offset(h, vma, haddr);
5208  			hash = hugetlb_fault_mutex_hash(mapping, idx);
5209  			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5210  			i_mmap_unlock_read(mapping);
5211  
5212  			unmap_ref_private(mm, vma, old_page, haddr);
5213  
5214  			i_mmap_lock_read(mapping);
5215  			mutex_lock(&hugetlb_fault_mutex_table[hash]);
5216  			spin_lock(ptl);
5217  			ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5218  			if (likely(ptep &&
5219  				   pte_same(huge_ptep_get(ptep), pte)))
5220  				goto retry_avoidcopy;
5221  			/*
5222  			 * race occurs while re-acquiring page table
5223  			 * lock, and our job is done.
5224  			 */
5225  			return 0;
5226  		}
5227  
5228  		ret = vmf_error(PTR_ERR(new_page));
5229  		goto out_release_old;
5230  	}
5231  
5232  	/*
5233  	 * When the original hugepage is shared one, it does not have
5234  	 * anon_vma prepared.
5235  	 */
5236  	if (unlikely(anon_vma_prepare(vma))) {
5237  		ret = VM_FAULT_OOM;
5238  		goto out_release_all;
5239  	}
5240  
5241  	copy_user_huge_page(new_page, old_page, address, vma,
5242  			    pages_per_huge_page(h));
5243  	__SetPageUptodate(new_page);
5244  
5245  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
5246  				haddr + huge_page_size(h));
5247  	mmu_notifier_invalidate_range_start(&range);
5248  
5249  	/*
5250  	 * Retake the page table lock to check for racing updates
5251  	 * before the page tables are altered
5252  	 */
5253  	spin_lock(ptl);
5254  	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5255  	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
5256  		ClearHPageRestoreReserve(new_page);
5257  
5258  		/* Break COW */
5259  		huge_ptep_clear_flush(vma, haddr, ptep);
5260  		mmu_notifier_invalidate_range(mm, range.start, range.end);
5261  		page_remove_rmap(old_page, true);
5262  		hugepage_add_new_anon_rmap(new_page, vma, haddr);
5263  		set_huge_pte_at(mm, haddr, ptep,
5264  				make_huge_pte(vma, new_page, 1));
5265  		SetHPageMigratable(new_page);
5266  		/* Make the old page be freed below */
5267  		new_page = old_page;
5268  	}
5269  	spin_unlock(ptl);
5270  	mmu_notifier_invalidate_range_end(&range);
5271  out_release_all:
5272  	/* No restore in case of successful pagetable update (Break COW) */
5273  	if (new_page != old_page)
5274  		restore_reserve_on_error(h, vma, haddr, new_page);
5275  	put_page(new_page);
5276  out_release_old:
5277  	put_page(old_page);
5278  
5279  	spin_lock(ptl); /* Caller expects lock to be held */
5280  	return ret;
5281  }
5282  
5283  /* Return the pagecache page at a given address within a VMA */
5284  static struct page *hugetlbfs_pagecache_page(struct hstate *h,
5285  			struct vm_area_struct *vma, unsigned long address)
5286  {
5287  	struct address_space *mapping;
5288  	pgoff_t idx;
5289  
5290  	mapping = vma->vm_file->f_mapping;
5291  	idx = vma_hugecache_offset(h, vma, address);
5292  
5293  	return find_lock_page(mapping, idx);
5294  }
5295  
5296  /*
5297   * Return whether there is a pagecache page to back given address within VMA.
5298   * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
5299   */
5300  static bool hugetlbfs_pagecache_present(struct hstate *h,
5301  			struct vm_area_struct *vma, unsigned long address)
5302  {
5303  	struct address_space *mapping;
5304  	pgoff_t idx;
5305  	struct page *page;
5306  
5307  	mapping = vma->vm_file->f_mapping;
5308  	idx = vma_hugecache_offset(h, vma, address);
5309  
5310  	page = find_get_page(mapping, idx);
5311  	if (page)
5312  		put_page(page);
5313  	return page != NULL;
5314  }
5315  
5316  int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
5317  			   pgoff_t idx)
5318  {
5319  	struct inode *inode = mapping->host;
5320  	struct hstate *h = hstate_inode(inode);
5321  	int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
5322  
5323  	if (err)
5324  		return err;
5325  	ClearHPageRestoreReserve(page);
5326  
5327  	/*
5328  	 * set page dirty so that it will not be removed from cache/file
5329  	 * by non-hugetlbfs specific code paths.
5330  	 */
5331  	set_page_dirty(page);
5332  
5333  	spin_lock(&inode->i_lock);
5334  	inode->i_blocks += blocks_per_huge_page(h);
5335  	spin_unlock(&inode->i_lock);
5336  	return 0;
5337  }
5338  
5339  static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
5340  						  struct address_space *mapping,
5341  						  pgoff_t idx,
5342  						  unsigned int flags,
5343  						  unsigned long haddr,
5344  						  unsigned long reason)
5345  {
5346  	vm_fault_t ret;
5347  	u32 hash;
5348  	struct vm_fault vmf = {
5349  		.vma = vma,
5350  		.address = haddr,
5351  		.flags = flags,
5352  
5353  		/*
5354  		 * Hard to debug if it ends up being
5355  		 * used by a callee that assumes
5356  		 * something about the other
5357  		 * uninitialized fields... same as in
5358  		 * memory.c
5359  		 */
5360  	};
5361  
5362  	/*
5363  	 * hugetlb_fault_mutex and i_mmap_rwsem must be
5364  	 * dropped before handling userfault.  Reacquire
5365  	 * after handling fault to make calling code simpler.
5366  	 */
5367  	hash = hugetlb_fault_mutex_hash(mapping, idx);
5368  	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5369  	i_mmap_unlock_read(mapping);
5370  	ret = handle_userfault(&vmf, reason);
5371  	i_mmap_lock_read(mapping);
5372  	mutex_lock(&hugetlb_fault_mutex_table[hash]);
5373  
5374  	return ret;
5375  }
5376  
5377  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
5378  			struct vm_area_struct *vma,
5379  			struct address_space *mapping, pgoff_t idx,
5380  			unsigned long address, pte_t *ptep, unsigned int flags)
5381  {
5382  	struct hstate *h = hstate_vma(vma);
5383  	vm_fault_t ret = VM_FAULT_SIGBUS;
5384  	int anon_rmap = 0;
5385  	unsigned long size;
5386  	struct page *page;
5387  	pte_t new_pte;
5388  	spinlock_t *ptl;
5389  	unsigned long haddr = address & huge_page_mask(h);
5390  	bool new_page, new_pagecache_page = false;
5391  
5392  	/*
5393  	 * Currently, we are forced to kill the process in the event the
5394  	 * original mapper has unmapped pages from the child due to a failed
5395  	 * COW. Warn that such a situation has occurred as it may not be obvious
5396  	 */
5397  	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
5398  		pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
5399  			   current->pid);
5400  		return ret;
5401  	}
5402  
5403  	/*
5404  	 * We can not race with truncation due to holding i_mmap_rwsem.
5405  	 * i_size is modified when holding i_mmap_rwsem, so check here
5406  	 * once for faults beyond end of file.
5407  	 */
5408  	size = i_size_read(mapping->host) >> huge_page_shift(h);
5409  	if (idx >= size)
5410  		goto out;
5411  
5412  retry:
5413  	new_page = false;
5414  	page = find_lock_page(mapping, idx);
5415  	if (!page) {
5416  		/* Check for page in userfault range */
5417  		if (userfaultfd_missing(vma)) {
5418  			ret = hugetlb_handle_userfault(vma, mapping, idx,
5419  						       flags, haddr,
5420  						       VM_UFFD_MISSING);
5421  			goto out;
5422  		}
5423  
5424  		page = alloc_huge_page(vma, haddr, 0);
5425  		if (IS_ERR(page)) {
5426  			/*
5427  			 * Returning error will result in faulting task being
5428  			 * sent SIGBUS.  The hugetlb fault mutex prevents two
5429  			 * tasks from racing to fault in the same page which
5430  			 * could result in false unable to allocate errors.
5431  			 * Page migration does not take the fault mutex, but
5432  			 * does a clear then write of pte's under page table
5433  			 * lock.  Page fault code could race with migration,
5434  			 * notice the clear pte and try to allocate a page
5435  			 * here.  Before returning error, get ptl and make
5436  			 * sure there really is no pte entry.
5437  			 */
5438  			ptl = huge_pte_lock(h, mm, ptep);
5439  			ret = 0;
5440  			if (huge_pte_none(huge_ptep_get(ptep)))
5441  				ret = vmf_error(PTR_ERR(page));
5442  			spin_unlock(ptl);
5443  			goto out;
5444  		}
5445  		clear_huge_page(page, address, pages_per_huge_page(h));
5446  		__SetPageUptodate(page);
5447  		new_page = true;
5448  
5449  		if (vma->vm_flags & VM_MAYSHARE) {
5450  			int err = huge_add_to_page_cache(page, mapping, idx);
5451  			if (err) {
5452  				put_page(page);
5453  				if (err == -EEXIST)
5454  					goto retry;
5455  				goto out;
5456  			}
5457  			new_pagecache_page = true;
5458  		} else {
5459  			lock_page(page);
5460  			if (unlikely(anon_vma_prepare(vma))) {
5461  				ret = VM_FAULT_OOM;
5462  				goto backout_unlocked;
5463  			}
5464  			anon_rmap = 1;
5465  		}
5466  	} else {
5467  		/*
5468  		 * If memory error occurs between mmap() and fault, some process
5469  		 * don't have hwpoisoned swap entry for errored virtual address.
5470  		 * So we need to block hugepage fault by PG_hwpoison bit check.
5471  		 */
5472  		if (unlikely(PageHWPoison(page))) {
5473  			ret = VM_FAULT_HWPOISON_LARGE |
5474  				VM_FAULT_SET_HINDEX(hstate_index(h));
5475  			goto backout_unlocked;
5476  		}
5477  
5478  		/* Check for page in userfault range. */
5479  		if (userfaultfd_minor(vma)) {
5480  			unlock_page(page);
5481  			put_page(page);
5482  			ret = hugetlb_handle_userfault(vma, mapping, idx,
5483  						       flags, haddr,
5484  						       VM_UFFD_MINOR);
5485  			goto out;
5486  		}
5487  	}
5488  
5489  	/*
5490  	 * If we are going to COW a private mapping later, we examine the
5491  	 * pending reservations for this page now. This will ensure that
5492  	 * any allocations necessary to record that reservation occur outside
5493  	 * the spinlock.
5494  	 */
5495  	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
5496  		if (vma_needs_reservation(h, vma, haddr) < 0) {
5497  			ret = VM_FAULT_OOM;
5498  			goto backout_unlocked;
5499  		}
5500  		/* Just decrements count, does not deallocate */
5501  		vma_end_reservation(h, vma, haddr);
5502  	}
5503  
5504  	ptl = huge_pte_lock(h, mm, ptep);
5505  	ret = 0;
5506  	if (!huge_pte_none(huge_ptep_get(ptep)))
5507  		goto backout;
5508  
5509  	if (anon_rmap) {
5510  		ClearHPageRestoreReserve(page);
5511  		hugepage_add_new_anon_rmap(page, vma, haddr);
5512  	} else
5513  		page_dup_rmap(page, true);
5514  	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
5515  				&& (vma->vm_flags & VM_SHARED)));
5516  	set_huge_pte_at(mm, haddr, ptep, new_pte);
5517  
5518  	hugetlb_count_add(pages_per_huge_page(h), mm);
5519  	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
5520  		/* Optimization, do the COW without a second fault */
5521  		ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
5522  	}
5523  
5524  	spin_unlock(ptl);
5525  
5526  	/*
5527  	 * Only set HPageMigratable in newly allocated pages.  Existing pages
5528  	 * found in the pagecache may not have HPageMigratableset if they have
5529  	 * been isolated for migration.
5530  	 */
5531  	if (new_page)
5532  		SetHPageMigratable(page);
5533  
5534  	unlock_page(page);
5535  out:
5536  	return ret;
5537  
5538  backout:
5539  	spin_unlock(ptl);
5540  backout_unlocked:
5541  	unlock_page(page);
5542  	/* restore reserve for newly allocated pages not in page cache */
5543  	if (new_page && !new_pagecache_page)
5544  		restore_reserve_on_error(h, vma, haddr, page);
5545  	put_page(page);
5546  	goto out;
5547  }
5548  
5549  #ifdef CONFIG_SMP
5550  u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
5551  {
5552  	unsigned long key[2];
5553  	u32 hash;
5554  
5555  	key[0] = (unsigned long) mapping;
5556  	key[1] = idx;
5557  
5558  	hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
5559  
5560  	return hash & (num_fault_mutexes - 1);
5561  }
5562  #else
5563  /*
5564   * For uniprocessor systems we always use a single mutex, so just
5565   * return 0 and avoid the hashing overhead.
5566   */
5567  u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
5568  {
5569  	return 0;
5570  }
5571  #endif
5572  
5573  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
5574  			unsigned long address, unsigned int flags)
5575  {
5576  	pte_t *ptep, entry;
5577  	spinlock_t *ptl;
5578  	vm_fault_t ret;
5579  	u32 hash;
5580  	pgoff_t idx;
5581  	struct page *page = NULL;
5582  	struct page *pagecache_page = NULL;
5583  	struct hstate *h = hstate_vma(vma);
5584  	struct address_space *mapping;
5585  	int need_wait_lock = 0;
5586  	unsigned long haddr = address & huge_page_mask(h);
5587  
5588  	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5589  	if (ptep) {
5590  		/*
5591  		 * Since we hold no locks, ptep could be stale.  That is
5592  		 * OK as we are only making decisions based on content and
5593  		 * not actually modifying content here.
5594  		 */
5595  		entry = huge_ptep_get(ptep);
5596  		if (unlikely(is_hugetlb_entry_migration(entry))) {
5597  			migration_entry_wait_huge(vma, mm, ptep);
5598  			return 0;
5599  		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
5600  			return VM_FAULT_HWPOISON_LARGE |
5601  				VM_FAULT_SET_HINDEX(hstate_index(h));
5602  	}
5603  
5604  	/*
5605  	 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
5606  	 * until finished with ptep.  This serves two purposes:
5607  	 * 1) It prevents huge_pmd_unshare from being called elsewhere
5608  	 *    and making the ptep no longer valid.
5609  	 * 2) It synchronizes us with i_size modifications during truncation.
5610  	 *
5611  	 * ptep could have already be assigned via huge_pte_offset.  That
5612  	 * is OK, as huge_pte_alloc will return the same value unless
5613  	 * something has changed.
5614  	 */
5615  	mapping = vma->vm_file->f_mapping;
5616  	i_mmap_lock_read(mapping);
5617  	ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
5618  	if (!ptep) {
5619  		i_mmap_unlock_read(mapping);
5620  		return VM_FAULT_OOM;
5621  	}
5622  
5623  	/*
5624  	 * Serialize hugepage allocation and instantiation, so that we don't
5625  	 * get spurious allocation failures if two CPUs race to instantiate
5626  	 * the same page in the page cache.
5627  	 */
5628  	idx = vma_hugecache_offset(h, vma, haddr);
5629  	hash = hugetlb_fault_mutex_hash(mapping, idx);
5630  	mutex_lock(&hugetlb_fault_mutex_table[hash]);
5631  
5632  	entry = huge_ptep_get(ptep);
5633  	if (huge_pte_none(entry)) {
5634  		ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
5635  		goto out_mutex;
5636  	}
5637  
5638  	ret = 0;
5639  
5640  	/*
5641  	 * entry could be a migration/hwpoison entry at this point, so this
5642  	 * check prevents the kernel from going below assuming that we have
5643  	 * an active hugepage in pagecache. This goto expects the 2nd page
5644  	 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
5645  	 * properly handle it.
5646  	 */
5647  	if (!pte_present(entry))
5648  		goto out_mutex;
5649  
5650  	/*
5651  	 * If we are going to COW the mapping later, we examine the pending
5652  	 * reservations for this page now. This will ensure that any
5653  	 * allocations necessary to record that reservation occur outside the
5654  	 * spinlock. For private mappings, we also lookup the pagecache
5655  	 * page now as it is used to determine if a reservation has been
5656  	 * consumed.
5657  	 */
5658  	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
5659  		if (vma_needs_reservation(h, vma, haddr) < 0) {
5660  			ret = VM_FAULT_OOM;
5661  			goto out_mutex;
5662  		}
5663  		/* Just decrements count, does not deallocate */
5664  		vma_end_reservation(h, vma, haddr);
5665  
5666  		if (!(vma->vm_flags & VM_MAYSHARE))
5667  			pagecache_page = hugetlbfs_pagecache_page(h,
5668  								vma, haddr);
5669  	}
5670  
5671  	ptl = huge_pte_lock(h, mm, ptep);
5672  
5673  	/* Check for a racing update before calling hugetlb_cow */
5674  	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
5675  		goto out_ptl;
5676  
5677  	/*
5678  	 * hugetlb_cow() requires page locks of pte_page(entry) and
5679  	 * pagecache_page, so here we need take the former one
5680  	 * when page != pagecache_page or !pagecache_page.
5681  	 */
5682  	page = pte_page(entry);
5683  	if (page != pagecache_page)
5684  		if (!trylock_page(page)) {
5685  			need_wait_lock = 1;
5686  			goto out_ptl;
5687  		}
5688  
5689  	get_page(page);
5690  
5691  	if (flags & FAULT_FLAG_WRITE) {
5692  		if (!huge_pte_write(entry)) {
5693  			ret = hugetlb_cow(mm, vma, address, ptep,
5694  					  pagecache_page, ptl);
5695  			goto out_put_page;
5696  		}
5697  		entry = huge_pte_mkdirty(entry);
5698  	}
5699  	entry = pte_mkyoung(entry);
5700  	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
5701  						flags & FAULT_FLAG_WRITE))
5702  		update_mmu_cache(vma, haddr, ptep);
5703  out_put_page:
5704  	if (page != pagecache_page)
5705  		unlock_page(page);
5706  	put_page(page);
5707  out_ptl:
5708  	spin_unlock(ptl);
5709  
5710  	if (pagecache_page) {
5711  		unlock_page(pagecache_page);
5712  		put_page(pagecache_page);
5713  	}
5714  out_mutex:
5715  	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5716  	i_mmap_unlock_read(mapping);
5717  	/*
5718  	 * Generally it's safe to hold refcount during waiting page lock. But
5719  	 * here we just wait to defer the next page fault to avoid busy loop and
5720  	 * the page is not used after unlocked before returning from the current
5721  	 * page fault. So we are safe from accessing freed page, even if we wait
5722  	 * here without taking refcount.
5723  	 */
5724  	if (need_wait_lock)
5725  		wait_on_page_locked(page);
5726  	return ret;
5727  }
5728  
5729  #ifdef CONFIG_USERFAULTFD
5730  /*
5731   * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
5732   * modifications for huge pages.
5733   */
5734  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
5735  			    pte_t *dst_pte,
5736  			    struct vm_area_struct *dst_vma,
5737  			    unsigned long dst_addr,
5738  			    unsigned long src_addr,
5739  			    enum mcopy_atomic_mode mode,
5740  			    struct page **pagep)
5741  {
5742  	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
5743  	struct hstate *h = hstate_vma(dst_vma);
5744  	struct address_space *mapping = dst_vma->vm_file->f_mapping;
5745  	pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
5746  	unsigned long size;
5747  	int vm_shared = dst_vma->vm_flags & VM_SHARED;
5748  	pte_t _dst_pte;
5749  	spinlock_t *ptl;
5750  	int ret = -ENOMEM;
5751  	struct page *page;
5752  	int writable;
5753  	bool page_in_pagecache = false;
5754  
5755  	if (is_continue) {
5756  		ret = -EFAULT;
5757  		page = find_lock_page(mapping, idx);
5758  		if (!page)
5759  			goto out;
5760  		page_in_pagecache = true;
5761  	} else if (!*pagep) {
5762  		/* If a page already exists, then it's UFFDIO_COPY for
5763  		 * a non-missing case. Return -EEXIST.
5764  		 */
5765  		if (vm_shared &&
5766  		    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5767  			ret = -EEXIST;
5768  			goto out;
5769  		}
5770  
5771  		page = alloc_huge_page(dst_vma, dst_addr, 0);
5772  		if (IS_ERR(page)) {
5773  			ret = -ENOMEM;
5774  			goto out;
5775  		}
5776  
5777  		ret = copy_huge_page_from_user(page,
5778  						(const void __user *) src_addr,
5779  						pages_per_huge_page(h), false);
5780  
5781  		/* fallback to copy_from_user outside mmap_lock */
5782  		if (unlikely(ret)) {
5783  			ret = -ENOENT;
5784  			/* Free the allocated page which may have
5785  			 * consumed a reservation.
5786  			 */
5787  			restore_reserve_on_error(h, dst_vma, dst_addr, page);
5788  			put_page(page);
5789  
5790  			/* Allocate a temporary page to hold the copied
5791  			 * contents.
5792  			 */
5793  			page = alloc_huge_page_vma(h, dst_vma, dst_addr);
5794  			if (!page) {
5795  				ret = -ENOMEM;
5796  				goto out;
5797  			}
5798  			*pagep = page;
5799  			/* Set the outparam pagep and return to the caller to
5800  			 * copy the contents outside the lock. Don't free the
5801  			 * page.
5802  			 */
5803  			goto out;
5804  		}
5805  	} else {
5806  		if (vm_shared &&
5807  		    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5808  			put_page(*pagep);
5809  			ret = -EEXIST;
5810  			*pagep = NULL;
5811  			goto out;
5812  		}
5813  
5814  		page = alloc_huge_page(dst_vma, dst_addr, 0);
5815  		if (IS_ERR(page)) {
5816  			ret = -ENOMEM;
5817  			*pagep = NULL;
5818  			goto out;
5819  		}
5820  		folio_copy(page_folio(page), page_folio(*pagep));
5821  		put_page(*pagep);
5822  		*pagep = NULL;
5823  	}
5824  
5825  	/*
5826  	 * The memory barrier inside __SetPageUptodate makes sure that
5827  	 * preceding stores to the page contents become visible before
5828  	 * the set_pte_at() write.
5829  	 */
5830  	__SetPageUptodate(page);
5831  
5832  	/* Add shared, newly allocated pages to the page cache. */
5833  	if (vm_shared && !is_continue) {
5834  		size = i_size_read(mapping->host) >> huge_page_shift(h);
5835  		ret = -EFAULT;
5836  		if (idx >= size)
5837  			goto out_release_nounlock;
5838  
5839  		/*
5840  		 * Serialization between remove_inode_hugepages() and
5841  		 * huge_add_to_page_cache() below happens through the
5842  		 * hugetlb_fault_mutex_table that here must be hold by
5843  		 * the caller.
5844  		 */
5845  		ret = huge_add_to_page_cache(page, mapping, idx);
5846  		if (ret)
5847  			goto out_release_nounlock;
5848  		page_in_pagecache = true;
5849  	}
5850  
5851  	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
5852  	spin_lock(ptl);
5853  
5854  	/*
5855  	 * Recheck the i_size after holding PT lock to make sure not
5856  	 * to leave any page mapped (as page_mapped()) beyond the end
5857  	 * of the i_size (remove_inode_hugepages() is strict about
5858  	 * enforcing that). If we bail out here, we'll also leave a
5859  	 * page in the radix tree in the vm_shared case beyond the end
5860  	 * of the i_size, but remove_inode_hugepages() will take care
5861  	 * of it as soon as we drop the hugetlb_fault_mutex_table.
5862  	 */
5863  	size = i_size_read(mapping->host) >> huge_page_shift(h);
5864  	ret = -EFAULT;
5865  	if (idx >= size)
5866  		goto out_release_unlock;
5867  
5868  	ret = -EEXIST;
5869  	if (!huge_pte_none(huge_ptep_get(dst_pte)))
5870  		goto out_release_unlock;
5871  
5872  	if (vm_shared) {
5873  		page_dup_rmap(page, true);
5874  	} else {
5875  		ClearHPageRestoreReserve(page);
5876  		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
5877  	}
5878  
5879  	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
5880  	if (is_continue && !vm_shared)
5881  		writable = 0;
5882  	else
5883  		writable = dst_vma->vm_flags & VM_WRITE;
5884  
5885  	_dst_pte = make_huge_pte(dst_vma, page, writable);
5886  	if (writable)
5887  		_dst_pte = huge_pte_mkdirty(_dst_pte);
5888  	_dst_pte = pte_mkyoung(_dst_pte);
5889  
5890  	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
5891  
5892  	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
5893  					dst_vma->vm_flags & VM_WRITE);
5894  	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
5895  
5896  	/* No need to invalidate - it was non-present before */
5897  	update_mmu_cache(dst_vma, dst_addr, dst_pte);
5898  
5899  	spin_unlock(ptl);
5900  	if (!is_continue)
5901  		SetHPageMigratable(page);
5902  	if (vm_shared || is_continue)
5903  		unlock_page(page);
5904  	ret = 0;
5905  out:
5906  	return ret;
5907  out_release_unlock:
5908  	spin_unlock(ptl);
5909  	if (vm_shared || is_continue)
5910  		unlock_page(page);
5911  out_release_nounlock:
5912  	if (!page_in_pagecache)
5913  		restore_reserve_on_error(h, dst_vma, dst_addr, page);
5914  	put_page(page);
5915  	goto out;
5916  }
5917  #endif /* CONFIG_USERFAULTFD */
5918  
5919  static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
5920  				 int refs, struct page **pages,
5921  				 struct vm_area_struct **vmas)
5922  {
5923  	int nr;
5924  
5925  	for (nr = 0; nr < refs; nr++) {
5926  		if (likely(pages))
5927  			pages[nr] = mem_map_offset(page, nr);
5928  		if (vmas)
5929  			vmas[nr] = vma;
5930  	}
5931  }
5932  
5933  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
5934  			 struct page **pages, struct vm_area_struct **vmas,
5935  			 unsigned long *position, unsigned long *nr_pages,
5936  			 long i, unsigned int flags, int *locked)
5937  {
5938  	unsigned long pfn_offset;
5939  	unsigned long vaddr = *position;
5940  	unsigned long remainder = *nr_pages;
5941  	struct hstate *h = hstate_vma(vma);
5942  	int err = -EFAULT, refs;
5943  
5944  	while (vaddr < vma->vm_end && remainder) {
5945  		pte_t *pte;
5946  		spinlock_t *ptl = NULL;
5947  		int absent;
5948  		struct page *page;
5949  
5950  		/*
5951  		 * If we have a pending SIGKILL, don't keep faulting pages and
5952  		 * potentially allocating memory.
5953  		 */
5954  		if (fatal_signal_pending(current)) {
5955  			remainder = 0;
5956  			break;
5957  		}
5958  
5959  		/*
5960  		 * Some archs (sparc64, sh*) have multiple pte_ts to
5961  		 * each hugepage.  We have to make sure we get the
5962  		 * first, for the page indexing below to work.
5963  		 *
5964  		 * Note that page table lock is not held when pte is null.
5965  		 */
5966  		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
5967  				      huge_page_size(h));
5968  		if (pte)
5969  			ptl = huge_pte_lock(h, mm, pte);
5970  		absent = !pte || huge_pte_none(huge_ptep_get(pte));
5971  
5972  		/*
5973  		 * When coredumping, it suits get_dump_page if we just return
5974  		 * an error where there's an empty slot with no huge pagecache
5975  		 * to back it.  This way, we avoid allocating a hugepage, and
5976  		 * the sparse dumpfile avoids allocating disk blocks, but its
5977  		 * huge holes still show up with zeroes where they need to be.
5978  		 */
5979  		if (absent && (flags & FOLL_DUMP) &&
5980  		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
5981  			if (pte)
5982  				spin_unlock(ptl);
5983  			remainder = 0;
5984  			break;
5985  		}
5986  
5987  		/*
5988  		 * We need call hugetlb_fault for both hugepages under migration
5989  		 * (in which case hugetlb_fault waits for the migration,) and
5990  		 * hwpoisoned hugepages (in which case we need to prevent the
5991  		 * caller from accessing to them.) In order to do this, we use
5992  		 * here is_swap_pte instead of is_hugetlb_entry_migration and
5993  		 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
5994  		 * both cases, and because we can't follow correct pages
5995  		 * directly from any kind of swap entries.
5996  		 */
5997  		if (absent || is_swap_pte(huge_ptep_get(pte)) ||
5998  		    ((flags & FOLL_WRITE) &&
5999  		      !huge_pte_write(huge_ptep_get(pte)))) {
6000  			vm_fault_t ret;
6001  			unsigned int fault_flags = 0;
6002  
6003  			if (pte)
6004  				spin_unlock(ptl);
6005  			if (flags & FOLL_WRITE)
6006  				fault_flags |= FAULT_FLAG_WRITE;
6007  			if (locked)
6008  				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
6009  					FAULT_FLAG_KILLABLE;
6010  			if (flags & FOLL_NOWAIT)
6011  				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
6012  					FAULT_FLAG_RETRY_NOWAIT;
6013  			if (flags & FOLL_TRIED) {
6014  				/*
6015  				 * Note: FAULT_FLAG_ALLOW_RETRY and
6016  				 * FAULT_FLAG_TRIED can co-exist
6017  				 */
6018  				fault_flags |= FAULT_FLAG_TRIED;
6019  			}
6020  			ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
6021  			if (ret & VM_FAULT_ERROR) {
6022  				err = vm_fault_to_errno(ret, flags);
6023  				remainder = 0;
6024  				break;
6025  			}
6026  			if (ret & VM_FAULT_RETRY) {
6027  				if (locked &&
6028  				    !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
6029  					*locked = 0;
6030  				*nr_pages = 0;
6031  				/*
6032  				 * VM_FAULT_RETRY must not return an
6033  				 * error, it will return zero
6034  				 * instead.
6035  				 *
6036  				 * No need to update "position" as the
6037  				 * caller will not check it after
6038  				 * *nr_pages is set to 0.
6039  				 */
6040  				return i;
6041  			}
6042  			continue;
6043  		}
6044  
6045  		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
6046  		page = pte_page(huge_ptep_get(pte));
6047  
6048  		/*
6049  		 * If subpage information not requested, update counters
6050  		 * and skip the same_page loop below.
6051  		 */
6052  		if (!pages && !vmas && !pfn_offset &&
6053  		    (vaddr + huge_page_size(h) < vma->vm_end) &&
6054  		    (remainder >= pages_per_huge_page(h))) {
6055  			vaddr += huge_page_size(h);
6056  			remainder -= pages_per_huge_page(h);
6057  			i += pages_per_huge_page(h);
6058  			spin_unlock(ptl);
6059  			continue;
6060  		}
6061  
6062  		/* vaddr may not be aligned to PAGE_SIZE */
6063  		refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
6064  		    (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
6065  
6066  		if (pages || vmas)
6067  			record_subpages_vmas(mem_map_offset(page, pfn_offset),
6068  					     vma, refs,
6069  					     likely(pages) ? pages + i : NULL,
6070  					     vmas ? vmas + i : NULL);
6071  
6072  		if (pages) {
6073  			/*
6074  			 * try_grab_compound_head() should always succeed here,
6075  			 * because: a) we hold the ptl lock, and b) we've just
6076  			 * checked that the huge page is present in the page
6077  			 * tables. If the huge page is present, then the tail
6078  			 * pages must also be present. The ptl prevents the
6079  			 * head page and tail pages from being rearranged in
6080  			 * any way. So this page must be available at this
6081  			 * point, unless the page refcount overflowed:
6082  			 */
6083  			if (WARN_ON_ONCE(!try_grab_compound_head(pages[i],
6084  								 refs,
6085  								 flags))) {
6086  				spin_unlock(ptl);
6087  				remainder = 0;
6088  				err = -ENOMEM;
6089  				break;
6090  			}
6091  		}
6092  
6093  		vaddr += (refs << PAGE_SHIFT);
6094  		remainder -= refs;
6095  		i += refs;
6096  
6097  		spin_unlock(ptl);
6098  	}
6099  	*nr_pages = remainder;
6100  	/*
6101  	 * setting position is actually required only if remainder is
6102  	 * not zero but it's faster not to add a "if (remainder)"
6103  	 * branch.
6104  	 */
6105  	*position = vaddr;
6106  
6107  	return i ? i : err;
6108  }
6109  
6110  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
6111  		unsigned long address, unsigned long end, pgprot_t newprot)
6112  {
6113  	struct mm_struct *mm = vma->vm_mm;
6114  	unsigned long start = address;
6115  	pte_t *ptep;
6116  	pte_t pte;
6117  	struct hstate *h = hstate_vma(vma);
6118  	unsigned long pages = 0;
6119  	bool shared_pmd = false;
6120  	struct mmu_notifier_range range;
6121  
6122  	/*
6123  	 * In the case of shared PMDs, the area to flush could be beyond
6124  	 * start/end.  Set range.start/range.end to cover the maximum possible
6125  	 * range if PMD sharing is possible.
6126  	 */
6127  	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
6128  				0, vma, mm, start, end);
6129  	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
6130  
6131  	BUG_ON(address >= end);
6132  	flush_cache_range(vma, range.start, range.end);
6133  
6134  	mmu_notifier_invalidate_range_start(&range);
6135  	i_mmap_lock_write(vma->vm_file->f_mapping);
6136  	for (; address < end; address += huge_page_size(h)) {
6137  		spinlock_t *ptl;
6138  		ptep = huge_pte_offset(mm, address, huge_page_size(h));
6139  		if (!ptep)
6140  			continue;
6141  		ptl = huge_pte_lock(h, mm, ptep);
6142  		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
6143  			pages++;
6144  			spin_unlock(ptl);
6145  			shared_pmd = true;
6146  			continue;
6147  		}
6148  		pte = huge_ptep_get(ptep);
6149  		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
6150  			spin_unlock(ptl);
6151  			continue;
6152  		}
6153  		if (unlikely(is_hugetlb_entry_migration(pte))) {
6154  			swp_entry_t entry = pte_to_swp_entry(pte);
6155  
6156  			if (is_writable_migration_entry(entry)) {
6157  				pte_t newpte;
6158  
6159  				entry = make_readable_migration_entry(
6160  							swp_offset(entry));
6161  				newpte = swp_entry_to_pte(entry);
6162  				set_huge_swap_pte_at(mm, address, ptep,
6163  						     newpte, huge_page_size(h));
6164  				pages++;
6165  			}
6166  			spin_unlock(ptl);
6167  			continue;
6168  		}
6169  		if (!huge_pte_none(pte)) {
6170  			pte_t old_pte;
6171  			unsigned int shift = huge_page_shift(hstate_vma(vma));
6172  
6173  			old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
6174  			pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
6175  			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
6176  			huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
6177  			pages++;
6178  		}
6179  		spin_unlock(ptl);
6180  	}
6181  	/*
6182  	 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
6183  	 * may have cleared our pud entry and done put_page on the page table:
6184  	 * once we release i_mmap_rwsem, another task can do the final put_page
6185  	 * and that page table be reused and filled with junk.  If we actually
6186  	 * did unshare a page of pmds, flush the range corresponding to the pud.
6187  	 */
6188  	if (shared_pmd)
6189  		flush_hugetlb_tlb_range(vma, range.start, range.end);
6190  	else
6191  		flush_hugetlb_tlb_range(vma, start, end);
6192  	/*
6193  	 * No need to call mmu_notifier_invalidate_range() we are downgrading
6194  	 * page table protection not changing it to point to a new page.
6195  	 *
6196  	 * See Documentation/vm/mmu_notifier.rst
6197  	 */
6198  	i_mmap_unlock_write(vma->vm_file->f_mapping);
6199  	mmu_notifier_invalidate_range_end(&range);
6200  
6201  	return pages << h->order;
6202  }
6203  
6204  /* Return true if reservation was successful, false otherwise.  */
6205  bool hugetlb_reserve_pages(struct inode *inode,
6206  					long from, long to,
6207  					struct vm_area_struct *vma,
6208  					vm_flags_t vm_flags)
6209  {
6210  	long chg, add = -1;
6211  	struct hstate *h = hstate_inode(inode);
6212  	struct hugepage_subpool *spool = subpool_inode(inode);
6213  	struct resv_map *resv_map;
6214  	struct hugetlb_cgroup *h_cg = NULL;
6215  	long gbl_reserve, regions_needed = 0;
6216  
6217  	/* This should never happen */
6218  	if (from > to) {
6219  		VM_WARN(1, "%s called with a negative range\n", __func__);
6220  		return false;
6221  	}
6222  
6223  	/*
6224  	 * Only apply hugepage reservation if asked. At fault time, an
6225  	 * attempt will be made for VM_NORESERVE to allocate a page
6226  	 * without using reserves
6227  	 */
6228  	if (vm_flags & VM_NORESERVE)
6229  		return true;
6230  
6231  	/*
6232  	 * Shared mappings base their reservation on the number of pages that
6233  	 * are already allocated on behalf of the file. Private mappings need
6234  	 * to reserve the full area even if read-only as mprotect() may be
6235  	 * called to make the mapping read-write. Assume !vma is a shm mapping
6236  	 */
6237  	if (!vma || vma->vm_flags & VM_MAYSHARE) {
6238  		/*
6239  		 * resv_map can not be NULL as hugetlb_reserve_pages is only
6240  		 * called for inodes for which resv_maps were created (see
6241  		 * hugetlbfs_get_inode).
6242  		 */
6243  		resv_map = inode_resv_map(inode);
6244  
6245  		chg = region_chg(resv_map, from, to, &regions_needed);
6246  
6247  	} else {
6248  		/* Private mapping. */
6249  		resv_map = resv_map_alloc();
6250  		if (!resv_map)
6251  			return false;
6252  
6253  		chg = to - from;
6254  
6255  		set_vma_resv_map(vma, resv_map);
6256  		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
6257  	}
6258  
6259  	if (chg < 0)
6260  		goto out_err;
6261  
6262  	if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
6263  				chg * pages_per_huge_page(h), &h_cg) < 0)
6264  		goto out_err;
6265  
6266  	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
6267  		/* For private mappings, the hugetlb_cgroup uncharge info hangs
6268  		 * of the resv_map.
6269  		 */
6270  		resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
6271  	}
6272  
6273  	/*
6274  	 * There must be enough pages in the subpool for the mapping. If
6275  	 * the subpool has a minimum size, there may be some global
6276  	 * reservations already in place (gbl_reserve).
6277  	 */
6278  	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
6279  	if (gbl_reserve < 0)
6280  		goto out_uncharge_cgroup;
6281  
6282  	/*
6283  	 * Check enough hugepages are available for the reservation.
6284  	 * Hand the pages back to the subpool if there are not
6285  	 */
6286  	if (hugetlb_acct_memory(h, gbl_reserve) < 0)
6287  		goto out_put_pages;
6288  
6289  	/*
6290  	 * Account for the reservations made. Shared mappings record regions
6291  	 * that have reservations as they are shared by multiple VMAs.
6292  	 * When the last VMA disappears, the region map says how much
6293  	 * the reservation was and the page cache tells how much of
6294  	 * the reservation was consumed. Private mappings are per-VMA and
6295  	 * only the consumed reservations are tracked. When the VMA
6296  	 * disappears, the original reservation is the VMA size and the
6297  	 * consumed reservations are stored in the map. Hence, nothing
6298  	 * else has to be done for private mappings here
6299  	 */
6300  	if (!vma || vma->vm_flags & VM_MAYSHARE) {
6301  		add = region_add(resv_map, from, to, regions_needed, h, h_cg);
6302  
6303  		if (unlikely(add < 0)) {
6304  			hugetlb_acct_memory(h, -gbl_reserve);
6305  			goto out_put_pages;
6306  		} else if (unlikely(chg > add)) {
6307  			/*
6308  			 * pages in this range were added to the reserve
6309  			 * map between region_chg and region_add.  This
6310  			 * indicates a race with alloc_huge_page.  Adjust
6311  			 * the subpool and reserve counts modified above
6312  			 * based on the difference.
6313  			 */
6314  			long rsv_adjust;
6315  
6316  			/*
6317  			 * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
6318  			 * reference to h_cg->css. See comment below for detail.
6319  			 */
6320  			hugetlb_cgroup_uncharge_cgroup_rsvd(
6321  				hstate_index(h),
6322  				(chg - add) * pages_per_huge_page(h), h_cg);
6323  
6324  			rsv_adjust = hugepage_subpool_put_pages(spool,
6325  								chg - add);
6326  			hugetlb_acct_memory(h, -rsv_adjust);
6327  		} else if (h_cg) {
6328  			/*
6329  			 * The file_regions will hold their own reference to
6330  			 * h_cg->css. So we should release the reference held
6331  			 * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
6332  			 * done.
6333  			 */
6334  			hugetlb_cgroup_put_rsvd_cgroup(h_cg);
6335  		}
6336  	}
6337  	return true;
6338  
6339  out_put_pages:
6340  	/* put back original number of pages, chg */
6341  	(void)hugepage_subpool_put_pages(spool, chg);
6342  out_uncharge_cgroup:
6343  	hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
6344  					    chg * pages_per_huge_page(h), h_cg);
6345  out_err:
6346  	if (!vma || vma->vm_flags & VM_MAYSHARE)
6347  		/* Only call region_abort if the region_chg succeeded but the
6348  		 * region_add failed or didn't run.
6349  		 */
6350  		if (chg >= 0 && add < 0)
6351  			region_abort(resv_map, from, to, regions_needed);
6352  	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
6353  		kref_put(&resv_map->refs, resv_map_release);
6354  	return false;
6355  }
6356  
6357  long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
6358  								long freed)
6359  {
6360  	struct hstate *h = hstate_inode(inode);
6361  	struct resv_map *resv_map = inode_resv_map(inode);
6362  	long chg = 0;
6363  	struct hugepage_subpool *spool = subpool_inode(inode);
6364  	long gbl_reserve;
6365  
6366  	/*
6367  	 * Since this routine can be called in the evict inode path for all
6368  	 * hugetlbfs inodes, resv_map could be NULL.
6369  	 */
6370  	if (resv_map) {
6371  		chg = region_del(resv_map, start, end);
6372  		/*
6373  		 * region_del() can fail in the rare case where a region
6374  		 * must be split and another region descriptor can not be
6375  		 * allocated.  If end == LONG_MAX, it will not fail.
6376  		 */
6377  		if (chg < 0)
6378  			return chg;
6379  	}
6380  
6381  	spin_lock(&inode->i_lock);
6382  	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
6383  	spin_unlock(&inode->i_lock);
6384  
6385  	/*
6386  	 * If the subpool has a minimum size, the number of global
6387  	 * reservations to be released may be adjusted.
6388  	 *
6389  	 * Note that !resv_map implies freed == 0. So (chg - freed)
6390  	 * won't go negative.
6391  	 */
6392  	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
6393  	hugetlb_acct_memory(h, -gbl_reserve);
6394  
6395  	return 0;
6396  }
6397  
6398  #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
6399  static unsigned long page_table_shareable(struct vm_area_struct *svma,
6400  				struct vm_area_struct *vma,
6401  				unsigned long addr, pgoff_t idx)
6402  {
6403  	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
6404  				svma->vm_start;
6405  	unsigned long sbase = saddr & PUD_MASK;
6406  	unsigned long s_end = sbase + PUD_SIZE;
6407  
6408  	/* Allow segments to share if only one is marked locked */
6409  	unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
6410  	unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
6411  
6412  	/*
6413  	 * match the virtual addresses, permission and the alignment of the
6414  	 * page table page.
6415  	 */
6416  	if (pmd_index(addr) != pmd_index(saddr) ||
6417  	    vm_flags != svm_flags ||
6418  	    !range_in_vma(svma, sbase, s_end))
6419  		return 0;
6420  
6421  	return saddr;
6422  }
6423  
6424  static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
6425  {
6426  	unsigned long base = addr & PUD_MASK;
6427  	unsigned long end = base + PUD_SIZE;
6428  
6429  	/*
6430  	 * check on proper vm_flags and page table alignment
6431  	 */
6432  	if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
6433  		return true;
6434  	return false;
6435  }
6436  
6437  bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
6438  {
6439  #ifdef CONFIG_USERFAULTFD
6440  	if (uffd_disable_huge_pmd_share(vma))
6441  		return false;
6442  #endif
6443  	return vma_shareable(vma, addr);
6444  }
6445  
6446  /*
6447   * Determine if start,end range within vma could be mapped by shared pmd.
6448   * If yes, adjust start and end to cover range associated with possible
6449   * shared pmd mappings.
6450   */
6451  void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
6452  				unsigned long *start, unsigned long *end)
6453  {
6454  	unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
6455  		v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
6456  
6457  	/*
6458  	 * vma needs to span at least one aligned PUD size, and the range
6459  	 * must be at least partially within in.
6460  	 */
6461  	if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
6462  		(*end <= v_start) || (*start >= v_end))
6463  		return;
6464  
6465  	/* Extend the range to be PUD aligned for a worst case scenario */
6466  	if (*start > v_start)
6467  		*start = ALIGN_DOWN(*start, PUD_SIZE);
6468  
6469  	if (*end < v_end)
6470  		*end = ALIGN(*end, PUD_SIZE);
6471  }
6472  
6473  /*
6474   * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
6475   * and returns the corresponding pte. While this is not necessary for the
6476   * !shared pmd case because we can allocate the pmd later as well, it makes the
6477   * code much cleaner.
6478   *
6479   * This routine must be called with i_mmap_rwsem held in at least read mode if
6480   * sharing is possible.  For hugetlbfs, this prevents removal of any page
6481   * table entries associated with the address space.  This is important as we
6482   * are setting up sharing based on existing page table entries (mappings).
6483   */
6484  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
6485  		      unsigned long addr, pud_t *pud)
6486  {
6487  	struct address_space *mapping = vma->vm_file->f_mapping;
6488  	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
6489  			vma->vm_pgoff;
6490  	struct vm_area_struct *svma;
6491  	unsigned long saddr;
6492  	pte_t *spte = NULL;
6493  	pte_t *pte;
6494  	spinlock_t *ptl;
6495  
6496  	i_mmap_assert_locked(mapping);
6497  	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
6498  		if (svma == vma)
6499  			continue;
6500  
6501  		saddr = page_table_shareable(svma, vma, addr, idx);
6502  		if (saddr) {
6503  			spte = huge_pte_offset(svma->vm_mm, saddr,
6504  					       vma_mmu_pagesize(svma));
6505  			if (spte) {
6506  				get_page(virt_to_page(spte));
6507  				break;
6508  			}
6509  		}
6510  	}
6511  
6512  	if (!spte)
6513  		goto out;
6514  
6515  	ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
6516  	if (pud_none(*pud)) {
6517  		pud_populate(mm, pud,
6518  				(pmd_t *)((unsigned long)spte & PAGE_MASK));
6519  		mm_inc_nr_pmds(mm);
6520  	} else {
6521  		put_page(virt_to_page(spte));
6522  	}
6523  	spin_unlock(ptl);
6524  out:
6525  	pte = (pte_t *)pmd_alloc(mm, pud, addr);
6526  	return pte;
6527  }
6528  
6529  /*
6530   * unmap huge page backed by shared pte.
6531   *
6532   * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
6533   * indicated by page_count > 1, unmap is achieved by clearing pud and
6534   * decrementing the ref count. If count == 1, the pte page is not shared.
6535   *
6536   * Called with page table lock held and i_mmap_rwsem held in write mode.
6537   *
6538   * returns: 1 successfully unmapped a shared pte page
6539   *	    0 the underlying pte page is not shared, or it is the last user
6540   */
6541  int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
6542  					unsigned long *addr, pte_t *ptep)
6543  {
6544  	pgd_t *pgd = pgd_offset(mm, *addr);
6545  	p4d_t *p4d = p4d_offset(pgd, *addr);
6546  	pud_t *pud = pud_offset(p4d, *addr);
6547  
6548  	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
6549  	BUG_ON(page_count(virt_to_page(ptep)) == 0);
6550  	if (page_count(virt_to_page(ptep)) == 1)
6551  		return 0;
6552  
6553  	pud_clear(pud);
6554  	put_page(virt_to_page(ptep));
6555  	mm_dec_nr_pmds(mm);
6556  	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
6557  	return 1;
6558  }
6559  
6560  #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
6561  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
6562  		      unsigned long addr, pud_t *pud)
6563  {
6564  	return NULL;
6565  }
6566  
6567  int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
6568  				unsigned long *addr, pte_t *ptep)
6569  {
6570  	return 0;
6571  }
6572  
6573  void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
6574  				unsigned long *start, unsigned long *end)
6575  {
6576  }
6577  
6578  bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
6579  {
6580  	return false;
6581  }
6582  #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
6583  
6584  #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
6585  pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
6586  			unsigned long addr, unsigned long sz)
6587  {
6588  	pgd_t *pgd;
6589  	p4d_t *p4d;
6590  	pud_t *pud;
6591  	pte_t *pte = NULL;
6592  
6593  	pgd = pgd_offset(mm, addr);
6594  	p4d = p4d_alloc(mm, pgd, addr);
6595  	if (!p4d)
6596  		return NULL;
6597  	pud = pud_alloc(mm, p4d, addr);
6598  	if (pud) {
6599  		if (sz == PUD_SIZE) {
6600  			pte = (pte_t *)pud;
6601  		} else {
6602  			BUG_ON(sz != PMD_SIZE);
6603  			if (want_pmd_share(vma, addr) && pud_none(*pud))
6604  				pte = huge_pmd_share(mm, vma, addr, pud);
6605  			else
6606  				pte = (pte_t *)pmd_alloc(mm, pud, addr);
6607  		}
6608  	}
6609  	BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
6610  
6611  	return pte;
6612  }
6613  
6614  /*
6615   * huge_pte_offset() - Walk the page table to resolve the hugepage
6616   * entry at address @addr
6617   *
6618   * Return: Pointer to page table entry (PUD or PMD) for
6619   * address @addr, or NULL if a !p*d_present() entry is encountered and the
6620   * size @sz doesn't match the hugepage size at this level of the page
6621   * table.
6622   */
6623  pte_t *huge_pte_offset(struct mm_struct *mm,
6624  		       unsigned long addr, unsigned long sz)
6625  {
6626  	pgd_t *pgd;
6627  	p4d_t *p4d;
6628  	pud_t *pud;
6629  	pmd_t *pmd;
6630  
6631  	pgd = pgd_offset(mm, addr);
6632  	if (!pgd_present(*pgd))
6633  		return NULL;
6634  	p4d = p4d_offset(pgd, addr);
6635  	if (!p4d_present(*p4d))
6636  		return NULL;
6637  
6638  	pud = pud_offset(p4d, addr);
6639  	if (sz == PUD_SIZE)
6640  		/* must be pud huge, non-present or none */
6641  		return (pte_t *)pud;
6642  	if (!pud_present(*pud))
6643  		return NULL;
6644  	/* must have a valid entry and size to go further */
6645  
6646  	pmd = pmd_offset(pud, addr);
6647  	/* must be pmd huge, non-present or none */
6648  	return (pte_t *)pmd;
6649  }
6650  
6651  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
6652  
6653  /*
6654   * These functions are overwritable if your architecture needs its own
6655   * behavior.
6656   */
6657  struct page * __weak
6658  follow_huge_addr(struct mm_struct *mm, unsigned long address,
6659  			      int write)
6660  {
6661  	return ERR_PTR(-EINVAL);
6662  }
6663  
6664  struct page * __weak
6665  follow_huge_pd(struct vm_area_struct *vma,
6666  	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
6667  {
6668  	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
6669  	return NULL;
6670  }
6671  
6672  struct page * __weak
6673  follow_huge_pmd(struct mm_struct *mm, unsigned long address,
6674  		pmd_t *pmd, int flags)
6675  {
6676  	struct page *page = NULL;
6677  	spinlock_t *ptl;
6678  	pte_t pte;
6679  
6680  	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
6681  	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
6682  			 (FOLL_PIN | FOLL_GET)))
6683  		return NULL;
6684  
6685  retry:
6686  	ptl = pmd_lockptr(mm, pmd);
6687  	spin_lock(ptl);
6688  	/*
6689  	 * make sure that the address range covered by this pmd is not
6690  	 * unmapped from other threads.
6691  	 */
6692  	if (!pmd_huge(*pmd))
6693  		goto out;
6694  	pte = huge_ptep_get((pte_t *)pmd);
6695  	if (pte_present(pte)) {
6696  		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
6697  		/*
6698  		 * try_grab_page() should always succeed here, because: a) we
6699  		 * hold the pmd (ptl) lock, and b) we've just checked that the
6700  		 * huge pmd (head) page is present in the page tables. The ptl
6701  		 * prevents the head page and tail pages from being rearranged
6702  		 * in any way. So this page must be available at this point,
6703  		 * unless the page refcount overflowed:
6704  		 */
6705  		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
6706  			page = NULL;
6707  			goto out;
6708  		}
6709  	} else {
6710  		if (is_hugetlb_entry_migration(pte)) {
6711  			spin_unlock(ptl);
6712  			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
6713  			goto retry;
6714  		}
6715  		/*
6716  		 * hwpoisoned entry is treated as no_page_table in
6717  		 * follow_page_mask().
6718  		 */
6719  	}
6720  out:
6721  	spin_unlock(ptl);
6722  	return page;
6723  }
6724  
6725  struct page * __weak
6726  follow_huge_pud(struct mm_struct *mm, unsigned long address,
6727  		pud_t *pud, int flags)
6728  {
6729  	if (flags & (FOLL_GET | FOLL_PIN))
6730  		return NULL;
6731  
6732  	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
6733  }
6734  
6735  struct page * __weak
6736  follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
6737  {
6738  	if (flags & (FOLL_GET | FOLL_PIN))
6739  		return NULL;
6740  
6741  	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
6742  }
6743  
6744  bool isolate_huge_page(struct page *page, struct list_head *list)
6745  {
6746  	bool ret = true;
6747  
6748  	spin_lock_irq(&hugetlb_lock);
6749  	if (!PageHeadHuge(page) ||
6750  	    !HPageMigratable(page) ||
6751  	    !get_page_unless_zero(page)) {
6752  		ret = false;
6753  		goto unlock;
6754  	}
6755  	ClearHPageMigratable(page);
6756  	list_move_tail(&page->lru, list);
6757  unlock:
6758  	spin_unlock_irq(&hugetlb_lock);
6759  	return ret;
6760  }
6761  
6762  int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
6763  {
6764  	int ret = 0;
6765  
6766  	*hugetlb = false;
6767  	spin_lock_irq(&hugetlb_lock);
6768  	if (PageHeadHuge(page)) {
6769  		*hugetlb = true;
6770  		if (HPageFreed(page) || HPageMigratable(page))
6771  			ret = get_page_unless_zero(page);
6772  		else
6773  			ret = -EBUSY;
6774  	}
6775  	spin_unlock_irq(&hugetlb_lock);
6776  	return ret;
6777  }
6778  
6779  void putback_active_hugepage(struct page *page)
6780  {
6781  	spin_lock_irq(&hugetlb_lock);
6782  	SetHPageMigratable(page);
6783  	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
6784  	spin_unlock_irq(&hugetlb_lock);
6785  	put_page(page);
6786  }
6787  
6788  void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
6789  {
6790  	struct hstate *h = page_hstate(oldpage);
6791  
6792  	hugetlb_cgroup_migrate(oldpage, newpage);
6793  	set_page_owner_migrate_reason(newpage, reason);
6794  
6795  	/*
6796  	 * transfer temporary state of the new huge page. This is
6797  	 * reverse to other transitions because the newpage is going to
6798  	 * be final while the old one will be freed so it takes over
6799  	 * the temporary status.
6800  	 *
6801  	 * Also note that we have to transfer the per-node surplus state
6802  	 * here as well otherwise the global surplus count will not match
6803  	 * the per-node's.
6804  	 */
6805  	if (HPageTemporary(newpage)) {
6806  		int old_nid = page_to_nid(oldpage);
6807  		int new_nid = page_to_nid(newpage);
6808  
6809  		SetHPageTemporary(oldpage);
6810  		ClearHPageTemporary(newpage);
6811  
6812  		/*
6813  		 * There is no need to transfer the per-node surplus state
6814  		 * when we do not cross the node.
6815  		 */
6816  		if (new_nid == old_nid)
6817  			return;
6818  		spin_lock_irq(&hugetlb_lock);
6819  		if (h->surplus_huge_pages_node[old_nid]) {
6820  			h->surplus_huge_pages_node[old_nid]--;
6821  			h->surplus_huge_pages_node[new_nid]++;
6822  		}
6823  		spin_unlock_irq(&hugetlb_lock);
6824  	}
6825  }
6826  
6827  /*
6828   * This function will unconditionally remove all the shared pmd pgtable entries
6829   * within the specific vma for a hugetlbfs memory range.
6830   */
6831  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
6832  {
6833  	struct hstate *h = hstate_vma(vma);
6834  	unsigned long sz = huge_page_size(h);
6835  	struct mm_struct *mm = vma->vm_mm;
6836  	struct mmu_notifier_range range;
6837  	unsigned long address, start, end;
6838  	spinlock_t *ptl;
6839  	pte_t *ptep;
6840  
6841  	if (!(vma->vm_flags & VM_MAYSHARE))
6842  		return;
6843  
6844  	start = ALIGN(vma->vm_start, PUD_SIZE);
6845  	end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
6846  
6847  	if (start >= end)
6848  		return;
6849  
6850  	/*
6851  	 * No need to call adjust_range_if_pmd_sharing_possible(), because
6852  	 * we have already done the PUD_SIZE alignment.
6853  	 */
6854  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
6855  				start, end);
6856  	mmu_notifier_invalidate_range_start(&range);
6857  	i_mmap_lock_write(vma->vm_file->f_mapping);
6858  	for (address = start; address < end; address += PUD_SIZE) {
6859  		unsigned long tmp = address;
6860  
6861  		ptep = huge_pte_offset(mm, address, sz);
6862  		if (!ptep)
6863  			continue;
6864  		ptl = huge_pte_lock(h, mm, ptep);
6865  		/* We don't want 'address' to be changed */
6866  		huge_pmd_unshare(mm, vma, &tmp, ptep);
6867  		spin_unlock(ptl);
6868  	}
6869  	flush_hugetlb_tlb_range(vma, start, end);
6870  	i_mmap_unlock_write(vma->vm_file->f_mapping);
6871  	/*
6872  	 * No need to call mmu_notifier_invalidate_range(), see
6873  	 * Documentation/vm/mmu_notifier.rst.
6874  	 */
6875  	mmu_notifier_invalidate_range_end(&range);
6876  }
6877  
6878  #ifdef CONFIG_CMA
6879  static bool cma_reserve_called __initdata;
6880  
6881  static int __init cmdline_parse_hugetlb_cma(char *p)
6882  {
6883  	int nid, count = 0;
6884  	unsigned long tmp;
6885  	char *s = p;
6886  
6887  	while (*s) {
6888  		if (sscanf(s, "%lu%n", &tmp, &count) != 1)
6889  			break;
6890  
6891  		if (s[count] == ':') {
6892  			nid = tmp;
6893  			if (nid < 0 || nid >= MAX_NUMNODES)
6894  				break;
6895  
6896  			s += count + 1;
6897  			tmp = memparse(s, &s);
6898  			hugetlb_cma_size_in_node[nid] = tmp;
6899  			hugetlb_cma_size += tmp;
6900  
6901  			/*
6902  			 * Skip the separator if have one, otherwise
6903  			 * break the parsing.
6904  			 */
6905  			if (*s == ',')
6906  				s++;
6907  			else
6908  				break;
6909  		} else {
6910  			hugetlb_cma_size = memparse(p, &p);
6911  			break;
6912  		}
6913  	}
6914  
6915  	return 0;
6916  }
6917  
6918  early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
6919  
6920  void __init hugetlb_cma_reserve(int order)
6921  {
6922  	unsigned long size, reserved, per_node;
6923  	bool node_specific_cma_alloc = false;
6924  	int nid;
6925  
6926  	cma_reserve_called = true;
6927  
6928  	if (!hugetlb_cma_size)
6929  		return;
6930  
6931  	for (nid = 0; nid < MAX_NUMNODES; nid++) {
6932  		if (hugetlb_cma_size_in_node[nid] == 0)
6933  			continue;
6934  
6935  		if (!node_state(nid, N_ONLINE)) {
6936  			pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
6937  			hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
6938  			hugetlb_cma_size_in_node[nid] = 0;
6939  			continue;
6940  		}
6941  
6942  		if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
6943  			pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
6944  				nid, (PAGE_SIZE << order) / SZ_1M);
6945  			hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
6946  			hugetlb_cma_size_in_node[nid] = 0;
6947  		} else {
6948  			node_specific_cma_alloc = true;
6949  		}
6950  	}
6951  
6952  	/* Validate the CMA size again in case some invalid nodes specified. */
6953  	if (!hugetlb_cma_size)
6954  		return;
6955  
6956  	if (hugetlb_cma_size < (PAGE_SIZE << order)) {
6957  		pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
6958  			(PAGE_SIZE << order) / SZ_1M);
6959  		hugetlb_cma_size = 0;
6960  		return;
6961  	}
6962  
6963  	if (!node_specific_cma_alloc) {
6964  		/*
6965  		 * If 3 GB area is requested on a machine with 4 numa nodes,
6966  		 * let's allocate 1 GB on first three nodes and ignore the last one.
6967  		 */
6968  		per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
6969  		pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
6970  			hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
6971  	}
6972  
6973  	reserved = 0;
6974  	for_each_node_state(nid, N_ONLINE) {
6975  		int res;
6976  		char name[CMA_MAX_NAME];
6977  
6978  		if (node_specific_cma_alloc) {
6979  			if (hugetlb_cma_size_in_node[nid] == 0)
6980  				continue;
6981  
6982  			size = hugetlb_cma_size_in_node[nid];
6983  		} else {
6984  			size = min(per_node, hugetlb_cma_size - reserved);
6985  		}
6986  
6987  		size = round_up(size, PAGE_SIZE << order);
6988  
6989  		snprintf(name, sizeof(name), "hugetlb%d", nid);
6990  		/*
6991  		 * Note that 'order per bit' is based on smallest size that
6992  		 * may be returned to CMA allocator in the case of
6993  		 * huge page demotion.
6994  		 */
6995  		res = cma_declare_contiguous_nid(0, size, 0,
6996  						PAGE_SIZE << HUGETLB_PAGE_ORDER,
6997  						 0, false, name,
6998  						 &hugetlb_cma[nid], nid);
6999  		if (res) {
7000  			pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
7001  				res, nid);
7002  			continue;
7003  		}
7004  
7005  		reserved += size;
7006  		pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
7007  			size / SZ_1M, nid);
7008  
7009  		if (reserved >= hugetlb_cma_size)
7010  			break;
7011  	}
7012  
7013  	if (!reserved)
7014  		/*
7015  		 * hugetlb_cma_size is used to determine if allocations from
7016  		 * cma are possible.  Set to zero if no cma regions are set up.
7017  		 */
7018  		hugetlb_cma_size = 0;
7019  }
7020  
7021  void __init hugetlb_cma_check(void)
7022  {
7023  	if (!hugetlb_cma_size || cma_reserve_called)
7024  		return;
7025  
7026  	pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
7027  }
7028  
7029  #endif /* CONFIG_CMA */
7030