xref: /openbmc/linux/mm/hugetlb.c (revision e8e0929d)
1 /*
2  * Generic hugetlb support.
3  * (C) William Irwin, April 2004
4  */
5 #include <linux/gfp.h>
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
9 #include <linux/mm.h>
10 #include <linux/seq_file.h>
11 #include <linux/sysctl.h>
12 #include <linux/highmem.h>
13 #include <linux/mmu_notifier.h>
14 #include <linux/nodemask.h>
15 #include <linux/pagemap.h>
16 #include <linux/mempolicy.h>
17 #include <linux/cpuset.h>
18 #include <linux/mutex.h>
19 #include <linux/bootmem.h>
20 #include <linux/sysfs.h>
21 
22 #include <asm/page.h>
23 #include <asm/pgtable.h>
24 #include <asm/io.h>
25 
26 #include <linux/hugetlb.h>
27 #include "internal.h"
28 
29 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
30 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
31 unsigned long hugepages_treat_as_movable;
32 
33 static int max_hstate;
34 unsigned int default_hstate_idx;
35 struct hstate hstates[HUGE_MAX_HSTATE];
36 
37 __initdata LIST_HEAD(huge_boot_pages);
38 
39 /* for command line parsing */
40 static struct hstate * __initdata parsed_hstate;
41 static unsigned long __initdata default_hstate_max_huge_pages;
42 static unsigned long __initdata default_hstate_size;
43 
44 #define for_each_hstate(h) \
45 	for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
46 
47 /*
48  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
49  */
50 static DEFINE_SPINLOCK(hugetlb_lock);
51 
52 /*
53  * Region tracking -- allows tracking of reservations and instantiated pages
54  *                    across the pages in a mapping.
55  *
56  * The region data structures are protected by a combination of the mmap_sem
57  * and the hugetlb_instantion_mutex.  To access or modify a region the caller
58  * must either hold the mmap_sem for write, or the mmap_sem for read and
59  * the hugetlb_instantiation mutex:
60  *
61  * 	down_write(&mm->mmap_sem);
62  * or
63  * 	down_read(&mm->mmap_sem);
64  * 	mutex_lock(&hugetlb_instantiation_mutex);
65  */
66 struct file_region {
67 	struct list_head link;
68 	long from;
69 	long to;
70 };
71 
72 static long region_add(struct list_head *head, long f, long t)
73 {
74 	struct file_region *rg, *nrg, *trg;
75 
76 	/* Locate the region we are either in or before. */
77 	list_for_each_entry(rg, head, link)
78 		if (f <= rg->to)
79 			break;
80 
81 	/* Round our left edge to the current segment if it encloses us. */
82 	if (f > rg->from)
83 		f = rg->from;
84 
85 	/* Check for and consume any regions we now overlap with. */
86 	nrg = rg;
87 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
88 		if (&rg->link == head)
89 			break;
90 		if (rg->from > t)
91 			break;
92 
93 		/* If this area reaches higher then extend our area to
94 		 * include it completely.  If this is not the first area
95 		 * which we intend to reuse, free it. */
96 		if (rg->to > t)
97 			t = rg->to;
98 		if (rg != nrg) {
99 			list_del(&rg->link);
100 			kfree(rg);
101 		}
102 	}
103 	nrg->from = f;
104 	nrg->to = t;
105 	return 0;
106 }
107 
108 static long region_chg(struct list_head *head, long f, long t)
109 {
110 	struct file_region *rg, *nrg;
111 	long chg = 0;
112 
113 	/* Locate the region we are before or in. */
114 	list_for_each_entry(rg, head, link)
115 		if (f <= rg->to)
116 			break;
117 
118 	/* If we are below the current region then a new region is required.
119 	 * Subtle, allocate a new region at the position but make it zero
120 	 * size such that we can guarantee to record the reservation. */
121 	if (&rg->link == head || t < rg->from) {
122 		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
123 		if (!nrg)
124 			return -ENOMEM;
125 		nrg->from = f;
126 		nrg->to   = f;
127 		INIT_LIST_HEAD(&nrg->link);
128 		list_add(&nrg->link, rg->link.prev);
129 
130 		return t - f;
131 	}
132 
133 	/* Round our left edge to the current segment if it encloses us. */
134 	if (f > rg->from)
135 		f = rg->from;
136 	chg = t - f;
137 
138 	/* Check for and consume any regions we now overlap with. */
139 	list_for_each_entry(rg, rg->link.prev, link) {
140 		if (&rg->link == head)
141 			break;
142 		if (rg->from > t)
143 			return chg;
144 
145 		/* We overlap with this area, if it extends futher than
146 		 * us then we must extend ourselves.  Account for its
147 		 * existing reservation. */
148 		if (rg->to > t) {
149 			chg += rg->to - t;
150 			t = rg->to;
151 		}
152 		chg -= rg->to - rg->from;
153 	}
154 	return chg;
155 }
156 
157 static long region_truncate(struct list_head *head, long end)
158 {
159 	struct file_region *rg, *trg;
160 	long chg = 0;
161 
162 	/* Locate the region we are either in or before. */
163 	list_for_each_entry(rg, head, link)
164 		if (end <= rg->to)
165 			break;
166 	if (&rg->link == head)
167 		return 0;
168 
169 	/* If we are in the middle of a region then adjust it. */
170 	if (end > rg->from) {
171 		chg = rg->to - end;
172 		rg->to = end;
173 		rg = list_entry(rg->link.next, typeof(*rg), link);
174 	}
175 
176 	/* Drop any remaining regions. */
177 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
178 		if (&rg->link == head)
179 			break;
180 		chg += rg->to - rg->from;
181 		list_del(&rg->link);
182 		kfree(rg);
183 	}
184 	return chg;
185 }
186 
187 static long region_count(struct list_head *head, long f, long t)
188 {
189 	struct file_region *rg;
190 	long chg = 0;
191 
192 	/* Locate each segment we overlap with, and count that overlap. */
193 	list_for_each_entry(rg, head, link) {
194 		int seg_from;
195 		int seg_to;
196 
197 		if (rg->to <= f)
198 			continue;
199 		if (rg->from >= t)
200 			break;
201 
202 		seg_from = max(rg->from, f);
203 		seg_to = min(rg->to, t);
204 
205 		chg += seg_to - seg_from;
206 	}
207 
208 	return chg;
209 }
210 
211 /*
212  * Convert the address within this vma to the page offset within
213  * the mapping, in pagecache page units; huge pages here.
214  */
215 static pgoff_t vma_hugecache_offset(struct hstate *h,
216 			struct vm_area_struct *vma, unsigned long address)
217 {
218 	return ((address - vma->vm_start) >> huge_page_shift(h)) +
219 			(vma->vm_pgoff >> huge_page_order(h));
220 }
221 
222 /*
223  * Return the size of the pages allocated when backing a VMA. In the majority
224  * cases this will be same size as used by the page table entries.
225  */
226 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
227 {
228 	struct hstate *hstate;
229 
230 	if (!is_vm_hugetlb_page(vma))
231 		return PAGE_SIZE;
232 
233 	hstate = hstate_vma(vma);
234 
235 	return 1UL << (hstate->order + PAGE_SHIFT);
236 }
237 EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
238 
239 /*
240  * Return the page size being used by the MMU to back a VMA. In the majority
241  * of cases, the page size used by the kernel matches the MMU size. On
242  * architectures where it differs, an architecture-specific version of this
243  * function is required.
244  */
245 #ifndef vma_mmu_pagesize
246 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
247 {
248 	return vma_kernel_pagesize(vma);
249 }
250 #endif
251 
252 /*
253  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
254  * bits of the reservation map pointer, which are always clear due to
255  * alignment.
256  */
257 #define HPAGE_RESV_OWNER    (1UL << 0)
258 #define HPAGE_RESV_UNMAPPED (1UL << 1)
259 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
260 
261 /*
262  * These helpers are used to track how many pages are reserved for
263  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
264  * is guaranteed to have their future faults succeed.
265  *
266  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
267  * the reserve counters are updated with the hugetlb_lock held. It is safe
268  * to reset the VMA at fork() time as it is not in use yet and there is no
269  * chance of the global counters getting corrupted as a result of the values.
270  *
271  * The private mapping reservation is represented in a subtly different
272  * manner to a shared mapping.  A shared mapping has a region map associated
273  * with the underlying file, this region map represents the backing file
274  * pages which have ever had a reservation assigned which this persists even
275  * after the page is instantiated.  A private mapping has a region map
276  * associated with the original mmap which is attached to all VMAs which
277  * reference it, this region map represents those offsets which have consumed
278  * reservation ie. where pages have been instantiated.
279  */
280 static unsigned long get_vma_private_data(struct vm_area_struct *vma)
281 {
282 	return (unsigned long)vma->vm_private_data;
283 }
284 
285 static void set_vma_private_data(struct vm_area_struct *vma,
286 							unsigned long value)
287 {
288 	vma->vm_private_data = (void *)value;
289 }
290 
291 struct resv_map {
292 	struct kref refs;
293 	struct list_head regions;
294 };
295 
296 static struct resv_map *resv_map_alloc(void)
297 {
298 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
299 	if (!resv_map)
300 		return NULL;
301 
302 	kref_init(&resv_map->refs);
303 	INIT_LIST_HEAD(&resv_map->regions);
304 
305 	return resv_map;
306 }
307 
308 static void resv_map_release(struct kref *ref)
309 {
310 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
311 
312 	/* Clear out any active regions before we release the map. */
313 	region_truncate(&resv_map->regions, 0);
314 	kfree(resv_map);
315 }
316 
317 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
318 {
319 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
320 	if (!(vma->vm_flags & VM_MAYSHARE))
321 		return (struct resv_map *)(get_vma_private_data(vma) &
322 							~HPAGE_RESV_MASK);
323 	return NULL;
324 }
325 
326 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
327 {
328 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
329 	VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
330 
331 	set_vma_private_data(vma, (get_vma_private_data(vma) &
332 				HPAGE_RESV_MASK) | (unsigned long)map);
333 }
334 
335 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
336 {
337 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
338 	VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
339 
340 	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
341 }
342 
343 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
344 {
345 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
346 
347 	return (get_vma_private_data(vma) & flag) != 0;
348 }
349 
350 /* Decrement the reserved pages in the hugepage pool by one */
351 static void decrement_hugepage_resv_vma(struct hstate *h,
352 			struct vm_area_struct *vma)
353 {
354 	if (vma->vm_flags & VM_NORESERVE)
355 		return;
356 
357 	if (vma->vm_flags & VM_MAYSHARE) {
358 		/* Shared mappings always use reserves */
359 		h->resv_huge_pages--;
360 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
361 		/*
362 		 * Only the process that called mmap() has reserves for
363 		 * private mappings.
364 		 */
365 		h->resv_huge_pages--;
366 	}
367 }
368 
369 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
370 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
371 {
372 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
373 	if (!(vma->vm_flags & VM_MAYSHARE))
374 		vma->vm_private_data = (void *)0;
375 }
376 
377 /* Returns true if the VMA has associated reserve pages */
378 static int vma_has_reserves(struct vm_area_struct *vma)
379 {
380 	if (vma->vm_flags & VM_MAYSHARE)
381 		return 1;
382 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
383 		return 1;
384 	return 0;
385 }
386 
387 static void clear_gigantic_page(struct page *page,
388 			unsigned long addr, unsigned long sz)
389 {
390 	int i;
391 	struct page *p = page;
392 
393 	might_sleep();
394 	for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
395 		cond_resched();
396 		clear_user_highpage(p, addr + i * PAGE_SIZE);
397 	}
398 }
399 static void clear_huge_page(struct page *page,
400 			unsigned long addr, unsigned long sz)
401 {
402 	int i;
403 
404 	if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
405 		clear_gigantic_page(page, addr, sz);
406 		return;
407 	}
408 
409 	might_sleep();
410 	for (i = 0; i < sz/PAGE_SIZE; i++) {
411 		cond_resched();
412 		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
413 	}
414 }
415 
416 static void copy_gigantic_page(struct page *dst, struct page *src,
417 			   unsigned long addr, struct vm_area_struct *vma)
418 {
419 	int i;
420 	struct hstate *h = hstate_vma(vma);
421 	struct page *dst_base = dst;
422 	struct page *src_base = src;
423 	might_sleep();
424 	for (i = 0; i < pages_per_huge_page(h); ) {
425 		cond_resched();
426 		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
427 
428 		i++;
429 		dst = mem_map_next(dst, dst_base, i);
430 		src = mem_map_next(src, src_base, i);
431 	}
432 }
433 static void copy_huge_page(struct page *dst, struct page *src,
434 			   unsigned long addr, struct vm_area_struct *vma)
435 {
436 	int i;
437 	struct hstate *h = hstate_vma(vma);
438 
439 	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
440 		copy_gigantic_page(dst, src, addr, vma);
441 		return;
442 	}
443 
444 	might_sleep();
445 	for (i = 0; i < pages_per_huge_page(h); i++) {
446 		cond_resched();
447 		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
448 	}
449 }
450 
451 static void enqueue_huge_page(struct hstate *h, struct page *page)
452 {
453 	int nid = page_to_nid(page);
454 	list_add(&page->lru, &h->hugepage_freelists[nid]);
455 	h->free_huge_pages++;
456 	h->free_huge_pages_node[nid]++;
457 }
458 
459 static struct page *dequeue_huge_page_vma(struct hstate *h,
460 				struct vm_area_struct *vma,
461 				unsigned long address, int avoid_reserve)
462 {
463 	int nid;
464 	struct page *page = NULL;
465 	struct mempolicy *mpol;
466 	nodemask_t *nodemask;
467 	struct zonelist *zonelist = huge_zonelist(vma, address,
468 					htlb_alloc_mask, &mpol, &nodemask);
469 	struct zone *zone;
470 	struct zoneref *z;
471 
472 	/*
473 	 * A child process with MAP_PRIVATE mappings created by their parent
474 	 * have no page reserves. This check ensures that reservations are
475 	 * not "stolen". The child may still get SIGKILLed
476 	 */
477 	if (!vma_has_reserves(vma) &&
478 			h->free_huge_pages - h->resv_huge_pages == 0)
479 		return NULL;
480 
481 	/* If reserves cannot be used, ensure enough pages are in the pool */
482 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
483 		return NULL;
484 
485 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
486 						MAX_NR_ZONES - 1, nodemask) {
487 		nid = zone_to_nid(zone);
488 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
489 		    !list_empty(&h->hugepage_freelists[nid])) {
490 			page = list_entry(h->hugepage_freelists[nid].next,
491 					  struct page, lru);
492 			list_del(&page->lru);
493 			h->free_huge_pages--;
494 			h->free_huge_pages_node[nid]--;
495 
496 			if (!avoid_reserve)
497 				decrement_hugepage_resv_vma(h, vma);
498 
499 			break;
500 		}
501 	}
502 	mpol_cond_put(mpol);
503 	return page;
504 }
505 
506 static void update_and_free_page(struct hstate *h, struct page *page)
507 {
508 	int i;
509 
510 	VM_BUG_ON(h->order >= MAX_ORDER);
511 
512 	h->nr_huge_pages--;
513 	h->nr_huge_pages_node[page_to_nid(page)]--;
514 	for (i = 0; i < pages_per_huge_page(h); i++) {
515 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
516 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
517 				1 << PG_private | 1<< PG_writeback);
518 	}
519 	set_compound_page_dtor(page, NULL);
520 	set_page_refcounted(page);
521 	arch_release_hugepage(page);
522 	__free_pages(page, huge_page_order(h));
523 }
524 
525 struct hstate *size_to_hstate(unsigned long size)
526 {
527 	struct hstate *h;
528 
529 	for_each_hstate(h) {
530 		if (huge_page_size(h) == size)
531 			return h;
532 	}
533 	return NULL;
534 }
535 
536 static void free_huge_page(struct page *page)
537 {
538 	/*
539 	 * Can't pass hstate in here because it is called from the
540 	 * compound page destructor.
541 	 */
542 	struct hstate *h = page_hstate(page);
543 	int nid = page_to_nid(page);
544 	struct address_space *mapping;
545 
546 	mapping = (struct address_space *) page_private(page);
547 	set_page_private(page, 0);
548 	BUG_ON(page_count(page));
549 	INIT_LIST_HEAD(&page->lru);
550 
551 	spin_lock(&hugetlb_lock);
552 	if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
553 		update_and_free_page(h, page);
554 		h->surplus_huge_pages--;
555 		h->surplus_huge_pages_node[nid]--;
556 	} else {
557 		enqueue_huge_page(h, page);
558 	}
559 	spin_unlock(&hugetlb_lock);
560 	if (mapping)
561 		hugetlb_put_quota(mapping, 1);
562 }
563 
564 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
565 {
566 	set_compound_page_dtor(page, free_huge_page);
567 	spin_lock(&hugetlb_lock);
568 	h->nr_huge_pages++;
569 	h->nr_huge_pages_node[nid]++;
570 	spin_unlock(&hugetlb_lock);
571 	put_page(page); /* free it into the hugepage allocator */
572 }
573 
574 static void prep_compound_gigantic_page(struct page *page, unsigned long order)
575 {
576 	int i;
577 	int nr_pages = 1 << order;
578 	struct page *p = page + 1;
579 
580 	/* we rely on prep_new_huge_page to set the destructor */
581 	set_compound_order(page, order);
582 	__SetPageHead(page);
583 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
584 		__SetPageTail(p);
585 		p->first_page = page;
586 	}
587 }
588 
589 int PageHuge(struct page *page)
590 {
591 	compound_page_dtor *dtor;
592 
593 	if (!PageCompound(page))
594 		return 0;
595 
596 	page = compound_head(page);
597 	dtor = get_compound_page_dtor(page);
598 
599 	return dtor == free_huge_page;
600 }
601 
602 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
603 {
604 	struct page *page;
605 
606 	if (h->order >= MAX_ORDER)
607 		return NULL;
608 
609 	page = alloc_pages_exact_node(nid,
610 		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
611 						__GFP_REPEAT|__GFP_NOWARN,
612 		huge_page_order(h));
613 	if (page) {
614 		if (arch_prepare_hugepage(page)) {
615 			__free_pages(page, huge_page_order(h));
616 			return NULL;
617 		}
618 		prep_new_huge_page(h, page, nid);
619 	}
620 
621 	return page;
622 }
623 
624 /*
625  * Use a helper variable to find the next node and then
626  * copy it back to next_nid_to_alloc afterwards:
627  * otherwise there's a window in which a racer might
628  * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
629  * But we don't need to use a spin_lock here: it really
630  * doesn't matter if occasionally a racer chooses the
631  * same nid as we do.  Move nid forward in the mask even
632  * if we just successfully allocated a hugepage so that
633  * the next caller gets hugepages on the next node.
634  */
635 static int hstate_next_node_to_alloc(struct hstate *h)
636 {
637 	int next_nid;
638 	next_nid = next_node(h->next_nid_to_alloc, node_online_map);
639 	if (next_nid == MAX_NUMNODES)
640 		next_nid = first_node(node_online_map);
641 	h->next_nid_to_alloc = next_nid;
642 	return next_nid;
643 }
644 
645 static int alloc_fresh_huge_page(struct hstate *h)
646 {
647 	struct page *page;
648 	int start_nid;
649 	int next_nid;
650 	int ret = 0;
651 
652 	start_nid = h->next_nid_to_alloc;
653 	next_nid = start_nid;
654 
655 	do {
656 		page = alloc_fresh_huge_page_node(h, next_nid);
657 		if (page)
658 			ret = 1;
659 		next_nid = hstate_next_node_to_alloc(h);
660 	} while (!page && next_nid != start_nid);
661 
662 	if (ret)
663 		count_vm_event(HTLB_BUDDY_PGALLOC);
664 	else
665 		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
666 
667 	return ret;
668 }
669 
670 /*
671  * helper for free_pool_huge_page() - find next node
672  * from which to free a huge page
673  */
674 static int hstate_next_node_to_free(struct hstate *h)
675 {
676 	int next_nid;
677 	next_nid = next_node(h->next_nid_to_free, node_online_map);
678 	if (next_nid == MAX_NUMNODES)
679 		next_nid = first_node(node_online_map);
680 	h->next_nid_to_free = next_nid;
681 	return next_nid;
682 }
683 
684 /*
685  * Free huge page from pool from next node to free.
686  * Attempt to keep persistent huge pages more or less
687  * balanced over allowed nodes.
688  * Called with hugetlb_lock locked.
689  */
690 static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
691 {
692 	int start_nid;
693 	int next_nid;
694 	int ret = 0;
695 
696 	start_nid = h->next_nid_to_free;
697 	next_nid = start_nid;
698 
699 	do {
700 		/*
701 		 * If we're returning unused surplus pages, only examine
702 		 * nodes with surplus pages.
703 		 */
704 		if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
705 		    !list_empty(&h->hugepage_freelists[next_nid])) {
706 			struct page *page =
707 				list_entry(h->hugepage_freelists[next_nid].next,
708 					  struct page, lru);
709 			list_del(&page->lru);
710 			h->free_huge_pages--;
711 			h->free_huge_pages_node[next_nid]--;
712 			if (acct_surplus) {
713 				h->surplus_huge_pages--;
714 				h->surplus_huge_pages_node[next_nid]--;
715 			}
716 			update_and_free_page(h, page);
717 			ret = 1;
718 		}
719 		next_nid = hstate_next_node_to_free(h);
720 	} while (!ret && next_nid != start_nid);
721 
722 	return ret;
723 }
724 
725 static struct page *alloc_buddy_huge_page(struct hstate *h,
726 			struct vm_area_struct *vma, unsigned long address)
727 {
728 	struct page *page;
729 	unsigned int nid;
730 
731 	if (h->order >= MAX_ORDER)
732 		return NULL;
733 
734 	/*
735 	 * Assume we will successfully allocate the surplus page to
736 	 * prevent racing processes from causing the surplus to exceed
737 	 * overcommit
738 	 *
739 	 * This however introduces a different race, where a process B
740 	 * tries to grow the static hugepage pool while alloc_pages() is
741 	 * called by process A. B will only examine the per-node
742 	 * counters in determining if surplus huge pages can be
743 	 * converted to normal huge pages in adjust_pool_surplus(). A
744 	 * won't be able to increment the per-node counter, until the
745 	 * lock is dropped by B, but B doesn't drop hugetlb_lock until
746 	 * no more huge pages can be converted from surplus to normal
747 	 * state (and doesn't try to convert again). Thus, we have a
748 	 * case where a surplus huge page exists, the pool is grown, and
749 	 * the surplus huge page still exists after, even though it
750 	 * should just have been converted to a normal huge page. This
751 	 * does not leak memory, though, as the hugepage will be freed
752 	 * once it is out of use. It also does not allow the counters to
753 	 * go out of whack in adjust_pool_surplus() as we don't modify
754 	 * the node values until we've gotten the hugepage and only the
755 	 * per-node value is checked there.
756 	 */
757 	spin_lock(&hugetlb_lock);
758 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
759 		spin_unlock(&hugetlb_lock);
760 		return NULL;
761 	} else {
762 		h->nr_huge_pages++;
763 		h->surplus_huge_pages++;
764 	}
765 	spin_unlock(&hugetlb_lock);
766 
767 	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
768 					__GFP_REPEAT|__GFP_NOWARN,
769 					huge_page_order(h));
770 
771 	if (page && arch_prepare_hugepage(page)) {
772 		__free_pages(page, huge_page_order(h));
773 		return NULL;
774 	}
775 
776 	spin_lock(&hugetlb_lock);
777 	if (page) {
778 		/*
779 		 * This page is now managed by the hugetlb allocator and has
780 		 * no users -- drop the buddy allocator's reference.
781 		 */
782 		put_page_testzero(page);
783 		VM_BUG_ON(page_count(page));
784 		nid = page_to_nid(page);
785 		set_compound_page_dtor(page, free_huge_page);
786 		/*
787 		 * We incremented the global counters already
788 		 */
789 		h->nr_huge_pages_node[nid]++;
790 		h->surplus_huge_pages_node[nid]++;
791 		__count_vm_event(HTLB_BUDDY_PGALLOC);
792 	} else {
793 		h->nr_huge_pages--;
794 		h->surplus_huge_pages--;
795 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
796 	}
797 	spin_unlock(&hugetlb_lock);
798 
799 	return page;
800 }
801 
802 /*
803  * Increase the hugetlb pool such that it can accomodate a reservation
804  * of size 'delta'.
805  */
806 static int gather_surplus_pages(struct hstate *h, int delta)
807 {
808 	struct list_head surplus_list;
809 	struct page *page, *tmp;
810 	int ret, i;
811 	int needed, allocated;
812 
813 	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
814 	if (needed <= 0) {
815 		h->resv_huge_pages += delta;
816 		return 0;
817 	}
818 
819 	allocated = 0;
820 	INIT_LIST_HEAD(&surplus_list);
821 
822 	ret = -ENOMEM;
823 retry:
824 	spin_unlock(&hugetlb_lock);
825 	for (i = 0; i < needed; i++) {
826 		page = alloc_buddy_huge_page(h, NULL, 0);
827 		if (!page) {
828 			/*
829 			 * We were not able to allocate enough pages to
830 			 * satisfy the entire reservation so we free what
831 			 * we've allocated so far.
832 			 */
833 			spin_lock(&hugetlb_lock);
834 			needed = 0;
835 			goto free;
836 		}
837 
838 		list_add(&page->lru, &surplus_list);
839 	}
840 	allocated += needed;
841 
842 	/*
843 	 * After retaking hugetlb_lock, we need to recalculate 'needed'
844 	 * because either resv_huge_pages or free_huge_pages may have changed.
845 	 */
846 	spin_lock(&hugetlb_lock);
847 	needed = (h->resv_huge_pages + delta) -
848 			(h->free_huge_pages + allocated);
849 	if (needed > 0)
850 		goto retry;
851 
852 	/*
853 	 * The surplus_list now contains _at_least_ the number of extra pages
854 	 * needed to accomodate the reservation.  Add the appropriate number
855 	 * of pages to the hugetlb pool and free the extras back to the buddy
856 	 * allocator.  Commit the entire reservation here to prevent another
857 	 * process from stealing the pages as they are added to the pool but
858 	 * before they are reserved.
859 	 */
860 	needed += allocated;
861 	h->resv_huge_pages += delta;
862 	ret = 0;
863 free:
864 	/* Free the needed pages to the hugetlb pool */
865 	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
866 		if ((--needed) < 0)
867 			break;
868 		list_del(&page->lru);
869 		enqueue_huge_page(h, page);
870 	}
871 
872 	/* Free unnecessary surplus pages to the buddy allocator */
873 	if (!list_empty(&surplus_list)) {
874 		spin_unlock(&hugetlb_lock);
875 		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
876 			list_del(&page->lru);
877 			/*
878 			 * The page has a reference count of zero already, so
879 			 * call free_huge_page directly instead of using
880 			 * put_page.  This must be done with hugetlb_lock
881 			 * unlocked which is safe because free_huge_page takes
882 			 * hugetlb_lock before deciding how to free the page.
883 			 */
884 			free_huge_page(page);
885 		}
886 		spin_lock(&hugetlb_lock);
887 	}
888 
889 	return ret;
890 }
891 
892 /*
893  * When releasing a hugetlb pool reservation, any surplus pages that were
894  * allocated to satisfy the reservation must be explicitly freed if they were
895  * never used.
896  * Called with hugetlb_lock held.
897  */
898 static void return_unused_surplus_pages(struct hstate *h,
899 					unsigned long unused_resv_pages)
900 {
901 	unsigned long nr_pages;
902 
903 	/* Uncommit the reservation */
904 	h->resv_huge_pages -= unused_resv_pages;
905 
906 	/* Cannot return gigantic pages currently */
907 	if (h->order >= MAX_ORDER)
908 		return;
909 
910 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
911 
912 	/*
913 	 * We want to release as many surplus pages as possible, spread
914 	 * evenly across all nodes. Iterate across all nodes until we
915 	 * can no longer free unreserved surplus pages. This occurs when
916 	 * the nodes with surplus pages have no free pages.
917 	 * free_pool_huge_page() will balance the the frees across the
918 	 * on-line nodes for us and will handle the hstate accounting.
919 	 */
920 	while (nr_pages--) {
921 		if (!free_pool_huge_page(h, 1))
922 			break;
923 	}
924 }
925 
926 /*
927  * Determine if the huge page at addr within the vma has an associated
928  * reservation.  Where it does not we will need to logically increase
929  * reservation and actually increase quota before an allocation can occur.
930  * Where any new reservation would be required the reservation change is
931  * prepared, but not committed.  Once the page has been quota'd allocated
932  * an instantiated the change should be committed via vma_commit_reservation.
933  * No action is required on failure.
934  */
935 static long vma_needs_reservation(struct hstate *h,
936 			struct vm_area_struct *vma, unsigned long addr)
937 {
938 	struct address_space *mapping = vma->vm_file->f_mapping;
939 	struct inode *inode = mapping->host;
940 
941 	if (vma->vm_flags & VM_MAYSHARE) {
942 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
943 		return region_chg(&inode->i_mapping->private_list,
944 							idx, idx + 1);
945 
946 	} else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
947 		return 1;
948 
949 	} else  {
950 		long err;
951 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
952 		struct resv_map *reservations = vma_resv_map(vma);
953 
954 		err = region_chg(&reservations->regions, idx, idx + 1);
955 		if (err < 0)
956 			return err;
957 		return 0;
958 	}
959 }
960 static void vma_commit_reservation(struct hstate *h,
961 			struct vm_area_struct *vma, unsigned long addr)
962 {
963 	struct address_space *mapping = vma->vm_file->f_mapping;
964 	struct inode *inode = mapping->host;
965 
966 	if (vma->vm_flags & VM_MAYSHARE) {
967 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
968 		region_add(&inode->i_mapping->private_list, idx, idx + 1);
969 
970 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
971 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
972 		struct resv_map *reservations = vma_resv_map(vma);
973 
974 		/* Mark this page used in the map. */
975 		region_add(&reservations->regions, idx, idx + 1);
976 	}
977 }
978 
979 static struct page *alloc_huge_page(struct vm_area_struct *vma,
980 				    unsigned long addr, int avoid_reserve)
981 {
982 	struct hstate *h = hstate_vma(vma);
983 	struct page *page;
984 	struct address_space *mapping = vma->vm_file->f_mapping;
985 	struct inode *inode = mapping->host;
986 	long chg;
987 
988 	/*
989 	 * Processes that did not create the mapping will have no reserves and
990 	 * will not have accounted against quota. Check that the quota can be
991 	 * made before satisfying the allocation
992 	 * MAP_NORESERVE mappings may also need pages and quota allocated
993 	 * if no reserve mapping overlaps.
994 	 */
995 	chg = vma_needs_reservation(h, vma, addr);
996 	if (chg < 0)
997 		return ERR_PTR(chg);
998 	if (chg)
999 		if (hugetlb_get_quota(inode->i_mapping, chg))
1000 			return ERR_PTR(-ENOSPC);
1001 
1002 	spin_lock(&hugetlb_lock);
1003 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1004 	spin_unlock(&hugetlb_lock);
1005 
1006 	if (!page) {
1007 		page = alloc_buddy_huge_page(h, vma, addr);
1008 		if (!page) {
1009 			hugetlb_put_quota(inode->i_mapping, chg);
1010 			return ERR_PTR(-VM_FAULT_OOM);
1011 		}
1012 	}
1013 
1014 	set_page_refcounted(page);
1015 	set_page_private(page, (unsigned long) mapping);
1016 
1017 	vma_commit_reservation(h, vma, addr);
1018 
1019 	return page;
1020 }
1021 
1022 int __weak alloc_bootmem_huge_page(struct hstate *h)
1023 {
1024 	struct huge_bootmem_page *m;
1025 	int nr_nodes = nodes_weight(node_online_map);
1026 
1027 	while (nr_nodes) {
1028 		void *addr;
1029 
1030 		addr = __alloc_bootmem_node_nopanic(
1031 				NODE_DATA(h->next_nid_to_alloc),
1032 				huge_page_size(h), huge_page_size(h), 0);
1033 
1034 		hstate_next_node_to_alloc(h);
1035 		if (addr) {
1036 			/*
1037 			 * Use the beginning of the huge page to store the
1038 			 * huge_bootmem_page struct (until gather_bootmem
1039 			 * puts them into the mem_map).
1040 			 */
1041 			m = addr;
1042 			goto found;
1043 		}
1044 		nr_nodes--;
1045 	}
1046 	return 0;
1047 
1048 found:
1049 	BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
1050 	/* Put them into a private list first because mem_map is not up yet */
1051 	list_add(&m->list, &huge_boot_pages);
1052 	m->hstate = h;
1053 	return 1;
1054 }
1055 
1056 static void prep_compound_huge_page(struct page *page, int order)
1057 {
1058 	if (unlikely(order > (MAX_ORDER - 1)))
1059 		prep_compound_gigantic_page(page, order);
1060 	else
1061 		prep_compound_page(page, order);
1062 }
1063 
1064 /* Put bootmem huge pages into the standard lists after mem_map is up */
1065 static void __init gather_bootmem_prealloc(void)
1066 {
1067 	struct huge_bootmem_page *m;
1068 
1069 	list_for_each_entry(m, &huge_boot_pages, list) {
1070 		struct page *page = virt_to_page(m);
1071 		struct hstate *h = m->hstate;
1072 		__ClearPageReserved(page);
1073 		WARN_ON(page_count(page) != 1);
1074 		prep_compound_huge_page(page, h->order);
1075 		prep_new_huge_page(h, page, page_to_nid(page));
1076 	}
1077 }
1078 
1079 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1080 {
1081 	unsigned long i;
1082 
1083 	for (i = 0; i < h->max_huge_pages; ++i) {
1084 		if (h->order >= MAX_ORDER) {
1085 			if (!alloc_bootmem_huge_page(h))
1086 				break;
1087 		} else if (!alloc_fresh_huge_page(h))
1088 			break;
1089 	}
1090 	h->max_huge_pages = i;
1091 }
1092 
1093 static void __init hugetlb_init_hstates(void)
1094 {
1095 	struct hstate *h;
1096 
1097 	for_each_hstate(h) {
1098 		/* oversize hugepages were init'ed in early boot */
1099 		if (h->order < MAX_ORDER)
1100 			hugetlb_hstate_alloc_pages(h);
1101 	}
1102 }
1103 
1104 static char * __init memfmt(char *buf, unsigned long n)
1105 {
1106 	if (n >= (1UL << 30))
1107 		sprintf(buf, "%lu GB", n >> 30);
1108 	else if (n >= (1UL << 20))
1109 		sprintf(buf, "%lu MB", n >> 20);
1110 	else
1111 		sprintf(buf, "%lu KB", n >> 10);
1112 	return buf;
1113 }
1114 
1115 static void __init report_hugepages(void)
1116 {
1117 	struct hstate *h;
1118 
1119 	for_each_hstate(h) {
1120 		char buf[32];
1121 		printk(KERN_INFO "HugeTLB registered %s page size, "
1122 				 "pre-allocated %ld pages\n",
1123 			memfmt(buf, huge_page_size(h)),
1124 			h->free_huge_pages);
1125 	}
1126 }
1127 
1128 #ifdef CONFIG_HIGHMEM
1129 static void try_to_free_low(struct hstate *h, unsigned long count)
1130 {
1131 	int i;
1132 
1133 	if (h->order >= MAX_ORDER)
1134 		return;
1135 
1136 	for (i = 0; i < MAX_NUMNODES; ++i) {
1137 		struct page *page, *next;
1138 		struct list_head *freel = &h->hugepage_freelists[i];
1139 		list_for_each_entry_safe(page, next, freel, lru) {
1140 			if (count >= h->nr_huge_pages)
1141 				return;
1142 			if (PageHighMem(page))
1143 				continue;
1144 			list_del(&page->lru);
1145 			update_and_free_page(h, page);
1146 			h->free_huge_pages--;
1147 			h->free_huge_pages_node[page_to_nid(page)]--;
1148 		}
1149 	}
1150 }
1151 #else
1152 static inline void try_to_free_low(struct hstate *h, unsigned long count)
1153 {
1154 }
1155 #endif
1156 
1157 /*
1158  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
1159  * balanced by operating on them in a round-robin fashion.
1160  * Returns 1 if an adjustment was made.
1161  */
1162 static int adjust_pool_surplus(struct hstate *h, int delta)
1163 {
1164 	int start_nid, next_nid;
1165 	int ret = 0;
1166 
1167 	VM_BUG_ON(delta != -1 && delta != 1);
1168 
1169 	if (delta < 0)
1170 		start_nid = h->next_nid_to_alloc;
1171 	else
1172 		start_nid = h->next_nid_to_free;
1173 	next_nid = start_nid;
1174 
1175 	do {
1176 		int nid = next_nid;
1177 		if (delta < 0)  {
1178 			next_nid = hstate_next_node_to_alloc(h);
1179 			/*
1180 			 * To shrink on this node, there must be a surplus page
1181 			 */
1182 			if (!h->surplus_huge_pages_node[nid])
1183 				continue;
1184 		}
1185 		if (delta > 0) {
1186 			next_nid = hstate_next_node_to_free(h);
1187 			/*
1188 			 * Surplus cannot exceed the total number of pages
1189 			 */
1190 			if (h->surplus_huge_pages_node[nid] >=
1191 						h->nr_huge_pages_node[nid])
1192 				continue;
1193 		}
1194 
1195 		h->surplus_huge_pages += delta;
1196 		h->surplus_huge_pages_node[nid] += delta;
1197 		ret = 1;
1198 		break;
1199 	} while (next_nid != start_nid);
1200 
1201 	return ret;
1202 }
1203 
1204 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1205 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1206 {
1207 	unsigned long min_count, ret;
1208 
1209 	if (h->order >= MAX_ORDER)
1210 		return h->max_huge_pages;
1211 
1212 	/*
1213 	 * Increase the pool size
1214 	 * First take pages out of surplus state.  Then make up the
1215 	 * remaining difference by allocating fresh huge pages.
1216 	 *
1217 	 * We might race with alloc_buddy_huge_page() here and be unable
1218 	 * to convert a surplus huge page to a normal huge page. That is
1219 	 * not critical, though, it just means the overall size of the
1220 	 * pool might be one hugepage larger than it needs to be, but
1221 	 * within all the constraints specified by the sysctls.
1222 	 */
1223 	spin_lock(&hugetlb_lock);
1224 	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1225 		if (!adjust_pool_surplus(h, -1))
1226 			break;
1227 	}
1228 
1229 	while (count > persistent_huge_pages(h)) {
1230 		/*
1231 		 * If this allocation races such that we no longer need the
1232 		 * page, free_huge_page will handle it by freeing the page
1233 		 * and reducing the surplus.
1234 		 */
1235 		spin_unlock(&hugetlb_lock);
1236 		ret = alloc_fresh_huge_page(h);
1237 		spin_lock(&hugetlb_lock);
1238 		if (!ret)
1239 			goto out;
1240 
1241 	}
1242 
1243 	/*
1244 	 * Decrease the pool size
1245 	 * First return free pages to the buddy allocator (being careful
1246 	 * to keep enough around to satisfy reservations).  Then place
1247 	 * pages into surplus state as needed so the pool will shrink
1248 	 * to the desired size as pages become free.
1249 	 *
1250 	 * By placing pages into the surplus state independent of the
1251 	 * overcommit value, we are allowing the surplus pool size to
1252 	 * exceed overcommit. There are few sane options here. Since
1253 	 * alloc_buddy_huge_page() is checking the global counter,
1254 	 * though, we'll note that we're not allowed to exceed surplus
1255 	 * and won't grow the pool anywhere else. Not until one of the
1256 	 * sysctls are changed, or the surplus pages go out of use.
1257 	 */
1258 	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1259 	min_count = max(count, min_count);
1260 	try_to_free_low(h, min_count);
1261 	while (min_count < persistent_huge_pages(h)) {
1262 		if (!free_pool_huge_page(h, 0))
1263 			break;
1264 	}
1265 	while (count < persistent_huge_pages(h)) {
1266 		if (!adjust_pool_surplus(h, 1))
1267 			break;
1268 	}
1269 out:
1270 	ret = persistent_huge_pages(h);
1271 	spin_unlock(&hugetlb_lock);
1272 	return ret;
1273 }
1274 
1275 #define HSTATE_ATTR_RO(_name) \
1276 	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1277 
1278 #define HSTATE_ATTR(_name) \
1279 	static struct kobj_attribute _name##_attr = \
1280 		__ATTR(_name, 0644, _name##_show, _name##_store)
1281 
1282 static struct kobject *hugepages_kobj;
1283 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1284 
1285 static struct hstate *kobj_to_hstate(struct kobject *kobj)
1286 {
1287 	int i;
1288 	for (i = 0; i < HUGE_MAX_HSTATE; i++)
1289 		if (hstate_kobjs[i] == kobj)
1290 			return &hstates[i];
1291 	BUG();
1292 	return NULL;
1293 }
1294 
1295 static ssize_t nr_hugepages_show(struct kobject *kobj,
1296 					struct kobj_attribute *attr, char *buf)
1297 {
1298 	struct hstate *h = kobj_to_hstate(kobj);
1299 	return sprintf(buf, "%lu\n", h->nr_huge_pages);
1300 }
1301 static ssize_t nr_hugepages_store(struct kobject *kobj,
1302 		struct kobj_attribute *attr, const char *buf, size_t count)
1303 {
1304 	int err;
1305 	unsigned long input;
1306 	struct hstate *h = kobj_to_hstate(kobj);
1307 
1308 	err = strict_strtoul(buf, 10, &input);
1309 	if (err)
1310 		return 0;
1311 
1312 	h->max_huge_pages = set_max_huge_pages(h, input);
1313 
1314 	return count;
1315 }
1316 HSTATE_ATTR(nr_hugepages);
1317 
1318 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1319 					struct kobj_attribute *attr, char *buf)
1320 {
1321 	struct hstate *h = kobj_to_hstate(kobj);
1322 	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1323 }
1324 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1325 		struct kobj_attribute *attr, const char *buf, size_t count)
1326 {
1327 	int err;
1328 	unsigned long input;
1329 	struct hstate *h = kobj_to_hstate(kobj);
1330 
1331 	err = strict_strtoul(buf, 10, &input);
1332 	if (err)
1333 		return 0;
1334 
1335 	spin_lock(&hugetlb_lock);
1336 	h->nr_overcommit_huge_pages = input;
1337 	spin_unlock(&hugetlb_lock);
1338 
1339 	return count;
1340 }
1341 HSTATE_ATTR(nr_overcommit_hugepages);
1342 
1343 static ssize_t free_hugepages_show(struct kobject *kobj,
1344 					struct kobj_attribute *attr, char *buf)
1345 {
1346 	struct hstate *h = kobj_to_hstate(kobj);
1347 	return sprintf(buf, "%lu\n", h->free_huge_pages);
1348 }
1349 HSTATE_ATTR_RO(free_hugepages);
1350 
1351 static ssize_t resv_hugepages_show(struct kobject *kobj,
1352 					struct kobj_attribute *attr, char *buf)
1353 {
1354 	struct hstate *h = kobj_to_hstate(kobj);
1355 	return sprintf(buf, "%lu\n", h->resv_huge_pages);
1356 }
1357 HSTATE_ATTR_RO(resv_hugepages);
1358 
1359 static ssize_t surplus_hugepages_show(struct kobject *kobj,
1360 					struct kobj_attribute *attr, char *buf)
1361 {
1362 	struct hstate *h = kobj_to_hstate(kobj);
1363 	return sprintf(buf, "%lu\n", h->surplus_huge_pages);
1364 }
1365 HSTATE_ATTR_RO(surplus_hugepages);
1366 
1367 static struct attribute *hstate_attrs[] = {
1368 	&nr_hugepages_attr.attr,
1369 	&nr_overcommit_hugepages_attr.attr,
1370 	&free_hugepages_attr.attr,
1371 	&resv_hugepages_attr.attr,
1372 	&surplus_hugepages_attr.attr,
1373 	NULL,
1374 };
1375 
1376 static struct attribute_group hstate_attr_group = {
1377 	.attrs = hstate_attrs,
1378 };
1379 
1380 static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
1381 {
1382 	int retval;
1383 
1384 	hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
1385 							hugepages_kobj);
1386 	if (!hstate_kobjs[h - hstates])
1387 		return -ENOMEM;
1388 
1389 	retval = sysfs_create_group(hstate_kobjs[h - hstates],
1390 							&hstate_attr_group);
1391 	if (retval)
1392 		kobject_put(hstate_kobjs[h - hstates]);
1393 
1394 	return retval;
1395 }
1396 
1397 static void __init hugetlb_sysfs_init(void)
1398 {
1399 	struct hstate *h;
1400 	int err;
1401 
1402 	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1403 	if (!hugepages_kobj)
1404 		return;
1405 
1406 	for_each_hstate(h) {
1407 		err = hugetlb_sysfs_add_hstate(h);
1408 		if (err)
1409 			printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1410 								h->name);
1411 	}
1412 }
1413 
1414 static void __exit hugetlb_exit(void)
1415 {
1416 	struct hstate *h;
1417 
1418 	for_each_hstate(h) {
1419 		kobject_put(hstate_kobjs[h - hstates]);
1420 	}
1421 
1422 	kobject_put(hugepages_kobj);
1423 }
1424 module_exit(hugetlb_exit);
1425 
1426 static int __init hugetlb_init(void)
1427 {
1428 	/* Some platform decide whether they support huge pages at boot
1429 	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
1430 	 * there is no such support
1431 	 */
1432 	if (HPAGE_SHIFT == 0)
1433 		return 0;
1434 
1435 	if (!size_to_hstate(default_hstate_size)) {
1436 		default_hstate_size = HPAGE_SIZE;
1437 		if (!size_to_hstate(default_hstate_size))
1438 			hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1439 	}
1440 	default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
1441 	if (default_hstate_max_huge_pages)
1442 		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1443 
1444 	hugetlb_init_hstates();
1445 
1446 	gather_bootmem_prealloc();
1447 
1448 	report_hugepages();
1449 
1450 	hugetlb_sysfs_init();
1451 
1452 	return 0;
1453 }
1454 module_init(hugetlb_init);
1455 
1456 /* Should be called on processing a hugepagesz=... option */
1457 void __init hugetlb_add_hstate(unsigned order)
1458 {
1459 	struct hstate *h;
1460 	unsigned long i;
1461 
1462 	if (size_to_hstate(PAGE_SIZE << order)) {
1463 		printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1464 		return;
1465 	}
1466 	BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
1467 	BUG_ON(order == 0);
1468 	h = &hstates[max_hstate++];
1469 	h->order = order;
1470 	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1471 	h->nr_huge_pages = 0;
1472 	h->free_huge_pages = 0;
1473 	for (i = 0; i < MAX_NUMNODES; ++i)
1474 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1475 	h->next_nid_to_alloc = first_node(node_online_map);
1476 	h->next_nid_to_free = first_node(node_online_map);
1477 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1478 					huge_page_size(h)/1024);
1479 
1480 	parsed_hstate = h;
1481 }
1482 
1483 static int __init hugetlb_nrpages_setup(char *s)
1484 {
1485 	unsigned long *mhp;
1486 	static unsigned long *last_mhp;
1487 
1488 	/*
1489 	 * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
1490 	 * so this hugepages= parameter goes to the "default hstate".
1491 	 */
1492 	if (!max_hstate)
1493 		mhp = &default_hstate_max_huge_pages;
1494 	else
1495 		mhp = &parsed_hstate->max_huge_pages;
1496 
1497 	if (mhp == last_mhp) {
1498 		printk(KERN_WARNING "hugepages= specified twice without "
1499 			"interleaving hugepagesz=, ignoring\n");
1500 		return 1;
1501 	}
1502 
1503 	if (sscanf(s, "%lu", mhp) <= 0)
1504 		*mhp = 0;
1505 
1506 	/*
1507 	 * Global state is always initialized later in hugetlb_init.
1508 	 * But we need to allocate >= MAX_ORDER hstates here early to still
1509 	 * use the bootmem allocator.
1510 	 */
1511 	if (max_hstate && parsed_hstate->order >= MAX_ORDER)
1512 		hugetlb_hstate_alloc_pages(parsed_hstate);
1513 
1514 	last_mhp = mhp;
1515 
1516 	return 1;
1517 }
1518 __setup("hugepages=", hugetlb_nrpages_setup);
1519 
1520 static int __init hugetlb_default_setup(char *s)
1521 {
1522 	default_hstate_size = memparse(s, &s);
1523 	return 1;
1524 }
1525 __setup("default_hugepagesz=", hugetlb_default_setup);
1526 
1527 static unsigned int cpuset_mems_nr(unsigned int *array)
1528 {
1529 	int node;
1530 	unsigned int nr = 0;
1531 
1532 	for_each_node_mask(node, cpuset_current_mems_allowed)
1533 		nr += array[node];
1534 
1535 	return nr;
1536 }
1537 
1538 #ifdef CONFIG_SYSCTL
1539 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1540 			   void __user *buffer,
1541 			   size_t *length, loff_t *ppos)
1542 {
1543 	struct hstate *h = &default_hstate;
1544 	unsigned long tmp;
1545 
1546 	if (!write)
1547 		tmp = h->max_huge_pages;
1548 
1549 	table->data = &tmp;
1550 	table->maxlen = sizeof(unsigned long);
1551 	proc_doulongvec_minmax(table, write, buffer, length, ppos);
1552 
1553 	if (write)
1554 		h->max_huge_pages = set_max_huge_pages(h, tmp);
1555 
1556 	return 0;
1557 }
1558 
1559 int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1560 			void __user *buffer,
1561 			size_t *length, loff_t *ppos)
1562 {
1563 	proc_dointvec(table, write, buffer, length, ppos);
1564 	if (hugepages_treat_as_movable)
1565 		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
1566 	else
1567 		htlb_alloc_mask = GFP_HIGHUSER;
1568 	return 0;
1569 }
1570 
1571 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1572 			void __user *buffer,
1573 			size_t *length, loff_t *ppos)
1574 {
1575 	struct hstate *h = &default_hstate;
1576 	unsigned long tmp;
1577 
1578 	if (!write)
1579 		tmp = h->nr_overcommit_huge_pages;
1580 
1581 	table->data = &tmp;
1582 	table->maxlen = sizeof(unsigned long);
1583 	proc_doulongvec_minmax(table, write, buffer, length, ppos);
1584 
1585 	if (write) {
1586 		spin_lock(&hugetlb_lock);
1587 		h->nr_overcommit_huge_pages = tmp;
1588 		spin_unlock(&hugetlb_lock);
1589 	}
1590 
1591 	return 0;
1592 }
1593 
1594 #endif /* CONFIG_SYSCTL */
1595 
1596 void hugetlb_report_meminfo(struct seq_file *m)
1597 {
1598 	struct hstate *h = &default_hstate;
1599 	seq_printf(m,
1600 			"HugePages_Total:   %5lu\n"
1601 			"HugePages_Free:    %5lu\n"
1602 			"HugePages_Rsvd:    %5lu\n"
1603 			"HugePages_Surp:    %5lu\n"
1604 			"Hugepagesize:   %8lu kB\n",
1605 			h->nr_huge_pages,
1606 			h->free_huge_pages,
1607 			h->resv_huge_pages,
1608 			h->surplus_huge_pages,
1609 			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
1610 }
1611 
1612 int hugetlb_report_node_meminfo(int nid, char *buf)
1613 {
1614 	struct hstate *h = &default_hstate;
1615 	return sprintf(buf,
1616 		"Node %d HugePages_Total: %5u\n"
1617 		"Node %d HugePages_Free:  %5u\n"
1618 		"Node %d HugePages_Surp:  %5u\n",
1619 		nid, h->nr_huge_pages_node[nid],
1620 		nid, h->free_huge_pages_node[nid],
1621 		nid, h->surplus_huge_pages_node[nid]);
1622 }
1623 
1624 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
1625 unsigned long hugetlb_total_pages(void)
1626 {
1627 	struct hstate *h = &default_hstate;
1628 	return h->nr_huge_pages * pages_per_huge_page(h);
1629 }
1630 
1631 static int hugetlb_acct_memory(struct hstate *h, long delta)
1632 {
1633 	int ret = -ENOMEM;
1634 
1635 	spin_lock(&hugetlb_lock);
1636 	/*
1637 	 * When cpuset is configured, it breaks the strict hugetlb page
1638 	 * reservation as the accounting is done on a global variable. Such
1639 	 * reservation is completely rubbish in the presence of cpuset because
1640 	 * the reservation is not checked against page availability for the
1641 	 * current cpuset. Application can still potentially OOM'ed by kernel
1642 	 * with lack of free htlb page in cpuset that the task is in.
1643 	 * Attempt to enforce strict accounting with cpuset is almost
1644 	 * impossible (or too ugly) because cpuset is too fluid that
1645 	 * task or memory node can be dynamically moved between cpusets.
1646 	 *
1647 	 * The change of semantics for shared hugetlb mapping with cpuset is
1648 	 * undesirable. However, in order to preserve some of the semantics,
1649 	 * we fall back to check against current free page availability as
1650 	 * a best attempt and hopefully to minimize the impact of changing
1651 	 * semantics that cpuset has.
1652 	 */
1653 	if (delta > 0) {
1654 		if (gather_surplus_pages(h, delta) < 0)
1655 			goto out;
1656 
1657 		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
1658 			return_unused_surplus_pages(h, delta);
1659 			goto out;
1660 		}
1661 	}
1662 
1663 	ret = 0;
1664 	if (delta < 0)
1665 		return_unused_surplus_pages(h, (unsigned long) -delta);
1666 
1667 out:
1668 	spin_unlock(&hugetlb_lock);
1669 	return ret;
1670 }
1671 
1672 static void hugetlb_vm_op_open(struct vm_area_struct *vma)
1673 {
1674 	struct resv_map *reservations = vma_resv_map(vma);
1675 
1676 	/*
1677 	 * This new VMA should share its siblings reservation map if present.
1678 	 * The VMA will only ever have a valid reservation map pointer where
1679 	 * it is being copied for another still existing VMA.  As that VMA
1680 	 * has a reference to the reservation map it cannot dissappear until
1681 	 * after this open call completes.  It is therefore safe to take a
1682 	 * new reference here without additional locking.
1683 	 */
1684 	if (reservations)
1685 		kref_get(&reservations->refs);
1686 }
1687 
1688 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1689 {
1690 	struct hstate *h = hstate_vma(vma);
1691 	struct resv_map *reservations = vma_resv_map(vma);
1692 	unsigned long reserve;
1693 	unsigned long start;
1694 	unsigned long end;
1695 
1696 	if (reservations) {
1697 		start = vma_hugecache_offset(h, vma, vma->vm_start);
1698 		end = vma_hugecache_offset(h, vma, vma->vm_end);
1699 
1700 		reserve = (end - start) -
1701 			region_count(&reservations->regions, start, end);
1702 
1703 		kref_put(&reservations->refs, resv_map_release);
1704 
1705 		if (reserve) {
1706 			hugetlb_acct_memory(h, -reserve);
1707 			hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
1708 		}
1709 	}
1710 }
1711 
1712 /*
1713  * We cannot handle pagefaults against hugetlb pages at all.  They cause
1714  * handle_mm_fault() to try to instantiate regular-sized pages in the
1715  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
1716  * this far.
1717  */
1718 static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1719 {
1720 	BUG();
1721 	return 0;
1722 }
1723 
1724 const struct vm_operations_struct hugetlb_vm_ops = {
1725 	.fault = hugetlb_vm_op_fault,
1726 	.open = hugetlb_vm_op_open,
1727 	.close = hugetlb_vm_op_close,
1728 };
1729 
1730 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
1731 				int writable)
1732 {
1733 	pte_t entry;
1734 
1735 	if (writable) {
1736 		entry =
1737 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
1738 	} else {
1739 		entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
1740 	}
1741 	entry = pte_mkyoung(entry);
1742 	entry = pte_mkhuge(entry);
1743 
1744 	return entry;
1745 }
1746 
1747 static void set_huge_ptep_writable(struct vm_area_struct *vma,
1748 				   unsigned long address, pte_t *ptep)
1749 {
1750 	pte_t entry;
1751 
1752 	entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
1753 	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
1754 		update_mmu_cache(vma, address, entry);
1755 	}
1756 }
1757 
1758 
1759 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
1760 			    struct vm_area_struct *vma)
1761 {
1762 	pte_t *src_pte, *dst_pte, entry;
1763 	struct page *ptepage;
1764 	unsigned long addr;
1765 	int cow;
1766 	struct hstate *h = hstate_vma(vma);
1767 	unsigned long sz = huge_page_size(h);
1768 
1769 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
1770 
1771 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
1772 		src_pte = huge_pte_offset(src, addr);
1773 		if (!src_pte)
1774 			continue;
1775 		dst_pte = huge_pte_alloc(dst, addr, sz);
1776 		if (!dst_pte)
1777 			goto nomem;
1778 
1779 		/* If the pagetables are shared don't copy or take references */
1780 		if (dst_pte == src_pte)
1781 			continue;
1782 
1783 		spin_lock(&dst->page_table_lock);
1784 		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
1785 		if (!huge_pte_none(huge_ptep_get(src_pte))) {
1786 			if (cow)
1787 				huge_ptep_set_wrprotect(src, addr, src_pte);
1788 			entry = huge_ptep_get(src_pte);
1789 			ptepage = pte_page(entry);
1790 			get_page(ptepage);
1791 			set_huge_pte_at(dst, addr, dst_pte, entry);
1792 		}
1793 		spin_unlock(&src->page_table_lock);
1794 		spin_unlock(&dst->page_table_lock);
1795 	}
1796 	return 0;
1797 
1798 nomem:
1799 	return -ENOMEM;
1800 }
1801 
1802 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1803 			    unsigned long end, struct page *ref_page)
1804 {
1805 	struct mm_struct *mm = vma->vm_mm;
1806 	unsigned long address;
1807 	pte_t *ptep;
1808 	pte_t pte;
1809 	struct page *page;
1810 	struct page *tmp;
1811 	struct hstate *h = hstate_vma(vma);
1812 	unsigned long sz = huge_page_size(h);
1813 
1814 	/*
1815 	 * A page gathering list, protected by per file i_mmap_lock. The
1816 	 * lock is used to avoid list corruption from multiple unmapping
1817 	 * of the same page since we are using page->lru.
1818 	 */
1819 	LIST_HEAD(page_list);
1820 
1821 	WARN_ON(!is_vm_hugetlb_page(vma));
1822 	BUG_ON(start & ~huge_page_mask(h));
1823 	BUG_ON(end & ~huge_page_mask(h));
1824 
1825 	mmu_notifier_invalidate_range_start(mm, start, end);
1826 	spin_lock(&mm->page_table_lock);
1827 	for (address = start; address < end; address += sz) {
1828 		ptep = huge_pte_offset(mm, address);
1829 		if (!ptep)
1830 			continue;
1831 
1832 		if (huge_pmd_unshare(mm, &address, ptep))
1833 			continue;
1834 
1835 		/*
1836 		 * If a reference page is supplied, it is because a specific
1837 		 * page is being unmapped, not a range. Ensure the page we
1838 		 * are about to unmap is the actual page of interest.
1839 		 */
1840 		if (ref_page) {
1841 			pte = huge_ptep_get(ptep);
1842 			if (huge_pte_none(pte))
1843 				continue;
1844 			page = pte_page(pte);
1845 			if (page != ref_page)
1846 				continue;
1847 
1848 			/*
1849 			 * Mark the VMA as having unmapped its page so that
1850 			 * future faults in this VMA will fail rather than
1851 			 * looking like data was lost
1852 			 */
1853 			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
1854 		}
1855 
1856 		pte = huge_ptep_get_and_clear(mm, address, ptep);
1857 		if (huge_pte_none(pte))
1858 			continue;
1859 
1860 		page = pte_page(pte);
1861 		if (pte_dirty(pte))
1862 			set_page_dirty(page);
1863 		list_add(&page->lru, &page_list);
1864 	}
1865 	spin_unlock(&mm->page_table_lock);
1866 	flush_tlb_range(vma, start, end);
1867 	mmu_notifier_invalidate_range_end(mm, start, end);
1868 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
1869 		list_del(&page->lru);
1870 		put_page(page);
1871 	}
1872 }
1873 
1874 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1875 			  unsigned long end, struct page *ref_page)
1876 {
1877 	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1878 	__unmap_hugepage_range(vma, start, end, ref_page);
1879 	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
1880 }
1881 
1882 /*
1883  * This is called when the original mapper is failing to COW a MAP_PRIVATE
1884  * mappping it owns the reserve page for. The intention is to unmap the page
1885  * from other VMAs and let the children be SIGKILLed if they are faulting the
1886  * same region.
1887  */
1888 static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1889 				struct page *page, unsigned long address)
1890 {
1891 	struct hstate *h = hstate_vma(vma);
1892 	struct vm_area_struct *iter_vma;
1893 	struct address_space *mapping;
1894 	struct prio_tree_iter iter;
1895 	pgoff_t pgoff;
1896 
1897 	/*
1898 	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
1899 	 * from page cache lookup which is in HPAGE_SIZE units.
1900 	 */
1901 	address = address & huge_page_mask(h);
1902 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
1903 		+ (vma->vm_pgoff >> PAGE_SHIFT);
1904 	mapping = (struct address_space *)page_private(page);
1905 
1906 	vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1907 		/* Do not unmap the current VMA */
1908 		if (iter_vma == vma)
1909 			continue;
1910 
1911 		/*
1912 		 * Unmap the page from other VMAs without their own reserves.
1913 		 * They get marked to be SIGKILLed if they fault in these
1914 		 * areas. This is because a future no-page fault on this VMA
1915 		 * could insert a zeroed page instead of the data existing
1916 		 * from the time of fork. This would look like data corruption
1917 		 */
1918 		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1919 			unmap_hugepage_range(iter_vma,
1920 				address, address + huge_page_size(h),
1921 				page);
1922 	}
1923 
1924 	return 1;
1925 }
1926 
1927 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
1928 			unsigned long address, pte_t *ptep, pte_t pte,
1929 			struct page *pagecache_page)
1930 {
1931 	struct hstate *h = hstate_vma(vma);
1932 	struct page *old_page, *new_page;
1933 	int avoidcopy;
1934 	int outside_reserve = 0;
1935 
1936 	old_page = pte_page(pte);
1937 
1938 retry_avoidcopy:
1939 	/* If no-one else is actually using this page, avoid the copy
1940 	 * and just make the page writable */
1941 	avoidcopy = (page_count(old_page) == 1);
1942 	if (avoidcopy) {
1943 		set_huge_ptep_writable(vma, address, ptep);
1944 		return 0;
1945 	}
1946 
1947 	/*
1948 	 * If the process that created a MAP_PRIVATE mapping is about to
1949 	 * perform a COW due to a shared page count, attempt to satisfy
1950 	 * the allocation without using the existing reserves. The pagecache
1951 	 * page is used to determine if the reserve at this address was
1952 	 * consumed or not. If reserves were used, a partial faulted mapping
1953 	 * at the time of fork() could consume its reserves on COW instead
1954 	 * of the full address range.
1955 	 */
1956 	if (!(vma->vm_flags & VM_MAYSHARE) &&
1957 			is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1958 			old_page != pagecache_page)
1959 		outside_reserve = 1;
1960 
1961 	page_cache_get(old_page);
1962 	new_page = alloc_huge_page(vma, address, outside_reserve);
1963 
1964 	if (IS_ERR(new_page)) {
1965 		page_cache_release(old_page);
1966 
1967 		/*
1968 		 * If a process owning a MAP_PRIVATE mapping fails to COW,
1969 		 * it is due to references held by a child and an insufficient
1970 		 * huge page pool. To guarantee the original mappers
1971 		 * reliability, unmap the page from child processes. The child
1972 		 * may get SIGKILLed if it later faults.
1973 		 */
1974 		if (outside_reserve) {
1975 			BUG_ON(huge_pte_none(pte));
1976 			if (unmap_ref_private(mm, vma, old_page, address)) {
1977 				BUG_ON(page_count(old_page) != 1);
1978 				BUG_ON(huge_pte_none(pte));
1979 				goto retry_avoidcopy;
1980 			}
1981 			WARN_ON_ONCE(1);
1982 		}
1983 
1984 		return -PTR_ERR(new_page);
1985 	}
1986 
1987 	spin_unlock(&mm->page_table_lock);
1988 	copy_huge_page(new_page, old_page, address, vma);
1989 	__SetPageUptodate(new_page);
1990 	spin_lock(&mm->page_table_lock);
1991 
1992 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
1993 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1994 		/* Break COW */
1995 		huge_ptep_clear_flush(vma, address, ptep);
1996 		set_huge_pte_at(mm, address, ptep,
1997 				make_huge_pte(vma, new_page, 1));
1998 		/* Make the old page be freed below */
1999 		new_page = old_page;
2000 	}
2001 	page_cache_release(new_page);
2002 	page_cache_release(old_page);
2003 	return 0;
2004 }
2005 
2006 /* Return the pagecache page at a given address within a VMA */
2007 static struct page *hugetlbfs_pagecache_page(struct hstate *h,
2008 			struct vm_area_struct *vma, unsigned long address)
2009 {
2010 	struct address_space *mapping;
2011 	pgoff_t idx;
2012 
2013 	mapping = vma->vm_file->f_mapping;
2014 	idx = vma_hugecache_offset(h, vma, address);
2015 
2016 	return find_lock_page(mapping, idx);
2017 }
2018 
2019 /*
2020  * Return whether there is a pagecache page to back given address within VMA.
2021  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2022  */
2023 static bool hugetlbfs_pagecache_present(struct hstate *h,
2024 			struct vm_area_struct *vma, unsigned long address)
2025 {
2026 	struct address_space *mapping;
2027 	pgoff_t idx;
2028 	struct page *page;
2029 
2030 	mapping = vma->vm_file->f_mapping;
2031 	idx = vma_hugecache_offset(h, vma, address);
2032 
2033 	page = find_get_page(mapping, idx);
2034 	if (page)
2035 		put_page(page);
2036 	return page != NULL;
2037 }
2038 
2039 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2040 			unsigned long address, pte_t *ptep, unsigned int flags)
2041 {
2042 	struct hstate *h = hstate_vma(vma);
2043 	int ret = VM_FAULT_SIGBUS;
2044 	pgoff_t idx;
2045 	unsigned long size;
2046 	struct page *page;
2047 	struct address_space *mapping;
2048 	pte_t new_pte;
2049 
2050 	/*
2051 	 * Currently, we are forced to kill the process in the event the
2052 	 * original mapper has unmapped pages from the child due to a failed
2053 	 * COW. Warn that such a situation has occured as it may not be obvious
2054 	 */
2055 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2056 		printk(KERN_WARNING
2057 			"PID %d killed due to inadequate hugepage pool\n",
2058 			current->pid);
2059 		return ret;
2060 	}
2061 
2062 	mapping = vma->vm_file->f_mapping;
2063 	idx = vma_hugecache_offset(h, vma, address);
2064 
2065 	/*
2066 	 * Use page lock to guard against racing truncation
2067 	 * before we get page_table_lock.
2068 	 */
2069 retry:
2070 	page = find_lock_page(mapping, idx);
2071 	if (!page) {
2072 		size = i_size_read(mapping->host) >> huge_page_shift(h);
2073 		if (idx >= size)
2074 			goto out;
2075 		page = alloc_huge_page(vma, address, 0);
2076 		if (IS_ERR(page)) {
2077 			ret = -PTR_ERR(page);
2078 			goto out;
2079 		}
2080 		clear_huge_page(page, address, huge_page_size(h));
2081 		__SetPageUptodate(page);
2082 
2083 		if (vma->vm_flags & VM_MAYSHARE) {
2084 			int err;
2085 			struct inode *inode = mapping->host;
2086 
2087 			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
2088 			if (err) {
2089 				put_page(page);
2090 				if (err == -EEXIST)
2091 					goto retry;
2092 				goto out;
2093 			}
2094 
2095 			spin_lock(&inode->i_lock);
2096 			inode->i_blocks += blocks_per_huge_page(h);
2097 			spin_unlock(&inode->i_lock);
2098 		} else
2099 			lock_page(page);
2100 	}
2101 
2102 	/*
2103 	 * If we are going to COW a private mapping later, we examine the
2104 	 * pending reservations for this page now. This will ensure that
2105 	 * any allocations necessary to record that reservation occur outside
2106 	 * the spinlock.
2107 	 */
2108 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
2109 		if (vma_needs_reservation(h, vma, address) < 0) {
2110 			ret = VM_FAULT_OOM;
2111 			goto backout_unlocked;
2112 		}
2113 
2114 	spin_lock(&mm->page_table_lock);
2115 	size = i_size_read(mapping->host) >> huge_page_shift(h);
2116 	if (idx >= size)
2117 		goto backout;
2118 
2119 	ret = 0;
2120 	if (!huge_pte_none(huge_ptep_get(ptep)))
2121 		goto backout;
2122 
2123 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
2124 				&& (vma->vm_flags & VM_SHARED)));
2125 	set_huge_pte_at(mm, address, ptep, new_pte);
2126 
2127 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
2128 		/* Optimization, do the COW without a second fault */
2129 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
2130 	}
2131 
2132 	spin_unlock(&mm->page_table_lock);
2133 	unlock_page(page);
2134 out:
2135 	return ret;
2136 
2137 backout:
2138 	spin_unlock(&mm->page_table_lock);
2139 backout_unlocked:
2140 	unlock_page(page);
2141 	put_page(page);
2142 	goto out;
2143 }
2144 
2145 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2146 			unsigned long address, unsigned int flags)
2147 {
2148 	pte_t *ptep;
2149 	pte_t entry;
2150 	int ret;
2151 	struct page *pagecache_page = NULL;
2152 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2153 	struct hstate *h = hstate_vma(vma);
2154 
2155 	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2156 	if (!ptep)
2157 		return VM_FAULT_OOM;
2158 
2159 	/*
2160 	 * Serialize hugepage allocation and instantiation, so that we don't
2161 	 * get spurious allocation failures if two CPUs race to instantiate
2162 	 * the same page in the page cache.
2163 	 */
2164 	mutex_lock(&hugetlb_instantiation_mutex);
2165 	entry = huge_ptep_get(ptep);
2166 	if (huge_pte_none(entry)) {
2167 		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
2168 		goto out_mutex;
2169 	}
2170 
2171 	ret = 0;
2172 
2173 	/*
2174 	 * If we are going to COW the mapping later, we examine the pending
2175 	 * reservations for this page now. This will ensure that any
2176 	 * allocations necessary to record that reservation occur outside the
2177 	 * spinlock. For private mappings, we also lookup the pagecache
2178 	 * page now as it is used to determine if a reservation has been
2179 	 * consumed.
2180 	 */
2181 	if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
2182 		if (vma_needs_reservation(h, vma, address) < 0) {
2183 			ret = VM_FAULT_OOM;
2184 			goto out_mutex;
2185 		}
2186 
2187 		if (!(vma->vm_flags & VM_MAYSHARE))
2188 			pagecache_page = hugetlbfs_pagecache_page(h,
2189 								vma, address);
2190 	}
2191 
2192 	spin_lock(&mm->page_table_lock);
2193 	/* Check for a racing update before calling hugetlb_cow */
2194 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
2195 		goto out_page_table_lock;
2196 
2197 
2198 	if (flags & FAULT_FLAG_WRITE) {
2199 		if (!pte_write(entry)) {
2200 			ret = hugetlb_cow(mm, vma, address, ptep, entry,
2201 							pagecache_page);
2202 			goto out_page_table_lock;
2203 		}
2204 		entry = pte_mkdirty(entry);
2205 	}
2206 	entry = pte_mkyoung(entry);
2207 	if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2208 						flags & FAULT_FLAG_WRITE))
2209 		update_mmu_cache(vma, address, entry);
2210 
2211 out_page_table_lock:
2212 	spin_unlock(&mm->page_table_lock);
2213 
2214 	if (pagecache_page) {
2215 		unlock_page(pagecache_page);
2216 		put_page(pagecache_page);
2217 	}
2218 
2219 out_mutex:
2220 	mutex_unlock(&hugetlb_instantiation_mutex);
2221 
2222 	return ret;
2223 }
2224 
2225 /* Can be overriden by architectures */
2226 __attribute__((weak)) struct page *
2227 follow_huge_pud(struct mm_struct *mm, unsigned long address,
2228 	       pud_t *pud, int write)
2229 {
2230 	BUG();
2231 	return NULL;
2232 }
2233 
2234 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2235 			struct page **pages, struct vm_area_struct **vmas,
2236 			unsigned long *position, int *length, int i,
2237 			unsigned int flags)
2238 {
2239 	unsigned long pfn_offset;
2240 	unsigned long vaddr = *position;
2241 	int remainder = *length;
2242 	struct hstate *h = hstate_vma(vma);
2243 
2244 	spin_lock(&mm->page_table_lock);
2245 	while (vaddr < vma->vm_end && remainder) {
2246 		pte_t *pte;
2247 		int absent;
2248 		struct page *page;
2249 
2250 		/*
2251 		 * Some archs (sparc64, sh*) have multiple pte_ts to
2252 		 * each hugepage.  We have to make sure we get the
2253 		 * first, for the page indexing below to work.
2254 		 */
2255 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2256 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
2257 
2258 		/*
2259 		 * When coredumping, it suits get_dump_page if we just return
2260 		 * an error where there's an empty slot with no huge pagecache
2261 		 * to back it.  This way, we avoid allocating a hugepage, and
2262 		 * the sparse dumpfile avoids allocating disk blocks, but its
2263 		 * huge holes still show up with zeroes where they need to be.
2264 		 */
2265 		if (absent && (flags & FOLL_DUMP) &&
2266 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2267 			remainder = 0;
2268 			break;
2269 		}
2270 
2271 		if (absent ||
2272 		    ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2273 			int ret;
2274 
2275 			spin_unlock(&mm->page_table_lock);
2276 			ret = hugetlb_fault(mm, vma, vaddr,
2277 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2278 			spin_lock(&mm->page_table_lock);
2279 			if (!(ret & VM_FAULT_ERROR))
2280 				continue;
2281 
2282 			remainder = 0;
2283 			break;
2284 		}
2285 
2286 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
2287 		page = pte_page(huge_ptep_get(pte));
2288 same_page:
2289 		if (pages) {
2290 			pages[i] = mem_map_offset(page, pfn_offset);
2291 			get_page(pages[i]);
2292 		}
2293 
2294 		if (vmas)
2295 			vmas[i] = vma;
2296 
2297 		vaddr += PAGE_SIZE;
2298 		++pfn_offset;
2299 		--remainder;
2300 		++i;
2301 		if (vaddr < vma->vm_end && remainder &&
2302 				pfn_offset < pages_per_huge_page(h)) {
2303 			/*
2304 			 * We use pfn_offset to avoid touching the pageframes
2305 			 * of this compound page.
2306 			 */
2307 			goto same_page;
2308 		}
2309 	}
2310 	spin_unlock(&mm->page_table_lock);
2311 	*length = remainder;
2312 	*position = vaddr;
2313 
2314 	return i ? i : -EFAULT;
2315 }
2316 
2317 void hugetlb_change_protection(struct vm_area_struct *vma,
2318 		unsigned long address, unsigned long end, pgprot_t newprot)
2319 {
2320 	struct mm_struct *mm = vma->vm_mm;
2321 	unsigned long start = address;
2322 	pte_t *ptep;
2323 	pte_t pte;
2324 	struct hstate *h = hstate_vma(vma);
2325 
2326 	BUG_ON(address >= end);
2327 	flush_cache_range(vma, address, end);
2328 
2329 	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
2330 	spin_lock(&mm->page_table_lock);
2331 	for (; address < end; address += huge_page_size(h)) {
2332 		ptep = huge_pte_offset(mm, address);
2333 		if (!ptep)
2334 			continue;
2335 		if (huge_pmd_unshare(mm, &address, ptep))
2336 			continue;
2337 		if (!huge_pte_none(huge_ptep_get(ptep))) {
2338 			pte = huge_ptep_get_and_clear(mm, address, ptep);
2339 			pte = pte_mkhuge(pte_modify(pte, newprot));
2340 			set_huge_pte_at(mm, address, ptep, pte);
2341 		}
2342 	}
2343 	spin_unlock(&mm->page_table_lock);
2344 	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
2345 
2346 	flush_tlb_range(vma, start, end);
2347 }
2348 
2349 int hugetlb_reserve_pages(struct inode *inode,
2350 					long from, long to,
2351 					struct vm_area_struct *vma,
2352 					int acctflag)
2353 {
2354 	long ret, chg;
2355 	struct hstate *h = hstate_inode(inode);
2356 
2357 	/*
2358 	 * Only apply hugepage reservation if asked. At fault time, an
2359 	 * attempt will be made for VM_NORESERVE to allocate a page
2360 	 * and filesystem quota without using reserves
2361 	 */
2362 	if (acctflag & VM_NORESERVE)
2363 		return 0;
2364 
2365 	/*
2366 	 * Shared mappings base their reservation on the number of pages that
2367 	 * are already allocated on behalf of the file. Private mappings need
2368 	 * to reserve the full area even if read-only as mprotect() may be
2369 	 * called to make the mapping read-write. Assume !vma is a shm mapping
2370 	 */
2371 	if (!vma || vma->vm_flags & VM_MAYSHARE)
2372 		chg = region_chg(&inode->i_mapping->private_list, from, to);
2373 	else {
2374 		struct resv_map *resv_map = resv_map_alloc();
2375 		if (!resv_map)
2376 			return -ENOMEM;
2377 
2378 		chg = to - from;
2379 
2380 		set_vma_resv_map(vma, resv_map);
2381 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2382 	}
2383 
2384 	if (chg < 0)
2385 		return chg;
2386 
2387 	/* There must be enough filesystem quota for the mapping */
2388 	if (hugetlb_get_quota(inode->i_mapping, chg))
2389 		return -ENOSPC;
2390 
2391 	/*
2392 	 * Check enough hugepages are available for the reservation.
2393 	 * Hand back the quota if there are not
2394 	 */
2395 	ret = hugetlb_acct_memory(h, chg);
2396 	if (ret < 0) {
2397 		hugetlb_put_quota(inode->i_mapping, chg);
2398 		return ret;
2399 	}
2400 
2401 	/*
2402 	 * Account for the reservations made. Shared mappings record regions
2403 	 * that have reservations as they are shared by multiple VMAs.
2404 	 * When the last VMA disappears, the region map says how much
2405 	 * the reservation was and the page cache tells how much of
2406 	 * the reservation was consumed. Private mappings are per-VMA and
2407 	 * only the consumed reservations are tracked. When the VMA
2408 	 * disappears, the original reservation is the VMA size and the
2409 	 * consumed reservations are stored in the map. Hence, nothing
2410 	 * else has to be done for private mappings here
2411 	 */
2412 	if (!vma || vma->vm_flags & VM_MAYSHARE)
2413 		region_add(&inode->i_mapping->private_list, from, to);
2414 	return 0;
2415 }
2416 
2417 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2418 {
2419 	struct hstate *h = hstate_inode(inode);
2420 	long chg = region_truncate(&inode->i_mapping->private_list, offset);
2421 
2422 	spin_lock(&inode->i_lock);
2423 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2424 	spin_unlock(&inode->i_lock);
2425 
2426 	hugetlb_put_quota(inode->i_mapping, (chg - freed));
2427 	hugetlb_acct_memory(h, -(chg - freed));
2428 }
2429