xref: /openbmc/linux/mm/hugetlb.c (revision 7835e98b2e3c66dba79cb0ff8ebb90a2fe030c29)
1 /*
2  * Generic hugetlb support.
3  * (C) William Irwin, April 2004
4  */
5 #include <linux/gfp.h>
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
9 #include <linux/mm.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/nodemask.h>
13 #include <linux/pagemap.h>
14 #include <linux/mempolicy.h>
15 #include <linux/cpuset.h>
16 
17 #include <asm/page.h>
18 #include <asm/pgtable.h>
19 
20 #include <linux/hugetlb.h>
21 #include "internal.h"
22 
23 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
24 static unsigned long nr_huge_pages, free_huge_pages;
25 unsigned long max_huge_pages;
26 static struct list_head hugepage_freelists[MAX_NUMNODES];
27 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
28 static unsigned int free_huge_pages_node[MAX_NUMNODES];
29 
30 /*
31  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
32  */
33 static DEFINE_SPINLOCK(hugetlb_lock);
34 
35 static void enqueue_huge_page(struct page *page)
36 {
37 	int nid = page_to_nid(page);
38 	list_add(&page->lru, &hugepage_freelists[nid]);
39 	free_huge_pages++;
40 	free_huge_pages_node[nid]++;
41 }
42 
43 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
44 				unsigned long address)
45 {
46 	int nid = numa_node_id();
47 	struct page *page = NULL;
48 	struct zonelist *zonelist = huge_zonelist(vma, address);
49 	struct zone **z;
50 
51 	for (z = zonelist->zones; *z; z++) {
52 		nid = (*z)->zone_pgdat->node_id;
53 		if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
54 		    !list_empty(&hugepage_freelists[nid]))
55 			break;
56 	}
57 
58 	if (*z) {
59 		page = list_entry(hugepage_freelists[nid].next,
60 				  struct page, lru);
61 		list_del(&page->lru);
62 		free_huge_pages--;
63 		free_huge_pages_node[nid]--;
64 	}
65 	return page;
66 }
67 
68 static int alloc_fresh_huge_page(void)
69 {
70 	static int nid = 0;
71 	struct page *page;
72 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
73 					HUGETLB_PAGE_ORDER);
74 	nid = (nid + 1) % num_online_nodes();
75 	if (page) {
76 		page[1].lru.next = (void *)free_huge_page;	/* dtor */
77 		spin_lock(&hugetlb_lock);
78 		nr_huge_pages++;
79 		nr_huge_pages_node[page_to_nid(page)]++;
80 		spin_unlock(&hugetlb_lock);
81 		put_page(page); /* free it into the hugepage allocator */
82 		return 1;
83 	}
84 	return 0;
85 }
86 
87 void free_huge_page(struct page *page)
88 {
89 	BUG_ON(page_count(page));
90 
91 	INIT_LIST_HEAD(&page->lru);
92 
93 	spin_lock(&hugetlb_lock);
94 	enqueue_huge_page(page);
95 	spin_unlock(&hugetlb_lock);
96 }
97 
98 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
99 {
100 	struct page *page;
101 	int i;
102 
103 	spin_lock(&hugetlb_lock);
104 	page = dequeue_huge_page(vma, addr);
105 	if (!page) {
106 		spin_unlock(&hugetlb_lock);
107 		return NULL;
108 	}
109 	spin_unlock(&hugetlb_lock);
110 	set_page_refcounted(page);
111 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
112 		clear_user_highpage(&page[i], addr);
113 	return page;
114 }
115 
116 static int __init hugetlb_init(void)
117 {
118 	unsigned long i;
119 
120 	if (HPAGE_SHIFT == 0)
121 		return 0;
122 
123 	for (i = 0; i < MAX_NUMNODES; ++i)
124 		INIT_LIST_HEAD(&hugepage_freelists[i]);
125 
126 	for (i = 0; i < max_huge_pages; ++i) {
127 		if (!alloc_fresh_huge_page())
128 			break;
129 	}
130 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
131 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
132 	return 0;
133 }
134 module_init(hugetlb_init);
135 
136 static int __init hugetlb_setup(char *s)
137 {
138 	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
139 		max_huge_pages = 0;
140 	return 1;
141 }
142 __setup("hugepages=", hugetlb_setup);
143 
144 #ifdef CONFIG_SYSCTL
145 static void update_and_free_page(struct page *page)
146 {
147 	int i;
148 	nr_huge_pages--;
149 	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
150 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
151 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
152 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
153 				1 << PG_private | 1<< PG_writeback);
154 	}
155 	page[1].lru.next = NULL;
156 	set_page_refcounted(page);
157 	__free_pages(page, HUGETLB_PAGE_ORDER);
158 }
159 
160 #ifdef CONFIG_HIGHMEM
161 static void try_to_free_low(unsigned long count)
162 {
163 	int i, nid;
164 	for (i = 0; i < MAX_NUMNODES; ++i) {
165 		struct page *page, *next;
166 		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
167 			if (PageHighMem(page))
168 				continue;
169 			list_del(&page->lru);
170 			update_and_free_page(page);
171 			nid = page_zone(page)->zone_pgdat->node_id;
172 			free_huge_pages--;
173 			free_huge_pages_node[nid]--;
174 			if (count >= nr_huge_pages)
175 				return;
176 		}
177 	}
178 }
179 #else
180 static inline void try_to_free_low(unsigned long count)
181 {
182 }
183 #endif
184 
185 static unsigned long set_max_huge_pages(unsigned long count)
186 {
187 	while (count > nr_huge_pages) {
188 		if (!alloc_fresh_huge_page())
189 			return nr_huge_pages;
190 	}
191 	if (count >= nr_huge_pages)
192 		return nr_huge_pages;
193 
194 	spin_lock(&hugetlb_lock);
195 	try_to_free_low(count);
196 	while (count < nr_huge_pages) {
197 		struct page *page = dequeue_huge_page(NULL, 0);
198 		if (!page)
199 			break;
200 		update_and_free_page(page);
201 	}
202 	spin_unlock(&hugetlb_lock);
203 	return nr_huge_pages;
204 }
205 
206 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
207 			   struct file *file, void __user *buffer,
208 			   size_t *length, loff_t *ppos)
209 {
210 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
211 	max_huge_pages = set_max_huge_pages(max_huge_pages);
212 	return 0;
213 }
214 #endif /* CONFIG_SYSCTL */
215 
216 int hugetlb_report_meminfo(char *buf)
217 {
218 	return sprintf(buf,
219 			"HugePages_Total: %5lu\n"
220 			"HugePages_Free:  %5lu\n"
221 			"Hugepagesize:    %5lu kB\n",
222 			nr_huge_pages,
223 			free_huge_pages,
224 			HPAGE_SIZE/1024);
225 }
226 
227 int hugetlb_report_node_meminfo(int nid, char *buf)
228 {
229 	return sprintf(buf,
230 		"Node %d HugePages_Total: %5u\n"
231 		"Node %d HugePages_Free:  %5u\n",
232 		nid, nr_huge_pages_node[nid],
233 		nid, free_huge_pages_node[nid]);
234 }
235 
236 int is_hugepage_mem_enough(size_t size)
237 {
238 	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
239 }
240 
241 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
242 unsigned long hugetlb_total_pages(void)
243 {
244 	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
245 }
246 
247 /*
248  * We cannot handle pagefaults against hugetlb pages at all.  They cause
249  * handle_mm_fault() to try to instantiate regular-sized pages in the
250  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
251  * this far.
252  */
253 static struct page *hugetlb_nopage(struct vm_area_struct *vma,
254 				unsigned long address, int *unused)
255 {
256 	BUG();
257 	return NULL;
258 }
259 
260 struct vm_operations_struct hugetlb_vm_ops = {
261 	.nopage = hugetlb_nopage,
262 };
263 
264 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
265 				int writable)
266 {
267 	pte_t entry;
268 
269 	if (writable) {
270 		entry =
271 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
272 	} else {
273 		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
274 	}
275 	entry = pte_mkyoung(entry);
276 	entry = pte_mkhuge(entry);
277 
278 	return entry;
279 }
280 
281 static void set_huge_ptep_writable(struct vm_area_struct *vma,
282 				   unsigned long address, pte_t *ptep)
283 {
284 	pte_t entry;
285 
286 	entry = pte_mkwrite(pte_mkdirty(*ptep));
287 	ptep_set_access_flags(vma, address, ptep, entry, 1);
288 	update_mmu_cache(vma, address, entry);
289 	lazy_mmu_prot_update(entry);
290 }
291 
292 
293 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
294 			    struct vm_area_struct *vma)
295 {
296 	pte_t *src_pte, *dst_pte, entry;
297 	struct page *ptepage;
298 	unsigned long addr;
299 	int cow;
300 
301 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
302 
303 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
304 		src_pte = huge_pte_offset(src, addr);
305 		if (!src_pte)
306 			continue;
307 		dst_pte = huge_pte_alloc(dst, addr);
308 		if (!dst_pte)
309 			goto nomem;
310 		spin_lock(&dst->page_table_lock);
311 		spin_lock(&src->page_table_lock);
312 		if (!pte_none(*src_pte)) {
313 			if (cow)
314 				ptep_set_wrprotect(src, addr, src_pte);
315 			entry = *src_pte;
316 			ptepage = pte_page(entry);
317 			get_page(ptepage);
318 			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
319 			set_huge_pte_at(dst, addr, dst_pte, entry);
320 		}
321 		spin_unlock(&src->page_table_lock);
322 		spin_unlock(&dst->page_table_lock);
323 	}
324 	return 0;
325 
326 nomem:
327 	return -ENOMEM;
328 }
329 
330 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
331 			  unsigned long end)
332 {
333 	struct mm_struct *mm = vma->vm_mm;
334 	unsigned long address;
335 	pte_t *ptep;
336 	pte_t pte;
337 	struct page *page;
338 
339 	WARN_ON(!is_vm_hugetlb_page(vma));
340 	BUG_ON(start & ~HPAGE_MASK);
341 	BUG_ON(end & ~HPAGE_MASK);
342 
343 	spin_lock(&mm->page_table_lock);
344 
345 	/* Update high watermark before we lower rss */
346 	update_hiwater_rss(mm);
347 
348 	for (address = start; address < end; address += HPAGE_SIZE) {
349 		ptep = huge_pte_offset(mm, address);
350 		if (!ptep)
351 			continue;
352 
353 		pte = huge_ptep_get_and_clear(mm, address, ptep);
354 		if (pte_none(pte))
355 			continue;
356 
357 		page = pte_page(pte);
358 		put_page(page);
359 		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
360 	}
361 
362 	spin_unlock(&mm->page_table_lock);
363 	flush_tlb_range(vma, start, end);
364 }
365 
366 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
367 			unsigned long address, pte_t *ptep, pte_t pte)
368 {
369 	struct page *old_page, *new_page;
370 	int i, avoidcopy;
371 
372 	old_page = pte_page(pte);
373 
374 	/* If no-one else is actually using this page, avoid the copy
375 	 * and just make the page writable */
376 	avoidcopy = (page_count(old_page) == 1);
377 	if (avoidcopy) {
378 		set_huge_ptep_writable(vma, address, ptep);
379 		return VM_FAULT_MINOR;
380 	}
381 
382 	page_cache_get(old_page);
383 	new_page = alloc_huge_page(vma, address);
384 
385 	if (!new_page) {
386 		page_cache_release(old_page);
387 		return VM_FAULT_OOM;
388 	}
389 
390 	spin_unlock(&mm->page_table_lock);
391 	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
392 		copy_user_highpage(new_page + i, old_page + i,
393 				   address + i*PAGE_SIZE);
394 	spin_lock(&mm->page_table_lock);
395 
396 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
397 	if (likely(pte_same(*ptep, pte))) {
398 		/* Break COW */
399 		set_huge_pte_at(mm, address, ptep,
400 				make_huge_pte(vma, new_page, 1));
401 		/* Make the old page be freed below */
402 		new_page = old_page;
403 	}
404 	page_cache_release(new_page);
405 	page_cache_release(old_page);
406 	return VM_FAULT_MINOR;
407 }
408 
409 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
410 			unsigned long address, pte_t *ptep, int write_access)
411 {
412 	int ret = VM_FAULT_SIGBUS;
413 	unsigned long idx;
414 	unsigned long size;
415 	struct page *page;
416 	struct address_space *mapping;
417 	pte_t new_pte;
418 
419 	mapping = vma->vm_file->f_mapping;
420 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
421 		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
422 
423 	/*
424 	 * Use page lock to guard against racing truncation
425 	 * before we get page_table_lock.
426 	 */
427 retry:
428 	page = find_lock_page(mapping, idx);
429 	if (!page) {
430 		if (hugetlb_get_quota(mapping))
431 			goto out;
432 		page = alloc_huge_page(vma, address);
433 		if (!page) {
434 			hugetlb_put_quota(mapping);
435 			ret = VM_FAULT_OOM;
436 			goto out;
437 		}
438 
439 		if (vma->vm_flags & VM_SHARED) {
440 			int err;
441 
442 			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
443 			if (err) {
444 				put_page(page);
445 				hugetlb_put_quota(mapping);
446 				if (err == -EEXIST)
447 					goto retry;
448 				goto out;
449 			}
450 		} else
451 			lock_page(page);
452 	}
453 
454 	spin_lock(&mm->page_table_lock);
455 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
456 	if (idx >= size)
457 		goto backout;
458 
459 	ret = VM_FAULT_MINOR;
460 	if (!pte_none(*ptep))
461 		goto backout;
462 
463 	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
464 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
465 				&& (vma->vm_flags & VM_SHARED)));
466 	set_huge_pte_at(mm, address, ptep, new_pte);
467 
468 	if (write_access && !(vma->vm_flags & VM_SHARED)) {
469 		/* Optimization, do the COW without a second fault */
470 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
471 	}
472 
473 	spin_unlock(&mm->page_table_lock);
474 	unlock_page(page);
475 out:
476 	return ret;
477 
478 backout:
479 	spin_unlock(&mm->page_table_lock);
480 	hugetlb_put_quota(mapping);
481 	unlock_page(page);
482 	put_page(page);
483 	goto out;
484 }
485 
486 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
487 			unsigned long address, int write_access)
488 {
489 	pte_t *ptep;
490 	pte_t entry;
491 	int ret;
492 
493 	ptep = huge_pte_alloc(mm, address);
494 	if (!ptep)
495 		return VM_FAULT_OOM;
496 
497 	entry = *ptep;
498 	if (pte_none(entry))
499 		return hugetlb_no_page(mm, vma, address, ptep, write_access);
500 
501 	ret = VM_FAULT_MINOR;
502 
503 	spin_lock(&mm->page_table_lock);
504 	/* Check for a racing update before calling hugetlb_cow */
505 	if (likely(pte_same(entry, *ptep)))
506 		if (write_access && !pte_write(entry))
507 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
508 	spin_unlock(&mm->page_table_lock);
509 
510 	return ret;
511 }
512 
513 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
514 			struct page **pages, struct vm_area_struct **vmas,
515 			unsigned long *position, int *length, int i)
516 {
517 	unsigned long vpfn, vaddr = *position;
518 	int remainder = *length;
519 
520 	vpfn = vaddr/PAGE_SIZE;
521 	spin_lock(&mm->page_table_lock);
522 	while (vaddr < vma->vm_end && remainder) {
523 		pte_t *pte;
524 		struct page *page;
525 
526 		/*
527 		 * Some archs (sparc64, sh*) have multiple pte_ts to
528 		 * each hugepage.  We have to make * sure we get the
529 		 * first, for the page indexing below to work.
530 		 */
531 		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
532 
533 		if (!pte || pte_none(*pte)) {
534 			int ret;
535 
536 			spin_unlock(&mm->page_table_lock);
537 			ret = hugetlb_fault(mm, vma, vaddr, 0);
538 			spin_lock(&mm->page_table_lock);
539 			if (ret == VM_FAULT_MINOR)
540 				continue;
541 
542 			remainder = 0;
543 			if (!i)
544 				i = -EFAULT;
545 			break;
546 		}
547 
548 		if (pages) {
549 			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
550 			get_page(page);
551 			pages[i] = page;
552 		}
553 
554 		if (vmas)
555 			vmas[i] = vma;
556 
557 		vaddr += PAGE_SIZE;
558 		++vpfn;
559 		--remainder;
560 		++i;
561 	}
562 	spin_unlock(&mm->page_table_lock);
563 	*length = remainder;
564 	*position = vaddr;
565 
566 	return i;
567 }
568