xref: /openbmc/linux/mm/hugetlb.c (revision a482289d46587ffcda4c85aab109fb74910d7a48)
1 /*
2  * Generic hugetlb support.
3  * (C) William Irwin, April 2004
4  */
5 #include <linux/gfp.h>
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
9 #include <linux/mm.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/nodemask.h>
13 #include <linux/pagemap.h>
14 #include <linux/mempolicy.h>
15 #include <linux/cpuset.h>
16 
17 #include <asm/page.h>
18 #include <asm/pgtable.h>
19 
20 #include <linux/hugetlb.h>
21 
22 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
23 static unsigned long nr_huge_pages, free_huge_pages;
24 unsigned long max_huge_pages;
25 static struct list_head hugepage_freelists[MAX_NUMNODES];
26 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
27 static unsigned int free_huge_pages_node[MAX_NUMNODES];
28 
29 /*
30  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
31  */
32 static DEFINE_SPINLOCK(hugetlb_lock);
33 
34 static void enqueue_huge_page(struct page *page)
35 {
36 	int nid = page_to_nid(page);
37 	list_add(&page->lru, &hugepage_freelists[nid]);
38 	free_huge_pages++;
39 	free_huge_pages_node[nid]++;
40 }
41 
42 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
43 				unsigned long address)
44 {
45 	int nid = numa_node_id();
46 	struct page *page = NULL;
47 	struct zonelist *zonelist = huge_zonelist(vma, address);
48 	struct zone **z;
49 
50 	for (z = zonelist->zones; *z; z++) {
51 		nid = (*z)->zone_pgdat->node_id;
52 		if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
53 		    !list_empty(&hugepage_freelists[nid]))
54 			break;
55 	}
56 
57 	if (*z) {
58 		page = list_entry(hugepage_freelists[nid].next,
59 				  struct page, lru);
60 		list_del(&page->lru);
61 		free_huge_pages--;
62 		free_huge_pages_node[nid]--;
63 	}
64 	return page;
65 }
66 
67 static int alloc_fresh_huge_page(void)
68 {
69 	static int nid = 0;
70 	struct page *page;
71 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
72 					HUGETLB_PAGE_ORDER);
73 	nid = (nid + 1) % num_online_nodes();
74 	if (page) {
75 		page[1].lru.next = (void *)free_huge_page;	/* dtor */
76 		spin_lock(&hugetlb_lock);
77 		nr_huge_pages++;
78 		nr_huge_pages_node[page_to_nid(page)]++;
79 		spin_unlock(&hugetlb_lock);
80 		put_page(page); /* free it into the hugepage allocator */
81 		return 1;
82 	}
83 	return 0;
84 }
85 
86 void free_huge_page(struct page *page)
87 {
88 	BUG_ON(page_count(page));
89 
90 	INIT_LIST_HEAD(&page->lru);
91 
92 	spin_lock(&hugetlb_lock);
93 	enqueue_huge_page(page);
94 	spin_unlock(&hugetlb_lock);
95 }
96 
97 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
98 {
99 	struct page *page;
100 	int i;
101 
102 	spin_lock(&hugetlb_lock);
103 	page = dequeue_huge_page(vma, addr);
104 	if (!page) {
105 		spin_unlock(&hugetlb_lock);
106 		return NULL;
107 	}
108 	spin_unlock(&hugetlb_lock);
109 	set_page_count(page, 1);
110 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
111 		clear_user_highpage(&page[i], addr);
112 	return page;
113 }
114 
115 static int __init hugetlb_init(void)
116 {
117 	unsigned long i;
118 
119 	if (HPAGE_SHIFT == 0)
120 		return 0;
121 
122 	for (i = 0; i < MAX_NUMNODES; ++i)
123 		INIT_LIST_HEAD(&hugepage_freelists[i]);
124 
125 	for (i = 0; i < max_huge_pages; ++i) {
126 		if (!alloc_fresh_huge_page())
127 			break;
128 	}
129 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
130 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
131 	return 0;
132 }
133 module_init(hugetlb_init);
134 
135 static int __init hugetlb_setup(char *s)
136 {
137 	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
138 		max_huge_pages = 0;
139 	return 1;
140 }
141 __setup("hugepages=", hugetlb_setup);
142 
143 #ifdef CONFIG_SYSCTL
144 static void update_and_free_page(struct page *page)
145 {
146 	int i;
147 	nr_huge_pages--;
148 	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
149 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
150 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
151 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
152 				1 << PG_private | 1<< PG_writeback);
153 	}
154 	page[1].lru.next = NULL;
155 	set_page_count(page, 1);
156 	__free_pages(page, HUGETLB_PAGE_ORDER);
157 }
158 
159 #ifdef CONFIG_HIGHMEM
160 static void try_to_free_low(unsigned long count)
161 {
162 	int i, nid;
163 	for (i = 0; i < MAX_NUMNODES; ++i) {
164 		struct page *page, *next;
165 		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
166 			if (PageHighMem(page))
167 				continue;
168 			list_del(&page->lru);
169 			update_and_free_page(page);
170 			nid = page_zone(page)->zone_pgdat->node_id;
171 			free_huge_pages--;
172 			free_huge_pages_node[nid]--;
173 			if (count >= nr_huge_pages)
174 				return;
175 		}
176 	}
177 }
178 #else
179 static inline void try_to_free_low(unsigned long count)
180 {
181 }
182 #endif
183 
184 static unsigned long set_max_huge_pages(unsigned long count)
185 {
186 	while (count > nr_huge_pages) {
187 		if (!alloc_fresh_huge_page())
188 			return nr_huge_pages;
189 	}
190 	if (count >= nr_huge_pages)
191 		return nr_huge_pages;
192 
193 	spin_lock(&hugetlb_lock);
194 	try_to_free_low(count);
195 	while (count < nr_huge_pages) {
196 		struct page *page = dequeue_huge_page(NULL, 0);
197 		if (!page)
198 			break;
199 		update_and_free_page(page);
200 	}
201 	spin_unlock(&hugetlb_lock);
202 	return nr_huge_pages;
203 }
204 
205 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
206 			   struct file *file, void __user *buffer,
207 			   size_t *length, loff_t *ppos)
208 {
209 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
210 	max_huge_pages = set_max_huge_pages(max_huge_pages);
211 	return 0;
212 }
213 #endif /* CONFIG_SYSCTL */
214 
215 int hugetlb_report_meminfo(char *buf)
216 {
217 	return sprintf(buf,
218 			"HugePages_Total: %5lu\n"
219 			"HugePages_Free:  %5lu\n"
220 			"Hugepagesize:    %5lu kB\n",
221 			nr_huge_pages,
222 			free_huge_pages,
223 			HPAGE_SIZE/1024);
224 }
225 
226 int hugetlb_report_node_meminfo(int nid, char *buf)
227 {
228 	return sprintf(buf,
229 		"Node %d HugePages_Total: %5u\n"
230 		"Node %d HugePages_Free:  %5u\n",
231 		nid, nr_huge_pages_node[nid],
232 		nid, free_huge_pages_node[nid]);
233 }
234 
235 int is_hugepage_mem_enough(size_t size)
236 {
237 	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
238 }
239 
240 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
241 unsigned long hugetlb_total_pages(void)
242 {
243 	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
244 }
245 
246 /*
247  * We cannot handle pagefaults against hugetlb pages at all.  They cause
248  * handle_mm_fault() to try to instantiate regular-sized pages in the
249  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
250  * this far.
251  */
252 static struct page *hugetlb_nopage(struct vm_area_struct *vma,
253 				unsigned long address, int *unused)
254 {
255 	BUG();
256 	return NULL;
257 }
258 
259 struct vm_operations_struct hugetlb_vm_ops = {
260 	.nopage = hugetlb_nopage,
261 };
262 
263 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
264 				int writable)
265 {
266 	pte_t entry;
267 
268 	if (writable) {
269 		entry =
270 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
271 	} else {
272 		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
273 	}
274 	entry = pte_mkyoung(entry);
275 	entry = pte_mkhuge(entry);
276 
277 	return entry;
278 }
279 
280 static void set_huge_ptep_writable(struct vm_area_struct *vma,
281 				   unsigned long address, pte_t *ptep)
282 {
283 	pte_t entry;
284 
285 	entry = pte_mkwrite(pte_mkdirty(*ptep));
286 	ptep_set_access_flags(vma, address, ptep, entry, 1);
287 	update_mmu_cache(vma, address, entry);
288 	lazy_mmu_prot_update(entry);
289 }
290 
291 
292 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
293 			    struct vm_area_struct *vma)
294 {
295 	pte_t *src_pte, *dst_pte, entry;
296 	struct page *ptepage;
297 	unsigned long addr;
298 	int cow;
299 
300 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
301 
302 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
303 		src_pte = huge_pte_offset(src, addr);
304 		if (!src_pte)
305 			continue;
306 		dst_pte = huge_pte_alloc(dst, addr);
307 		if (!dst_pte)
308 			goto nomem;
309 		spin_lock(&dst->page_table_lock);
310 		spin_lock(&src->page_table_lock);
311 		if (!pte_none(*src_pte)) {
312 			if (cow)
313 				ptep_set_wrprotect(src, addr, src_pte);
314 			entry = *src_pte;
315 			ptepage = pte_page(entry);
316 			get_page(ptepage);
317 			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
318 			set_huge_pte_at(dst, addr, dst_pte, entry);
319 		}
320 		spin_unlock(&src->page_table_lock);
321 		spin_unlock(&dst->page_table_lock);
322 	}
323 	return 0;
324 
325 nomem:
326 	return -ENOMEM;
327 }
328 
329 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
330 			  unsigned long end)
331 {
332 	struct mm_struct *mm = vma->vm_mm;
333 	unsigned long address;
334 	pte_t *ptep;
335 	pte_t pte;
336 	struct page *page;
337 
338 	WARN_ON(!is_vm_hugetlb_page(vma));
339 	BUG_ON(start & ~HPAGE_MASK);
340 	BUG_ON(end & ~HPAGE_MASK);
341 
342 	spin_lock(&mm->page_table_lock);
343 
344 	/* Update high watermark before we lower rss */
345 	update_hiwater_rss(mm);
346 
347 	for (address = start; address < end; address += HPAGE_SIZE) {
348 		ptep = huge_pte_offset(mm, address);
349 		if (!ptep)
350 			continue;
351 
352 		pte = huge_ptep_get_and_clear(mm, address, ptep);
353 		if (pte_none(pte))
354 			continue;
355 
356 		page = pte_page(pte);
357 		put_page(page);
358 		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
359 	}
360 
361 	spin_unlock(&mm->page_table_lock);
362 	flush_tlb_range(vma, start, end);
363 }
364 
365 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
366 			unsigned long address, pte_t *ptep, pte_t pte)
367 {
368 	struct page *old_page, *new_page;
369 	int i, avoidcopy;
370 
371 	old_page = pte_page(pte);
372 
373 	/* If no-one else is actually using this page, avoid the copy
374 	 * and just make the page writable */
375 	avoidcopy = (page_count(old_page) == 1);
376 	if (avoidcopy) {
377 		set_huge_ptep_writable(vma, address, ptep);
378 		return VM_FAULT_MINOR;
379 	}
380 
381 	page_cache_get(old_page);
382 	new_page = alloc_huge_page(vma, address);
383 
384 	if (!new_page) {
385 		page_cache_release(old_page);
386 		return VM_FAULT_OOM;
387 	}
388 
389 	spin_unlock(&mm->page_table_lock);
390 	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
391 		copy_user_highpage(new_page + i, old_page + i,
392 				   address + i*PAGE_SIZE);
393 	spin_lock(&mm->page_table_lock);
394 
395 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
396 	if (likely(pte_same(*ptep, pte))) {
397 		/* Break COW */
398 		set_huge_pte_at(mm, address, ptep,
399 				make_huge_pte(vma, new_page, 1));
400 		/* Make the old page be freed below */
401 		new_page = old_page;
402 	}
403 	page_cache_release(new_page);
404 	page_cache_release(old_page);
405 	return VM_FAULT_MINOR;
406 }
407 
408 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
409 			unsigned long address, pte_t *ptep, int write_access)
410 {
411 	int ret = VM_FAULT_SIGBUS;
412 	unsigned long idx;
413 	unsigned long size;
414 	struct page *page;
415 	struct address_space *mapping;
416 	pte_t new_pte;
417 
418 	mapping = vma->vm_file->f_mapping;
419 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
420 		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
421 
422 	/*
423 	 * Use page lock to guard against racing truncation
424 	 * before we get page_table_lock.
425 	 */
426 retry:
427 	page = find_lock_page(mapping, idx);
428 	if (!page) {
429 		if (hugetlb_get_quota(mapping))
430 			goto out;
431 		page = alloc_huge_page(vma, address);
432 		if (!page) {
433 			hugetlb_put_quota(mapping);
434 			ret = VM_FAULT_OOM;
435 			goto out;
436 		}
437 
438 		if (vma->vm_flags & VM_SHARED) {
439 			int err;
440 
441 			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
442 			if (err) {
443 				put_page(page);
444 				hugetlb_put_quota(mapping);
445 				if (err == -EEXIST)
446 					goto retry;
447 				goto out;
448 			}
449 		} else
450 			lock_page(page);
451 	}
452 
453 	spin_lock(&mm->page_table_lock);
454 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
455 	if (idx >= size)
456 		goto backout;
457 
458 	ret = VM_FAULT_MINOR;
459 	if (!pte_none(*ptep))
460 		goto backout;
461 
462 	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
463 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
464 				&& (vma->vm_flags & VM_SHARED)));
465 	set_huge_pte_at(mm, address, ptep, new_pte);
466 
467 	if (write_access && !(vma->vm_flags & VM_SHARED)) {
468 		/* Optimization, do the COW without a second fault */
469 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
470 	}
471 
472 	spin_unlock(&mm->page_table_lock);
473 	unlock_page(page);
474 out:
475 	return ret;
476 
477 backout:
478 	spin_unlock(&mm->page_table_lock);
479 	hugetlb_put_quota(mapping);
480 	unlock_page(page);
481 	put_page(page);
482 	goto out;
483 }
484 
485 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
486 			unsigned long address, int write_access)
487 {
488 	pte_t *ptep;
489 	pte_t entry;
490 	int ret;
491 
492 	ptep = huge_pte_alloc(mm, address);
493 	if (!ptep)
494 		return VM_FAULT_OOM;
495 
496 	entry = *ptep;
497 	if (pte_none(entry))
498 		return hugetlb_no_page(mm, vma, address, ptep, write_access);
499 
500 	ret = VM_FAULT_MINOR;
501 
502 	spin_lock(&mm->page_table_lock);
503 	/* Check for a racing update before calling hugetlb_cow */
504 	if (likely(pte_same(entry, *ptep)))
505 		if (write_access && !pte_write(entry))
506 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
507 	spin_unlock(&mm->page_table_lock);
508 
509 	return ret;
510 }
511 
512 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
513 			struct page **pages, struct vm_area_struct **vmas,
514 			unsigned long *position, int *length, int i)
515 {
516 	unsigned long vpfn, vaddr = *position;
517 	int remainder = *length;
518 
519 	vpfn = vaddr/PAGE_SIZE;
520 	spin_lock(&mm->page_table_lock);
521 	while (vaddr < vma->vm_end && remainder) {
522 		pte_t *pte;
523 		struct page *page;
524 
525 		/*
526 		 * Some archs (sparc64, sh*) have multiple pte_ts to
527 		 * each hugepage.  We have to make * sure we get the
528 		 * first, for the page indexing below to work.
529 		 */
530 		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
531 
532 		if (!pte || pte_none(*pte)) {
533 			int ret;
534 
535 			spin_unlock(&mm->page_table_lock);
536 			ret = hugetlb_fault(mm, vma, vaddr, 0);
537 			spin_lock(&mm->page_table_lock);
538 			if (ret == VM_FAULT_MINOR)
539 				continue;
540 
541 			remainder = 0;
542 			if (!i)
543 				i = -EFAULT;
544 			break;
545 		}
546 
547 		if (pages) {
548 			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
549 			get_page(page);
550 			pages[i] = page;
551 		}
552 
553 		if (vmas)
554 			vmas[i] = vma;
555 
556 		vaddr += PAGE_SIZE;
557 		++vpfn;
558 		--remainder;
559 		++i;
560 	}
561 	spin_unlock(&mm->page_table_lock);
562 	*length = remainder;
563 	*position = vaddr;
564 
565 	return i;
566 }
567