xref: /openbmc/linux/mm/hugetlb_vmemmap.c (revision d35ac6ac)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * HugeTLB Vmemmap Optimization (HVO)
4  *
5  * Copyright (c) 2020, ByteDance. All rights reserved.
6  *
7  *     Author: Muchun Song <songmuchun@bytedance.com>
8  *
9  * See Documentation/mm/vmemmap_dedup.rst
10  */
11 #define pr_fmt(fmt)	"HugeTLB: " fmt
12 
13 #include <linux/pgtable.h>
14 #include <linux/moduleparam.h>
15 #include <linux/bootmem_info.h>
16 #include <asm/pgalloc.h>
17 #include <asm/tlbflush.h>
18 #include "hugetlb_vmemmap.h"
19 
20 /**
21  * struct vmemmap_remap_walk - walk vmemmap page table
22  *
23  * @remap_pte:		called for each lowest-level entry (PTE).
24  * @nr_walked:		the number of walked pte.
25  * @reuse_page:		the page which is reused for the tail vmemmap pages.
26  * @reuse_addr:		the virtual address of the @reuse_page page.
27  * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
28  *			or is mapped from.
29  */
30 struct vmemmap_remap_walk {
31 	void			(*remap_pte)(pte_t *pte, unsigned long addr,
32 					     struct vmemmap_remap_walk *walk);
33 	unsigned long		nr_walked;
34 	struct page		*reuse_page;
35 	unsigned long		reuse_addr;
36 	struct list_head	*vmemmap_pages;
37 };
38 
39 static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
40 {
41 	pmd_t __pmd;
42 	int i;
43 	unsigned long addr = start;
44 	struct page *page = pmd_page(*pmd);
45 	pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
46 
47 	if (!pgtable)
48 		return -ENOMEM;
49 
50 	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
51 
52 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
53 		pte_t entry, *pte;
54 		pgprot_t pgprot = PAGE_KERNEL;
55 
56 		entry = mk_pte(page + i, pgprot);
57 		pte = pte_offset_kernel(&__pmd, addr);
58 		set_pte_at(&init_mm, addr, pte, entry);
59 	}
60 
61 	spin_lock(&init_mm.page_table_lock);
62 	if (likely(pmd_leaf(*pmd))) {
63 		/*
64 		 * Higher order allocations from buddy allocator must be able to
65 		 * be treated as indepdenent small pages (as they can be freed
66 		 * individually).
67 		 */
68 		if (!PageReserved(page))
69 			split_page(page, get_order(PMD_SIZE));
70 
71 		/* Make pte visible before pmd. See comment in pmd_install(). */
72 		smp_wmb();
73 		pmd_populate_kernel(&init_mm, pmd, pgtable);
74 		flush_tlb_kernel_range(start, start + PMD_SIZE);
75 	} else {
76 		pte_free_kernel(&init_mm, pgtable);
77 	}
78 	spin_unlock(&init_mm.page_table_lock);
79 
80 	return 0;
81 }
82 
83 static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
84 {
85 	int leaf;
86 
87 	spin_lock(&init_mm.page_table_lock);
88 	leaf = pmd_leaf(*pmd);
89 	spin_unlock(&init_mm.page_table_lock);
90 
91 	if (!leaf)
92 		return 0;
93 
94 	return __split_vmemmap_huge_pmd(pmd, start);
95 }
96 
97 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
98 			      unsigned long end,
99 			      struct vmemmap_remap_walk *walk)
100 {
101 	pte_t *pte = pte_offset_kernel(pmd, addr);
102 
103 	/*
104 	 * The reuse_page is found 'first' in table walk before we start
105 	 * remapping (which is calling @walk->remap_pte).
106 	 */
107 	if (!walk->reuse_page) {
108 		walk->reuse_page = pte_page(ptep_get(pte));
109 		/*
110 		 * Because the reuse address is part of the range that we are
111 		 * walking, skip the reuse address range.
112 		 */
113 		addr += PAGE_SIZE;
114 		pte++;
115 		walk->nr_walked++;
116 	}
117 
118 	for (; addr != end; addr += PAGE_SIZE, pte++) {
119 		walk->remap_pte(pte, addr, walk);
120 		walk->nr_walked++;
121 	}
122 }
123 
124 static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
125 			     unsigned long end,
126 			     struct vmemmap_remap_walk *walk)
127 {
128 	pmd_t *pmd;
129 	unsigned long next;
130 
131 	pmd = pmd_offset(pud, addr);
132 	do {
133 		int ret;
134 
135 		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
136 		if (ret)
137 			return ret;
138 
139 		next = pmd_addr_end(addr, end);
140 		vmemmap_pte_range(pmd, addr, next, walk);
141 	} while (pmd++, addr = next, addr != end);
142 
143 	return 0;
144 }
145 
146 static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
147 			     unsigned long end,
148 			     struct vmemmap_remap_walk *walk)
149 {
150 	pud_t *pud;
151 	unsigned long next;
152 
153 	pud = pud_offset(p4d, addr);
154 	do {
155 		int ret;
156 
157 		next = pud_addr_end(addr, end);
158 		ret = vmemmap_pmd_range(pud, addr, next, walk);
159 		if (ret)
160 			return ret;
161 	} while (pud++, addr = next, addr != end);
162 
163 	return 0;
164 }
165 
166 static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
167 			     unsigned long end,
168 			     struct vmemmap_remap_walk *walk)
169 {
170 	p4d_t *p4d;
171 	unsigned long next;
172 
173 	p4d = p4d_offset(pgd, addr);
174 	do {
175 		int ret;
176 
177 		next = p4d_addr_end(addr, end);
178 		ret = vmemmap_pud_range(p4d, addr, next, walk);
179 		if (ret)
180 			return ret;
181 	} while (p4d++, addr = next, addr != end);
182 
183 	return 0;
184 }
185 
186 static int vmemmap_remap_range(unsigned long start, unsigned long end,
187 			       struct vmemmap_remap_walk *walk)
188 {
189 	unsigned long addr = start;
190 	unsigned long next;
191 	pgd_t *pgd;
192 
193 	VM_BUG_ON(!PAGE_ALIGNED(start));
194 	VM_BUG_ON(!PAGE_ALIGNED(end));
195 
196 	pgd = pgd_offset_k(addr);
197 	do {
198 		int ret;
199 
200 		next = pgd_addr_end(addr, end);
201 		ret = vmemmap_p4d_range(pgd, addr, next, walk);
202 		if (ret)
203 			return ret;
204 	} while (pgd++, addr = next, addr != end);
205 
206 	flush_tlb_kernel_range(start, end);
207 
208 	return 0;
209 }
210 
211 /*
212  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
213  * allocator or buddy allocator. If the PG_reserved flag is set, it means
214  * that it allocated from the memblock allocator, just free it via the
215  * free_bootmem_page(). Otherwise, use __free_page().
216  */
217 static inline void free_vmemmap_page(struct page *page)
218 {
219 	if (PageReserved(page))
220 		free_bootmem_page(page);
221 	else
222 		__free_page(page);
223 }
224 
225 /* Free a list of the vmemmap pages */
226 static void free_vmemmap_page_list(struct list_head *list)
227 {
228 	struct page *page, *next;
229 
230 	list_for_each_entry_safe(page, next, list, lru)
231 		free_vmemmap_page(page);
232 }
233 
234 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
235 			      struct vmemmap_remap_walk *walk)
236 {
237 	/*
238 	 * Remap the tail pages as read-only to catch illegal write operation
239 	 * to the tail pages.
240 	 */
241 	pgprot_t pgprot = PAGE_KERNEL_RO;
242 	struct page *page = pte_page(ptep_get(pte));
243 	pte_t entry;
244 
245 	/* Remapping the head page requires r/w */
246 	if (unlikely(addr == walk->reuse_addr)) {
247 		pgprot = PAGE_KERNEL;
248 		list_del(&walk->reuse_page->lru);
249 
250 		/*
251 		 * Makes sure that preceding stores to the page contents from
252 		 * vmemmap_remap_free() become visible before the set_pte_at()
253 		 * write.
254 		 */
255 		smp_wmb();
256 	}
257 
258 	entry = mk_pte(walk->reuse_page, pgprot);
259 	list_add_tail(&page->lru, walk->vmemmap_pages);
260 	set_pte_at(&init_mm, addr, pte, entry);
261 }
262 
263 /*
264  * How many struct page structs need to be reset. When we reuse the head
265  * struct page, the special metadata (e.g. page->flags or page->mapping)
266  * cannot copy to the tail struct page structs. The invalid value will be
267  * checked in the free_tail_page_prepare(). In order to avoid the message
268  * of "corrupted mapping in tail page". We need to reset at least 3 (one
269  * head struct page struct and two tail struct page structs) struct page
270  * structs.
271  */
272 #define NR_RESET_STRUCT_PAGE		3
273 
274 static inline void reset_struct_pages(struct page *start)
275 {
276 	struct page *from = start + NR_RESET_STRUCT_PAGE;
277 
278 	BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
279 	memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
280 }
281 
282 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
283 				struct vmemmap_remap_walk *walk)
284 {
285 	pgprot_t pgprot = PAGE_KERNEL;
286 	struct page *page;
287 	void *to;
288 
289 	BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
290 
291 	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
292 	list_del(&page->lru);
293 	to = page_to_virt(page);
294 	copy_page(to, (void *)walk->reuse_addr);
295 	reset_struct_pages(to);
296 
297 	/*
298 	 * Makes sure that preceding stores to the page contents become visible
299 	 * before the set_pte_at() write.
300 	 */
301 	smp_wmb();
302 	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
303 }
304 
305 /**
306  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
307  *			to the page which @reuse is mapped to, then free vmemmap
308  *			which the range are mapped to.
309  * @start:	start address of the vmemmap virtual address range that we want
310  *		to remap.
311  * @end:	end address of the vmemmap virtual address range that we want to
312  *		remap.
313  * @reuse:	reuse address.
314  *
315  * Return: %0 on success, negative error code otherwise.
316  */
317 static int vmemmap_remap_free(unsigned long start, unsigned long end,
318 			      unsigned long reuse)
319 {
320 	int ret;
321 	LIST_HEAD(vmemmap_pages);
322 	struct vmemmap_remap_walk walk = {
323 		.remap_pte	= vmemmap_remap_pte,
324 		.reuse_addr	= reuse,
325 		.vmemmap_pages	= &vmemmap_pages,
326 	};
327 	int nid = page_to_nid((struct page *)start);
328 	gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
329 			__GFP_NOWARN;
330 
331 	/*
332 	 * Allocate a new head vmemmap page to avoid breaking a contiguous
333 	 * block of struct page memory when freeing it back to page allocator
334 	 * in free_vmemmap_page_list(). This will allow the likely contiguous
335 	 * struct page backing memory to be kept contiguous and allowing for
336 	 * more allocations of hugepages. Fallback to the currently
337 	 * mapped head page in case should it fail to allocate.
338 	 */
339 	walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
340 	if (walk.reuse_page) {
341 		copy_page(page_to_virt(walk.reuse_page),
342 			  (void *)walk.reuse_addr);
343 		list_add(&walk.reuse_page->lru, &vmemmap_pages);
344 	}
345 
346 	/*
347 	 * In order to make remapping routine most efficient for the huge pages,
348 	 * the routine of vmemmap page table walking has the following rules
349 	 * (see more details from the vmemmap_pte_range()):
350 	 *
351 	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
352 	 *   should be continuous.
353 	 * - The @reuse address is part of the range [@reuse, @end) that we are
354 	 *   walking which is passed to vmemmap_remap_range().
355 	 * - The @reuse address is the first in the complete range.
356 	 *
357 	 * So we need to make sure that @start and @reuse meet the above rules.
358 	 */
359 	BUG_ON(start - reuse != PAGE_SIZE);
360 
361 	mmap_read_lock(&init_mm);
362 	ret = vmemmap_remap_range(reuse, end, &walk);
363 	if (ret && walk.nr_walked) {
364 		end = reuse + walk.nr_walked * PAGE_SIZE;
365 		/*
366 		 * vmemmap_pages contains pages from the previous
367 		 * vmemmap_remap_range call which failed.  These
368 		 * are pages which were removed from the vmemmap.
369 		 * They will be restored in the following call.
370 		 */
371 		walk = (struct vmemmap_remap_walk) {
372 			.remap_pte	= vmemmap_restore_pte,
373 			.reuse_addr	= reuse,
374 			.vmemmap_pages	= &vmemmap_pages,
375 		};
376 
377 		vmemmap_remap_range(reuse, end, &walk);
378 	}
379 	mmap_read_unlock(&init_mm);
380 
381 	free_vmemmap_page_list(&vmemmap_pages);
382 
383 	return ret;
384 }
385 
386 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
387 				   struct list_head *list)
388 {
389 	gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE;
390 	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
391 	int nid = page_to_nid((struct page *)start);
392 	struct page *page, *next;
393 
394 	while (nr_pages--) {
395 		page = alloc_pages_node(nid, gfp_mask, 0);
396 		if (!page)
397 			goto out;
398 		list_add_tail(&page->lru, list);
399 	}
400 
401 	return 0;
402 out:
403 	list_for_each_entry_safe(page, next, list, lru)
404 		__free_page(page);
405 	return -ENOMEM;
406 }
407 
408 /**
409  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
410  *			 to the page which is from the @vmemmap_pages
411  *			 respectively.
412  * @start:	start address of the vmemmap virtual address range that we want
413  *		to remap.
414  * @end:	end address of the vmemmap virtual address range that we want to
415  *		remap.
416  * @reuse:	reuse address.
417  *
418  * Return: %0 on success, negative error code otherwise.
419  */
420 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
421 			       unsigned long reuse)
422 {
423 	LIST_HEAD(vmemmap_pages);
424 	struct vmemmap_remap_walk walk = {
425 		.remap_pte	= vmemmap_restore_pte,
426 		.reuse_addr	= reuse,
427 		.vmemmap_pages	= &vmemmap_pages,
428 	};
429 
430 	/* See the comment in the vmemmap_remap_free(). */
431 	BUG_ON(start - reuse != PAGE_SIZE);
432 
433 	if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
434 		return -ENOMEM;
435 
436 	mmap_read_lock(&init_mm);
437 	vmemmap_remap_range(reuse, end, &walk);
438 	mmap_read_unlock(&init_mm);
439 
440 	return 0;
441 }
442 
443 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
444 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
445 
446 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
447 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
448 
449 /**
450  * hugetlb_vmemmap_restore - restore previously optimized (by
451  *			     hugetlb_vmemmap_optimize()) vmemmap pages which
452  *			     will be reallocated and remapped.
453  * @h:		struct hstate.
454  * @head:	the head page whose vmemmap pages will be restored.
455  *
456  * Return: %0 if @head's vmemmap pages have been reallocated and remapped,
457  * negative error code otherwise.
458  */
459 int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
460 {
461 	int ret;
462 	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
463 	unsigned long vmemmap_reuse;
464 
465 	if (!HPageVmemmapOptimized(head))
466 		return 0;
467 
468 	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
469 	vmemmap_reuse	= vmemmap_start;
470 	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
471 
472 	/*
473 	 * The pages which the vmemmap virtual address range [@vmemmap_start,
474 	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
475 	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
476 	 * When a HugeTLB page is freed to the buddy allocator, previously
477 	 * discarded vmemmap pages must be allocated and remapping.
478 	 */
479 	ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse);
480 	if (!ret) {
481 		ClearHPageVmemmapOptimized(head);
482 		static_branch_dec(&hugetlb_optimize_vmemmap_key);
483 	}
484 
485 	return ret;
486 }
487 
488 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
489 static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
490 {
491 	if (!READ_ONCE(vmemmap_optimize_enabled))
492 		return false;
493 
494 	if (!hugetlb_vmemmap_optimizable(h))
495 		return false;
496 
497 	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
498 		pmd_t *pmdp, pmd;
499 		struct page *vmemmap_page;
500 		unsigned long vaddr = (unsigned long)head;
501 
502 		/*
503 		 * Only the vmemmap page's vmemmap page can be self-hosted.
504 		 * Walking the page tables to find the backing page of the
505 		 * vmemmap page.
506 		 */
507 		pmdp = pmd_off_k(vaddr);
508 		/*
509 		 * The READ_ONCE() is used to stabilize *pmdp in a register or
510 		 * on the stack so that it will stop changing under the code.
511 		 * The only concurrent operation where it can be changed is
512 		 * split_vmemmap_huge_pmd() (*pmdp will be stable after this
513 		 * operation).
514 		 */
515 		pmd = READ_ONCE(*pmdp);
516 		if (pmd_leaf(pmd))
517 			vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
518 		else
519 			vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
520 		/*
521 		 * Due to HugeTLB alignment requirements and the vmemmap pages
522 		 * being at the start of the hotplugged memory region in
523 		 * memory_hotplug.memmap_on_memory case. Checking any vmemmap
524 		 * page's vmemmap page if it is marked as VmemmapSelfHosted is
525 		 * sufficient.
526 		 *
527 		 * [                  hotplugged memory                  ]
528 		 * [        section        ][...][        section        ]
529 		 * [ vmemmap ][              usable memory               ]
530 		 *   ^   |     |                                        |
531 		 *   +---+     |                                        |
532 		 *     ^       |                                        |
533 		 *     +-------+                                        |
534 		 *          ^                                           |
535 		 *          +-------------------------------------------+
536 		 */
537 		if (PageVmemmapSelfHosted(vmemmap_page))
538 			return false;
539 	}
540 
541 	return true;
542 }
543 
544 /**
545  * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
546  * @h:		struct hstate.
547  * @head:	the head page whose vmemmap pages will be optimized.
548  *
549  * This function only tries to optimize @head's vmemmap pages and does not
550  * guarantee that the optimization will succeed after it returns. The caller
551  * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
552  * have been optimized.
553  */
554 void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
555 {
556 	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
557 	unsigned long vmemmap_reuse;
558 
559 	if (!vmemmap_should_optimize(h, head))
560 		return;
561 
562 	static_branch_inc(&hugetlb_optimize_vmemmap_key);
563 
564 	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
565 	vmemmap_reuse	= vmemmap_start;
566 	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
567 
568 	/*
569 	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
570 	 * to the page which @vmemmap_reuse is mapped to, then free the pages
571 	 * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
572 	 */
573 	if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
574 		static_branch_dec(&hugetlb_optimize_vmemmap_key);
575 	else
576 		SetHPageVmemmapOptimized(head);
577 }
578 
579 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
580 	{
581 		.procname	= "hugetlb_optimize_vmemmap",
582 		.data		= &vmemmap_optimize_enabled,
583 		.maxlen		= sizeof(vmemmap_optimize_enabled),
584 		.mode		= 0644,
585 		.proc_handler	= proc_dobool,
586 	},
587 	{ }
588 };
589 
590 static int __init hugetlb_vmemmap_init(void)
591 {
592 	const struct hstate *h;
593 
594 	/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
595 	BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
596 
597 	for_each_hstate(h) {
598 		if (hugetlb_vmemmap_optimizable(h)) {
599 			register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
600 			break;
601 		}
602 	}
603 	return 0;
604 }
605 late_initcall(hugetlb_vmemmap_init);
606