xref: /openbmc/linux/mm/hugetlb_vmemmap.c (revision 747f7a29)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Optimize vmemmap pages associated with HugeTLB
4  *
5  * Copyright (c) 2020, Bytedance. All rights reserved.
6  *
7  *     Author: Muchun Song <songmuchun@bytedance.com>
8  *
9  * See Documentation/mm/vmemmap_dedup.rst
10  */
11 #define pr_fmt(fmt)	"HugeTLB: " fmt
12 
13 #include <linux/memory.h>
14 #include "hugetlb_vmemmap.h"
15 
16 /*
17  * There are a lot of struct page structures associated with each HugeTLB page.
18  * For tail pages, the value of compound_head is the same. So we can reuse first
19  * page of head page structures. We map the virtual addresses of all the pages
20  * of tail page structures to the head page struct, and then free these page
21  * frames. Therefore, we need to reserve one pages as vmemmap areas.
22  */
23 #define RESERVE_VMEMMAP_NR		1U
24 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
25 
26 enum vmemmap_optimize_mode {
27 	VMEMMAP_OPTIMIZE_OFF,
28 	VMEMMAP_OPTIMIZE_ON,
29 };
30 
31 DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
32 			hugetlb_optimize_vmemmap_key);
33 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
34 
35 static enum vmemmap_optimize_mode vmemmap_optimize_mode =
36 	IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
37 
38 static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
39 {
40 	if (vmemmap_optimize_mode == to)
41 		return;
42 
43 	if (to == VMEMMAP_OPTIMIZE_OFF)
44 		static_branch_dec(&hugetlb_optimize_vmemmap_key);
45 	else
46 		static_branch_inc(&hugetlb_optimize_vmemmap_key);
47 	WRITE_ONCE(vmemmap_optimize_mode, to);
48 }
49 
50 static int __init hugetlb_vmemmap_early_param(char *buf)
51 {
52 	bool enable;
53 	enum vmemmap_optimize_mode mode;
54 
55 	if (kstrtobool(buf, &enable))
56 		return -EINVAL;
57 
58 	mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF;
59 	vmemmap_optimize_mode_switch(mode);
60 
61 	return 0;
62 }
63 early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param);
64 
65 /*
66  * Previously discarded vmemmap pages will be allocated and remapping
67  * after this function returns zero.
68  */
69 int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
70 {
71 	int ret;
72 	unsigned long vmemmap_addr = (unsigned long)head;
73 	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
74 
75 	if (!HPageVmemmapOptimized(head))
76 		return 0;
77 
78 	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
79 	vmemmap_pages	= hugetlb_optimize_vmemmap_pages(h);
80 	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
81 	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
82 
83 	/*
84 	 * The pages which the vmemmap virtual address range [@vmemmap_addr,
85 	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
86 	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
87 	 * When a HugeTLB page is freed to the buddy allocator, previously
88 	 * discarded vmemmap pages must be allocated and remapping.
89 	 */
90 	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
91 				  GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
92 	if (!ret) {
93 		ClearHPageVmemmapOptimized(head);
94 		static_branch_dec(&hugetlb_optimize_vmemmap_key);
95 	}
96 
97 	return ret;
98 }
99 
100 static unsigned int vmemmap_optimizable_pages(struct hstate *h,
101 					      struct page *head)
102 {
103 	if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
104 		return 0;
105 
106 	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
107 		pmd_t *pmdp, pmd;
108 		struct page *vmemmap_page;
109 		unsigned long vaddr = (unsigned long)head;
110 
111 		/*
112 		 * Only the vmemmap page's vmemmap page can be self-hosted.
113 		 * Walking the page tables to find the backing page of the
114 		 * vmemmap page.
115 		 */
116 		pmdp = pmd_off_k(vaddr);
117 		/*
118 		 * The READ_ONCE() is used to stabilize *pmdp in a register or
119 		 * on the stack so that it will stop changing under the code.
120 		 * The only concurrent operation where it can be changed is
121 		 * split_vmemmap_huge_pmd() (*pmdp will be stable after this
122 		 * operation).
123 		 */
124 		pmd = READ_ONCE(*pmdp);
125 		if (pmd_leaf(pmd))
126 			vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
127 		else
128 			vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
129 		/*
130 		 * Due to HugeTLB alignment requirements and the vmemmap pages
131 		 * being at the start of the hotplugged memory region in
132 		 * memory_hotplug.memmap_on_memory case. Checking any vmemmap
133 		 * page's vmemmap page if it is marked as VmemmapSelfHosted is
134 		 * sufficient.
135 		 *
136 		 * [                  hotplugged memory                  ]
137 		 * [        section        ][...][        section        ]
138 		 * [ vmemmap ][              usable memory               ]
139 		 *   ^   |     |                                        |
140 		 *   +---+     |                                        |
141 		 *     ^       |                                        |
142 		 *     +-------+                                        |
143 		 *          ^                                           |
144 		 *          +-------------------------------------------+
145 		 */
146 		if (PageVmemmapSelfHosted(vmemmap_page))
147 			return 0;
148 	}
149 
150 	return hugetlb_optimize_vmemmap_pages(h);
151 }
152 
153 void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
154 {
155 	unsigned long vmemmap_addr = (unsigned long)head;
156 	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
157 
158 	vmemmap_pages = vmemmap_optimizable_pages(h, head);
159 	if (!vmemmap_pages)
160 		return;
161 
162 	static_branch_inc(&hugetlb_optimize_vmemmap_key);
163 
164 	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
165 	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
166 	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
167 
168 	/*
169 	 * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
170 	 * to the page which @vmemmap_reuse is mapped to, then free the pages
171 	 * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
172 	 */
173 	if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
174 		static_branch_dec(&hugetlb_optimize_vmemmap_key);
175 	else
176 		SetHPageVmemmapOptimized(head);
177 }
178 
179 void __init hugetlb_vmemmap_init(struct hstate *h)
180 {
181 	unsigned int nr_pages = pages_per_huge_page(h);
182 	unsigned int vmemmap_pages;
183 
184 	/*
185 	 * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
186 	 * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP,
187 	 * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
188 	 */
189 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
190 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
191 
192 	if (!is_power_of_2(sizeof(struct page))) {
193 		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
194 		static_branch_disable(&hugetlb_optimize_vmemmap_key);
195 		return;
196 	}
197 
198 	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
199 	/*
200 	 * The head page is not to be freed to buddy allocator, the other tail
201 	 * pages will map to the head page, so they can be freed.
202 	 *
203 	 * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
204 	 * on some architectures (e.g. aarch64). See Documentation/arm64/
205 	 * hugetlbpage.rst for more details.
206 	 */
207 	if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
208 		h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
209 
210 	pr_info("can optimize %d vmemmap pages for %s\n",
211 		h->optimize_vmemmap_pages, h->name);
212 }
213 
214 #ifdef CONFIG_PROC_SYSCTL
215 static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write,
216 					    void *buffer, size_t *length,
217 					    loff_t *ppos)
218 {
219 	int ret;
220 	enum vmemmap_optimize_mode mode;
221 	static DEFINE_MUTEX(sysctl_mutex);
222 
223 	if (write && !capable(CAP_SYS_ADMIN))
224 		return -EPERM;
225 
226 	mutex_lock(&sysctl_mutex);
227 	mode = vmemmap_optimize_mode;
228 	table->data = &mode;
229 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
230 	if (write && !ret)
231 		vmemmap_optimize_mode_switch(mode);
232 	mutex_unlock(&sysctl_mutex);
233 
234 	return ret;
235 }
236 
237 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
238 	{
239 		.procname	= "hugetlb_optimize_vmemmap",
240 		.maxlen		= sizeof(enum vmemmap_optimize_mode),
241 		.mode		= 0644,
242 		.proc_handler	= hugetlb_optimize_vmemmap_handler,
243 		.extra1		= SYSCTL_ZERO,
244 		.extra2		= SYSCTL_ONE,
245 	},
246 	{ }
247 };
248 
249 static __init int hugetlb_vmemmap_sysctls_init(void)
250 {
251 	/*
252 	 * If "struct page" crosses page boundaries, the vmemmap pages cannot
253 	 * be optimized.
254 	 */
255 	if (is_power_of_2(sizeof(struct page)))
256 		register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
257 
258 	return 0;
259 }
260 late_initcall(hugetlb_vmemmap_sysctls_init);
261 #endif /* CONFIG_PROC_SYSCTL */
262