1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Optimize vmemmap pages associated with HugeTLB 4 * 5 * Copyright (c) 2020, Bytedance. All rights reserved. 6 * 7 * Author: Muchun Song <songmuchun@bytedance.com> 8 * 9 * See Documentation/mm/vmemmap_dedup.rst 10 */ 11 #define pr_fmt(fmt) "HugeTLB: " fmt 12 13 #include <linux/memory.h> 14 #include "hugetlb_vmemmap.h" 15 16 /* 17 * There are a lot of struct page structures associated with each HugeTLB page. 18 * For tail pages, the value of compound_head is the same. So we can reuse first 19 * page of head page structures. We map the virtual addresses of all the pages 20 * of tail page structures to the head page struct, and then free these page 21 * frames. Therefore, we need to reserve one pages as vmemmap areas. 22 */ 23 #define RESERVE_VMEMMAP_NR 1U 24 #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) 25 26 enum vmemmap_optimize_mode { 27 VMEMMAP_OPTIMIZE_OFF, 28 VMEMMAP_OPTIMIZE_ON, 29 }; 30 31 DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, 32 hugetlb_optimize_vmemmap_key); 33 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); 34 35 static enum vmemmap_optimize_mode vmemmap_optimize_mode = 36 IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); 37 38 static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) 39 { 40 if (vmemmap_optimize_mode == to) 41 return; 42 43 if (to == VMEMMAP_OPTIMIZE_OFF) 44 static_branch_dec(&hugetlb_optimize_vmemmap_key); 45 else 46 static_branch_inc(&hugetlb_optimize_vmemmap_key); 47 WRITE_ONCE(vmemmap_optimize_mode, to); 48 } 49 50 static int __init hugetlb_vmemmap_early_param(char *buf) 51 { 52 bool enable; 53 enum vmemmap_optimize_mode mode; 54 55 if (kstrtobool(buf, &enable)) 56 return -EINVAL; 57 58 mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; 59 vmemmap_optimize_mode_switch(mode); 60 61 return 0; 62 } 63 early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); 64 65 /* 66 * Previously discarded vmemmap pages will be allocated and remapping 67 * after this function returns zero. 68 */ 69 int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) 70 { 71 int ret; 72 unsigned long vmemmap_addr = (unsigned long)head; 73 unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; 74 75 if (!HPageVmemmapOptimized(head)) 76 return 0; 77 78 vmemmap_addr += RESERVE_VMEMMAP_SIZE; 79 vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); 80 vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); 81 vmemmap_reuse = vmemmap_addr - PAGE_SIZE; 82 83 /* 84 * The pages which the vmemmap virtual address range [@vmemmap_addr, 85 * @vmemmap_end) are mapped to are freed to the buddy allocator, and 86 * the range is mapped to the page which @vmemmap_reuse is mapped to. 87 * When a HugeTLB page is freed to the buddy allocator, previously 88 * discarded vmemmap pages must be allocated and remapping. 89 */ 90 ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, 91 GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); 92 if (!ret) { 93 ClearHPageVmemmapOptimized(head); 94 static_branch_dec(&hugetlb_optimize_vmemmap_key); 95 } 96 97 return ret; 98 } 99 100 static unsigned int vmemmap_optimizable_pages(struct hstate *h, 101 struct page *head) 102 { 103 if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) 104 return 0; 105 106 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { 107 pmd_t *pmdp, pmd; 108 struct page *vmemmap_page; 109 unsigned long vaddr = (unsigned long)head; 110 111 /* 112 * Only the vmemmap page's vmemmap page can be self-hosted. 113 * Walking the page tables to find the backing page of the 114 * vmemmap page. 115 */ 116 pmdp = pmd_off_k(vaddr); 117 /* 118 * The READ_ONCE() is used to stabilize *pmdp in a register or 119 * on the stack so that it will stop changing under the code. 120 * The only concurrent operation where it can be changed is 121 * split_vmemmap_huge_pmd() (*pmdp will be stable after this 122 * operation). 123 */ 124 pmd = READ_ONCE(*pmdp); 125 if (pmd_leaf(pmd)) 126 vmemmap_page = pmd_page(pmd) + pte_index(vaddr); 127 else 128 vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr)); 129 /* 130 * Due to HugeTLB alignment requirements and the vmemmap pages 131 * being at the start of the hotplugged memory region in 132 * memory_hotplug.memmap_on_memory case. Checking any vmemmap 133 * page's vmemmap page if it is marked as VmemmapSelfHosted is 134 * sufficient. 135 * 136 * [ hotplugged memory ] 137 * [ section ][...][ section ] 138 * [ vmemmap ][ usable memory ] 139 * ^ | | | 140 * +---+ | | 141 * ^ | | 142 * +-------+ | 143 * ^ | 144 * +-------------------------------------------+ 145 */ 146 if (PageVmemmapSelfHosted(vmemmap_page)) 147 return 0; 148 } 149 150 return hugetlb_optimize_vmemmap_pages(h); 151 } 152 153 void hugetlb_vmemmap_free(struct hstate *h, struct page *head) 154 { 155 unsigned long vmemmap_addr = (unsigned long)head; 156 unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; 157 158 vmemmap_pages = vmemmap_optimizable_pages(h, head); 159 if (!vmemmap_pages) 160 return; 161 162 static_branch_inc(&hugetlb_optimize_vmemmap_key); 163 164 vmemmap_addr += RESERVE_VMEMMAP_SIZE; 165 vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); 166 vmemmap_reuse = vmemmap_addr - PAGE_SIZE; 167 168 /* 169 * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) 170 * to the page which @vmemmap_reuse is mapped to, then free the pages 171 * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. 172 */ 173 if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) 174 static_branch_dec(&hugetlb_optimize_vmemmap_key); 175 else 176 SetHPageVmemmapOptimized(head); 177 } 178 179 void __init hugetlb_vmemmap_init(struct hstate *h) 180 { 181 unsigned int nr_pages = pages_per_huge_page(h); 182 unsigned int vmemmap_pages; 183 184 /* 185 * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct 186 * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, 187 * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. 188 */ 189 BUILD_BUG_ON(__NR_USED_SUBPAGE >= 190 RESERVE_VMEMMAP_SIZE / sizeof(struct page)); 191 192 if (!is_power_of_2(sizeof(struct page))) { 193 pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); 194 static_branch_disable(&hugetlb_optimize_vmemmap_key); 195 return; 196 } 197 198 vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; 199 /* 200 * The head page is not to be freed to buddy allocator, the other tail 201 * pages will map to the head page, so they can be freed. 202 * 203 * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true 204 * on some architectures (e.g. aarch64). See Documentation/arm64/ 205 * hugetlbpage.rst for more details. 206 */ 207 if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) 208 h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; 209 210 pr_info("can optimize %d vmemmap pages for %s\n", 211 h->optimize_vmemmap_pages, h->name); 212 } 213 214 #ifdef CONFIG_PROC_SYSCTL 215 static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, 216 void *buffer, size_t *length, 217 loff_t *ppos) 218 { 219 int ret; 220 enum vmemmap_optimize_mode mode; 221 static DEFINE_MUTEX(sysctl_mutex); 222 223 if (write && !capable(CAP_SYS_ADMIN)) 224 return -EPERM; 225 226 mutex_lock(&sysctl_mutex); 227 mode = vmemmap_optimize_mode; 228 table->data = &mode; 229 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 230 if (write && !ret) 231 vmemmap_optimize_mode_switch(mode); 232 mutex_unlock(&sysctl_mutex); 233 234 return ret; 235 } 236 237 static struct ctl_table hugetlb_vmemmap_sysctls[] = { 238 { 239 .procname = "hugetlb_optimize_vmemmap", 240 .maxlen = sizeof(enum vmemmap_optimize_mode), 241 .mode = 0644, 242 .proc_handler = hugetlb_optimize_vmemmap_handler, 243 .extra1 = SYSCTL_ZERO, 244 .extra2 = SYSCTL_ONE, 245 }, 246 { } 247 }; 248 249 static __init int hugetlb_vmemmap_sysctls_init(void) 250 { 251 /* 252 * If "struct page" crosses page boundaries, the vmemmap pages cannot 253 * be optimized. 254 */ 255 if (is_power_of_2(sizeof(struct page))) 256 register_sysctl_init("vm", hugetlb_vmemmap_sysctls); 257 258 return 0; 259 } 260 late_initcall(hugetlb_vmemmap_sysctls_init); 261 #endif /* CONFIG_PROC_SYSCTL */ 262