1 /* 2 * IA-32 Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 5 */ 6 7 #include <linux/init.h> 8 #include <linux/fs.h> 9 #include <linux/mm.h> 10 #include <linux/hugetlb.h> 11 #include <linux/pagemap.h> 12 #include <linux/err.h> 13 #include <linux/sysctl.h> 14 #include <asm/mman.h> 15 #include <asm/tlb.h> 16 #include <asm/tlbflush.h> 17 #include <asm/pgalloc.h> 18 19 static unsigned long page_table_shareable(struct vm_area_struct *svma, 20 struct vm_area_struct *vma, 21 unsigned long addr, pgoff_t idx) 22 { 23 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 24 svma->vm_start; 25 unsigned long sbase = saddr & PUD_MASK; 26 unsigned long s_end = sbase + PUD_SIZE; 27 28 /* Allow segments to share if only one is marked locked */ 29 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; 30 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; 31 32 /* 33 * match the virtual addresses, permission and the alignment of the 34 * page table page. 35 */ 36 if (pmd_index(addr) != pmd_index(saddr) || 37 vm_flags != svm_flags || 38 sbase < svma->vm_start || svma->vm_end < s_end) 39 return 0; 40 41 return saddr; 42 } 43 44 static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) 45 { 46 unsigned long base = addr & PUD_MASK; 47 unsigned long end = base + PUD_SIZE; 48 49 /* 50 * check on proper vm_flags and page table alignment 51 */ 52 if (vma->vm_flags & VM_MAYSHARE && 53 vma->vm_start <= base && end <= vma->vm_end) 54 return 1; 55 return 0; 56 } 57 58 /* 59 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 60 * and returns the corresponding pte. While this is not necessary for the 61 * !shared pmd case because we can allocate the pmd later as well, it makes the 62 * code much cleaner. pmd allocation is essential for the shared case because 63 * pud has to be populated inside the same i_mmap_mutex section - otherwise 64 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 65 * bad pmd for sharing. 66 */ 67 static pte_t * 68 huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 69 { 70 struct vm_area_struct *vma = find_vma(mm, addr); 71 struct address_space *mapping = vma->vm_file->f_mapping; 72 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 73 vma->vm_pgoff; 74 struct prio_tree_iter iter; 75 struct vm_area_struct *svma; 76 unsigned long saddr; 77 pte_t *spte = NULL; 78 pte_t *pte; 79 80 if (!vma_shareable(vma, addr)) 81 return (pte_t *)pmd_alloc(mm, pud, addr); 82 83 mutex_lock(&mapping->i_mmap_mutex); 84 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { 85 if (svma == vma) 86 continue; 87 88 saddr = page_table_shareable(svma, vma, addr, idx); 89 if (saddr) { 90 spte = huge_pte_offset(svma->vm_mm, saddr); 91 if (spte) { 92 get_page(virt_to_page(spte)); 93 break; 94 } 95 } 96 } 97 98 if (!spte) 99 goto out; 100 101 spin_lock(&mm->page_table_lock); 102 if (pud_none(*pud)) 103 pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); 104 else 105 put_page(virt_to_page(spte)); 106 spin_unlock(&mm->page_table_lock); 107 out: 108 pte = (pte_t *)pmd_alloc(mm, pud, addr); 109 mutex_unlock(&mapping->i_mmap_mutex); 110 return pte; 111 } 112 113 /* 114 * unmap huge page backed by shared pte. 115 * 116 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 117 * indicated by page_count > 1, unmap is achieved by clearing pud and 118 * decrementing the ref count. If count == 1, the pte page is not shared. 119 * 120 * called with vma->vm_mm->page_table_lock held. 121 * 122 * returns: 1 successfully unmapped a shared pte page 123 * 0 the underlying pte page is not shared, or it is the last user 124 */ 125 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 126 { 127 pgd_t *pgd = pgd_offset(mm, *addr); 128 pud_t *pud = pud_offset(pgd, *addr); 129 130 BUG_ON(page_count(virt_to_page(ptep)) == 0); 131 if (page_count(virt_to_page(ptep)) == 1) 132 return 0; 133 134 pud_clear(pud); 135 put_page(virt_to_page(ptep)); 136 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 137 return 1; 138 } 139 140 pte_t *huge_pte_alloc(struct mm_struct *mm, 141 unsigned long addr, unsigned long sz) 142 { 143 pgd_t *pgd; 144 pud_t *pud; 145 pte_t *pte = NULL; 146 147 pgd = pgd_offset(mm, addr); 148 pud = pud_alloc(mm, pgd, addr); 149 if (pud) { 150 if (sz == PUD_SIZE) { 151 pte = (pte_t *)pud; 152 } else { 153 BUG_ON(sz != PMD_SIZE); 154 if (pud_none(*pud)) 155 pte = huge_pmd_share(mm, addr, pud); 156 else 157 pte = (pte_t *)pmd_alloc(mm, pud, addr); 158 } 159 } 160 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 161 162 return pte; 163 } 164 165 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 166 { 167 pgd_t *pgd; 168 pud_t *pud; 169 pmd_t *pmd = NULL; 170 171 pgd = pgd_offset(mm, addr); 172 if (pgd_present(*pgd)) { 173 pud = pud_offset(pgd, addr); 174 if (pud_present(*pud)) { 175 if (pud_large(*pud)) 176 return (pte_t *)pud; 177 pmd = pmd_offset(pud, addr); 178 } 179 } 180 return (pte_t *) pmd; 181 } 182 183 #if 0 /* This is just for testing */ 184 struct page * 185 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 186 { 187 unsigned long start = address; 188 int length = 1; 189 int nr; 190 struct page *page; 191 struct vm_area_struct *vma; 192 193 vma = find_vma(mm, addr); 194 if (!vma || !is_vm_hugetlb_page(vma)) 195 return ERR_PTR(-EINVAL); 196 197 pte = huge_pte_offset(mm, address); 198 199 /* hugetlb should be locked, and hence, prefaulted */ 200 WARN_ON(!pte || pte_none(*pte)); 201 202 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 203 204 WARN_ON(!PageHead(page)); 205 206 return page; 207 } 208 209 int pmd_huge(pmd_t pmd) 210 { 211 return 0; 212 } 213 214 int pud_huge(pud_t pud) 215 { 216 return 0; 217 } 218 219 struct page * 220 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 221 pmd_t *pmd, int write) 222 { 223 return NULL; 224 } 225 226 #else 227 228 struct page * 229 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 230 { 231 return ERR_PTR(-EINVAL); 232 } 233 234 int pmd_huge(pmd_t pmd) 235 { 236 return !!(pmd_val(pmd) & _PAGE_PSE); 237 } 238 239 int pud_huge(pud_t pud) 240 { 241 return !!(pud_val(pud) & _PAGE_PSE); 242 } 243 244 struct page * 245 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 246 pmd_t *pmd, int write) 247 { 248 struct page *page; 249 250 page = pte_page(*(pte_t *)pmd); 251 if (page) 252 page += ((address & ~PMD_MASK) >> PAGE_SHIFT); 253 return page; 254 } 255 256 struct page * 257 follow_huge_pud(struct mm_struct *mm, unsigned long address, 258 pud_t *pud, int write) 259 { 260 struct page *page; 261 262 page = pte_page(*(pte_t *)pud); 263 if (page) 264 page += ((address & ~PUD_MASK) >> PAGE_SHIFT); 265 return page; 266 } 267 268 #endif 269 270 /* x86_64 also uses this file */ 271 272 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 273 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, 274 unsigned long addr, unsigned long len, 275 unsigned long pgoff, unsigned long flags) 276 { 277 struct hstate *h = hstate_file(file); 278 struct mm_struct *mm = current->mm; 279 struct vm_area_struct *vma; 280 unsigned long start_addr; 281 282 if (len > mm->cached_hole_size) { 283 start_addr = mm->free_area_cache; 284 } else { 285 start_addr = TASK_UNMAPPED_BASE; 286 mm->cached_hole_size = 0; 287 } 288 289 full_search: 290 addr = ALIGN(start_addr, huge_page_size(h)); 291 292 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 293 /* At this point: (!vma || addr < vma->vm_end). */ 294 if (TASK_SIZE - len < addr) { 295 /* 296 * Start a new search - just in case we missed 297 * some holes. 298 */ 299 if (start_addr != TASK_UNMAPPED_BASE) { 300 start_addr = TASK_UNMAPPED_BASE; 301 mm->cached_hole_size = 0; 302 goto full_search; 303 } 304 return -ENOMEM; 305 } 306 if (!vma || addr + len <= vma->vm_start) { 307 mm->free_area_cache = addr + len; 308 return addr; 309 } 310 if (addr + mm->cached_hole_size < vma->vm_start) 311 mm->cached_hole_size = vma->vm_start - addr; 312 addr = ALIGN(vma->vm_end, huge_page_size(h)); 313 } 314 } 315 316 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, 317 unsigned long addr0, unsigned long len, 318 unsigned long pgoff, unsigned long flags) 319 { 320 struct hstate *h = hstate_file(file); 321 struct mm_struct *mm = current->mm; 322 struct vm_area_struct *vma; 323 unsigned long base = mm->mmap_base; 324 unsigned long addr = addr0; 325 unsigned long largest_hole = mm->cached_hole_size; 326 unsigned long start_addr; 327 328 /* don't allow allocations above current base */ 329 if (mm->free_area_cache > base) 330 mm->free_area_cache = base; 331 332 if (len <= largest_hole) { 333 largest_hole = 0; 334 mm->free_area_cache = base; 335 } 336 try_again: 337 start_addr = mm->free_area_cache; 338 339 /* make sure it can fit in the remaining address space */ 340 if (mm->free_area_cache < len) 341 goto fail; 342 343 /* either no address requested or can't fit in requested address hole */ 344 addr = (mm->free_area_cache - len) & huge_page_mask(h); 345 do { 346 /* 347 * Lookup failure means no vma is above this address, 348 * i.e. return with success: 349 */ 350 vma = find_vma(mm, addr); 351 if (!vma) 352 return addr; 353 354 if (addr + len <= vma->vm_start) { 355 /* remember the address as a hint for next time */ 356 mm->cached_hole_size = largest_hole; 357 return (mm->free_area_cache = addr); 358 } else if (mm->free_area_cache == vma->vm_end) { 359 /* pull free_area_cache down to the first hole */ 360 mm->free_area_cache = vma->vm_start; 361 mm->cached_hole_size = largest_hole; 362 } 363 364 /* remember the largest hole we saw so far */ 365 if (addr + largest_hole < vma->vm_start) 366 largest_hole = vma->vm_start - addr; 367 368 /* try just below the current vma->vm_start */ 369 addr = (vma->vm_start - len) & huge_page_mask(h); 370 } while (len <= vma->vm_start); 371 372 fail: 373 /* 374 * if hint left us with no space for the requested 375 * mapping then try again: 376 */ 377 if (start_addr != base) { 378 mm->free_area_cache = base; 379 largest_hole = 0; 380 goto try_again; 381 } 382 /* 383 * A failed mmap() very likely causes application failure, 384 * so fall back to the bottom-up function here. This scenario 385 * can happen with large stack limits and large mmap() 386 * allocations. 387 */ 388 mm->free_area_cache = TASK_UNMAPPED_BASE; 389 mm->cached_hole_size = ~0UL; 390 addr = hugetlb_get_unmapped_area_bottomup(file, addr0, 391 len, pgoff, flags); 392 393 /* 394 * Restore the topdown base: 395 */ 396 mm->free_area_cache = base; 397 mm->cached_hole_size = ~0UL; 398 399 return addr; 400 } 401 402 unsigned long 403 hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 404 unsigned long len, unsigned long pgoff, unsigned long flags) 405 { 406 struct hstate *h = hstate_file(file); 407 struct mm_struct *mm = current->mm; 408 struct vm_area_struct *vma; 409 410 if (len & ~huge_page_mask(h)) 411 return -EINVAL; 412 if (len > TASK_SIZE) 413 return -ENOMEM; 414 415 if (flags & MAP_FIXED) { 416 if (prepare_hugepage_range(file, addr, len)) 417 return -EINVAL; 418 return addr; 419 } 420 421 if (addr) { 422 addr = ALIGN(addr, huge_page_size(h)); 423 vma = find_vma(mm, addr); 424 if (TASK_SIZE - len >= addr && 425 (!vma || addr + len <= vma->vm_start)) 426 return addr; 427 } 428 if (mm->get_unmapped_area == arch_get_unmapped_area) 429 return hugetlb_get_unmapped_area_bottomup(file, addr, len, 430 pgoff, flags); 431 else 432 return hugetlb_get_unmapped_area_topdown(file, addr, len, 433 pgoff, flags); 434 } 435 436 #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 437 438 #ifdef CONFIG_X86_64 439 static __init int setup_hugepagesz(char *opt) 440 { 441 unsigned long ps = memparse(opt, &opt); 442 if (ps == PMD_SIZE) { 443 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); 444 } else if (ps == PUD_SIZE && cpu_has_gbpages) { 445 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); 446 } else { 447 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", 448 ps >> 20); 449 return 0; 450 } 451 return 1; 452 } 453 __setup("hugepagesz=", setup_hugepagesz); 454 #endif 455