1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/spinlock.h> 14 #include <linux/rcupdate.h> 15 #include <linux/slab.h> 16 #include <linux/swapops.h> 17 #include <linux/sysctl.h> 18 #include <linux/ksm.h> 19 #include <linux/mman.h> 20 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 27 unsigned long *crst_table_alloc(struct mm_struct *mm) 28 { 29 struct page *page = alloc_pages(GFP_KERNEL, 2); 30 31 if (!page) 32 return NULL; 33 return (unsigned long *) page_to_phys(page); 34 } 35 36 void crst_table_free(struct mm_struct *mm, unsigned long *table) 37 { 38 free_pages((unsigned long) table, 2); 39 } 40 41 static void __crst_table_upgrade(void *arg) 42 { 43 struct mm_struct *mm = arg; 44 45 if (current->active_mm == mm) { 46 clear_user_asce(); 47 set_user_asce(mm); 48 } 49 __tlb_flush_local(); 50 } 51 52 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 53 { 54 unsigned long *table, *pgd; 55 unsigned long entry; 56 int flush; 57 58 BUG_ON(limit > (1UL << 53)); 59 flush = 0; 60 repeat: 61 table = crst_table_alloc(mm); 62 if (!table) 63 return -ENOMEM; 64 spin_lock_bh(&mm->page_table_lock); 65 if (mm->context.asce_limit < limit) { 66 pgd = (unsigned long *) mm->pgd; 67 if (mm->context.asce_limit <= (1UL << 31)) { 68 entry = _REGION3_ENTRY_EMPTY; 69 mm->context.asce_limit = 1UL << 42; 70 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 71 _ASCE_USER_BITS | 72 _ASCE_TYPE_REGION3; 73 } else { 74 entry = _REGION2_ENTRY_EMPTY; 75 mm->context.asce_limit = 1UL << 53; 76 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 77 _ASCE_USER_BITS | 78 _ASCE_TYPE_REGION2; 79 } 80 crst_table_init(table, entry); 81 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 82 mm->pgd = (pgd_t *) table; 83 mm->task_size = mm->context.asce_limit; 84 table = NULL; 85 flush = 1; 86 } 87 spin_unlock_bh(&mm->page_table_lock); 88 if (table) 89 crst_table_free(mm, table); 90 if (mm->context.asce_limit < limit) 91 goto repeat; 92 if (flush) 93 on_each_cpu(__crst_table_upgrade, mm, 0); 94 return 0; 95 } 96 97 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 98 { 99 pgd_t *pgd; 100 101 if (current->active_mm == mm) { 102 clear_user_asce(); 103 __tlb_flush_mm(mm); 104 } 105 while (mm->context.asce_limit > limit) { 106 pgd = mm->pgd; 107 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 108 case _REGION_ENTRY_TYPE_R2: 109 mm->context.asce_limit = 1UL << 42; 110 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 111 _ASCE_USER_BITS | 112 _ASCE_TYPE_REGION3; 113 break; 114 case _REGION_ENTRY_TYPE_R3: 115 mm->context.asce_limit = 1UL << 31; 116 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 117 _ASCE_USER_BITS | 118 _ASCE_TYPE_SEGMENT; 119 break; 120 default: 121 BUG(); 122 } 123 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 124 mm->task_size = mm->context.asce_limit; 125 crst_table_free(mm, (unsigned long *) pgd); 126 } 127 if (current->active_mm == mm) 128 set_user_asce(mm); 129 } 130 131 #ifdef CONFIG_PGSTE 132 133 /** 134 * gmap_alloc - allocate a guest address space 135 * @mm: pointer to the parent mm_struct 136 * @limit: maximum size of the gmap address space 137 * 138 * Returns a guest address space structure. 139 */ 140 struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) 141 { 142 struct gmap *gmap; 143 struct page *page; 144 unsigned long *table; 145 unsigned long etype, atype; 146 147 if (limit < (1UL << 31)) { 148 limit = (1UL << 31) - 1; 149 atype = _ASCE_TYPE_SEGMENT; 150 etype = _SEGMENT_ENTRY_EMPTY; 151 } else if (limit < (1UL << 42)) { 152 limit = (1UL << 42) - 1; 153 atype = _ASCE_TYPE_REGION3; 154 etype = _REGION3_ENTRY_EMPTY; 155 } else if (limit < (1UL << 53)) { 156 limit = (1UL << 53) - 1; 157 atype = _ASCE_TYPE_REGION2; 158 etype = _REGION2_ENTRY_EMPTY; 159 } else { 160 limit = -1UL; 161 atype = _ASCE_TYPE_REGION1; 162 etype = _REGION1_ENTRY_EMPTY; 163 } 164 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 165 if (!gmap) 166 goto out; 167 INIT_LIST_HEAD(&gmap->crst_list); 168 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); 169 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); 170 spin_lock_init(&gmap->guest_table_lock); 171 gmap->mm = mm; 172 page = alloc_pages(GFP_KERNEL, 2); 173 if (!page) 174 goto out_free; 175 page->index = 0; 176 list_add(&page->lru, &gmap->crst_list); 177 table = (unsigned long *) page_to_phys(page); 178 crst_table_init(table, etype); 179 gmap->table = table; 180 gmap->asce = atype | _ASCE_TABLE_LENGTH | 181 _ASCE_USER_BITS | __pa(table); 182 gmap->asce_end = limit; 183 down_write(&mm->mmap_sem); 184 list_add(&gmap->list, &mm->context.gmap_list); 185 up_write(&mm->mmap_sem); 186 return gmap; 187 188 out_free: 189 kfree(gmap); 190 out: 191 return NULL; 192 } 193 EXPORT_SYMBOL_GPL(gmap_alloc); 194 195 static void gmap_flush_tlb(struct gmap *gmap) 196 { 197 if (MACHINE_HAS_IDTE) 198 __tlb_flush_asce(gmap->mm, gmap->asce); 199 else 200 __tlb_flush_global(); 201 } 202 203 static void gmap_radix_tree_free(struct radix_tree_root *root) 204 { 205 struct radix_tree_iter iter; 206 unsigned long indices[16]; 207 unsigned long index; 208 void **slot; 209 int i, nr; 210 211 /* A radix tree is freed by deleting all of its entries */ 212 index = 0; 213 do { 214 nr = 0; 215 radix_tree_for_each_slot(slot, root, &iter, index) { 216 indices[nr] = iter.index; 217 if (++nr == 16) 218 break; 219 } 220 for (i = 0; i < nr; i++) { 221 index = indices[i]; 222 radix_tree_delete(root, index); 223 } 224 } while (nr > 0); 225 } 226 227 /** 228 * gmap_free - free a guest address space 229 * @gmap: pointer to the guest address space structure 230 */ 231 void gmap_free(struct gmap *gmap) 232 { 233 struct page *page, *next; 234 235 /* Flush tlb. */ 236 if (MACHINE_HAS_IDTE) 237 __tlb_flush_asce(gmap->mm, gmap->asce); 238 else 239 __tlb_flush_global(); 240 241 /* Free all segment & region tables. */ 242 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 243 __free_pages(page, 2); 244 gmap_radix_tree_free(&gmap->guest_to_host); 245 gmap_radix_tree_free(&gmap->host_to_guest); 246 down_write(&gmap->mm->mmap_sem); 247 list_del(&gmap->list); 248 up_write(&gmap->mm->mmap_sem); 249 kfree(gmap); 250 } 251 EXPORT_SYMBOL_GPL(gmap_free); 252 253 /** 254 * gmap_enable - switch primary space to the guest address space 255 * @gmap: pointer to the guest address space structure 256 */ 257 void gmap_enable(struct gmap *gmap) 258 { 259 S390_lowcore.gmap = (unsigned long) gmap; 260 } 261 EXPORT_SYMBOL_GPL(gmap_enable); 262 263 /** 264 * gmap_disable - switch back to the standard primary address space 265 * @gmap: pointer to the guest address space structure 266 */ 267 void gmap_disable(struct gmap *gmap) 268 { 269 S390_lowcore.gmap = 0UL; 270 } 271 EXPORT_SYMBOL_GPL(gmap_disable); 272 273 /* 274 * gmap_alloc_table is assumed to be called with mmap_sem held 275 */ 276 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 277 unsigned long init, unsigned long gaddr) 278 { 279 struct page *page; 280 unsigned long *new; 281 282 /* since we dont free the gmap table until gmap_free we can unlock */ 283 page = alloc_pages(GFP_KERNEL, 2); 284 if (!page) 285 return -ENOMEM; 286 new = (unsigned long *) page_to_phys(page); 287 crst_table_init(new, init); 288 spin_lock(&gmap->mm->page_table_lock); 289 if (*table & _REGION_ENTRY_INVALID) { 290 list_add(&page->lru, &gmap->crst_list); 291 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 292 (*table & _REGION_ENTRY_TYPE_MASK); 293 page->index = gaddr; 294 page = NULL; 295 } 296 spin_unlock(&gmap->mm->page_table_lock); 297 if (page) 298 __free_pages(page, 2); 299 return 0; 300 } 301 302 /** 303 * __gmap_segment_gaddr - find virtual address from segment pointer 304 * @entry: pointer to a segment table entry in the guest address space 305 * 306 * Returns the virtual address in the guest address space for the segment 307 */ 308 static unsigned long __gmap_segment_gaddr(unsigned long *entry) 309 { 310 struct page *page; 311 unsigned long offset, mask; 312 313 offset = (unsigned long) entry / sizeof(unsigned long); 314 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 315 mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); 316 page = virt_to_page((void *)((unsigned long) entry & mask)); 317 return page->index + offset; 318 } 319 320 /** 321 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 322 * @gmap: pointer to the guest address space structure 323 * @vmaddr: address in the host process address space 324 * 325 * Returns 1 if a TLB flush is required 326 */ 327 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 328 { 329 unsigned long *entry; 330 int flush = 0; 331 332 spin_lock(&gmap->guest_table_lock); 333 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 334 if (entry) { 335 flush = (*entry != _SEGMENT_ENTRY_INVALID); 336 *entry = _SEGMENT_ENTRY_INVALID; 337 } 338 spin_unlock(&gmap->guest_table_lock); 339 return flush; 340 } 341 342 /** 343 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 344 * @gmap: pointer to the guest address space structure 345 * @gaddr: address in the guest address space 346 * 347 * Returns 1 if a TLB flush is required 348 */ 349 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 350 { 351 unsigned long vmaddr; 352 353 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 354 gaddr >> PMD_SHIFT); 355 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 356 } 357 358 /** 359 * gmap_unmap_segment - unmap segment from the guest address space 360 * @gmap: pointer to the guest address space structure 361 * @to: address in the guest address space 362 * @len: length of the memory area to unmap 363 * 364 * Returns 0 if the unmap succeeded, -EINVAL if not. 365 */ 366 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 367 { 368 unsigned long off; 369 int flush; 370 371 if ((to | len) & (PMD_SIZE - 1)) 372 return -EINVAL; 373 if (len == 0 || to + len < to) 374 return -EINVAL; 375 376 flush = 0; 377 down_write(&gmap->mm->mmap_sem); 378 for (off = 0; off < len; off += PMD_SIZE) 379 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 380 up_write(&gmap->mm->mmap_sem); 381 if (flush) 382 gmap_flush_tlb(gmap); 383 return 0; 384 } 385 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 386 387 /** 388 * gmap_mmap_segment - map a segment to the guest address space 389 * @gmap: pointer to the guest address space structure 390 * @from: source address in the parent address space 391 * @to: target address in the guest address space 392 * @len: length of the memory area to map 393 * 394 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 395 */ 396 int gmap_map_segment(struct gmap *gmap, unsigned long from, 397 unsigned long to, unsigned long len) 398 { 399 unsigned long off; 400 int flush; 401 402 if ((from | to | len) & (PMD_SIZE - 1)) 403 return -EINVAL; 404 if (len == 0 || from + len < from || to + len < to || 405 from + len > TASK_MAX_SIZE || to + len > gmap->asce_end) 406 return -EINVAL; 407 408 flush = 0; 409 down_write(&gmap->mm->mmap_sem); 410 for (off = 0; off < len; off += PMD_SIZE) { 411 /* Remove old translation */ 412 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 413 /* Store new translation */ 414 if (radix_tree_insert(&gmap->guest_to_host, 415 (to + off) >> PMD_SHIFT, 416 (void *) from + off)) 417 break; 418 } 419 up_write(&gmap->mm->mmap_sem); 420 if (flush) 421 gmap_flush_tlb(gmap); 422 if (off >= len) 423 return 0; 424 gmap_unmap_segment(gmap, to, len); 425 return -ENOMEM; 426 } 427 EXPORT_SYMBOL_GPL(gmap_map_segment); 428 429 /** 430 * __gmap_translate - translate a guest address to a user space address 431 * @gmap: pointer to guest mapping meta data structure 432 * @gaddr: guest address 433 * 434 * Returns user space address which corresponds to the guest address or 435 * -EFAULT if no such mapping exists. 436 * This function does not establish potentially missing page table entries. 437 * The mmap_sem of the mm that belongs to the address space must be held 438 * when this function gets called. 439 */ 440 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 441 { 442 unsigned long vmaddr; 443 444 vmaddr = (unsigned long) 445 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 446 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 447 } 448 EXPORT_SYMBOL_GPL(__gmap_translate); 449 450 /** 451 * gmap_translate - translate a guest address to a user space address 452 * @gmap: pointer to guest mapping meta data structure 453 * @gaddr: guest address 454 * 455 * Returns user space address which corresponds to the guest address or 456 * -EFAULT if no such mapping exists. 457 * This function does not establish potentially missing page table entries. 458 */ 459 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 460 { 461 unsigned long rc; 462 463 down_read(&gmap->mm->mmap_sem); 464 rc = __gmap_translate(gmap, gaddr); 465 up_read(&gmap->mm->mmap_sem); 466 return rc; 467 } 468 EXPORT_SYMBOL_GPL(gmap_translate); 469 470 /** 471 * gmap_unlink - disconnect a page table from the gmap shadow tables 472 * @gmap: pointer to guest mapping meta data structure 473 * @table: pointer to the host page table 474 * @vmaddr: vm address associated with the host page table 475 */ 476 static void gmap_unlink(struct mm_struct *mm, unsigned long *table, 477 unsigned long vmaddr) 478 { 479 struct gmap *gmap; 480 int flush; 481 482 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 483 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 484 if (flush) 485 gmap_flush_tlb(gmap); 486 } 487 } 488 489 /** 490 * gmap_link - set up shadow page tables to connect a host to a guest address 491 * @gmap: pointer to guest mapping meta data structure 492 * @gaddr: guest address 493 * @vmaddr: vm address 494 * 495 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 496 * if the vm address is already mapped to a different guest segment. 497 * The mmap_sem of the mm that belongs to the address space must be held 498 * when this function gets called. 499 */ 500 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 501 { 502 struct mm_struct *mm; 503 unsigned long *table; 504 spinlock_t *ptl; 505 pgd_t *pgd; 506 pud_t *pud; 507 pmd_t *pmd; 508 int rc; 509 510 /* Create higher level tables in the gmap page table */ 511 table = gmap->table; 512 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 513 table += (gaddr >> 53) & 0x7ff; 514 if ((*table & _REGION_ENTRY_INVALID) && 515 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 516 gaddr & 0xffe0000000000000UL)) 517 return -ENOMEM; 518 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 519 } 520 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 521 table += (gaddr >> 42) & 0x7ff; 522 if ((*table & _REGION_ENTRY_INVALID) && 523 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 524 gaddr & 0xfffffc0000000000UL)) 525 return -ENOMEM; 526 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 527 } 528 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 529 table += (gaddr >> 31) & 0x7ff; 530 if ((*table & _REGION_ENTRY_INVALID) && 531 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 532 gaddr & 0xffffffff80000000UL)) 533 return -ENOMEM; 534 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 535 } 536 table += (gaddr >> 20) & 0x7ff; 537 /* Walk the parent mm page table */ 538 mm = gmap->mm; 539 pgd = pgd_offset(mm, vmaddr); 540 VM_BUG_ON(pgd_none(*pgd)); 541 pud = pud_offset(pgd, vmaddr); 542 VM_BUG_ON(pud_none(*pud)); 543 pmd = pmd_offset(pud, vmaddr); 544 VM_BUG_ON(pmd_none(*pmd)); 545 /* large pmds cannot yet be handled */ 546 if (pmd_large(*pmd)) 547 return -EFAULT; 548 /* Link gmap segment table entry location to page table. */ 549 rc = radix_tree_preload(GFP_KERNEL); 550 if (rc) 551 return rc; 552 ptl = pmd_lock(mm, pmd); 553 spin_lock(&gmap->guest_table_lock); 554 if (*table == _SEGMENT_ENTRY_INVALID) { 555 rc = radix_tree_insert(&gmap->host_to_guest, 556 vmaddr >> PMD_SHIFT, table); 557 if (!rc) 558 *table = pmd_val(*pmd); 559 } else 560 rc = 0; 561 spin_unlock(&gmap->guest_table_lock); 562 spin_unlock(ptl); 563 radix_tree_preload_end(); 564 return rc; 565 } 566 567 /** 568 * gmap_fault - resolve a fault on a guest address 569 * @gmap: pointer to guest mapping meta data structure 570 * @gaddr: guest address 571 * @fault_flags: flags to pass down to handle_mm_fault() 572 * 573 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 574 * if the vm address is already mapped to a different guest segment. 575 */ 576 int gmap_fault(struct gmap *gmap, unsigned long gaddr, 577 unsigned int fault_flags) 578 { 579 unsigned long vmaddr; 580 int rc; 581 582 down_read(&gmap->mm->mmap_sem); 583 vmaddr = __gmap_translate(gmap, gaddr); 584 if (IS_ERR_VALUE(vmaddr)) { 585 rc = vmaddr; 586 goto out_up; 587 } 588 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) { 589 rc = -EFAULT; 590 goto out_up; 591 } 592 rc = __gmap_link(gmap, gaddr, vmaddr); 593 out_up: 594 up_read(&gmap->mm->mmap_sem); 595 return rc; 596 } 597 EXPORT_SYMBOL_GPL(gmap_fault); 598 599 static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) 600 { 601 if (!non_swap_entry(entry)) 602 dec_mm_counter(mm, MM_SWAPENTS); 603 else if (is_migration_entry(entry)) { 604 struct page *page = migration_entry_to_page(entry); 605 606 if (PageAnon(page)) 607 dec_mm_counter(mm, MM_ANONPAGES); 608 else 609 dec_mm_counter(mm, MM_FILEPAGES); 610 } 611 free_swap_and_cache(entry); 612 } 613 614 /* 615 * this function is assumed to be called with mmap_sem held 616 */ 617 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 618 { 619 unsigned long vmaddr, ptev, pgstev; 620 pte_t *ptep, pte; 621 spinlock_t *ptl; 622 pgste_t pgste; 623 624 /* Find the vm address for the guest address */ 625 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 626 gaddr >> PMD_SHIFT); 627 if (!vmaddr) 628 return; 629 vmaddr |= gaddr & ~PMD_MASK; 630 /* Get pointer to the page table entry */ 631 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 632 if (unlikely(!ptep)) 633 return; 634 pte = *ptep; 635 if (!pte_swap(pte)) 636 goto out_pte; 637 /* Zap unused and logically-zero pages */ 638 pgste = pgste_get_lock(ptep); 639 pgstev = pgste_val(pgste); 640 ptev = pte_val(pte); 641 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || 642 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { 643 gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); 644 pte_clear(gmap->mm, vmaddr, ptep); 645 } 646 pgste_set_unlock(ptep, pgste); 647 out_pte: 648 pte_unmap_unlock(ptep, ptl); 649 } 650 EXPORT_SYMBOL_GPL(__gmap_zap); 651 652 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 653 { 654 unsigned long gaddr, vmaddr, size; 655 struct vm_area_struct *vma; 656 657 down_read(&gmap->mm->mmap_sem); 658 for (gaddr = from; gaddr < to; 659 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 660 /* Find the vm address for the guest address */ 661 vmaddr = (unsigned long) 662 radix_tree_lookup(&gmap->guest_to_host, 663 gaddr >> PMD_SHIFT); 664 if (!vmaddr) 665 continue; 666 vmaddr |= gaddr & ~PMD_MASK; 667 /* Find vma in the parent mm */ 668 vma = find_vma(gmap->mm, vmaddr); 669 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 670 zap_page_range(vma, vmaddr, size, NULL); 671 } 672 up_read(&gmap->mm->mmap_sem); 673 } 674 EXPORT_SYMBOL_GPL(gmap_discard); 675 676 static LIST_HEAD(gmap_notifier_list); 677 static DEFINE_SPINLOCK(gmap_notifier_lock); 678 679 /** 680 * gmap_register_ipte_notifier - register a pte invalidation callback 681 * @nb: pointer to the gmap notifier block 682 */ 683 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 684 { 685 spin_lock(&gmap_notifier_lock); 686 list_add(&nb->list, &gmap_notifier_list); 687 spin_unlock(&gmap_notifier_lock); 688 } 689 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 690 691 /** 692 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 693 * @nb: pointer to the gmap notifier block 694 */ 695 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 696 { 697 spin_lock(&gmap_notifier_lock); 698 list_del_init(&nb->list); 699 spin_unlock(&gmap_notifier_lock); 700 } 701 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 702 703 /** 704 * gmap_ipte_notify - mark a range of ptes for invalidation notification 705 * @gmap: pointer to guest mapping meta data structure 706 * @gaddr: virtual address in the guest address space 707 * @len: size of area 708 * 709 * Returns 0 if for each page in the given range a gmap mapping exists and 710 * the invalidation notification could be set. If the gmap mapping is missing 711 * for one or more pages -EFAULT is returned. If no memory could be allocated 712 * -ENOMEM is returned. This function establishes missing page table entries. 713 */ 714 int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) 715 { 716 unsigned long addr; 717 spinlock_t *ptl; 718 pte_t *ptep, entry; 719 pgste_t pgste; 720 int rc = 0; 721 722 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) 723 return -EINVAL; 724 down_read(&gmap->mm->mmap_sem); 725 while (len) { 726 /* Convert gmap address and connect the page tables */ 727 addr = __gmap_translate(gmap, gaddr); 728 if (IS_ERR_VALUE(addr)) { 729 rc = addr; 730 break; 731 } 732 /* Get the page mapped */ 733 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 734 rc = -EFAULT; 735 break; 736 } 737 rc = __gmap_link(gmap, gaddr, addr); 738 if (rc) 739 break; 740 /* Walk the process page table, lock and get pte pointer */ 741 ptep = get_locked_pte(gmap->mm, addr, &ptl); 742 VM_BUG_ON(!ptep); 743 /* Set notification bit in the pgste of the pte */ 744 entry = *ptep; 745 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { 746 pgste = pgste_get_lock(ptep); 747 pgste_val(pgste) |= PGSTE_IN_BIT; 748 pgste_set_unlock(ptep, pgste); 749 gaddr += PAGE_SIZE; 750 len -= PAGE_SIZE; 751 } 752 pte_unmap_unlock(ptep, ptl); 753 } 754 up_read(&gmap->mm->mmap_sem); 755 return rc; 756 } 757 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 758 759 /** 760 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 761 * @mm: pointer to the process mm_struct 762 * @addr: virtual address in the process address space 763 * @pte: pointer to the page table entry 764 * 765 * This function is assumed to be called with the page table lock held 766 * for the pte to notify. 767 */ 768 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) 769 { 770 unsigned long offset, gaddr; 771 unsigned long *table; 772 struct gmap_notifier *nb; 773 struct gmap *gmap; 774 775 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 776 offset = offset * (4096 / sizeof(pte_t)); 777 spin_lock(&gmap_notifier_lock); 778 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 779 table = radix_tree_lookup(&gmap->host_to_guest, 780 vmaddr >> PMD_SHIFT); 781 if (!table) 782 continue; 783 gaddr = __gmap_segment_gaddr(table) + offset; 784 list_for_each_entry(nb, &gmap_notifier_list, list) 785 nb->notifier_call(gmap, gaddr); 786 } 787 spin_unlock(&gmap_notifier_lock); 788 } 789 EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); 790 791 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 792 unsigned long key, bool nq) 793 { 794 spinlock_t *ptl; 795 pgste_t old, new; 796 pte_t *ptep; 797 798 down_read(&mm->mmap_sem); 799 retry: 800 ptep = get_locked_pte(mm, addr, &ptl); 801 if (unlikely(!ptep)) { 802 up_read(&mm->mmap_sem); 803 return -EFAULT; 804 } 805 if (!(pte_val(*ptep) & _PAGE_INVALID) && 806 (pte_val(*ptep) & _PAGE_PROTECT)) { 807 pte_unmap_unlock(ptep, ptl); 808 if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { 809 up_read(&mm->mmap_sem); 810 return -EFAULT; 811 } 812 goto retry; 813 } 814 815 new = old = pgste_get_lock(ptep); 816 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 817 PGSTE_ACC_BITS | PGSTE_FP_BIT); 818 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 819 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 820 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 821 unsigned long address, bits, skey; 822 823 address = pte_val(*ptep) & PAGE_MASK; 824 skey = (unsigned long) page_get_storage_key(address); 825 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 826 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 827 /* Set storage key ACC and FP */ 828 page_set_storage_key(address, skey, !nq); 829 /* Merge host changed & referenced into pgste */ 830 pgste_val(new) |= bits << 52; 831 } 832 /* changing the guest storage key is considered a change of the page */ 833 if ((pgste_val(new) ^ pgste_val(old)) & 834 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 835 pgste_val(new) |= PGSTE_UC_BIT; 836 837 pgste_set_unlock(ptep, new); 838 pte_unmap_unlock(ptep, ptl); 839 up_read(&mm->mmap_sem); 840 return 0; 841 } 842 EXPORT_SYMBOL(set_guest_storage_key); 843 844 unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) 845 { 846 spinlock_t *ptl; 847 pgste_t pgste; 848 pte_t *ptep; 849 uint64_t physaddr; 850 unsigned long key = 0; 851 852 down_read(&mm->mmap_sem); 853 ptep = get_locked_pte(mm, addr, &ptl); 854 if (unlikely(!ptep)) { 855 up_read(&mm->mmap_sem); 856 return -EFAULT; 857 } 858 pgste = pgste_get_lock(ptep); 859 860 if (pte_val(*ptep) & _PAGE_INVALID) { 861 key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; 862 key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; 863 key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; 864 key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; 865 } else { 866 physaddr = pte_val(*ptep) & PAGE_MASK; 867 key = page_get_storage_key(physaddr); 868 869 /* Reflect guest's logical view, not physical */ 870 if (pgste_val(pgste) & PGSTE_GR_BIT) 871 key |= _PAGE_REFERENCED; 872 if (pgste_val(pgste) & PGSTE_GC_BIT) 873 key |= _PAGE_CHANGED; 874 } 875 876 pgste_set_unlock(ptep, pgste); 877 pte_unmap_unlock(ptep, ptl); 878 up_read(&mm->mmap_sem); 879 return key; 880 } 881 EXPORT_SYMBOL(get_guest_storage_key); 882 883 static int page_table_allocate_pgste_min = 0; 884 static int page_table_allocate_pgste_max = 1; 885 int page_table_allocate_pgste = 0; 886 EXPORT_SYMBOL(page_table_allocate_pgste); 887 888 static struct ctl_table page_table_sysctl[] = { 889 { 890 .procname = "allocate_pgste", 891 .data = &page_table_allocate_pgste, 892 .maxlen = sizeof(int), 893 .mode = S_IRUGO | S_IWUSR, 894 .proc_handler = proc_dointvec, 895 .extra1 = &page_table_allocate_pgste_min, 896 .extra2 = &page_table_allocate_pgste_max, 897 }, 898 { } 899 }; 900 901 static struct ctl_table page_table_sysctl_dir[] = { 902 { 903 .procname = "vm", 904 .maxlen = 0, 905 .mode = 0555, 906 .child = page_table_sysctl, 907 }, 908 { } 909 }; 910 911 static int __init page_table_register_sysctl(void) 912 { 913 return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; 914 } 915 __initcall(page_table_register_sysctl); 916 917 #else /* CONFIG_PGSTE */ 918 919 static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, 920 unsigned long vmaddr) 921 { 922 } 923 924 #endif /* CONFIG_PGSTE */ 925 926 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 927 { 928 unsigned int old, new; 929 930 do { 931 old = atomic_read(v); 932 new = old ^ bits; 933 } while (atomic_cmpxchg(v, old, new) != old); 934 return new; 935 } 936 937 /* 938 * page table entry allocation/free routines. 939 */ 940 unsigned long *page_table_alloc(struct mm_struct *mm) 941 { 942 unsigned long *table; 943 struct page *page; 944 unsigned int mask, bit; 945 946 /* Try to get a fragment of a 4K page as a 2K page table */ 947 if (!mm_alloc_pgste(mm)) { 948 table = NULL; 949 spin_lock_bh(&mm->context.list_lock); 950 if (!list_empty(&mm->context.pgtable_list)) { 951 page = list_first_entry(&mm->context.pgtable_list, 952 struct page, lru); 953 mask = atomic_read(&page->_mapcount); 954 mask = (mask | (mask >> 4)) & 3; 955 if (mask != 3) { 956 table = (unsigned long *) page_to_phys(page); 957 bit = mask & 1; /* =1 -> second 2K */ 958 if (bit) 959 table += PTRS_PER_PTE; 960 atomic_xor_bits(&page->_mapcount, 1U << bit); 961 list_del(&page->lru); 962 } 963 } 964 spin_unlock_bh(&mm->context.list_lock); 965 if (table) 966 return table; 967 } 968 /* Allocate a fresh page */ 969 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 970 if (!page) 971 return NULL; 972 if (!pgtable_page_ctor(page)) { 973 __free_page(page); 974 return NULL; 975 } 976 /* Initialize page table */ 977 table = (unsigned long *) page_to_phys(page); 978 if (mm_alloc_pgste(mm)) { 979 /* Return 4K page table with PGSTEs */ 980 atomic_set(&page->_mapcount, 3); 981 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 982 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 983 } else { 984 /* Return the first 2K fragment of the page */ 985 atomic_set(&page->_mapcount, 1); 986 clear_table(table, _PAGE_INVALID, PAGE_SIZE); 987 spin_lock_bh(&mm->context.list_lock); 988 list_add(&page->lru, &mm->context.pgtable_list); 989 spin_unlock_bh(&mm->context.list_lock); 990 } 991 return table; 992 } 993 994 void page_table_free(struct mm_struct *mm, unsigned long *table) 995 { 996 struct page *page; 997 unsigned int bit, mask; 998 999 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1000 if (!mm_alloc_pgste(mm)) { 1001 /* Free 2K page table fragment of a 4K page */ 1002 bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); 1003 spin_lock_bh(&mm->context.list_lock); 1004 mask = atomic_xor_bits(&page->_mapcount, 1U << bit); 1005 if (mask & 3) 1006 list_add(&page->lru, &mm->context.pgtable_list); 1007 else 1008 list_del(&page->lru); 1009 spin_unlock_bh(&mm->context.list_lock); 1010 if (mask != 0) 1011 return; 1012 } 1013 1014 pgtable_page_dtor(page); 1015 atomic_set(&page->_mapcount, -1); 1016 __free_page(page); 1017 } 1018 1019 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, 1020 unsigned long vmaddr) 1021 { 1022 struct mm_struct *mm; 1023 struct page *page; 1024 unsigned int bit, mask; 1025 1026 mm = tlb->mm; 1027 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1028 if (mm_alloc_pgste(mm)) { 1029 gmap_unlink(mm, table, vmaddr); 1030 table = (unsigned long *) (__pa(table) | 3); 1031 tlb_remove_table(tlb, table); 1032 return; 1033 } 1034 bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); 1035 spin_lock_bh(&mm->context.list_lock); 1036 mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); 1037 if (mask & 3) 1038 list_add_tail(&page->lru, &mm->context.pgtable_list); 1039 else 1040 list_del(&page->lru); 1041 spin_unlock_bh(&mm->context.list_lock); 1042 table = (unsigned long *) (__pa(table) | (1U << bit)); 1043 tlb_remove_table(tlb, table); 1044 } 1045 1046 static void __tlb_remove_table(void *_table) 1047 { 1048 unsigned int mask = (unsigned long) _table & 3; 1049 void *table = (void *)((unsigned long) _table ^ mask); 1050 struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1051 1052 switch (mask) { 1053 case 0: /* pmd or pud */ 1054 free_pages((unsigned long) table, 2); 1055 break; 1056 case 1: /* lower 2K of a 4K page table */ 1057 case 2: /* higher 2K of a 4K page table */ 1058 if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) 1059 break; 1060 /* fallthrough */ 1061 case 3: /* 4K page table with pgstes */ 1062 pgtable_page_dtor(page); 1063 atomic_set(&page->_mapcount, -1); 1064 __free_page(page); 1065 break; 1066 } 1067 } 1068 1069 static void tlb_remove_table_smp_sync(void *arg) 1070 { 1071 /* Simply deliver the interrupt */ 1072 } 1073 1074 static void tlb_remove_table_one(void *table) 1075 { 1076 /* 1077 * This isn't an RCU grace period and hence the page-tables cannot be 1078 * assumed to be actually RCU-freed. 1079 * 1080 * It is however sufficient for software page-table walkers that rely 1081 * on IRQ disabling. See the comment near struct mmu_table_batch. 1082 */ 1083 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 1084 __tlb_remove_table(table); 1085 } 1086 1087 static void tlb_remove_table_rcu(struct rcu_head *head) 1088 { 1089 struct mmu_table_batch *batch; 1090 int i; 1091 1092 batch = container_of(head, struct mmu_table_batch, rcu); 1093 1094 for (i = 0; i < batch->nr; i++) 1095 __tlb_remove_table(batch->tables[i]); 1096 1097 free_page((unsigned long)batch); 1098 } 1099 1100 void tlb_table_flush(struct mmu_gather *tlb) 1101 { 1102 struct mmu_table_batch **batch = &tlb->batch; 1103 1104 if (*batch) { 1105 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 1106 *batch = NULL; 1107 } 1108 } 1109 1110 void tlb_remove_table(struct mmu_gather *tlb, void *table) 1111 { 1112 struct mmu_table_batch **batch = &tlb->batch; 1113 1114 tlb->mm->context.flush_mm = 1; 1115 if (*batch == NULL) { 1116 *batch = (struct mmu_table_batch *) 1117 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 1118 if (*batch == NULL) { 1119 __tlb_flush_mm_lazy(tlb->mm); 1120 tlb_remove_table_one(table); 1121 return; 1122 } 1123 (*batch)->nr = 0; 1124 } 1125 (*batch)->tables[(*batch)->nr++] = table; 1126 if ((*batch)->nr == MAX_TABLE_BATCH) 1127 tlb_flush_mmu(tlb); 1128 } 1129 1130 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1131 static inline void thp_split_vma(struct vm_area_struct *vma) 1132 { 1133 unsigned long addr; 1134 1135 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) 1136 follow_page(vma, addr, FOLL_SPLIT); 1137 } 1138 1139 static inline void thp_split_mm(struct mm_struct *mm) 1140 { 1141 struct vm_area_struct *vma; 1142 1143 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { 1144 thp_split_vma(vma); 1145 vma->vm_flags &= ~VM_HUGEPAGE; 1146 vma->vm_flags |= VM_NOHUGEPAGE; 1147 } 1148 mm->def_flags |= VM_NOHUGEPAGE; 1149 } 1150 #else 1151 static inline void thp_split_mm(struct mm_struct *mm) 1152 { 1153 } 1154 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1155 1156 /* 1157 * switch on pgstes for its userspace process (for kvm) 1158 */ 1159 int s390_enable_sie(void) 1160 { 1161 struct mm_struct *mm = current->mm; 1162 1163 /* Do we have pgstes? if yes, we are done */ 1164 if (mm_has_pgste(mm)) 1165 return 0; 1166 /* Fail if the page tables are 2K */ 1167 if (!mm_alloc_pgste(mm)) 1168 return -EINVAL; 1169 down_write(&mm->mmap_sem); 1170 mm->context.has_pgste = 1; 1171 /* split thp mappings and disable thp for future mappings */ 1172 thp_split_mm(mm); 1173 up_write(&mm->mmap_sem); 1174 return 0; 1175 } 1176 EXPORT_SYMBOL_GPL(s390_enable_sie); 1177 1178 /* 1179 * Enable storage key handling from now on and initialize the storage 1180 * keys with the default key. 1181 */ 1182 static int __s390_enable_skey(pte_t *pte, unsigned long addr, 1183 unsigned long next, struct mm_walk *walk) 1184 { 1185 unsigned long ptev; 1186 pgste_t pgste; 1187 1188 pgste = pgste_get_lock(pte); 1189 /* 1190 * Remove all zero page mappings, 1191 * after establishing a policy to forbid zero page mappings 1192 * following faults for that page will get fresh anonymous pages 1193 */ 1194 if (is_zero_pfn(pte_pfn(*pte))) { 1195 ptep_flush_direct(walk->mm, addr, pte); 1196 pte_val(*pte) = _PAGE_INVALID; 1197 } 1198 /* Clear storage key */ 1199 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | 1200 PGSTE_GR_BIT | PGSTE_GC_BIT); 1201 ptev = pte_val(*pte); 1202 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 1203 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); 1204 pgste_set_unlock(pte, pgste); 1205 return 0; 1206 } 1207 1208 int s390_enable_skey(void) 1209 { 1210 struct mm_walk walk = { .pte_entry = __s390_enable_skey }; 1211 struct mm_struct *mm = current->mm; 1212 struct vm_area_struct *vma; 1213 int rc = 0; 1214 1215 down_write(&mm->mmap_sem); 1216 if (mm_use_skey(mm)) 1217 goto out_up; 1218 1219 mm->context.use_skey = 1; 1220 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1221 if (ksm_madvise(vma, vma->vm_start, vma->vm_end, 1222 MADV_UNMERGEABLE, &vma->vm_flags)) { 1223 mm->context.use_skey = 0; 1224 rc = -ENOMEM; 1225 goto out_up; 1226 } 1227 } 1228 mm->def_flags &= ~VM_MERGEABLE; 1229 1230 walk.mm = mm; 1231 walk_page_range(0, TASK_SIZE, &walk); 1232 1233 out_up: 1234 up_write(&mm->mmap_sem); 1235 return rc; 1236 } 1237 EXPORT_SYMBOL_GPL(s390_enable_skey); 1238 1239 /* 1240 * Reset CMMA state, make all pages stable again. 1241 */ 1242 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 1243 unsigned long next, struct mm_walk *walk) 1244 { 1245 pgste_t pgste; 1246 1247 pgste = pgste_get_lock(pte); 1248 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; 1249 pgste_set_unlock(pte, pgste); 1250 return 0; 1251 } 1252 1253 void s390_reset_cmma(struct mm_struct *mm) 1254 { 1255 struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; 1256 1257 down_write(&mm->mmap_sem); 1258 walk.mm = mm; 1259 walk_page_range(0, TASK_SIZE, &walk); 1260 up_write(&mm->mmap_sem); 1261 } 1262 EXPORT_SYMBOL_GPL(s390_reset_cmma); 1263 1264 /* 1265 * Test and reset if a guest page is dirty 1266 */ 1267 bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) 1268 { 1269 pte_t *pte; 1270 spinlock_t *ptl; 1271 bool dirty = false; 1272 1273 pte = get_locked_pte(gmap->mm, address, &ptl); 1274 if (unlikely(!pte)) 1275 return false; 1276 1277 if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) 1278 dirty = true; 1279 1280 spin_unlock(ptl); 1281 return dirty; 1282 } 1283 EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); 1284 1285 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1286 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1287 pmd_t *pmdp) 1288 { 1289 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1290 /* No need to flush TLB 1291 * On s390 reference bits are in storage key and never in TLB */ 1292 return pmdp_test_and_clear_young(vma, address, pmdp); 1293 } 1294 1295 int pmdp_set_access_flags(struct vm_area_struct *vma, 1296 unsigned long address, pmd_t *pmdp, 1297 pmd_t entry, int dirty) 1298 { 1299 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1300 1301 entry = pmd_mkyoung(entry); 1302 if (dirty) 1303 entry = pmd_mkdirty(entry); 1304 if (pmd_same(*pmdp, entry)) 1305 return 0; 1306 pmdp_invalidate(vma, address, pmdp); 1307 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1308 return 1; 1309 } 1310 1311 static void pmdp_splitting_flush_sync(void *arg) 1312 { 1313 /* Simply deliver the interrupt */ 1314 } 1315 1316 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 1317 pmd_t *pmdp) 1318 { 1319 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1320 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 1321 (unsigned long *) pmdp)) { 1322 /* need to serialize against gup-fast (IRQ disabled) */ 1323 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 1324 } 1325 } 1326 1327 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1328 pgtable_t pgtable) 1329 { 1330 struct list_head *lh = (struct list_head *) pgtable; 1331 1332 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1333 1334 /* FIFO */ 1335 if (!pmd_huge_pte(mm, pmdp)) 1336 INIT_LIST_HEAD(lh); 1337 else 1338 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1339 pmd_huge_pte(mm, pmdp) = pgtable; 1340 } 1341 1342 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1343 { 1344 struct list_head *lh; 1345 pgtable_t pgtable; 1346 pte_t *ptep; 1347 1348 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1349 1350 /* FIFO */ 1351 pgtable = pmd_huge_pte(mm, pmdp); 1352 lh = (struct list_head *) pgtable; 1353 if (list_empty(lh)) 1354 pmd_huge_pte(mm, pmdp) = NULL; 1355 else { 1356 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1357 list_del(lh); 1358 } 1359 ptep = (pte_t *) pgtable; 1360 pte_val(*ptep) = _PAGE_INVALID; 1361 ptep++; 1362 pte_val(*ptep) = _PAGE_INVALID; 1363 return pgtable; 1364 } 1365 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1366