1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/spinlock.h> 14 #include <linux/rcupdate.h> 15 #include <linux/slab.h> 16 #include <linux/swapops.h> 17 #include <linux/sysctl.h> 18 #include <linux/ksm.h> 19 #include <linux/mman.h> 20 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 27 unsigned long *crst_table_alloc(struct mm_struct *mm) 28 { 29 struct page *page = alloc_pages(GFP_KERNEL, 2); 30 31 if (!page) 32 return NULL; 33 return (unsigned long *) page_to_phys(page); 34 } 35 36 void crst_table_free(struct mm_struct *mm, unsigned long *table) 37 { 38 free_pages((unsigned long) table, 2); 39 } 40 41 static void __crst_table_upgrade(void *arg) 42 { 43 struct mm_struct *mm = arg; 44 45 if (current->active_mm == mm) { 46 clear_user_asce(); 47 set_user_asce(mm); 48 } 49 __tlb_flush_local(); 50 } 51 52 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 53 { 54 unsigned long *table, *pgd; 55 unsigned long entry; 56 int flush; 57 58 BUG_ON(limit > TASK_MAX_SIZE); 59 flush = 0; 60 repeat: 61 table = crst_table_alloc(mm); 62 if (!table) 63 return -ENOMEM; 64 spin_lock_bh(&mm->page_table_lock); 65 if (mm->context.asce_limit < limit) { 66 pgd = (unsigned long *) mm->pgd; 67 if (mm->context.asce_limit <= (1UL << 31)) { 68 entry = _REGION3_ENTRY_EMPTY; 69 mm->context.asce_limit = 1UL << 42; 70 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 71 _ASCE_USER_BITS | 72 _ASCE_TYPE_REGION3; 73 } else { 74 entry = _REGION2_ENTRY_EMPTY; 75 mm->context.asce_limit = 1UL << 53; 76 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 77 _ASCE_USER_BITS | 78 _ASCE_TYPE_REGION2; 79 } 80 crst_table_init(table, entry); 81 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 82 mm->pgd = (pgd_t *) table; 83 mm->task_size = mm->context.asce_limit; 84 table = NULL; 85 flush = 1; 86 } 87 spin_unlock_bh(&mm->page_table_lock); 88 if (table) 89 crst_table_free(mm, table); 90 if (mm->context.asce_limit < limit) 91 goto repeat; 92 if (flush) 93 on_each_cpu(__crst_table_upgrade, mm, 0); 94 return 0; 95 } 96 97 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 98 { 99 pgd_t *pgd; 100 101 if (current->active_mm == mm) { 102 clear_user_asce(); 103 __tlb_flush_mm(mm); 104 } 105 while (mm->context.asce_limit > limit) { 106 pgd = mm->pgd; 107 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 108 case _REGION_ENTRY_TYPE_R2: 109 mm->context.asce_limit = 1UL << 42; 110 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 111 _ASCE_USER_BITS | 112 _ASCE_TYPE_REGION3; 113 break; 114 case _REGION_ENTRY_TYPE_R3: 115 mm->context.asce_limit = 1UL << 31; 116 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 117 _ASCE_USER_BITS | 118 _ASCE_TYPE_SEGMENT; 119 break; 120 default: 121 BUG(); 122 } 123 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 124 mm->task_size = mm->context.asce_limit; 125 crst_table_free(mm, (unsigned long *) pgd); 126 } 127 if (current->active_mm == mm) 128 set_user_asce(mm); 129 } 130 131 #ifdef CONFIG_PGSTE 132 133 /** 134 * gmap_alloc - allocate a guest address space 135 * @mm: pointer to the parent mm_struct 136 * @limit: maximum address of the gmap address space 137 * 138 * Returns a guest address space structure. 139 */ 140 struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) 141 { 142 struct gmap *gmap; 143 struct page *page; 144 unsigned long *table; 145 unsigned long etype, atype; 146 147 if (limit < (1UL << 31)) { 148 limit = (1UL << 31) - 1; 149 atype = _ASCE_TYPE_SEGMENT; 150 etype = _SEGMENT_ENTRY_EMPTY; 151 } else if (limit < (1UL << 42)) { 152 limit = (1UL << 42) - 1; 153 atype = _ASCE_TYPE_REGION3; 154 etype = _REGION3_ENTRY_EMPTY; 155 } else if (limit < (1UL << 53)) { 156 limit = (1UL << 53) - 1; 157 atype = _ASCE_TYPE_REGION2; 158 etype = _REGION2_ENTRY_EMPTY; 159 } else { 160 limit = -1UL; 161 atype = _ASCE_TYPE_REGION1; 162 etype = _REGION1_ENTRY_EMPTY; 163 } 164 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 165 if (!gmap) 166 goto out; 167 INIT_LIST_HEAD(&gmap->crst_list); 168 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); 169 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); 170 spin_lock_init(&gmap->guest_table_lock); 171 gmap->mm = mm; 172 page = alloc_pages(GFP_KERNEL, 2); 173 if (!page) 174 goto out_free; 175 page->index = 0; 176 list_add(&page->lru, &gmap->crst_list); 177 table = (unsigned long *) page_to_phys(page); 178 crst_table_init(table, etype); 179 gmap->table = table; 180 gmap->asce = atype | _ASCE_TABLE_LENGTH | 181 _ASCE_USER_BITS | __pa(table); 182 gmap->asce_end = limit; 183 down_write(&mm->mmap_sem); 184 list_add(&gmap->list, &mm->context.gmap_list); 185 up_write(&mm->mmap_sem); 186 return gmap; 187 188 out_free: 189 kfree(gmap); 190 out: 191 return NULL; 192 } 193 EXPORT_SYMBOL_GPL(gmap_alloc); 194 195 static void gmap_flush_tlb(struct gmap *gmap) 196 { 197 if (MACHINE_HAS_IDTE) 198 __tlb_flush_asce(gmap->mm, gmap->asce); 199 else 200 __tlb_flush_global(); 201 } 202 203 static void gmap_radix_tree_free(struct radix_tree_root *root) 204 { 205 struct radix_tree_iter iter; 206 unsigned long indices[16]; 207 unsigned long index; 208 void **slot; 209 int i, nr; 210 211 /* A radix tree is freed by deleting all of its entries */ 212 index = 0; 213 do { 214 nr = 0; 215 radix_tree_for_each_slot(slot, root, &iter, index) { 216 indices[nr] = iter.index; 217 if (++nr == 16) 218 break; 219 } 220 for (i = 0; i < nr; i++) { 221 index = indices[i]; 222 radix_tree_delete(root, index); 223 } 224 } while (nr > 0); 225 } 226 227 /** 228 * gmap_free - free a guest address space 229 * @gmap: pointer to the guest address space structure 230 */ 231 void gmap_free(struct gmap *gmap) 232 { 233 struct page *page, *next; 234 235 /* Flush tlb. */ 236 if (MACHINE_HAS_IDTE) 237 __tlb_flush_asce(gmap->mm, gmap->asce); 238 else 239 __tlb_flush_global(); 240 241 /* Free all segment & region tables. */ 242 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 243 __free_pages(page, 2); 244 gmap_radix_tree_free(&gmap->guest_to_host); 245 gmap_radix_tree_free(&gmap->host_to_guest); 246 down_write(&gmap->mm->mmap_sem); 247 list_del(&gmap->list); 248 up_write(&gmap->mm->mmap_sem); 249 kfree(gmap); 250 } 251 EXPORT_SYMBOL_GPL(gmap_free); 252 253 /** 254 * gmap_enable - switch primary space to the guest address space 255 * @gmap: pointer to the guest address space structure 256 */ 257 void gmap_enable(struct gmap *gmap) 258 { 259 S390_lowcore.gmap = (unsigned long) gmap; 260 } 261 EXPORT_SYMBOL_GPL(gmap_enable); 262 263 /** 264 * gmap_disable - switch back to the standard primary address space 265 * @gmap: pointer to the guest address space structure 266 */ 267 void gmap_disable(struct gmap *gmap) 268 { 269 S390_lowcore.gmap = 0UL; 270 } 271 EXPORT_SYMBOL_GPL(gmap_disable); 272 273 /* 274 * gmap_alloc_table is assumed to be called with mmap_sem held 275 */ 276 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 277 unsigned long init, unsigned long gaddr) 278 { 279 struct page *page; 280 unsigned long *new; 281 282 /* since we dont free the gmap table until gmap_free we can unlock */ 283 page = alloc_pages(GFP_KERNEL, 2); 284 if (!page) 285 return -ENOMEM; 286 new = (unsigned long *) page_to_phys(page); 287 crst_table_init(new, init); 288 spin_lock(&gmap->mm->page_table_lock); 289 if (*table & _REGION_ENTRY_INVALID) { 290 list_add(&page->lru, &gmap->crst_list); 291 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 292 (*table & _REGION_ENTRY_TYPE_MASK); 293 page->index = gaddr; 294 page = NULL; 295 } 296 spin_unlock(&gmap->mm->page_table_lock); 297 if (page) 298 __free_pages(page, 2); 299 return 0; 300 } 301 302 /** 303 * __gmap_segment_gaddr - find virtual address from segment pointer 304 * @entry: pointer to a segment table entry in the guest address space 305 * 306 * Returns the virtual address in the guest address space for the segment 307 */ 308 static unsigned long __gmap_segment_gaddr(unsigned long *entry) 309 { 310 struct page *page; 311 unsigned long offset, mask; 312 313 offset = (unsigned long) entry / sizeof(unsigned long); 314 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 315 mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); 316 page = virt_to_page((void *)((unsigned long) entry & mask)); 317 return page->index + offset; 318 } 319 320 /** 321 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 322 * @gmap: pointer to the guest address space structure 323 * @vmaddr: address in the host process address space 324 * 325 * Returns 1 if a TLB flush is required 326 */ 327 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 328 { 329 unsigned long *entry; 330 int flush = 0; 331 332 spin_lock(&gmap->guest_table_lock); 333 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 334 if (entry) { 335 flush = (*entry != _SEGMENT_ENTRY_INVALID); 336 *entry = _SEGMENT_ENTRY_INVALID; 337 } 338 spin_unlock(&gmap->guest_table_lock); 339 return flush; 340 } 341 342 /** 343 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 344 * @gmap: pointer to the guest address space structure 345 * @gaddr: address in the guest address space 346 * 347 * Returns 1 if a TLB flush is required 348 */ 349 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 350 { 351 unsigned long vmaddr; 352 353 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 354 gaddr >> PMD_SHIFT); 355 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 356 } 357 358 /** 359 * gmap_unmap_segment - unmap segment from the guest address space 360 * @gmap: pointer to the guest address space structure 361 * @to: address in the guest address space 362 * @len: length of the memory area to unmap 363 * 364 * Returns 0 if the unmap succeeded, -EINVAL if not. 365 */ 366 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 367 { 368 unsigned long off; 369 int flush; 370 371 if ((to | len) & (PMD_SIZE - 1)) 372 return -EINVAL; 373 if (len == 0 || to + len < to) 374 return -EINVAL; 375 376 flush = 0; 377 down_write(&gmap->mm->mmap_sem); 378 for (off = 0; off < len; off += PMD_SIZE) 379 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 380 up_write(&gmap->mm->mmap_sem); 381 if (flush) 382 gmap_flush_tlb(gmap); 383 return 0; 384 } 385 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 386 387 /** 388 * gmap_mmap_segment - map a segment to the guest address space 389 * @gmap: pointer to the guest address space structure 390 * @from: source address in the parent address space 391 * @to: target address in the guest address space 392 * @len: length of the memory area to map 393 * 394 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 395 */ 396 int gmap_map_segment(struct gmap *gmap, unsigned long from, 397 unsigned long to, unsigned long len) 398 { 399 unsigned long off; 400 int flush; 401 402 if ((from | to | len) & (PMD_SIZE - 1)) 403 return -EINVAL; 404 if (len == 0 || from + len < from || to + len < to || 405 from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end) 406 return -EINVAL; 407 408 flush = 0; 409 down_write(&gmap->mm->mmap_sem); 410 for (off = 0; off < len; off += PMD_SIZE) { 411 /* Remove old translation */ 412 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 413 /* Store new translation */ 414 if (radix_tree_insert(&gmap->guest_to_host, 415 (to + off) >> PMD_SHIFT, 416 (void *) from + off)) 417 break; 418 } 419 up_write(&gmap->mm->mmap_sem); 420 if (flush) 421 gmap_flush_tlb(gmap); 422 if (off >= len) 423 return 0; 424 gmap_unmap_segment(gmap, to, len); 425 return -ENOMEM; 426 } 427 EXPORT_SYMBOL_GPL(gmap_map_segment); 428 429 /** 430 * __gmap_translate - translate a guest address to a user space address 431 * @gmap: pointer to guest mapping meta data structure 432 * @gaddr: guest address 433 * 434 * Returns user space address which corresponds to the guest address or 435 * -EFAULT if no such mapping exists. 436 * This function does not establish potentially missing page table entries. 437 * The mmap_sem of the mm that belongs to the address space must be held 438 * when this function gets called. 439 */ 440 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 441 { 442 unsigned long vmaddr; 443 444 vmaddr = (unsigned long) 445 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 446 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 447 } 448 EXPORT_SYMBOL_GPL(__gmap_translate); 449 450 /** 451 * gmap_translate - translate a guest address to a user space address 452 * @gmap: pointer to guest mapping meta data structure 453 * @gaddr: guest address 454 * 455 * Returns user space address which corresponds to the guest address or 456 * -EFAULT if no such mapping exists. 457 * This function does not establish potentially missing page table entries. 458 */ 459 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 460 { 461 unsigned long rc; 462 463 down_read(&gmap->mm->mmap_sem); 464 rc = __gmap_translate(gmap, gaddr); 465 up_read(&gmap->mm->mmap_sem); 466 return rc; 467 } 468 EXPORT_SYMBOL_GPL(gmap_translate); 469 470 /** 471 * gmap_unlink - disconnect a page table from the gmap shadow tables 472 * @gmap: pointer to guest mapping meta data structure 473 * @table: pointer to the host page table 474 * @vmaddr: vm address associated with the host page table 475 */ 476 static void gmap_unlink(struct mm_struct *mm, unsigned long *table, 477 unsigned long vmaddr) 478 { 479 struct gmap *gmap; 480 int flush; 481 482 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 483 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 484 if (flush) 485 gmap_flush_tlb(gmap); 486 } 487 } 488 489 /** 490 * gmap_link - set up shadow page tables to connect a host to a guest address 491 * @gmap: pointer to guest mapping meta data structure 492 * @gaddr: guest address 493 * @vmaddr: vm address 494 * 495 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 496 * if the vm address is already mapped to a different guest segment. 497 * The mmap_sem of the mm that belongs to the address space must be held 498 * when this function gets called. 499 */ 500 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 501 { 502 struct mm_struct *mm; 503 unsigned long *table; 504 spinlock_t *ptl; 505 pgd_t *pgd; 506 pud_t *pud; 507 pmd_t *pmd; 508 int rc; 509 510 /* Create higher level tables in the gmap page table */ 511 table = gmap->table; 512 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 513 table += (gaddr >> 53) & 0x7ff; 514 if ((*table & _REGION_ENTRY_INVALID) && 515 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 516 gaddr & 0xffe0000000000000UL)) 517 return -ENOMEM; 518 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 519 } 520 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 521 table += (gaddr >> 42) & 0x7ff; 522 if ((*table & _REGION_ENTRY_INVALID) && 523 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 524 gaddr & 0xfffffc0000000000UL)) 525 return -ENOMEM; 526 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 527 } 528 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 529 table += (gaddr >> 31) & 0x7ff; 530 if ((*table & _REGION_ENTRY_INVALID) && 531 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 532 gaddr & 0xffffffff80000000UL)) 533 return -ENOMEM; 534 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 535 } 536 table += (gaddr >> 20) & 0x7ff; 537 /* Walk the parent mm page table */ 538 mm = gmap->mm; 539 pgd = pgd_offset(mm, vmaddr); 540 VM_BUG_ON(pgd_none(*pgd)); 541 pud = pud_offset(pgd, vmaddr); 542 VM_BUG_ON(pud_none(*pud)); 543 pmd = pmd_offset(pud, vmaddr); 544 VM_BUG_ON(pmd_none(*pmd)); 545 /* large pmds cannot yet be handled */ 546 if (pmd_large(*pmd)) 547 return -EFAULT; 548 /* Link gmap segment table entry location to page table. */ 549 rc = radix_tree_preload(GFP_KERNEL); 550 if (rc) 551 return rc; 552 ptl = pmd_lock(mm, pmd); 553 spin_lock(&gmap->guest_table_lock); 554 if (*table == _SEGMENT_ENTRY_INVALID) { 555 rc = radix_tree_insert(&gmap->host_to_guest, 556 vmaddr >> PMD_SHIFT, table); 557 if (!rc) 558 *table = pmd_val(*pmd); 559 } else 560 rc = 0; 561 spin_unlock(&gmap->guest_table_lock); 562 spin_unlock(ptl); 563 radix_tree_preload_end(); 564 return rc; 565 } 566 567 /** 568 * gmap_fault - resolve a fault on a guest address 569 * @gmap: pointer to guest mapping meta data structure 570 * @gaddr: guest address 571 * @fault_flags: flags to pass down to handle_mm_fault() 572 * 573 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 574 * if the vm address is already mapped to a different guest segment. 575 */ 576 int gmap_fault(struct gmap *gmap, unsigned long gaddr, 577 unsigned int fault_flags) 578 { 579 unsigned long vmaddr; 580 int rc; 581 bool unlocked; 582 583 down_read(&gmap->mm->mmap_sem); 584 585 retry: 586 unlocked = false; 587 vmaddr = __gmap_translate(gmap, gaddr); 588 if (IS_ERR_VALUE(vmaddr)) { 589 rc = vmaddr; 590 goto out_up; 591 } 592 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, 593 &unlocked)) { 594 rc = -EFAULT; 595 goto out_up; 596 } 597 /* 598 * In the case that fixup_user_fault unlocked the mmap_sem during 599 * faultin redo __gmap_translate to not race with a map/unmap_segment. 600 */ 601 if (unlocked) 602 goto retry; 603 604 rc = __gmap_link(gmap, gaddr, vmaddr); 605 out_up: 606 up_read(&gmap->mm->mmap_sem); 607 return rc; 608 } 609 EXPORT_SYMBOL_GPL(gmap_fault); 610 611 static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) 612 { 613 if (!non_swap_entry(entry)) 614 dec_mm_counter(mm, MM_SWAPENTS); 615 else if (is_migration_entry(entry)) { 616 struct page *page = migration_entry_to_page(entry); 617 618 dec_mm_counter(mm, mm_counter(page)); 619 } 620 free_swap_and_cache(entry); 621 } 622 623 /* 624 * this function is assumed to be called with mmap_sem held 625 */ 626 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 627 { 628 unsigned long vmaddr, ptev, pgstev; 629 pte_t *ptep, pte; 630 spinlock_t *ptl; 631 pgste_t pgste; 632 633 /* Find the vm address for the guest address */ 634 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 635 gaddr >> PMD_SHIFT); 636 if (!vmaddr) 637 return; 638 vmaddr |= gaddr & ~PMD_MASK; 639 /* Get pointer to the page table entry */ 640 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 641 if (unlikely(!ptep)) 642 return; 643 pte = *ptep; 644 if (!pte_swap(pte)) 645 goto out_pte; 646 /* Zap unused and logically-zero pages */ 647 pgste = pgste_get_lock(ptep); 648 pgstev = pgste_val(pgste); 649 ptev = pte_val(pte); 650 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || 651 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { 652 gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); 653 pte_clear(gmap->mm, vmaddr, ptep); 654 } 655 pgste_set_unlock(ptep, pgste); 656 out_pte: 657 pte_unmap_unlock(ptep, ptl); 658 } 659 EXPORT_SYMBOL_GPL(__gmap_zap); 660 661 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 662 { 663 unsigned long gaddr, vmaddr, size; 664 struct vm_area_struct *vma; 665 666 down_read(&gmap->mm->mmap_sem); 667 for (gaddr = from; gaddr < to; 668 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 669 /* Find the vm address for the guest address */ 670 vmaddr = (unsigned long) 671 radix_tree_lookup(&gmap->guest_to_host, 672 gaddr >> PMD_SHIFT); 673 if (!vmaddr) 674 continue; 675 vmaddr |= gaddr & ~PMD_MASK; 676 /* Find vma in the parent mm */ 677 vma = find_vma(gmap->mm, vmaddr); 678 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 679 zap_page_range(vma, vmaddr, size, NULL); 680 } 681 up_read(&gmap->mm->mmap_sem); 682 } 683 EXPORT_SYMBOL_GPL(gmap_discard); 684 685 static LIST_HEAD(gmap_notifier_list); 686 static DEFINE_SPINLOCK(gmap_notifier_lock); 687 688 /** 689 * gmap_register_ipte_notifier - register a pte invalidation callback 690 * @nb: pointer to the gmap notifier block 691 */ 692 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 693 { 694 spin_lock(&gmap_notifier_lock); 695 list_add(&nb->list, &gmap_notifier_list); 696 spin_unlock(&gmap_notifier_lock); 697 } 698 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 699 700 /** 701 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 702 * @nb: pointer to the gmap notifier block 703 */ 704 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 705 { 706 spin_lock(&gmap_notifier_lock); 707 list_del_init(&nb->list); 708 spin_unlock(&gmap_notifier_lock); 709 } 710 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 711 712 /** 713 * gmap_ipte_notify - mark a range of ptes for invalidation notification 714 * @gmap: pointer to guest mapping meta data structure 715 * @gaddr: virtual address in the guest address space 716 * @len: size of area 717 * 718 * Returns 0 if for each page in the given range a gmap mapping exists and 719 * the invalidation notification could be set. If the gmap mapping is missing 720 * for one or more pages -EFAULT is returned. If no memory could be allocated 721 * -ENOMEM is returned. This function establishes missing page table entries. 722 */ 723 int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) 724 { 725 unsigned long addr; 726 spinlock_t *ptl; 727 pte_t *ptep, entry; 728 pgste_t pgste; 729 bool unlocked; 730 int rc = 0; 731 732 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) 733 return -EINVAL; 734 down_read(&gmap->mm->mmap_sem); 735 while (len) { 736 unlocked = false; 737 /* Convert gmap address and connect the page tables */ 738 addr = __gmap_translate(gmap, gaddr); 739 if (IS_ERR_VALUE(addr)) { 740 rc = addr; 741 break; 742 } 743 /* Get the page mapped */ 744 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, 745 &unlocked)) { 746 rc = -EFAULT; 747 break; 748 } 749 /* While trying to map mmap_sem got unlocked. Let us retry */ 750 if (unlocked) 751 continue; 752 rc = __gmap_link(gmap, gaddr, addr); 753 if (rc) 754 break; 755 /* Walk the process page table, lock and get pte pointer */ 756 ptep = get_locked_pte(gmap->mm, addr, &ptl); 757 VM_BUG_ON(!ptep); 758 /* Set notification bit in the pgste of the pte */ 759 entry = *ptep; 760 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { 761 pgste = pgste_get_lock(ptep); 762 pgste_val(pgste) |= PGSTE_IN_BIT; 763 pgste_set_unlock(ptep, pgste); 764 gaddr += PAGE_SIZE; 765 len -= PAGE_SIZE; 766 } 767 pte_unmap_unlock(ptep, ptl); 768 } 769 up_read(&gmap->mm->mmap_sem); 770 return rc; 771 } 772 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 773 774 /** 775 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 776 * @mm: pointer to the process mm_struct 777 * @addr: virtual address in the process address space 778 * @pte: pointer to the page table entry 779 * 780 * This function is assumed to be called with the page table lock held 781 * for the pte to notify. 782 */ 783 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) 784 { 785 unsigned long offset, gaddr; 786 unsigned long *table; 787 struct gmap_notifier *nb; 788 struct gmap *gmap; 789 790 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 791 offset = offset * (4096 / sizeof(pte_t)); 792 spin_lock(&gmap_notifier_lock); 793 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 794 table = radix_tree_lookup(&gmap->host_to_guest, 795 vmaddr >> PMD_SHIFT); 796 if (!table) 797 continue; 798 gaddr = __gmap_segment_gaddr(table) + offset; 799 list_for_each_entry(nb, &gmap_notifier_list, list) 800 nb->notifier_call(gmap, gaddr); 801 } 802 spin_unlock(&gmap_notifier_lock); 803 } 804 EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); 805 806 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 807 unsigned long key, bool nq) 808 { 809 spinlock_t *ptl; 810 pgste_t old, new; 811 pte_t *ptep; 812 bool unlocked; 813 814 down_read(&mm->mmap_sem); 815 retry: 816 unlocked = false; 817 ptep = get_locked_pte(mm, addr, &ptl); 818 if (unlikely(!ptep)) { 819 up_read(&mm->mmap_sem); 820 return -EFAULT; 821 } 822 if (!(pte_val(*ptep) & _PAGE_INVALID) && 823 (pte_val(*ptep) & _PAGE_PROTECT)) { 824 pte_unmap_unlock(ptep, ptl); 825 /* 826 * We do not really care about unlocked. We will retry either 827 * way. But this allows fixup_user_fault to enable userfaultfd. 828 */ 829 if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE, 830 &unlocked)) { 831 up_read(&mm->mmap_sem); 832 return -EFAULT; 833 } 834 goto retry; 835 } 836 837 new = old = pgste_get_lock(ptep); 838 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 839 PGSTE_ACC_BITS | PGSTE_FP_BIT); 840 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 841 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 842 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 843 unsigned long address, bits, skey; 844 845 address = pte_val(*ptep) & PAGE_MASK; 846 skey = (unsigned long) page_get_storage_key(address); 847 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 848 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 849 /* Set storage key ACC and FP */ 850 page_set_storage_key(address, skey, !nq); 851 /* Merge host changed & referenced into pgste */ 852 pgste_val(new) |= bits << 52; 853 } 854 /* changing the guest storage key is considered a change of the page */ 855 if ((pgste_val(new) ^ pgste_val(old)) & 856 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 857 pgste_val(new) |= PGSTE_UC_BIT; 858 859 pgste_set_unlock(ptep, new); 860 pte_unmap_unlock(ptep, ptl); 861 up_read(&mm->mmap_sem); 862 return 0; 863 } 864 EXPORT_SYMBOL(set_guest_storage_key); 865 866 unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) 867 { 868 spinlock_t *ptl; 869 pgste_t pgste; 870 pte_t *ptep; 871 uint64_t physaddr; 872 unsigned long key = 0; 873 874 down_read(&mm->mmap_sem); 875 ptep = get_locked_pte(mm, addr, &ptl); 876 if (unlikely(!ptep)) { 877 up_read(&mm->mmap_sem); 878 return -EFAULT; 879 } 880 pgste = pgste_get_lock(ptep); 881 882 if (pte_val(*ptep) & _PAGE_INVALID) { 883 key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; 884 key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; 885 key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; 886 key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; 887 } else { 888 physaddr = pte_val(*ptep) & PAGE_MASK; 889 key = page_get_storage_key(physaddr); 890 891 /* Reflect guest's logical view, not physical */ 892 if (pgste_val(pgste) & PGSTE_GR_BIT) 893 key |= _PAGE_REFERENCED; 894 if (pgste_val(pgste) & PGSTE_GC_BIT) 895 key |= _PAGE_CHANGED; 896 } 897 898 pgste_set_unlock(ptep, pgste); 899 pte_unmap_unlock(ptep, ptl); 900 up_read(&mm->mmap_sem); 901 return key; 902 } 903 EXPORT_SYMBOL(get_guest_storage_key); 904 905 static int page_table_allocate_pgste_min = 0; 906 static int page_table_allocate_pgste_max = 1; 907 int page_table_allocate_pgste = 0; 908 EXPORT_SYMBOL(page_table_allocate_pgste); 909 910 static struct ctl_table page_table_sysctl[] = { 911 { 912 .procname = "allocate_pgste", 913 .data = &page_table_allocate_pgste, 914 .maxlen = sizeof(int), 915 .mode = S_IRUGO | S_IWUSR, 916 .proc_handler = proc_dointvec, 917 .extra1 = &page_table_allocate_pgste_min, 918 .extra2 = &page_table_allocate_pgste_max, 919 }, 920 { } 921 }; 922 923 static struct ctl_table page_table_sysctl_dir[] = { 924 { 925 .procname = "vm", 926 .maxlen = 0, 927 .mode = 0555, 928 .child = page_table_sysctl, 929 }, 930 { } 931 }; 932 933 static int __init page_table_register_sysctl(void) 934 { 935 return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; 936 } 937 __initcall(page_table_register_sysctl); 938 939 #else /* CONFIG_PGSTE */ 940 941 static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, 942 unsigned long vmaddr) 943 { 944 } 945 946 #endif /* CONFIG_PGSTE */ 947 948 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 949 { 950 unsigned int old, new; 951 952 do { 953 old = atomic_read(v); 954 new = old ^ bits; 955 } while (atomic_cmpxchg(v, old, new) != old); 956 return new; 957 } 958 959 /* 960 * page table entry allocation/free routines. 961 */ 962 unsigned long *page_table_alloc(struct mm_struct *mm) 963 { 964 unsigned long *table; 965 struct page *page; 966 unsigned int mask, bit; 967 968 /* Try to get a fragment of a 4K page as a 2K page table */ 969 if (!mm_alloc_pgste(mm)) { 970 table = NULL; 971 spin_lock_bh(&mm->context.list_lock); 972 if (!list_empty(&mm->context.pgtable_list)) { 973 page = list_first_entry(&mm->context.pgtable_list, 974 struct page, lru); 975 mask = atomic_read(&page->_mapcount); 976 mask = (mask | (mask >> 4)) & 3; 977 if (mask != 3) { 978 table = (unsigned long *) page_to_phys(page); 979 bit = mask & 1; /* =1 -> second 2K */ 980 if (bit) 981 table += PTRS_PER_PTE; 982 atomic_xor_bits(&page->_mapcount, 1U << bit); 983 list_del(&page->lru); 984 } 985 } 986 spin_unlock_bh(&mm->context.list_lock); 987 if (table) 988 return table; 989 } 990 /* Allocate a fresh page */ 991 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 992 if (!page) 993 return NULL; 994 if (!pgtable_page_ctor(page)) { 995 __free_page(page); 996 return NULL; 997 } 998 /* Initialize page table */ 999 table = (unsigned long *) page_to_phys(page); 1000 if (mm_alloc_pgste(mm)) { 1001 /* Return 4K page table with PGSTEs */ 1002 atomic_set(&page->_mapcount, 3); 1003 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 1004 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 1005 } else { 1006 /* Return the first 2K fragment of the page */ 1007 atomic_set(&page->_mapcount, 1); 1008 clear_table(table, _PAGE_INVALID, PAGE_SIZE); 1009 spin_lock_bh(&mm->context.list_lock); 1010 list_add(&page->lru, &mm->context.pgtable_list); 1011 spin_unlock_bh(&mm->context.list_lock); 1012 } 1013 return table; 1014 } 1015 1016 void page_table_free(struct mm_struct *mm, unsigned long *table) 1017 { 1018 struct page *page; 1019 unsigned int bit, mask; 1020 1021 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1022 if (!mm_alloc_pgste(mm)) { 1023 /* Free 2K page table fragment of a 4K page */ 1024 bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); 1025 spin_lock_bh(&mm->context.list_lock); 1026 mask = atomic_xor_bits(&page->_mapcount, 1U << bit); 1027 if (mask & 3) 1028 list_add(&page->lru, &mm->context.pgtable_list); 1029 else 1030 list_del(&page->lru); 1031 spin_unlock_bh(&mm->context.list_lock); 1032 if (mask != 0) 1033 return; 1034 } 1035 1036 pgtable_page_dtor(page); 1037 atomic_set(&page->_mapcount, -1); 1038 __free_page(page); 1039 } 1040 1041 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, 1042 unsigned long vmaddr) 1043 { 1044 struct mm_struct *mm; 1045 struct page *page; 1046 unsigned int bit, mask; 1047 1048 mm = tlb->mm; 1049 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1050 if (mm_alloc_pgste(mm)) { 1051 gmap_unlink(mm, table, vmaddr); 1052 table = (unsigned long *) (__pa(table) | 3); 1053 tlb_remove_table(tlb, table); 1054 return; 1055 } 1056 bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); 1057 spin_lock_bh(&mm->context.list_lock); 1058 mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); 1059 if (mask & 3) 1060 list_add_tail(&page->lru, &mm->context.pgtable_list); 1061 else 1062 list_del(&page->lru); 1063 spin_unlock_bh(&mm->context.list_lock); 1064 table = (unsigned long *) (__pa(table) | (1U << bit)); 1065 tlb_remove_table(tlb, table); 1066 } 1067 1068 static void __tlb_remove_table(void *_table) 1069 { 1070 unsigned int mask = (unsigned long) _table & 3; 1071 void *table = (void *)((unsigned long) _table ^ mask); 1072 struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1073 1074 switch (mask) { 1075 case 0: /* pmd or pud */ 1076 free_pages((unsigned long) table, 2); 1077 break; 1078 case 1: /* lower 2K of a 4K page table */ 1079 case 2: /* higher 2K of a 4K page table */ 1080 if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) 1081 break; 1082 /* fallthrough */ 1083 case 3: /* 4K page table with pgstes */ 1084 pgtable_page_dtor(page); 1085 atomic_set(&page->_mapcount, -1); 1086 __free_page(page); 1087 break; 1088 } 1089 } 1090 1091 static void tlb_remove_table_smp_sync(void *arg) 1092 { 1093 /* Simply deliver the interrupt */ 1094 } 1095 1096 static void tlb_remove_table_one(void *table) 1097 { 1098 /* 1099 * This isn't an RCU grace period and hence the page-tables cannot be 1100 * assumed to be actually RCU-freed. 1101 * 1102 * It is however sufficient for software page-table walkers that rely 1103 * on IRQ disabling. See the comment near struct mmu_table_batch. 1104 */ 1105 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 1106 __tlb_remove_table(table); 1107 } 1108 1109 static void tlb_remove_table_rcu(struct rcu_head *head) 1110 { 1111 struct mmu_table_batch *batch; 1112 int i; 1113 1114 batch = container_of(head, struct mmu_table_batch, rcu); 1115 1116 for (i = 0; i < batch->nr; i++) 1117 __tlb_remove_table(batch->tables[i]); 1118 1119 free_page((unsigned long)batch); 1120 } 1121 1122 void tlb_table_flush(struct mmu_gather *tlb) 1123 { 1124 struct mmu_table_batch **batch = &tlb->batch; 1125 1126 if (*batch) { 1127 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 1128 *batch = NULL; 1129 } 1130 } 1131 1132 void tlb_remove_table(struct mmu_gather *tlb, void *table) 1133 { 1134 struct mmu_table_batch **batch = &tlb->batch; 1135 1136 tlb->mm->context.flush_mm = 1; 1137 if (*batch == NULL) { 1138 *batch = (struct mmu_table_batch *) 1139 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 1140 if (*batch == NULL) { 1141 __tlb_flush_mm_lazy(tlb->mm); 1142 tlb_remove_table_one(table); 1143 return; 1144 } 1145 (*batch)->nr = 0; 1146 } 1147 (*batch)->tables[(*batch)->nr++] = table; 1148 if ((*batch)->nr == MAX_TABLE_BATCH) 1149 tlb_flush_mmu(tlb); 1150 } 1151 1152 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1153 static inline void thp_split_vma(struct vm_area_struct *vma) 1154 { 1155 unsigned long addr; 1156 1157 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) 1158 follow_page(vma, addr, FOLL_SPLIT); 1159 } 1160 1161 static inline void thp_split_mm(struct mm_struct *mm) 1162 { 1163 struct vm_area_struct *vma; 1164 1165 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { 1166 thp_split_vma(vma); 1167 vma->vm_flags &= ~VM_HUGEPAGE; 1168 vma->vm_flags |= VM_NOHUGEPAGE; 1169 } 1170 mm->def_flags |= VM_NOHUGEPAGE; 1171 } 1172 #else 1173 static inline void thp_split_mm(struct mm_struct *mm) 1174 { 1175 } 1176 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1177 1178 /* 1179 * switch on pgstes for its userspace process (for kvm) 1180 */ 1181 int s390_enable_sie(void) 1182 { 1183 struct mm_struct *mm = current->mm; 1184 1185 /* Do we have pgstes? if yes, we are done */ 1186 if (mm_has_pgste(mm)) 1187 return 0; 1188 /* Fail if the page tables are 2K */ 1189 if (!mm_alloc_pgste(mm)) 1190 return -EINVAL; 1191 down_write(&mm->mmap_sem); 1192 mm->context.has_pgste = 1; 1193 /* split thp mappings and disable thp for future mappings */ 1194 thp_split_mm(mm); 1195 up_write(&mm->mmap_sem); 1196 return 0; 1197 } 1198 EXPORT_SYMBOL_GPL(s390_enable_sie); 1199 1200 /* 1201 * Enable storage key handling from now on and initialize the storage 1202 * keys with the default key. 1203 */ 1204 static int __s390_enable_skey(pte_t *pte, unsigned long addr, 1205 unsigned long next, struct mm_walk *walk) 1206 { 1207 unsigned long ptev; 1208 pgste_t pgste; 1209 1210 pgste = pgste_get_lock(pte); 1211 /* 1212 * Remove all zero page mappings, 1213 * after establishing a policy to forbid zero page mappings 1214 * following faults for that page will get fresh anonymous pages 1215 */ 1216 if (is_zero_pfn(pte_pfn(*pte))) { 1217 ptep_flush_direct(walk->mm, addr, pte); 1218 pte_val(*pte) = _PAGE_INVALID; 1219 } 1220 /* Clear storage key */ 1221 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | 1222 PGSTE_GR_BIT | PGSTE_GC_BIT); 1223 ptev = pte_val(*pte); 1224 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 1225 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); 1226 pgste_set_unlock(pte, pgste); 1227 return 0; 1228 } 1229 1230 int s390_enable_skey(void) 1231 { 1232 struct mm_walk walk = { .pte_entry = __s390_enable_skey }; 1233 struct mm_struct *mm = current->mm; 1234 struct vm_area_struct *vma; 1235 int rc = 0; 1236 1237 down_write(&mm->mmap_sem); 1238 if (mm_use_skey(mm)) 1239 goto out_up; 1240 1241 mm->context.use_skey = 1; 1242 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1243 if (ksm_madvise(vma, vma->vm_start, vma->vm_end, 1244 MADV_UNMERGEABLE, &vma->vm_flags)) { 1245 mm->context.use_skey = 0; 1246 rc = -ENOMEM; 1247 goto out_up; 1248 } 1249 } 1250 mm->def_flags &= ~VM_MERGEABLE; 1251 1252 walk.mm = mm; 1253 walk_page_range(0, TASK_SIZE, &walk); 1254 1255 out_up: 1256 up_write(&mm->mmap_sem); 1257 return rc; 1258 } 1259 EXPORT_SYMBOL_GPL(s390_enable_skey); 1260 1261 /* 1262 * Reset CMMA state, make all pages stable again. 1263 */ 1264 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 1265 unsigned long next, struct mm_walk *walk) 1266 { 1267 pgste_t pgste; 1268 1269 pgste = pgste_get_lock(pte); 1270 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; 1271 pgste_set_unlock(pte, pgste); 1272 return 0; 1273 } 1274 1275 void s390_reset_cmma(struct mm_struct *mm) 1276 { 1277 struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; 1278 1279 down_write(&mm->mmap_sem); 1280 walk.mm = mm; 1281 walk_page_range(0, TASK_SIZE, &walk); 1282 up_write(&mm->mmap_sem); 1283 } 1284 EXPORT_SYMBOL_GPL(s390_reset_cmma); 1285 1286 /* 1287 * Test and reset if a guest page is dirty 1288 */ 1289 bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) 1290 { 1291 pte_t *pte; 1292 spinlock_t *ptl; 1293 bool dirty = false; 1294 1295 pte = get_locked_pte(gmap->mm, address, &ptl); 1296 if (unlikely(!pte)) 1297 return false; 1298 1299 if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) 1300 dirty = true; 1301 1302 spin_unlock(ptl); 1303 return dirty; 1304 } 1305 EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); 1306 1307 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1308 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1309 pmd_t *pmdp) 1310 { 1311 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1312 /* No need to flush TLB 1313 * On s390 reference bits are in storage key and never in TLB */ 1314 return pmdp_test_and_clear_young(vma, address, pmdp); 1315 } 1316 1317 int pmdp_set_access_flags(struct vm_area_struct *vma, 1318 unsigned long address, pmd_t *pmdp, 1319 pmd_t entry, int dirty) 1320 { 1321 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1322 1323 entry = pmd_mkyoung(entry); 1324 if (dirty) 1325 entry = pmd_mkdirty(entry); 1326 if (pmd_same(*pmdp, entry)) 1327 return 0; 1328 pmdp_invalidate(vma, address, pmdp); 1329 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1330 return 1; 1331 } 1332 1333 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1334 pgtable_t pgtable) 1335 { 1336 struct list_head *lh = (struct list_head *) pgtable; 1337 1338 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1339 1340 /* FIFO */ 1341 if (!pmd_huge_pte(mm, pmdp)) 1342 INIT_LIST_HEAD(lh); 1343 else 1344 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1345 pmd_huge_pte(mm, pmdp) = pgtable; 1346 } 1347 1348 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1349 { 1350 struct list_head *lh; 1351 pgtable_t pgtable; 1352 pte_t *ptep; 1353 1354 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1355 1356 /* FIFO */ 1357 pgtable = pmd_huge_pte(mm, pmdp); 1358 lh = (struct list_head *) pgtable; 1359 if (list_empty(lh)) 1360 pmd_huge_pte(mm, pmdp) = NULL; 1361 else { 1362 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1363 list_del(lh); 1364 } 1365 ptep = (pte_t *) pgtable; 1366 pte_val(*ptep) = _PAGE_INVALID; 1367 ptep++; 1368 pte_val(*ptep) = _PAGE_INVALID; 1369 return pgtable; 1370 } 1371 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1372