1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 #include <linux/swapops.h> 21 #include <linux/ksm.h> 22 #include <linux/mman.h> 23 24 #include <asm/pgtable.h> 25 #include <asm/pgalloc.h> 26 #include <asm/tlb.h> 27 #include <asm/tlbflush.h> 28 #include <asm/mmu_context.h> 29 30 #ifndef CONFIG_64BIT 31 #define ALLOC_ORDER 1 32 #define FRAG_MASK 0x0f 33 #else 34 #define ALLOC_ORDER 2 35 #define FRAG_MASK 0x03 36 #endif 37 38 39 unsigned long *crst_table_alloc(struct mm_struct *mm) 40 { 41 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 42 43 if (!page) 44 return NULL; 45 return (unsigned long *) page_to_phys(page); 46 } 47 48 void crst_table_free(struct mm_struct *mm, unsigned long *table) 49 { 50 free_pages((unsigned long) table, ALLOC_ORDER); 51 } 52 53 #ifdef CONFIG_64BIT 54 static void __crst_table_upgrade(void *arg) 55 { 56 struct mm_struct *mm = arg; 57 58 if (current->active_mm == mm) { 59 clear_user_asce(); 60 set_user_asce(mm); 61 } 62 __tlb_flush_local(); 63 } 64 65 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 66 { 67 unsigned long *table, *pgd; 68 unsigned long entry; 69 int flush; 70 71 BUG_ON(limit > (1UL << 53)); 72 flush = 0; 73 repeat: 74 table = crst_table_alloc(mm); 75 if (!table) 76 return -ENOMEM; 77 spin_lock_bh(&mm->page_table_lock); 78 if (mm->context.asce_limit < limit) { 79 pgd = (unsigned long *) mm->pgd; 80 if (mm->context.asce_limit <= (1UL << 31)) { 81 entry = _REGION3_ENTRY_EMPTY; 82 mm->context.asce_limit = 1UL << 42; 83 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 84 _ASCE_USER_BITS | 85 _ASCE_TYPE_REGION3; 86 } else { 87 entry = _REGION2_ENTRY_EMPTY; 88 mm->context.asce_limit = 1UL << 53; 89 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 90 _ASCE_USER_BITS | 91 _ASCE_TYPE_REGION2; 92 } 93 crst_table_init(table, entry); 94 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 95 mm->pgd = (pgd_t *) table; 96 mm->task_size = mm->context.asce_limit; 97 table = NULL; 98 flush = 1; 99 } 100 spin_unlock_bh(&mm->page_table_lock); 101 if (table) 102 crst_table_free(mm, table); 103 if (mm->context.asce_limit < limit) 104 goto repeat; 105 if (flush) 106 on_each_cpu(__crst_table_upgrade, mm, 0); 107 return 0; 108 } 109 110 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 111 { 112 pgd_t *pgd; 113 114 if (current->active_mm == mm) { 115 clear_user_asce(); 116 __tlb_flush_mm(mm); 117 } 118 while (mm->context.asce_limit > limit) { 119 pgd = mm->pgd; 120 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 121 case _REGION_ENTRY_TYPE_R2: 122 mm->context.asce_limit = 1UL << 42; 123 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 124 _ASCE_USER_BITS | 125 _ASCE_TYPE_REGION3; 126 break; 127 case _REGION_ENTRY_TYPE_R3: 128 mm->context.asce_limit = 1UL << 31; 129 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 130 _ASCE_USER_BITS | 131 _ASCE_TYPE_SEGMENT; 132 break; 133 default: 134 BUG(); 135 } 136 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 137 mm->task_size = mm->context.asce_limit; 138 crst_table_free(mm, (unsigned long *) pgd); 139 } 140 if (current->active_mm == mm) 141 set_user_asce(mm); 142 } 143 #endif 144 145 #ifdef CONFIG_PGSTE 146 147 /** 148 * gmap_alloc - allocate a guest address space 149 * @mm: pointer to the parent mm_struct 150 * @limit: maximum size of the gmap address space 151 * 152 * Returns a guest address space structure. 153 */ 154 struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) 155 { 156 struct gmap *gmap; 157 struct page *page; 158 unsigned long *table; 159 unsigned long etype, atype; 160 161 if (limit < (1UL << 31)) { 162 limit = (1UL << 31) - 1; 163 atype = _ASCE_TYPE_SEGMENT; 164 etype = _SEGMENT_ENTRY_EMPTY; 165 } else if (limit < (1UL << 42)) { 166 limit = (1UL << 42) - 1; 167 atype = _ASCE_TYPE_REGION3; 168 etype = _REGION3_ENTRY_EMPTY; 169 } else if (limit < (1UL << 53)) { 170 limit = (1UL << 53) - 1; 171 atype = _ASCE_TYPE_REGION2; 172 etype = _REGION2_ENTRY_EMPTY; 173 } else { 174 limit = -1UL; 175 atype = _ASCE_TYPE_REGION1; 176 etype = _REGION1_ENTRY_EMPTY; 177 } 178 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 179 if (!gmap) 180 goto out; 181 INIT_LIST_HEAD(&gmap->crst_list); 182 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); 183 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); 184 spin_lock_init(&gmap->guest_table_lock); 185 gmap->mm = mm; 186 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 187 if (!page) 188 goto out_free; 189 page->index = 0; 190 list_add(&page->lru, &gmap->crst_list); 191 table = (unsigned long *) page_to_phys(page); 192 crst_table_init(table, etype); 193 gmap->table = table; 194 gmap->asce = atype | _ASCE_TABLE_LENGTH | 195 _ASCE_USER_BITS | __pa(table); 196 gmap->asce_end = limit; 197 down_write(&mm->mmap_sem); 198 list_add(&gmap->list, &mm->context.gmap_list); 199 up_write(&mm->mmap_sem); 200 return gmap; 201 202 out_free: 203 kfree(gmap); 204 out: 205 return NULL; 206 } 207 EXPORT_SYMBOL_GPL(gmap_alloc); 208 209 static void gmap_flush_tlb(struct gmap *gmap) 210 { 211 if (MACHINE_HAS_IDTE) 212 __tlb_flush_asce(gmap->mm, gmap->asce); 213 else 214 __tlb_flush_global(); 215 } 216 217 static void gmap_radix_tree_free(struct radix_tree_root *root) 218 { 219 struct radix_tree_iter iter; 220 unsigned long indices[16]; 221 unsigned long index; 222 void **slot; 223 int i, nr; 224 225 /* A radix tree is freed by deleting all of its entries */ 226 index = 0; 227 do { 228 nr = 0; 229 radix_tree_for_each_slot(slot, root, &iter, index) { 230 indices[nr] = iter.index; 231 if (++nr == 16) 232 break; 233 } 234 for (i = 0; i < nr; i++) { 235 index = indices[i]; 236 radix_tree_delete(root, index); 237 } 238 } while (nr > 0); 239 } 240 241 /** 242 * gmap_free - free a guest address space 243 * @gmap: pointer to the guest address space structure 244 */ 245 void gmap_free(struct gmap *gmap) 246 { 247 struct page *page, *next; 248 249 /* Flush tlb. */ 250 if (MACHINE_HAS_IDTE) 251 __tlb_flush_asce(gmap->mm, gmap->asce); 252 else 253 __tlb_flush_global(); 254 255 /* Free all segment & region tables. */ 256 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 257 __free_pages(page, ALLOC_ORDER); 258 gmap_radix_tree_free(&gmap->guest_to_host); 259 gmap_radix_tree_free(&gmap->host_to_guest); 260 down_write(&gmap->mm->mmap_sem); 261 list_del(&gmap->list); 262 up_write(&gmap->mm->mmap_sem); 263 kfree(gmap); 264 } 265 EXPORT_SYMBOL_GPL(gmap_free); 266 267 /** 268 * gmap_enable - switch primary space to the guest address space 269 * @gmap: pointer to the guest address space structure 270 */ 271 void gmap_enable(struct gmap *gmap) 272 { 273 S390_lowcore.gmap = (unsigned long) gmap; 274 } 275 EXPORT_SYMBOL_GPL(gmap_enable); 276 277 /** 278 * gmap_disable - switch back to the standard primary address space 279 * @gmap: pointer to the guest address space structure 280 */ 281 void gmap_disable(struct gmap *gmap) 282 { 283 S390_lowcore.gmap = 0UL; 284 } 285 EXPORT_SYMBOL_GPL(gmap_disable); 286 287 /* 288 * gmap_alloc_table is assumed to be called with mmap_sem held 289 */ 290 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 291 unsigned long init, unsigned long gaddr) 292 { 293 struct page *page; 294 unsigned long *new; 295 296 /* since we dont free the gmap table until gmap_free we can unlock */ 297 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 298 if (!page) 299 return -ENOMEM; 300 new = (unsigned long *) page_to_phys(page); 301 crst_table_init(new, init); 302 spin_lock(&gmap->mm->page_table_lock); 303 if (*table & _REGION_ENTRY_INVALID) { 304 list_add(&page->lru, &gmap->crst_list); 305 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 306 (*table & _REGION_ENTRY_TYPE_MASK); 307 page->index = gaddr; 308 page = NULL; 309 } 310 spin_unlock(&gmap->mm->page_table_lock); 311 if (page) 312 __free_pages(page, ALLOC_ORDER); 313 return 0; 314 } 315 316 /** 317 * __gmap_segment_gaddr - find virtual address from segment pointer 318 * @entry: pointer to a segment table entry in the guest address space 319 * 320 * Returns the virtual address in the guest address space for the segment 321 */ 322 static unsigned long __gmap_segment_gaddr(unsigned long *entry) 323 { 324 struct page *page; 325 unsigned long offset; 326 327 offset = (unsigned long) entry / sizeof(unsigned long); 328 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 329 page = pmd_to_page((pmd_t *) entry); 330 return page->index + offset; 331 } 332 333 /** 334 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 335 * @gmap: pointer to the guest address space structure 336 * @vmaddr: address in the host process address space 337 * 338 * Returns 1 if a TLB flush is required 339 */ 340 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 341 { 342 unsigned long *entry; 343 int flush = 0; 344 345 spin_lock(&gmap->guest_table_lock); 346 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 347 if (entry) { 348 flush = (*entry != _SEGMENT_ENTRY_INVALID); 349 *entry = _SEGMENT_ENTRY_INVALID; 350 } 351 spin_unlock(&gmap->guest_table_lock); 352 return flush; 353 } 354 355 /** 356 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 357 * @gmap: pointer to the guest address space structure 358 * @gaddr: address in the guest address space 359 * 360 * Returns 1 if a TLB flush is required 361 */ 362 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 363 { 364 unsigned long vmaddr; 365 366 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 367 gaddr >> PMD_SHIFT); 368 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 369 } 370 371 /** 372 * gmap_unmap_segment - unmap segment from the guest address space 373 * @gmap: pointer to the guest address space structure 374 * @to: address in the guest address space 375 * @len: length of the memory area to unmap 376 * 377 * Returns 0 if the unmap succeeded, -EINVAL if not. 378 */ 379 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 380 { 381 unsigned long off; 382 int flush; 383 384 if ((to | len) & (PMD_SIZE - 1)) 385 return -EINVAL; 386 if (len == 0 || to + len < to) 387 return -EINVAL; 388 389 flush = 0; 390 down_write(&gmap->mm->mmap_sem); 391 for (off = 0; off < len; off += PMD_SIZE) 392 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 393 up_write(&gmap->mm->mmap_sem); 394 if (flush) 395 gmap_flush_tlb(gmap); 396 return 0; 397 } 398 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 399 400 /** 401 * gmap_mmap_segment - map a segment to the guest address space 402 * @gmap: pointer to the guest address space structure 403 * @from: source address in the parent address space 404 * @to: target address in the guest address space 405 * @len: length of the memory area to map 406 * 407 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 408 */ 409 int gmap_map_segment(struct gmap *gmap, unsigned long from, 410 unsigned long to, unsigned long len) 411 { 412 unsigned long off; 413 int flush; 414 415 if ((from | to | len) & (PMD_SIZE - 1)) 416 return -EINVAL; 417 if (len == 0 || from + len < from || to + len < to || 418 from + len > TASK_MAX_SIZE || to + len > gmap->asce_end) 419 return -EINVAL; 420 421 flush = 0; 422 down_write(&gmap->mm->mmap_sem); 423 for (off = 0; off < len; off += PMD_SIZE) { 424 /* Remove old translation */ 425 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 426 /* Store new translation */ 427 if (radix_tree_insert(&gmap->guest_to_host, 428 (to + off) >> PMD_SHIFT, 429 (void *) from + off)) 430 break; 431 } 432 up_write(&gmap->mm->mmap_sem); 433 if (flush) 434 gmap_flush_tlb(gmap); 435 if (off >= len) 436 return 0; 437 gmap_unmap_segment(gmap, to, len); 438 return -ENOMEM; 439 } 440 EXPORT_SYMBOL_GPL(gmap_map_segment); 441 442 /** 443 * __gmap_translate - translate a guest address to a user space address 444 * @gmap: pointer to guest mapping meta data structure 445 * @gaddr: guest address 446 * 447 * Returns user space address which corresponds to the guest address or 448 * -EFAULT if no such mapping exists. 449 * This function does not establish potentially missing page table entries. 450 * The mmap_sem of the mm that belongs to the address space must be held 451 * when this function gets called. 452 */ 453 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 454 { 455 unsigned long vmaddr; 456 457 vmaddr = (unsigned long) 458 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 459 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 460 } 461 EXPORT_SYMBOL_GPL(__gmap_translate); 462 463 /** 464 * gmap_translate - translate a guest address to a user space address 465 * @gmap: pointer to guest mapping meta data structure 466 * @gaddr: guest address 467 * 468 * Returns user space address which corresponds to the guest address or 469 * -EFAULT if no such mapping exists. 470 * This function does not establish potentially missing page table entries. 471 */ 472 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 473 { 474 unsigned long rc; 475 476 down_read(&gmap->mm->mmap_sem); 477 rc = __gmap_translate(gmap, gaddr); 478 up_read(&gmap->mm->mmap_sem); 479 return rc; 480 } 481 EXPORT_SYMBOL_GPL(gmap_translate); 482 483 /** 484 * gmap_unlink - disconnect a page table from the gmap shadow tables 485 * @gmap: pointer to guest mapping meta data structure 486 * @table: pointer to the host page table 487 * @vmaddr: vm address associated with the host page table 488 */ 489 static void gmap_unlink(struct mm_struct *mm, unsigned long *table, 490 unsigned long vmaddr) 491 { 492 struct gmap *gmap; 493 int flush; 494 495 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 496 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 497 if (flush) 498 gmap_flush_tlb(gmap); 499 } 500 } 501 502 /** 503 * gmap_link - set up shadow page tables to connect a host to a guest address 504 * @gmap: pointer to guest mapping meta data structure 505 * @gaddr: guest address 506 * @vmaddr: vm address 507 * 508 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 509 * if the vm address is already mapped to a different guest segment. 510 * The mmap_sem of the mm that belongs to the address space must be held 511 * when this function gets called. 512 */ 513 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 514 { 515 struct mm_struct *mm; 516 unsigned long *table; 517 spinlock_t *ptl; 518 pgd_t *pgd; 519 pud_t *pud; 520 pmd_t *pmd; 521 int rc; 522 523 /* Create higher level tables in the gmap page table */ 524 table = gmap->table; 525 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 526 table += (gaddr >> 53) & 0x7ff; 527 if ((*table & _REGION_ENTRY_INVALID) && 528 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 529 gaddr & 0xffe0000000000000)) 530 return -ENOMEM; 531 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 532 } 533 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 534 table += (gaddr >> 42) & 0x7ff; 535 if ((*table & _REGION_ENTRY_INVALID) && 536 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 537 gaddr & 0xfffffc0000000000)) 538 return -ENOMEM; 539 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 540 } 541 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 542 table += (gaddr >> 31) & 0x7ff; 543 if ((*table & _REGION_ENTRY_INVALID) && 544 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 545 gaddr & 0xffffffff80000000)) 546 return -ENOMEM; 547 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 548 } 549 table += (gaddr >> 20) & 0x7ff; 550 /* Walk the parent mm page table */ 551 mm = gmap->mm; 552 pgd = pgd_offset(mm, vmaddr); 553 VM_BUG_ON(pgd_none(*pgd)); 554 pud = pud_offset(pgd, vmaddr); 555 VM_BUG_ON(pud_none(*pud)); 556 pmd = pmd_offset(pud, vmaddr); 557 VM_BUG_ON(pmd_none(*pmd)); 558 /* large pmds cannot yet be handled */ 559 if (pmd_large(*pmd)) 560 return -EFAULT; 561 /* Link gmap segment table entry location to page table. */ 562 rc = radix_tree_preload(GFP_KERNEL); 563 if (rc) 564 return rc; 565 ptl = pmd_lock(mm, pmd); 566 spin_lock(&gmap->guest_table_lock); 567 if (*table == _SEGMENT_ENTRY_INVALID) { 568 rc = radix_tree_insert(&gmap->host_to_guest, 569 vmaddr >> PMD_SHIFT, table); 570 if (!rc) 571 *table = pmd_val(*pmd); 572 } else 573 rc = 0; 574 spin_unlock(&gmap->guest_table_lock); 575 spin_unlock(ptl); 576 radix_tree_preload_end(); 577 return rc; 578 } 579 580 /** 581 * gmap_fault - resolve a fault on a guest address 582 * @gmap: pointer to guest mapping meta data structure 583 * @gaddr: guest address 584 * @fault_flags: flags to pass down to handle_mm_fault() 585 * 586 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 587 * if the vm address is already mapped to a different guest segment. 588 */ 589 int gmap_fault(struct gmap *gmap, unsigned long gaddr, 590 unsigned int fault_flags) 591 { 592 unsigned long vmaddr; 593 int rc; 594 595 down_read(&gmap->mm->mmap_sem); 596 vmaddr = __gmap_translate(gmap, gaddr); 597 if (IS_ERR_VALUE(vmaddr)) { 598 rc = vmaddr; 599 goto out_up; 600 } 601 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) { 602 rc = -EFAULT; 603 goto out_up; 604 } 605 rc = __gmap_link(gmap, gaddr, vmaddr); 606 out_up: 607 up_read(&gmap->mm->mmap_sem); 608 return rc; 609 } 610 EXPORT_SYMBOL_GPL(gmap_fault); 611 612 static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) 613 { 614 if (!non_swap_entry(entry)) 615 dec_mm_counter(mm, MM_SWAPENTS); 616 else if (is_migration_entry(entry)) { 617 struct page *page = migration_entry_to_page(entry); 618 619 if (PageAnon(page)) 620 dec_mm_counter(mm, MM_ANONPAGES); 621 else 622 dec_mm_counter(mm, MM_FILEPAGES); 623 } 624 free_swap_and_cache(entry); 625 } 626 627 /* 628 * this function is assumed to be called with mmap_sem held 629 */ 630 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 631 { 632 unsigned long vmaddr, ptev, pgstev; 633 pte_t *ptep, pte; 634 spinlock_t *ptl; 635 pgste_t pgste; 636 637 /* Find the vm address for the guest address */ 638 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 639 gaddr >> PMD_SHIFT); 640 if (!vmaddr) 641 return; 642 vmaddr |= gaddr & ~PMD_MASK; 643 /* Get pointer to the page table entry */ 644 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 645 if (unlikely(!ptep)) 646 return; 647 pte = *ptep; 648 if (!pte_swap(pte)) 649 goto out_pte; 650 /* Zap unused and logically-zero pages */ 651 pgste = pgste_get_lock(ptep); 652 pgstev = pgste_val(pgste); 653 ptev = pte_val(pte); 654 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || 655 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { 656 gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); 657 pte_clear(gmap->mm, vmaddr, ptep); 658 } 659 pgste_set_unlock(ptep, pgste); 660 out_pte: 661 pte_unmap_unlock(ptep, ptl); 662 } 663 EXPORT_SYMBOL_GPL(__gmap_zap); 664 665 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 666 { 667 unsigned long gaddr, vmaddr, size; 668 struct vm_area_struct *vma; 669 670 down_read(&gmap->mm->mmap_sem); 671 for (gaddr = from; gaddr < to; 672 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 673 /* Find the vm address for the guest address */ 674 vmaddr = (unsigned long) 675 radix_tree_lookup(&gmap->guest_to_host, 676 gaddr >> PMD_SHIFT); 677 if (!vmaddr) 678 continue; 679 vmaddr |= gaddr & ~PMD_MASK; 680 /* Find vma in the parent mm */ 681 vma = find_vma(gmap->mm, vmaddr); 682 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 683 zap_page_range(vma, vmaddr, size, NULL); 684 } 685 up_read(&gmap->mm->mmap_sem); 686 } 687 EXPORT_SYMBOL_GPL(gmap_discard); 688 689 static LIST_HEAD(gmap_notifier_list); 690 static DEFINE_SPINLOCK(gmap_notifier_lock); 691 692 /** 693 * gmap_register_ipte_notifier - register a pte invalidation callback 694 * @nb: pointer to the gmap notifier block 695 */ 696 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 697 { 698 spin_lock(&gmap_notifier_lock); 699 list_add(&nb->list, &gmap_notifier_list); 700 spin_unlock(&gmap_notifier_lock); 701 } 702 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 703 704 /** 705 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 706 * @nb: pointer to the gmap notifier block 707 */ 708 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 709 { 710 spin_lock(&gmap_notifier_lock); 711 list_del_init(&nb->list); 712 spin_unlock(&gmap_notifier_lock); 713 } 714 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 715 716 /** 717 * gmap_ipte_notify - mark a range of ptes for invalidation notification 718 * @gmap: pointer to guest mapping meta data structure 719 * @gaddr: virtual address in the guest address space 720 * @len: size of area 721 * 722 * Returns 0 if for each page in the given range a gmap mapping exists and 723 * the invalidation notification could be set. If the gmap mapping is missing 724 * for one or more pages -EFAULT is returned. If no memory could be allocated 725 * -ENOMEM is returned. This function establishes missing page table entries. 726 */ 727 int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) 728 { 729 unsigned long addr; 730 spinlock_t *ptl; 731 pte_t *ptep, entry; 732 pgste_t pgste; 733 int rc = 0; 734 735 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) 736 return -EINVAL; 737 down_read(&gmap->mm->mmap_sem); 738 while (len) { 739 /* Convert gmap address and connect the page tables */ 740 addr = __gmap_translate(gmap, gaddr); 741 if (IS_ERR_VALUE(addr)) { 742 rc = addr; 743 break; 744 } 745 /* Get the page mapped */ 746 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 747 rc = -EFAULT; 748 break; 749 } 750 rc = __gmap_link(gmap, gaddr, addr); 751 if (rc) 752 break; 753 /* Walk the process page table, lock and get pte pointer */ 754 ptep = get_locked_pte(gmap->mm, addr, &ptl); 755 VM_BUG_ON(!ptep); 756 /* Set notification bit in the pgste of the pte */ 757 entry = *ptep; 758 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { 759 pgste = pgste_get_lock(ptep); 760 pgste_val(pgste) |= PGSTE_IN_BIT; 761 pgste_set_unlock(ptep, pgste); 762 gaddr += PAGE_SIZE; 763 len -= PAGE_SIZE; 764 } 765 pte_unmap_unlock(ptep, ptl); 766 } 767 up_read(&gmap->mm->mmap_sem); 768 return rc; 769 } 770 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 771 772 /** 773 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 774 * @mm: pointer to the process mm_struct 775 * @addr: virtual address in the process address space 776 * @pte: pointer to the page table entry 777 * 778 * This function is assumed to be called with the page table lock held 779 * for the pte to notify. 780 */ 781 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) 782 { 783 unsigned long offset, gaddr; 784 unsigned long *table; 785 struct gmap_notifier *nb; 786 struct gmap *gmap; 787 788 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 789 offset = offset * (4096 / sizeof(pte_t)); 790 spin_lock(&gmap_notifier_lock); 791 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 792 table = radix_tree_lookup(&gmap->host_to_guest, 793 vmaddr >> PMD_SHIFT); 794 if (!table) 795 continue; 796 gaddr = __gmap_segment_gaddr(table) + offset; 797 list_for_each_entry(nb, &gmap_notifier_list, list) 798 nb->notifier_call(gmap, gaddr); 799 } 800 spin_unlock(&gmap_notifier_lock); 801 } 802 EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); 803 804 static inline int page_table_with_pgste(struct page *page) 805 { 806 return atomic_read(&page->_mapcount) == 0; 807 } 808 809 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) 810 { 811 struct page *page; 812 unsigned long *table; 813 814 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 815 if (!page) 816 return NULL; 817 if (!pgtable_page_ctor(page)) { 818 __free_page(page); 819 return NULL; 820 } 821 atomic_set(&page->_mapcount, 0); 822 table = (unsigned long *) page_to_phys(page); 823 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 824 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 825 return table; 826 } 827 828 static inline void page_table_free_pgste(unsigned long *table) 829 { 830 struct page *page; 831 832 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 833 pgtable_page_dtor(page); 834 atomic_set(&page->_mapcount, -1); 835 __free_page(page); 836 } 837 838 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 839 unsigned long key, bool nq) 840 { 841 spinlock_t *ptl; 842 pgste_t old, new; 843 pte_t *ptep; 844 845 down_read(&mm->mmap_sem); 846 retry: 847 ptep = get_locked_pte(mm, addr, &ptl); 848 if (unlikely(!ptep)) { 849 up_read(&mm->mmap_sem); 850 return -EFAULT; 851 } 852 if (!(pte_val(*ptep) & _PAGE_INVALID) && 853 (pte_val(*ptep) & _PAGE_PROTECT)) { 854 pte_unmap_unlock(ptep, ptl); 855 if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { 856 up_read(&mm->mmap_sem); 857 return -EFAULT; 858 } 859 goto retry; 860 } 861 862 new = old = pgste_get_lock(ptep); 863 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 864 PGSTE_ACC_BITS | PGSTE_FP_BIT); 865 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 866 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 867 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 868 unsigned long address, bits, skey; 869 870 address = pte_val(*ptep) & PAGE_MASK; 871 skey = (unsigned long) page_get_storage_key(address); 872 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 873 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 874 /* Set storage key ACC and FP */ 875 page_set_storage_key(address, skey, !nq); 876 /* Merge host changed & referenced into pgste */ 877 pgste_val(new) |= bits << 52; 878 } 879 /* changing the guest storage key is considered a change of the page */ 880 if ((pgste_val(new) ^ pgste_val(old)) & 881 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 882 pgste_val(new) |= PGSTE_UC_BIT; 883 884 pgste_set_unlock(ptep, new); 885 pte_unmap_unlock(ptep, ptl); 886 up_read(&mm->mmap_sem); 887 return 0; 888 } 889 EXPORT_SYMBOL(set_guest_storage_key); 890 891 unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) 892 { 893 spinlock_t *ptl; 894 pgste_t pgste; 895 pte_t *ptep; 896 uint64_t physaddr; 897 unsigned long key = 0; 898 899 down_read(&mm->mmap_sem); 900 ptep = get_locked_pte(mm, addr, &ptl); 901 if (unlikely(!ptep)) { 902 up_read(&mm->mmap_sem); 903 return -EFAULT; 904 } 905 pgste = pgste_get_lock(ptep); 906 907 if (pte_val(*ptep) & _PAGE_INVALID) { 908 key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; 909 key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; 910 key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; 911 key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; 912 } else { 913 physaddr = pte_val(*ptep) & PAGE_MASK; 914 key = page_get_storage_key(physaddr); 915 916 /* Reflect guest's logical view, not physical */ 917 if (pgste_val(pgste) & PGSTE_GR_BIT) 918 key |= _PAGE_REFERENCED; 919 if (pgste_val(pgste) & PGSTE_GC_BIT) 920 key |= _PAGE_CHANGED; 921 } 922 923 pgste_set_unlock(ptep, pgste); 924 pte_unmap_unlock(ptep, ptl); 925 up_read(&mm->mmap_sem); 926 return key; 927 } 928 EXPORT_SYMBOL(get_guest_storage_key); 929 930 #else /* CONFIG_PGSTE */ 931 932 static inline int page_table_with_pgste(struct page *page) 933 { 934 return 0; 935 } 936 937 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) 938 { 939 return NULL; 940 } 941 942 static inline void page_table_free_pgste(unsigned long *table) 943 { 944 } 945 946 static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, 947 unsigned long vmaddr) 948 { 949 } 950 951 #endif /* CONFIG_PGSTE */ 952 953 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 954 { 955 unsigned int old, new; 956 957 do { 958 old = atomic_read(v); 959 new = old ^ bits; 960 } while (atomic_cmpxchg(v, old, new) != old); 961 return new; 962 } 963 964 /* 965 * page table entry allocation/free routines. 966 */ 967 unsigned long *page_table_alloc(struct mm_struct *mm) 968 { 969 unsigned long *uninitialized_var(table); 970 struct page *uninitialized_var(page); 971 unsigned int mask, bit; 972 973 if (mm_has_pgste(mm)) 974 return page_table_alloc_pgste(mm); 975 /* Allocate fragments of a 4K page as 1K/2K page table */ 976 spin_lock_bh(&mm->context.list_lock); 977 mask = FRAG_MASK; 978 if (!list_empty(&mm->context.pgtable_list)) { 979 page = list_first_entry(&mm->context.pgtable_list, 980 struct page, lru); 981 table = (unsigned long *) page_to_phys(page); 982 mask = atomic_read(&page->_mapcount); 983 mask = mask | (mask >> 4); 984 } 985 if ((mask & FRAG_MASK) == FRAG_MASK) { 986 spin_unlock_bh(&mm->context.list_lock); 987 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 988 if (!page) 989 return NULL; 990 if (!pgtable_page_ctor(page)) { 991 __free_page(page); 992 return NULL; 993 } 994 atomic_set(&page->_mapcount, 1); 995 table = (unsigned long *) page_to_phys(page); 996 clear_table(table, _PAGE_INVALID, PAGE_SIZE); 997 spin_lock_bh(&mm->context.list_lock); 998 list_add(&page->lru, &mm->context.pgtable_list); 999 } else { 1000 for (bit = 1; mask & bit; bit <<= 1) 1001 table += PTRS_PER_PTE; 1002 mask = atomic_xor_bits(&page->_mapcount, bit); 1003 if ((mask & FRAG_MASK) == FRAG_MASK) 1004 list_del(&page->lru); 1005 } 1006 spin_unlock_bh(&mm->context.list_lock); 1007 return table; 1008 } 1009 1010 void page_table_free(struct mm_struct *mm, unsigned long *table) 1011 { 1012 struct page *page; 1013 unsigned int bit, mask; 1014 1015 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1016 if (page_table_with_pgste(page)) 1017 return page_table_free_pgste(table); 1018 /* Free 1K/2K page table fragment of a 4K page */ 1019 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 1020 spin_lock_bh(&mm->context.list_lock); 1021 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1022 list_del(&page->lru); 1023 mask = atomic_xor_bits(&page->_mapcount, bit); 1024 if (mask & FRAG_MASK) 1025 list_add(&page->lru, &mm->context.pgtable_list); 1026 spin_unlock_bh(&mm->context.list_lock); 1027 if (mask == 0) { 1028 pgtable_page_dtor(page); 1029 atomic_set(&page->_mapcount, -1); 1030 __free_page(page); 1031 } 1032 } 1033 1034 static void __page_table_free_rcu(void *table, unsigned bit) 1035 { 1036 struct page *page; 1037 1038 if (bit == FRAG_MASK) 1039 return page_table_free_pgste(table); 1040 /* Free 1K/2K page table fragment of a 4K page */ 1041 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1042 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 1043 pgtable_page_dtor(page); 1044 atomic_set(&page->_mapcount, -1); 1045 __free_page(page); 1046 } 1047 } 1048 1049 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, 1050 unsigned long vmaddr) 1051 { 1052 struct mm_struct *mm; 1053 struct page *page; 1054 unsigned int bit, mask; 1055 1056 mm = tlb->mm; 1057 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1058 if (page_table_with_pgste(page)) { 1059 gmap_unlink(mm, table, vmaddr); 1060 table = (unsigned long *) (__pa(table) | FRAG_MASK); 1061 tlb_remove_table(tlb, table); 1062 return; 1063 } 1064 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 1065 spin_lock_bh(&mm->context.list_lock); 1066 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1067 list_del(&page->lru); 1068 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 1069 if (mask & FRAG_MASK) 1070 list_add_tail(&page->lru, &mm->context.pgtable_list); 1071 spin_unlock_bh(&mm->context.list_lock); 1072 table = (unsigned long *) (__pa(table) | (bit << 4)); 1073 tlb_remove_table(tlb, table); 1074 } 1075 1076 static void __tlb_remove_table(void *_table) 1077 { 1078 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 1079 void *table = (void *)((unsigned long) _table & ~mask); 1080 unsigned type = (unsigned long) _table & mask; 1081 1082 if (type) 1083 __page_table_free_rcu(table, type); 1084 else 1085 free_pages((unsigned long) table, ALLOC_ORDER); 1086 } 1087 1088 static void tlb_remove_table_smp_sync(void *arg) 1089 { 1090 /* Simply deliver the interrupt */ 1091 } 1092 1093 static void tlb_remove_table_one(void *table) 1094 { 1095 /* 1096 * This isn't an RCU grace period and hence the page-tables cannot be 1097 * assumed to be actually RCU-freed. 1098 * 1099 * It is however sufficient for software page-table walkers that rely 1100 * on IRQ disabling. See the comment near struct mmu_table_batch. 1101 */ 1102 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 1103 __tlb_remove_table(table); 1104 } 1105 1106 static void tlb_remove_table_rcu(struct rcu_head *head) 1107 { 1108 struct mmu_table_batch *batch; 1109 int i; 1110 1111 batch = container_of(head, struct mmu_table_batch, rcu); 1112 1113 for (i = 0; i < batch->nr; i++) 1114 __tlb_remove_table(batch->tables[i]); 1115 1116 free_page((unsigned long)batch); 1117 } 1118 1119 void tlb_table_flush(struct mmu_gather *tlb) 1120 { 1121 struct mmu_table_batch **batch = &tlb->batch; 1122 1123 if (*batch) { 1124 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 1125 *batch = NULL; 1126 } 1127 } 1128 1129 void tlb_remove_table(struct mmu_gather *tlb, void *table) 1130 { 1131 struct mmu_table_batch **batch = &tlb->batch; 1132 1133 tlb->mm->context.flush_mm = 1; 1134 if (*batch == NULL) { 1135 *batch = (struct mmu_table_batch *) 1136 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 1137 if (*batch == NULL) { 1138 __tlb_flush_mm_lazy(tlb->mm); 1139 tlb_remove_table_one(table); 1140 return; 1141 } 1142 (*batch)->nr = 0; 1143 } 1144 (*batch)->tables[(*batch)->nr++] = table; 1145 if ((*batch)->nr == MAX_TABLE_BATCH) 1146 tlb_flush_mmu(tlb); 1147 } 1148 1149 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1150 static inline void thp_split_vma(struct vm_area_struct *vma) 1151 { 1152 unsigned long addr; 1153 1154 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) 1155 follow_page(vma, addr, FOLL_SPLIT); 1156 } 1157 1158 static inline void thp_split_mm(struct mm_struct *mm) 1159 { 1160 struct vm_area_struct *vma; 1161 1162 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { 1163 thp_split_vma(vma); 1164 vma->vm_flags &= ~VM_HUGEPAGE; 1165 vma->vm_flags |= VM_NOHUGEPAGE; 1166 } 1167 mm->def_flags |= VM_NOHUGEPAGE; 1168 } 1169 #else 1170 static inline void thp_split_mm(struct mm_struct *mm) 1171 { 1172 } 1173 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1174 1175 static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb, 1176 struct mm_struct *mm, pud_t *pud, 1177 unsigned long addr, unsigned long end) 1178 { 1179 unsigned long next, *table, *new; 1180 struct page *page; 1181 spinlock_t *ptl; 1182 pmd_t *pmd; 1183 1184 pmd = pmd_offset(pud, addr); 1185 do { 1186 next = pmd_addr_end(addr, end); 1187 again: 1188 if (pmd_none_or_clear_bad(pmd)) 1189 continue; 1190 table = (unsigned long *) pmd_deref(*pmd); 1191 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1192 if (page_table_with_pgste(page)) 1193 continue; 1194 /* Allocate new page table with pgstes */ 1195 new = page_table_alloc_pgste(mm); 1196 if (!new) 1197 return -ENOMEM; 1198 1199 ptl = pmd_lock(mm, pmd); 1200 if (likely((unsigned long *) pmd_deref(*pmd) == table)) { 1201 /* Nuke pmd entry pointing to the "short" page table */ 1202 pmdp_flush_lazy(mm, addr, pmd); 1203 pmd_clear(pmd); 1204 /* Copy ptes from old table to new table */ 1205 memcpy(new, table, PAGE_SIZE/2); 1206 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 1207 /* Establish new table */ 1208 pmd_populate(mm, pmd, (pte_t *) new); 1209 /* Free old table with rcu, there might be a walker! */ 1210 page_table_free_rcu(tlb, table, addr); 1211 new = NULL; 1212 } 1213 spin_unlock(ptl); 1214 if (new) { 1215 page_table_free_pgste(new); 1216 goto again; 1217 } 1218 } while (pmd++, addr = next, addr != end); 1219 1220 return addr; 1221 } 1222 1223 static unsigned long page_table_realloc_pud(struct mmu_gather *tlb, 1224 struct mm_struct *mm, pgd_t *pgd, 1225 unsigned long addr, unsigned long end) 1226 { 1227 unsigned long next; 1228 pud_t *pud; 1229 1230 pud = pud_offset(pgd, addr); 1231 do { 1232 next = pud_addr_end(addr, end); 1233 if (pud_none_or_clear_bad(pud)) 1234 continue; 1235 next = page_table_realloc_pmd(tlb, mm, pud, addr, next); 1236 if (unlikely(IS_ERR_VALUE(next))) 1237 return next; 1238 } while (pud++, addr = next, addr != end); 1239 1240 return addr; 1241 } 1242 1243 static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm, 1244 unsigned long addr, unsigned long end) 1245 { 1246 unsigned long next; 1247 pgd_t *pgd; 1248 1249 pgd = pgd_offset(mm, addr); 1250 do { 1251 next = pgd_addr_end(addr, end); 1252 if (pgd_none_or_clear_bad(pgd)) 1253 continue; 1254 next = page_table_realloc_pud(tlb, mm, pgd, addr, next); 1255 if (unlikely(IS_ERR_VALUE(next))) 1256 return next; 1257 } while (pgd++, addr = next, addr != end); 1258 1259 return 0; 1260 } 1261 1262 /* 1263 * switch on pgstes for its userspace process (for kvm) 1264 */ 1265 int s390_enable_sie(void) 1266 { 1267 struct task_struct *tsk = current; 1268 struct mm_struct *mm = tsk->mm; 1269 struct mmu_gather tlb; 1270 1271 /* Do we have pgstes? if yes, we are done */ 1272 if (mm_has_pgste(tsk->mm)) 1273 return 0; 1274 1275 down_write(&mm->mmap_sem); 1276 /* split thp mappings and disable thp for future mappings */ 1277 thp_split_mm(mm); 1278 /* Reallocate the page tables with pgstes */ 1279 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE); 1280 if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE)) 1281 mm->context.has_pgste = 1; 1282 tlb_finish_mmu(&tlb, 0, TASK_SIZE); 1283 up_write(&mm->mmap_sem); 1284 return mm->context.has_pgste ? 0 : -ENOMEM; 1285 } 1286 EXPORT_SYMBOL_GPL(s390_enable_sie); 1287 1288 /* 1289 * Enable storage key handling from now on and initialize the storage 1290 * keys with the default key. 1291 */ 1292 static int __s390_enable_skey(pte_t *pte, unsigned long addr, 1293 unsigned long next, struct mm_walk *walk) 1294 { 1295 unsigned long ptev; 1296 pgste_t pgste; 1297 1298 pgste = pgste_get_lock(pte); 1299 /* 1300 * Remove all zero page mappings, 1301 * after establishing a policy to forbid zero page mappings 1302 * following faults for that page will get fresh anonymous pages 1303 */ 1304 if (is_zero_pfn(pte_pfn(*pte))) { 1305 ptep_flush_direct(walk->mm, addr, pte); 1306 pte_val(*pte) = _PAGE_INVALID; 1307 } 1308 /* Clear storage key */ 1309 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | 1310 PGSTE_GR_BIT | PGSTE_GC_BIT); 1311 ptev = pte_val(*pte); 1312 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 1313 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); 1314 pgste_set_unlock(pte, pgste); 1315 return 0; 1316 } 1317 1318 int s390_enable_skey(void) 1319 { 1320 struct mm_walk walk = { .pte_entry = __s390_enable_skey }; 1321 struct mm_struct *mm = current->mm; 1322 struct vm_area_struct *vma; 1323 int rc = 0; 1324 1325 down_write(&mm->mmap_sem); 1326 if (mm_use_skey(mm)) 1327 goto out_up; 1328 1329 mm->context.use_skey = 1; 1330 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1331 if (ksm_madvise(vma, vma->vm_start, vma->vm_end, 1332 MADV_UNMERGEABLE, &vma->vm_flags)) { 1333 mm->context.use_skey = 0; 1334 rc = -ENOMEM; 1335 goto out_up; 1336 } 1337 } 1338 mm->def_flags &= ~VM_MERGEABLE; 1339 1340 walk.mm = mm; 1341 walk_page_range(0, TASK_SIZE, &walk); 1342 1343 out_up: 1344 up_write(&mm->mmap_sem); 1345 return rc; 1346 } 1347 EXPORT_SYMBOL_GPL(s390_enable_skey); 1348 1349 /* 1350 * Reset CMMA state, make all pages stable again. 1351 */ 1352 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 1353 unsigned long next, struct mm_walk *walk) 1354 { 1355 pgste_t pgste; 1356 1357 pgste = pgste_get_lock(pte); 1358 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; 1359 pgste_set_unlock(pte, pgste); 1360 return 0; 1361 } 1362 1363 void s390_reset_cmma(struct mm_struct *mm) 1364 { 1365 struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; 1366 1367 down_write(&mm->mmap_sem); 1368 walk.mm = mm; 1369 walk_page_range(0, TASK_SIZE, &walk); 1370 up_write(&mm->mmap_sem); 1371 } 1372 EXPORT_SYMBOL_GPL(s390_reset_cmma); 1373 1374 /* 1375 * Test and reset if a guest page is dirty 1376 */ 1377 bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) 1378 { 1379 pte_t *pte; 1380 spinlock_t *ptl; 1381 bool dirty = false; 1382 1383 pte = get_locked_pte(gmap->mm, address, &ptl); 1384 if (unlikely(!pte)) 1385 return false; 1386 1387 if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) 1388 dirty = true; 1389 1390 spin_unlock(ptl); 1391 return dirty; 1392 } 1393 EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); 1394 1395 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1396 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1397 pmd_t *pmdp) 1398 { 1399 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1400 /* No need to flush TLB 1401 * On s390 reference bits are in storage key and never in TLB */ 1402 return pmdp_test_and_clear_young(vma, address, pmdp); 1403 } 1404 1405 int pmdp_set_access_flags(struct vm_area_struct *vma, 1406 unsigned long address, pmd_t *pmdp, 1407 pmd_t entry, int dirty) 1408 { 1409 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1410 1411 entry = pmd_mkyoung(entry); 1412 if (dirty) 1413 entry = pmd_mkdirty(entry); 1414 if (pmd_same(*pmdp, entry)) 1415 return 0; 1416 pmdp_invalidate(vma, address, pmdp); 1417 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1418 return 1; 1419 } 1420 1421 static void pmdp_splitting_flush_sync(void *arg) 1422 { 1423 /* Simply deliver the interrupt */ 1424 } 1425 1426 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 1427 pmd_t *pmdp) 1428 { 1429 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1430 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 1431 (unsigned long *) pmdp)) { 1432 /* need to serialize against gup-fast (IRQ disabled) */ 1433 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 1434 } 1435 } 1436 1437 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1438 pgtable_t pgtable) 1439 { 1440 struct list_head *lh = (struct list_head *) pgtable; 1441 1442 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1443 1444 /* FIFO */ 1445 if (!pmd_huge_pte(mm, pmdp)) 1446 INIT_LIST_HEAD(lh); 1447 else 1448 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1449 pmd_huge_pte(mm, pmdp) = pgtable; 1450 } 1451 1452 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1453 { 1454 struct list_head *lh; 1455 pgtable_t pgtable; 1456 pte_t *ptep; 1457 1458 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1459 1460 /* FIFO */ 1461 pgtable = pmd_huge_pte(mm, pmdp); 1462 lh = (struct list_head *) pgtable; 1463 if (list_empty(lh)) 1464 pmd_huge_pte(mm, pmdp) = NULL; 1465 else { 1466 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1467 list_del(lh); 1468 } 1469 ptep = (pte_t *) pgtable; 1470 pte_val(*ptep) = _PAGE_INVALID; 1471 ptep++; 1472 pte_val(*ptep) = _PAGE_INVALID; 1473 return pgtable; 1474 } 1475 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1476