1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 #include <linux/swapops.h> 21 #include <linux/ksm.h> 22 #include <linux/mman.h> 23 24 #include <asm/pgtable.h> 25 #include <asm/pgalloc.h> 26 #include <asm/tlb.h> 27 #include <asm/tlbflush.h> 28 #include <asm/mmu_context.h> 29 30 #define ALLOC_ORDER 2 31 #define FRAG_MASK 0x03 32 33 unsigned long *crst_table_alloc(struct mm_struct *mm) 34 { 35 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 36 37 if (!page) 38 return NULL; 39 return (unsigned long *) page_to_phys(page); 40 } 41 42 void crst_table_free(struct mm_struct *mm, unsigned long *table) 43 { 44 free_pages((unsigned long) table, ALLOC_ORDER); 45 } 46 47 static void __crst_table_upgrade(void *arg) 48 { 49 struct mm_struct *mm = arg; 50 51 if (current->active_mm == mm) { 52 clear_user_asce(); 53 set_user_asce(mm); 54 } 55 __tlb_flush_local(); 56 } 57 58 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 59 { 60 unsigned long *table, *pgd; 61 unsigned long entry; 62 int flush; 63 64 BUG_ON(limit > (1UL << 53)); 65 flush = 0; 66 repeat: 67 table = crst_table_alloc(mm); 68 if (!table) 69 return -ENOMEM; 70 spin_lock_bh(&mm->page_table_lock); 71 if (mm->context.asce_limit < limit) { 72 pgd = (unsigned long *) mm->pgd; 73 if (mm->context.asce_limit <= (1UL << 31)) { 74 entry = _REGION3_ENTRY_EMPTY; 75 mm->context.asce_limit = 1UL << 42; 76 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 77 _ASCE_USER_BITS | 78 _ASCE_TYPE_REGION3; 79 } else { 80 entry = _REGION2_ENTRY_EMPTY; 81 mm->context.asce_limit = 1UL << 53; 82 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 83 _ASCE_USER_BITS | 84 _ASCE_TYPE_REGION2; 85 } 86 crst_table_init(table, entry); 87 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 88 mm->pgd = (pgd_t *) table; 89 mm->task_size = mm->context.asce_limit; 90 table = NULL; 91 flush = 1; 92 } 93 spin_unlock_bh(&mm->page_table_lock); 94 if (table) 95 crst_table_free(mm, table); 96 if (mm->context.asce_limit < limit) 97 goto repeat; 98 if (flush) 99 on_each_cpu(__crst_table_upgrade, mm, 0); 100 return 0; 101 } 102 103 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 104 { 105 pgd_t *pgd; 106 107 if (current->active_mm == mm) { 108 clear_user_asce(); 109 __tlb_flush_mm(mm); 110 } 111 while (mm->context.asce_limit > limit) { 112 pgd = mm->pgd; 113 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 114 case _REGION_ENTRY_TYPE_R2: 115 mm->context.asce_limit = 1UL << 42; 116 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 117 _ASCE_USER_BITS | 118 _ASCE_TYPE_REGION3; 119 break; 120 case _REGION_ENTRY_TYPE_R3: 121 mm->context.asce_limit = 1UL << 31; 122 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 123 _ASCE_USER_BITS | 124 _ASCE_TYPE_SEGMENT; 125 break; 126 default: 127 BUG(); 128 } 129 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 130 mm->task_size = mm->context.asce_limit; 131 crst_table_free(mm, (unsigned long *) pgd); 132 } 133 if (current->active_mm == mm) 134 set_user_asce(mm); 135 } 136 137 #ifdef CONFIG_PGSTE 138 139 /** 140 * gmap_alloc - allocate a guest address space 141 * @mm: pointer to the parent mm_struct 142 * @limit: maximum size of the gmap address space 143 * 144 * Returns a guest address space structure. 145 */ 146 struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) 147 { 148 struct gmap *gmap; 149 struct page *page; 150 unsigned long *table; 151 unsigned long etype, atype; 152 153 if (limit < (1UL << 31)) { 154 limit = (1UL << 31) - 1; 155 atype = _ASCE_TYPE_SEGMENT; 156 etype = _SEGMENT_ENTRY_EMPTY; 157 } else if (limit < (1UL << 42)) { 158 limit = (1UL << 42) - 1; 159 atype = _ASCE_TYPE_REGION3; 160 etype = _REGION3_ENTRY_EMPTY; 161 } else if (limit < (1UL << 53)) { 162 limit = (1UL << 53) - 1; 163 atype = _ASCE_TYPE_REGION2; 164 etype = _REGION2_ENTRY_EMPTY; 165 } else { 166 limit = -1UL; 167 atype = _ASCE_TYPE_REGION1; 168 etype = _REGION1_ENTRY_EMPTY; 169 } 170 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 171 if (!gmap) 172 goto out; 173 INIT_LIST_HEAD(&gmap->crst_list); 174 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); 175 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); 176 spin_lock_init(&gmap->guest_table_lock); 177 gmap->mm = mm; 178 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 179 if (!page) 180 goto out_free; 181 page->index = 0; 182 list_add(&page->lru, &gmap->crst_list); 183 table = (unsigned long *) page_to_phys(page); 184 crst_table_init(table, etype); 185 gmap->table = table; 186 gmap->asce = atype | _ASCE_TABLE_LENGTH | 187 _ASCE_USER_BITS | __pa(table); 188 gmap->asce_end = limit; 189 down_write(&mm->mmap_sem); 190 list_add(&gmap->list, &mm->context.gmap_list); 191 up_write(&mm->mmap_sem); 192 return gmap; 193 194 out_free: 195 kfree(gmap); 196 out: 197 return NULL; 198 } 199 EXPORT_SYMBOL_GPL(gmap_alloc); 200 201 static void gmap_flush_tlb(struct gmap *gmap) 202 { 203 if (MACHINE_HAS_IDTE) 204 __tlb_flush_asce(gmap->mm, gmap->asce); 205 else 206 __tlb_flush_global(); 207 } 208 209 static void gmap_radix_tree_free(struct radix_tree_root *root) 210 { 211 struct radix_tree_iter iter; 212 unsigned long indices[16]; 213 unsigned long index; 214 void **slot; 215 int i, nr; 216 217 /* A radix tree is freed by deleting all of its entries */ 218 index = 0; 219 do { 220 nr = 0; 221 radix_tree_for_each_slot(slot, root, &iter, index) { 222 indices[nr] = iter.index; 223 if (++nr == 16) 224 break; 225 } 226 for (i = 0; i < nr; i++) { 227 index = indices[i]; 228 radix_tree_delete(root, index); 229 } 230 } while (nr > 0); 231 } 232 233 /** 234 * gmap_free - free a guest address space 235 * @gmap: pointer to the guest address space structure 236 */ 237 void gmap_free(struct gmap *gmap) 238 { 239 struct page *page, *next; 240 241 /* Flush tlb. */ 242 if (MACHINE_HAS_IDTE) 243 __tlb_flush_asce(gmap->mm, gmap->asce); 244 else 245 __tlb_flush_global(); 246 247 /* Free all segment & region tables. */ 248 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 249 __free_pages(page, ALLOC_ORDER); 250 gmap_radix_tree_free(&gmap->guest_to_host); 251 gmap_radix_tree_free(&gmap->host_to_guest); 252 down_write(&gmap->mm->mmap_sem); 253 list_del(&gmap->list); 254 up_write(&gmap->mm->mmap_sem); 255 kfree(gmap); 256 } 257 EXPORT_SYMBOL_GPL(gmap_free); 258 259 /** 260 * gmap_enable - switch primary space to the guest address space 261 * @gmap: pointer to the guest address space structure 262 */ 263 void gmap_enable(struct gmap *gmap) 264 { 265 S390_lowcore.gmap = (unsigned long) gmap; 266 } 267 EXPORT_SYMBOL_GPL(gmap_enable); 268 269 /** 270 * gmap_disable - switch back to the standard primary address space 271 * @gmap: pointer to the guest address space structure 272 */ 273 void gmap_disable(struct gmap *gmap) 274 { 275 S390_lowcore.gmap = 0UL; 276 } 277 EXPORT_SYMBOL_GPL(gmap_disable); 278 279 /* 280 * gmap_alloc_table is assumed to be called with mmap_sem held 281 */ 282 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 283 unsigned long init, unsigned long gaddr) 284 { 285 struct page *page; 286 unsigned long *new; 287 288 /* since we dont free the gmap table until gmap_free we can unlock */ 289 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 290 if (!page) 291 return -ENOMEM; 292 new = (unsigned long *) page_to_phys(page); 293 crst_table_init(new, init); 294 spin_lock(&gmap->mm->page_table_lock); 295 if (*table & _REGION_ENTRY_INVALID) { 296 list_add(&page->lru, &gmap->crst_list); 297 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 298 (*table & _REGION_ENTRY_TYPE_MASK); 299 page->index = gaddr; 300 page = NULL; 301 } 302 spin_unlock(&gmap->mm->page_table_lock); 303 if (page) 304 __free_pages(page, ALLOC_ORDER); 305 return 0; 306 } 307 308 /** 309 * __gmap_segment_gaddr - find virtual address from segment pointer 310 * @entry: pointer to a segment table entry in the guest address space 311 * 312 * Returns the virtual address in the guest address space for the segment 313 */ 314 static unsigned long __gmap_segment_gaddr(unsigned long *entry) 315 { 316 struct page *page; 317 unsigned long offset, mask; 318 319 offset = (unsigned long) entry / sizeof(unsigned long); 320 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 321 mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); 322 page = virt_to_page((void *)((unsigned long) entry & mask)); 323 return page->index + offset; 324 } 325 326 /** 327 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 328 * @gmap: pointer to the guest address space structure 329 * @vmaddr: address in the host process address space 330 * 331 * Returns 1 if a TLB flush is required 332 */ 333 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 334 { 335 unsigned long *entry; 336 int flush = 0; 337 338 spin_lock(&gmap->guest_table_lock); 339 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 340 if (entry) { 341 flush = (*entry != _SEGMENT_ENTRY_INVALID); 342 *entry = _SEGMENT_ENTRY_INVALID; 343 } 344 spin_unlock(&gmap->guest_table_lock); 345 return flush; 346 } 347 348 /** 349 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 350 * @gmap: pointer to the guest address space structure 351 * @gaddr: address in the guest address space 352 * 353 * Returns 1 if a TLB flush is required 354 */ 355 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 356 { 357 unsigned long vmaddr; 358 359 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 360 gaddr >> PMD_SHIFT); 361 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 362 } 363 364 /** 365 * gmap_unmap_segment - unmap segment from the guest address space 366 * @gmap: pointer to the guest address space structure 367 * @to: address in the guest address space 368 * @len: length of the memory area to unmap 369 * 370 * Returns 0 if the unmap succeeded, -EINVAL if not. 371 */ 372 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 373 { 374 unsigned long off; 375 int flush; 376 377 if ((to | len) & (PMD_SIZE - 1)) 378 return -EINVAL; 379 if (len == 0 || to + len < to) 380 return -EINVAL; 381 382 flush = 0; 383 down_write(&gmap->mm->mmap_sem); 384 for (off = 0; off < len; off += PMD_SIZE) 385 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 386 up_write(&gmap->mm->mmap_sem); 387 if (flush) 388 gmap_flush_tlb(gmap); 389 return 0; 390 } 391 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 392 393 /** 394 * gmap_mmap_segment - map a segment to the guest address space 395 * @gmap: pointer to the guest address space structure 396 * @from: source address in the parent address space 397 * @to: target address in the guest address space 398 * @len: length of the memory area to map 399 * 400 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 401 */ 402 int gmap_map_segment(struct gmap *gmap, unsigned long from, 403 unsigned long to, unsigned long len) 404 { 405 unsigned long off; 406 int flush; 407 408 if ((from | to | len) & (PMD_SIZE - 1)) 409 return -EINVAL; 410 if (len == 0 || from + len < from || to + len < to || 411 from + len > TASK_MAX_SIZE || to + len > gmap->asce_end) 412 return -EINVAL; 413 414 flush = 0; 415 down_write(&gmap->mm->mmap_sem); 416 for (off = 0; off < len; off += PMD_SIZE) { 417 /* Remove old translation */ 418 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 419 /* Store new translation */ 420 if (radix_tree_insert(&gmap->guest_to_host, 421 (to + off) >> PMD_SHIFT, 422 (void *) from + off)) 423 break; 424 } 425 up_write(&gmap->mm->mmap_sem); 426 if (flush) 427 gmap_flush_tlb(gmap); 428 if (off >= len) 429 return 0; 430 gmap_unmap_segment(gmap, to, len); 431 return -ENOMEM; 432 } 433 EXPORT_SYMBOL_GPL(gmap_map_segment); 434 435 /** 436 * __gmap_translate - translate a guest address to a user space address 437 * @gmap: pointer to guest mapping meta data structure 438 * @gaddr: guest address 439 * 440 * Returns user space address which corresponds to the guest address or 441 * -EFAULT if no such mapping exists. 442 * This function does not establish potentially missing page table entries. 443 * The mmap_sem of the mm that belongs to the address space must be held 444 * when this function gets called. 445 */ 446 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 447 { 448 unsigned long vmaddr; 449 450 vmaddr = (unsigned long) 451 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 452 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 453 } 454 EXPORT_SYMBOL_GPL(__gmap_translate); 455 456 /** 457 * gmap_translate - translate a guest address to a user space address 458 * @gmap: pointer to guest mapping meta data structure 459 * @gaddr: guest address 460 * 461 * Returns user space address which corresponds to the guest address or 462 * -EFAULT if no such mapping exists. 463 * This function does not establish potentially missing page table entries. 464 */ 465 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 466 { 467 unsigned long rc; 468 469 down_read(&gmap->mm->mmap_sem); 470 rc = __gmap_translate(gmap, gaddr); 471 up_read(&gmap->mm->mmap_sem); 472 return rc; 473 } 474 EXPORT_SYMBOL_GPL(gmap_translate); 475 476 /** 477 * gmap_unlink - disconnect a page table from the gmap shadow tables 478 * @gmap: pointer to guest mapping meta data structure 479 * @table: pointer to the host page table 480 * @vmaddr: vm address associated with the host page table 481 */ 482 static void gmap_unlink(struct mm_struct *mm, unsigned long *table, 483 unsigned long vmaddr) 484 { 485 struct gmap *gmap; 486 int flush; 487 488 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 489 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 490 if (flush) 491 gmap_flush_tlb(gmap); 492 } 493 } 494 495 /** 496 * gmap_link - set up shadow page tables to connect a host to a guest address 497 * @gmap: pointer to guest mapping meta data structure 498 * @gaddr: guest address 499 * @vmaddr: vm address 500 * 501 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 502 * if the vm address is already mapped to a different guest segment. 503 * The mmap_sem of the mm that belongs to the address space must be held 504 * when this function gets called. 505 */ 506 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 507 { 508 struct mm_struct *mm; 509 unsigned long *table; 510 spinlock_t *ptl; 511 pgd_t *pgd; 512 pud_t *pud; 513 pmd_t *pmd; 514 int rc; 515 516 /* Create higher level tables in the gmap page table */ 517 table = gmap->table; 518 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 519 table += (gaddr >> 53) & 0x7ff; 520 if ((*table & _REGION_ENTRY_INVALID) && 521 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 522 gaddr & 0xffe0000000000000UL)) 523 return -ENOMEM; 524 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 525 } 526 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 527 table += (gaddr >> 42) & 0x7ff; 528 if ((*table & _REGION_ENTRY_INVALID) && 529 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 530 gaddr & 0xfffffc0000000000UL)) 531 return -ENOMEM; 532 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 533 } 534 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 535 table += (gaddr >> 31) & 0x7ff; 536 if ((*table & _REGION_ENTRY_INVALID) && 537 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 538 gaddr & 0xffffffff80000000UL)) 539 return -ENOMEM; 540 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 541 } 542 table += (gaddr >> 20) & 0x7ff; 543 /* Walk the parent mm page table */ 544 mm = gmap->mm; 545 pgd = pgd_offset(mm, vmaddr); 546 VM_BUG_ON(pgd_none(*pgd)); 547 pud = pud_offset(pgd, vmaddr); 548 VM_BUG_ON(pud_none(*pud)); 549 pmd = pmd_offset(pud, vmaddr); 550 VM_BUG_ON(pmd_none(*pmd)); 551 /* large pmds cannot yet be handled */ 552 if (pmd_large(*pmd)) 553 return -EFAULT; 554 /* Link gmap segment table entry location to page table. */ 555 rc = radix_tree_preload(GFP_KERNEL); 556 if (rc) 557 return rc; 558 ptl = pmd_lock(mm, pmd); 559 spin_lock(&gmap->guest_table_lock); 560 if (*table == _SEGMENT_ENTRY_INVALID) { 561 rc = radix_tree_insert(&gmap->host_to_guest, 562 vmaddr >> PMD_SHIFT, table); 563 if (!rc) 564 *table = pmd_val(*pmd); 565 } else 566 rc = 0; 567 spin_unlock(&gmap->guest_table_lock); 568 spin_unlock(ptl); 569 radix_tree_preload_end(); 570 return rc; 571 } 572 573 /** 574 * gmap_fault - resolve a fault on a guest address 575 * @gmap: pointer to guest mapping meta data structure 576 * @gaddr: guest address 577 * @fault_flags: flags to pass down to handle_mm_fault() 578 * 579 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 580 * if the vm address is already mapped to a different guest segment. 581 */ 582 int gmap_fault(struct gmap *gmap, unsigned long gaddr, 583 unsigned int fault_flags) 584 { 585 unsigned long vmaddr; 586 int rc; 587 588 down_read(&gmap->mm->mmap_sem); 589 vmaddr = __gmap_translate(gmap, gaddr); 590 if (IS_ERR_VALUE(vmaddr)) { 591 rc = vmaddr; 592 goto out_up; 593 } 594 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) { 595 rc = -EFAULT; 596 goto out_up; 597 } 598 rc = __gmap_link(gmap, gaddr, vmaddr); 599 out_up: 600 up_read(&gmap->mm->mmap_sem); 601 return rc; 602 } 603 EXPORT_SYMBOL_GPL(gmap_fault); 604 605 static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) 606 { 607 if (!non_swap_entry(entry)) 608 dec_mm_counter(mm, MM_SWAPENTS); 609 else if (is_migration_entry(entry)) { 610 struct page *page = migration_entry_to_page(entry); 611 612 if (PageAnon(page)) 613 dec_mm_counter(mm, MM_ANONPAGES); 614 else 615 dec_mm_counter(mm, MM_FILEPAGES); 616 } 617 free_swap_and_cache(entry); 618 } 619 620 /* 621 * this function is assumed to be called with mmap_sem held 622 */ 623 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 624 { 625 unsigned long vmaddr, ptev, pgstev; 626 pte_t *ptep, pte; 627 spinlock_t *ptl; 628 pgste_t pgste; 629 630 /* Find the vm address for the guest address */ 631 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 632 gaddr >> PMD_SHIFT); 633 if (!vmaddr) 634 return; 635 vmaddr |= gaddr & ~PMD_MASK; 636 /* Get pointer to the page table entry */ 637 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 638 if (unlikely(!ptep)) 639 return; 640 pte = *ptep; 641 if (!pte_swap(pte)) 642 goto out_pte; 643 /* Zap unused and logically-zero pages */ 644 pgste = pgste_get_lock(ptep); 645 pgstev = pgste_val(pgste); 646 ptev = pte_val(pte); 647 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || 648 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { 649 gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); 650 pte_clear(gmap->mm, vmaddr, ptep); 651 } 652 pgste_set_unlock(ptep, pgste); 653 out_pte: 654 pte_unmap_unlock(ptep, ptl); 655 } 656 EXPORT_SYMBOL_GPL(__gmap_zap); 657 658 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 659 { 660 unsigned long gaddr, vmaddr, size; 661 struct vm_area_struct *vma; 662 663 down_read(&gmap->mm->mmap_sem); 664 for (gaddr = from; gaddr < to; 665 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 666 /* Find the vm address for the guest address */ 667 vmaddr = (unsigned long) 668 radix_tree_lookup(&gmap->guest_to_host, 669 gaddr >> PMD_SHIFT); 670 if (!vmaddr) 671 continue; 672 vmaddr |= gaddr & ~PMD_MASK; 673 /* Find vma in the parent mm */ 674 vma = find_vma(gmap->mm, vmaddr); 675 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 676 zap_page_range(vma, vmaddr, size, NULL); 677 } 678 up_read(&gmap->mm->mmap_sem); 679 } 680 EXPORT_SYMBOL_GPL(gmap_discard); 681 682 static LIST_HEAD(gmap_notifier_list); 683 static DEFINE_SPINLOCK(gmap_notifier_lock); 684 685 /** 686 * gmap_register_ipte_notifier - register a pte invalidation callback 687 * @nb: pointer to the gmap notifier block 688 */ 689 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 690 { 691 spin_lock(&gmap_notifier_lock); 692 list_add(&nb->list, &gmap_notifier_list); 693 spin_unlock(&gmap_notifier_lock); 694 } 695 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 696 697 /** 698 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 699 * @nb: pointer to the gmap notifier block 700 */ 701 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 702 { 703 spin_lock(&gmap_notifier_lock); 704 list_del_init(&nb->list); 705 spin_unlock(&gmap_notifier_lock); 706 } 707 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 708 709 /** 710 * gmap_ipte_notify - mark a range of ptes for invalidation notification 711 * @gmap: pointer to guest mapping meta data structure 712 * @gaddr: virtual address in the guest address space 713 * @len: size of area 714 * 715 * Returns 0 if for each page in the given range a gmap mapping exists and 716 * the invalidation notification could be set. If the gmap mapping is missing 717 * for one or more pages -EFAULT is returned. If no memory could be allocated 718 * -ENOMEM is returned. This function establishes missing page table entries. 719 */ 720 int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) 721 { 722 unsigned long addr; 723 spinlock_t *ptl; 724 pte_t *ptep, entry; 725 pgste_t pgste; 726 int rc = 0; 727 728 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) 729 return -EINVAL; 730 down_read(&gmap->mm->mmap_sem); 731 while (len) { 732 /* Convert gmap address and connect the page tables */ 733 addr = __gmap_translate(gmap, gaddr); 734 if (IS_ERR_VALUE(addr)) { 735 rc = addr; 736 break; 737 } 738 /* Get the page mapped */ 739 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 740 rc = -EFAULT; 741 break; 742 } 743 rc = __gmap_link(gmap, gaddr, addr); 744 if (rc) 745 break; 746 /* Walk the process page table, lock and get pte pointer */ 747 ptep = get_locked_pte(gmap->mm, addr, &ptl); 748 VM_BUG_ON(!ptep); 749 /* Set notification bit in the pgste of the pte */ 750 entry = *ptep; 751 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { 752 pgste = pgste_get_lock(ptep); 753 pgste_val(pgste) |= PGSTE_IN_BIT; 754 pgste_set_unlock(ptep, pgste); 755 gaddr += PAGE_SIZE; 756 len -= PAGE_SIZE; 757 } 758 pte_unmap_unlock(ptep, ptl); 759 } 760 up_read(&gmap->mm->mmap_sem); 761 return rc; 762 } 763 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 764 765 /** 766 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 767 * @mm: pointer to the process mm_struct 768 * @addr: virtual address in the process address space 769 * @pte: pointer to the page table entry 770 * 771 * This function is assumed to be called with the page table lock held 772 * for the pte to notify. 773 */ 774 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) 775 { 776 unsigned long offset, gaddr; 777 unsigned long *table; 778 struct gmap_notifier *nb; 779 struct gmap *gmap; 780 781 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 782 offset = offset * (4096 / sizeof(pte_t)); 783 spin_lock(&gmap_notifier_lock); 784 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 785 table = radix_tree_lookup(&gmap->host_to_guest, 786 vmaddr >> PMD_SHIFT); 787 if (!table) 788 continue; 789 gaddr = __gmap_segment_gaddr(table) + offset; 790 list_for_each_entry(nb, &gmap_notifier_list, list) 791 nb->notifier_call(gmap, gaddr); 792 } 793 spin_unlock(&gmap_notifier_lock); 794 } 795 EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); 796 797 static inline int page_table_with_pgste(struct page *page) 798 { 799 return atomic_read(&page->_mapcount) == 0; 800 } 801 802 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) 803 { 804 struct page *page; 805 unsigned long *table; 806 807 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 808 if (!page) 809 return NULL; 810 if (!pgtable_page_ctor(page)) { 811 __free_page(page); 812 return NULL; 813 } 814 atomic_set(&page->_mapcount, 0); 815 table = (unsigned long *) page_to_phys(page); 816 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 817 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 818 return table; 819 } 820 821 static inline void page_table_free_pgste(unsigned long *table) 822 { 823 struct page *page; 824 825 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 826 pgtable_page_dtor(page); 827 atomic_set(&page->_mapcount, -1); 828 __free_page(page); 829 } 830 831 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 832 unsigned long key, bool nq) 833 { 834 spinlock_t *ptl; 835 pgste_t old, new; 836 pte_t *ptep; 837 838 down_read(&mm->mmap_sem); 839 retry: 840 ptep = get_locked_pte(mm, addr, &ptl); 841 if (unlikely(!ptep)) { 842 up_read(&mm->mmap_sem); 843 return -EFAULT; 844 } 845 if (!(pte_val(*ptep) & _PAGE_INVALID) && 846 (pte_val(*ptep) & _PAGE_PROTECT)) { 847 pte_unmap_unlock(ptep, ptl); 848 if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { 849 up_read(&mm->mmap_sem); 850 return -EFAULT; 851 } 852 goto retry; 853 } 854 855 new = old = pgste_get_lock(ptep); 856 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 857 PGSTE_ACC_BITS | PGSTE_FP_BIT); 858 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 859 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 860 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 861 unsigned long address, bits, skey; 862 863 address = pte_val(*ptep) & PAGE_MASK; 864 skey = (unsigned long) page_get_storage_key(address); 865 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 866 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 867 /* Set storage key ACC and FP */ 868 page_set_storage_key(address, skey, !nq); 869 /* Merge host changed & referenced into pgste */ 870 pgste_val(new) |= bits << 52; 871 } 872 /* changing the guest storage key is considered a change of the page */ 873 if ((pgste_val(new) ^ pgste_val(old)) & 874 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 875 pgste_val(new) |= PGSTE_UC_BIT; 876 877 pgste_set_unlock(ptep, new); 878 pte_unmap_unlock(ptep, ptl); 879 up_read(&mm->mmap_sem); 880 return 0; 881 } 882 EXPORT_SYMBOL(set_guest_storage_key); 883 884 unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) 885 { 886 spinlock_t *ptl; 887 pgste_t pgste; 888 pte_t *ptep; 889 uint64_t physaddr; 890 unsigned long key = 0; 891 892 down_read(&mm->mmap_sem); 893 ptep = get_locked_pte(mm, addr, &ptl); 894 if (unlikely(!ptep)) { 895 up_read(&mm->mmap_sem); 896 return -EFAULT; 897 } 898 pgste = pgste_get_lock(ptep); 899 900 if (pte_val(*ptep) & _PAGE_INVALID) { 901 key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; 902 key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; 903 key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; 904 key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; 905 } else { 906 physaddr = pte_val(*ptep) & PAGE_MASK; 907 key = page_get_storage_key(physaddr); 908 909 /* Reflect guest's logical view, not physical */ 910 if (pgste_val(pgste) & PGSTE_GR_BIT) 911 key |= _PAGE_REFERENCED; 912 if (pgste_val(pgste) & PGSTE_GC_BIT) 913 key |= _PAGE_CHANGED; 914 } 915 916 pgste_set_unlock(ptep, pgste); 917 pte_unmap_unlock(ptep, ptl); 918 up_read(&mm->mmap_sem); 919 return key; 920 } 921 EXPORT_SYMBOL(get_guest_storage_key); 922 923 #else /* CONFIG_PGSTE */ 924 925 static inline int page_table_with_pgste(struct page *page) 926 { 927 return 0; 928 } 929 930 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) 931 { 932 return NULL; 933 } 934 935 static inline void page_table_free_pgste(unsigned long *table) 936 { 937 } 938 939 static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, 940 unsigned long vmaddr) 941 { 942 } 943 944 #endif /* CONFIG_PGSTE */ 945 946 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 947 { 948 unsigned int old, new; 949 950 do { 951 old = atomic_read(v); 952 new = old ^ bits; 953 } while (atomic_cmpxchg(v, old, new) != old); 954 return new; 955 } 956 957 /* 958 * page table entry allocation/free routines. 959 */ 960 unsigned long *page_table_alloc(struct mm_struct *mm) 961 { 962 unsigned long *uninitialized_var(table); 963 struct page *uninitialized_var(page); 964 unsigned int mask, bit; 965 966 if (mm_has_pgste(mm)) 967 return page_table_alloc_pgste(mm); 968 /* Allocate fragments of a 4K page as 1K/2K page table */ 969 spin_lock_bh(&mm->context.list_lock); 970 mask = FRAG_MASK; 971 if (!list_empty(&mm->context.pgtable_list)) { 972 page = list_first_entry(&mm->context.pgtable_list, 973 struct page, lru); 974 table = (unsigned long *) page_to_phys(page); 975 mask = atomic_read(&page->_mapcount); 976 mask = mask | (mask >> 4); 977 } 978 if ((mask & FRAG_MASK) == FRAG_MASK) { 979 spin_unlock_bh(&mm->context.list_lock); 980 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 981 if (!page) 982 return NULL; 983 if (!pgtable_page_ctor(page)) { 984 __free_page(page); 985 return NULL; 986 } 987 atomic_set(&page->_mapcount, 1); 988 table = (unsigned long *) page_to_phys(page); 989 clear_table(table, _PAGE_INVALID, PAGE_SIZE); 990 spin_lock_bh(&mm->context.list_lock); 991 list_add(&page->lru, &mm->context.pgtable_list); 992 } else { 993 for (bit = 1; mask & bit; bit <<= 1) 994 table += PTRS_PER_PTE; 995 mask = atomic_xor_bits(&page->_mapcount, bit); 996 if ((mask & FRAG_MASK) == FRAG_MASK) 997 list_del(&page->lru); 998 } 999 spin_unlock_bh(&mm->context.list_lock); 1000 return table; 1001 } 1002 1003 void page_table_free(struct mm_struct *mm, unsigned long *table) 1004 { 1005 struct page *page; 1006 unsigned int bit, mask; 1007 1008 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1009 if (page_table_with_pgste(page)) 1010 return page_table_free_pgste(table); 1011 /* Free 1K/2K page table fragment of a 4K page */ 1012 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 1013 spin_lock_bh(&mm->context.list_lock); 1014 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1015 list_del(&page->lru); 1016 mask = atomic_xor_bits(&page->_mapcount, bit); 1017 if (mask & FRAG_MASK) 1018 list_add(&page->lru, &mm->context.pgtable_list); 1019 spin_unlock_bh(&mm->context.list_lock); 1020 if (mask == 0) { 1021 pgtable_page_dtor(page); 1022 atomic_set(&page->_mapcount, -1); 1023 __free_page(page); 1024 } 1025 } 1026 1027 static void __page_table_free_rcu(void *table, unsigned bit) 1028 { 1029 struct page *page; 1030 1031 if (bit == FRAG_MASK) 1032 return page_table_free_pgste(table); 1033 /* Free 1K/2K page table fragment of a 4K page */ 1034 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1035 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 1036 pgtable_page_dtor(page); 1037 atomic_set(&page->_mapcount, -1); 1038 __free_page(page); 1039 } 1040 } 1041 1042 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, 1043 unsigned long vmaddr) 1044 { 1045 struct mm_struct *mm; 1046 struct page *page; 1047 unsigned int bit, mask; 1048 1049 mm = tlb->mm; 1050 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1051 if (page_table_with_pgste(page)) { 1052 gmap_unlink(mm, table, vmaddr); 1053 table = (unsigned long *) (__pa(table) | FRAG_MASK); 1054 tlb_remove_table(tlb, table); 1055 return; 1056 } 1057 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 1058 spin_lock_bh(&mm->context.list_lock); 1059 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1060 list_del(&page->lru); 1061 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 1062 if (mask & FRAG_MASK) 1063 list_add_tail(&page->lru, &mm->context.pgtable_list); 1064 spin_unlock_bh(&mm->context.list_lock); 1065 table = (unsigned long *) (__pa(table) | (bit << 4)); 1066 tlb_remove_table(tlb, table); 1067 } 1068 1069 static void __tlb_remove_table(void *_table) 1070 { 1071 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 1072 void *table = (void *)((unsigned long) _table & ~mask); 1073 unsigned type = (unsigned long) _table & mask; 1074 1075 if (type) 1076 __page_table_free_rcu(table, type); 1077 else 1078 free_pages((unsigned long) table, ALLOC_ORDER); 1079 } 1080 1081 static void tlb_remove_table_smp_sync(void *arg) 1082 { 1083 /* Simply deliver the interrupt */ 1084 } 1085 1086 static void tlb_remove_table_one(void *table) 1087 { 1088 /* 1089 * This isn't an RCU grace period and hence the page-tables cannot be 1090 * assumed to be actually RCU-freed. 1091 * 1092 * It is however sufficient for software page-table walkers that rely 1093 * on IRQ disabling. See the comment near struct mmu_table_batch. 1094 */ 1095 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 1096 __tlb_remove_table(table); 1097 } 1098 1099 static void tlb_remove_table_rcu(struct rcu_head *head) 1100 { 1101 struct mmu_table_batch *batch; 1102 int i; 1103 1104 batch = container_of(head, struct mmu_table_batch, rcu); 1105 1106 for (i = 0; i < batch->nr; i++) 1107 __tlb_remove_table(batch->tables[i]); 1108 1109 free_page((unsigned long)batch); 1110 } 1111 1112 void tlb_table_flush(struct mmu_gather *tlb) 1113 { 1114 struct mmu_table_batch **batch = &tlb->batch; 1115 1116 if (*batch) { 1117 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 1118 *batch = NULL; 1119 } 1120 } 1121 1122 void tlb_remove_table(struct mmu_gather *tlb, void *table) 1123 { 1124 struct mmu_table_batch **batch = &tlb->batch; 1125 1126 tlb->mm->context.flush_mm = 1; 1127 if (*batch == NULL) { 1128 *batch = (struct mmu_table_batch *) 1129 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 1130 if (*batch == NULL) { 1131 __tlb_flush_mm_lazy(tlb->mm); 1132 tlb_remove_table_one(table); 1133 return; 1134 } 1135 (*batch)->nr = 0; 1136 } 1137 (*batch)->tables[(*batch)->nr++] = table; 1138 if ((*batch)->nr == MAX_TABLE_BATCH) 1139 tlb_flush_mmu(tlb); 1140 } 1141 1142 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1143 static inline void thp_split_vma(struct vm_area_struct *vma) 1144 { 1145 unsigned long addr; 1146 1147 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) 1148 follow_page(vma, addr, FOLL_SPLIT); 1149 } 1150 1151 static inline void thp_split_mm(struct mm_struct *mm) 1152 { 1153 struct vm_area_struct *vma; 1154 1155 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { 1156 thp_split_vma(vma); 1157 vma->vm_flags &= ~VM_HUGEPAGE; 1158 vma->vm_flags |= VM_NOHUGEPAGE; 1159 } 1160 mm->def_flags |= VM_NOHUGEPAGE; 1161 } 1162 #else 1163 static inline void thp_split_mm(struct mm_struct *mm) 1164 { 1165 } 1166 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1167 1168 static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb, 1169 struct mm_struct *mm, pud_t *pud, 1170 unsigned long addr, unsigned long end) 1171 { 1172 unsigned long next, *table, *new; 1173 struct page *page; 1174 spinlock_t *ptl; 1175 pmd_t *pmd; 1176 1177 pmd = pmd_offset(pud, addr); 1178 do { 1179 next = pmd_addr_end(addr, end); 1180 again: 1181 if (pmd_none_or_clear_bad(pmd)) 1182 continue; 1183 table = (unsigned long *) pmd_deref(*pmd); 1184 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1185 if (page_table_with_pgste(page)) 1186 continue; 1187 /* Allocate new page table with pgstes */ 1188 new = page_table_alloc_pgste(mm); 1189 if (!new) 1190 return -ENOMEM; 1191 1192 ptl = pmd_lock(mm, pmd); 1193 if (likely((unsigned long *) pmd_deref(*pmd) == table)) { 1194 /* Nuke pmd entry pointing to the "short" page table */ 1195 pmdp_flush_lazy(mm, addr, pmd); 1196 pmd_clear(pmd); 1197 /* Copy ptes from old table to new table */ 1198 memcpy(new, table, PAGE_SIZE/2); 1199 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 1200 /* Establish new table */ 1201 pmd_populate(mm, pmd, (pte_t *) new); 1202 /* Free old table with rcu, there might be a walker! */ 1203 page_table_free_rcu(tlb, table, addr); 1204 new = NULL; 1205 } 1206 spin_unlock(ptl); 1207 if (new) { 1208 page_table_free_pgste(new); 1209 goto again; 1210 } 1211 } while (pmd++, addr = next, addr != end); 1212 1213 return addr; 1214 } 1215 1216 static unsigned long page_table_realloc_pud(struct mmu_gather *tlb, 1217 struct mm_struct *mm, pgd_t *pgd, 1218 unsigned long addr, unsigned long end) 1219 { 1220 unsigned long next; 1221 pud_t *pud; 1222 1223 pud = pud_offset(pgd, addr); 1224 do { 1225 next = pud_addr_end(addr, end); 1226 if (pud_none_or_clear_bad(pud)) 1227 continue; 1228 next = page_table_realloc_pmd(tlb, mm, pud, addr, next); 1229 if (unlikely(IS_ERR_VALUE(next))) 1230 return next; 1231 } while (pud++, addr = next, addr != end); 1232 1233 return addr; 1234 } 1235 1236 static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm, 1237 unsigned long addr, unsigned long end) 1238 { 1239 unsigned long next; 1240 pgd_t *pgd; 1241 1242 pgd = pgd_offset(mm, addr); 1243 do { 1244 next = pgd_addr_end(addr, end); 1245 if (pgd_none_or_clear_bad(pgd)) 1246 continue; 1247 next = page_table_realloc_pud(tlb, mm, pgd, addr, next); 1248 if (unlikely(IS_ERR_VALUE(next))) 1249 return next; 1250 } while (pgd++, addr = next, addr != end); 1251 1252 return 0; 1253 } 1254 1255 /* 1256 * switch on pgstes for its userspace process (for kvm) 1257 */ 1258 int s390_enable_sie(void) 1259 { 1260 struct task_struct *tsk = current; 1261 struct mm_struct *mm = tsk->mm; 1262 struct mmu_gather tlb; 1263 1264 /* Do we have pgstes? if yes, we are done */ 1265 if (mm_has_pgste(tsk->mm)) 1266 return 0; 1267 1268 down_write(&mm->mmap_sem); 1269 /* split thp mappings and disable thp for future mappings */ 1270 thp_split_mm(mm); 1271 /* Reallocate the page tables with pgstes */ 1272 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE); 1273 if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE)) 1274 mm->context.has_pgste = 1; 1275 tlb_finish_mmu(&tlb, 0, TASK_SIZE); 1276 up_write(&mm->mmap_sem); 1277 return mm->context.has_pgste ? 0 : -ENOMEM; 1278 } 1279 EXPORT_SYMBOL_GPL(s390_enable_sie); 1280 1281 /* 1282 * Enable storage key handling from now on and initialize the storage 1283 * keys with the default key. 1284 */ 1285 static int __s390_enable_skey(pte_t *pte, unsigned long addr, 1286 unsigned long next, struct mm_walk *walk) 1287 { 1288 unsigned long ptev; 1289 pgste_t pgste; 1290 1291 pgste = pgste_get_lock(pte); 1292 /* 1293 * Remove all zero page mappings, 1294 * after establishing a policy to forbid zero page mappings 1295 * following faults for that page will get fresh anonymous pages 1296 */ 1297 if (is_zero_pfn(pte_pfn(*pte))) { 1298 ptep_flush_direct(walk->mm, addr, pte); 1299 pte_val(*pte) = _PAGE_INVALID; 1300 } 1301 /* Clear storage key */ 1302 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | 1303 PGSTE_GR_BIT | PGSTE_GC_BIT); 1304 ptev = pte_val(*pte); 1305 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 1306 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); 1307 pgste_set_unlock(pte, pgste); 1308 return 0; 1309 } 1310 1311 int s390_enable_skey(void) 1312 { 1313 struct mm_walk walk = { .pte_entry = __s390_enable_skey }; 1314 struct mm_struct *mm = current->mm; 1315 struct vm_area_struct *vma; 1316 int rc = 0; 1317 1318 down_write(&mm->mmap_sem); 1319 if (mm_use_skey(mm)) 1320 goto out_up; 1321 1322 mm->context.use_skey = 1; 1323 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1324 if (ksm_madvise(vma, vma->vm_start, vma->vm_end, 1325 MADV_UNMERGEABLE, &vma->vm_flags)) { 1326 mm->context.use_skey = 0; 1327 rc = -ENOMEM; 1328 goto out_up; 1329 } 1330 } 1331 mm->def_flags &= ~VM_MERGEABLE; 1332 1333 walk.mm = mm; 1334 walk_page_range(0, TASK_SIZE, &walk); 1335 1336 out_up: 1337 up_write(&mm->mmap_sem); 1338 return rc; 1339 } 1340 EXPORT_SYMBOL_GPL(s390_enable_skey); 1341 1342 /* 1343 * Reset CMMA state, make all pages stable again. 1344 */ 1345 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 1346 unsigned long next, struct mm_walk *walk) 1347 { 1348 pgste_t pgste; 1349 1350 pgste = pgste_get_lock(pte); 1351 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; 1352 pgste_set_unlock(pte, pgste); 1353 return 0; 1354 } 1355 1356 void s390_reset_cmma(struct mm_struct *mm) 1357 { 1358 struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; 1359 1360 down_write(&mm->mmap_sem); 1361 walk.mm = mm; 1362 walk_page_range(0, TASK_SIZE, &walk); 1363 up_write(&mm->mmap_sem); 1364 } 1365 EXPORT_SYMBOL_GPL(s390_reset_cmma); 1366 1367 /* 1368 * Test and reset if a guest page is dirty 1369 */ 1370 bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) 1371 { 1372 pte_t *pte; 1373 spinlock_t *ptl; 1374 bool dirty = false; 1375 1376 pte = get_locked_pte(gmap->mm, address, &ptl); 1377 if (unlikely(!pte)) 1378 return false; 1379 1380 if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) 1381 dirty = true; 1382 1383 spin_unlock(ptl); 1384 return dirty; 1385 } 1386 EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); 1387 1388 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1389 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1390 pmd_t *pmdp) 1391 { 1392 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1393 /* No need to flush TLB 1394 * On s390 reference bits are in storage key and never in TLB */ 1395 return pmdp_test_and_clear_young(vma, address, pmdp); 1396 } 1397 1398 int pmdp_set_access_flags(struct vm_area_struct *vma, 1399 unsigned long address, pmd_t *pmdp, 1400 pmd_t entry, int dirty) 1401 { 1402 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1403 1404 entry = pmd_mkyoung(entry); 1405 if (dirty) 1406 entry = pmd_mkdirty(entry); 1407 if (pmd_same(*pmdp, entry)) 1408 return 0; 1409 pmdp_invalidate(vma, address, pmdp); 1410 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1411 return 1; 1412 } 1413 1414 static void pmdp_splitting_flush_sync(void *arg) 1415 { 1416 /* Simply deliver the interrupt */ 1417 } 1418 1419 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 1420 pmd_t *pmdp) 1421 { 1422 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1423 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 1424 (unsigned long *) pmdp)) { 1425 /* need to serialize against gup-fast (IRQ disabled) */ 1426 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 1427 } 1428 } 1429 1430 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1431 pgtable_t pgtable) 1432 { 1433 struct list_head *lh = (struct list_head *) pgtable; 1434 1435 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1436 1437 /* FIFO */ 1438 if (!pmd_huge_pte(mm, pmdp)) 1439 INIT_LIST_HEAD(lh); 1440 else 1441 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1442 pmd_huge_pte(mm, pmdp) = pgtable; 1443 } 1444 1445 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1446 { 1447 struct list_head *lh; 1448 pgtable_t pgtable; 1449 pte_t *ptep; 1450 1451 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1452 1453 /* FIFO */ 1454 pgtable = pmd_huge_pte(mm, pmdp); 1455 lh = (struct list_head *) pgtable; 1456 if (list_empty(lh)) 1457 pmd_huge_pte(mm, pmdp) = NULL; 1458 else { 1459 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1460 list_del(lh); 1461 } 1462 ptep = (pte_t *) pgtable; 1463 pte_val(*ptep) = _PAGE_INVALID; 1464 ptep++; 1465 pte_val(*ptep) = _PAGE_INVALID; 1466 return pgtable; 1467 } 1468 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1469