1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 #include <linux/swapops.h> 21 22 #include <asm/pgtable.h> 23 #include <asm/pgalloc.h> 24 #include <asm/tlb.h> 25 #include <asm/tlbflush.h> 26 #include <asm/mmu_context.h> 27 28 #ifndef CONFIG_64BIT 29 #define ALLOC_ORDER 1 30 #define FRAG_MASK 0x0f 31 #else 32 #define ALLOC_ORDER 2 33 #define FRAG_MASK 0x03 34 #endif 35 36 37 unsigned long *crst_table_alloc(struct mm_struct *mm) 38 { 39 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 40 41 if (!page) 42 return NULL; 43 return (unsigned long *) page_to_phys(page); 44 } 45 46 void crst_table_free(struct mm_struct *mm, unsigned long *table) 47 { 48 free_pages((unsigned long) table, ALLOC_ORDER); 49 } 50 51 #ifdef CONFIG_64BIT 52 static void __crst_table_upgrade(void *arg) 53 { 54 struct mm_struct *mm = arg; 55 56 if (current->active_mm == mm) { 57 clear_user_asce(); 58 set_user_asce(mm); 59 } 60 __tlb_flush_local(); 61 } 62 63 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 64 { 65 unsigned long *table, *pgd; 66 unsigned long entry; 67 int flush; 68 69 BUG_ON(limit > (1UL << 53)); 70 flush = 0; 71 repeat: 72 table = crst_table_alloc(mm); 73 if (!table) 74 return -ENOMEM; 75 spin_lock_bh(&mm->page_table_lock); 76 if (mm->context.asce_limit < limit) { 77 pgd = (unsigned long *) mm->pgd; 78 if (mm->context.asce_limit <= (1UL << 31)) { 79 entry = _REGION3_ENTRY_EMPTY; 80 mm->context.asce_limit = 1UL << 42; 81 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 82 _ASCE_USER_BITS | 83 _ASCE_TYPE_REGION3; 84 } else { 85 entry = _REGION2_ENTRY_EMPTY; 86 mm->context.asce_limit = 1UL << 53; 87 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 88 _ASCE_USER_BITS | 89 _ASCE_TYPE_REGION2; 90 } 91 crst_table_init(table, entry); 92 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 93 mm->pgd = (pgd_t *) table; 94 mm->task_size = mm->context.asce_limit; 95 table = NULL; 96 flush = 1; 97 } 98 spin_unlock_bh(&mm->page_table_lock); 99 if (table) 100 crst_table_free(mm, table); 101 if (mm->context.asce_limit < limit) 102 goto repeat; 103 if (flush) 104 on_each_cpu(__crst_table_upgrade, mm, 0); 105 return 0; 106 } 107 108 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 109 { 110 pgd_t *pgd; 111 112 if (current->active_mm == mm) { 113 clear_user_asce(); 114 __tlb_flush_mm(mm); 115 } 116 while (mm->context.asce_limit > limit) { 117 pgd = mm->pgd; 118 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 119 case _REGION_ENTRY_TYPE_R2: 120 mm->context.asce_limit = 1UL << 42; 121 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 122 _ASCE_USER_BITS | 123 _ASCE_TYPE_REGION3; 124 break; 125 case _REGION_ENTRY_TYPE_R3: 126 mm->context.asce_limit = 1UL << 31; 127 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 128 _ASCE_USER_BITS | 129 _ASCE_TYPE_SEGMENT; 130 break; 131 default: 132 BUG(); 133 } 134 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 135 mm->task_size = mm->context.asce_limit; 136 crst_table_free(mm, (unsigned long *) pgd); 137 } 138 if (current->active_mm == mm) 139 set_user_asce(mm); 140 } 141 #endif 142 143 #ifdef CONFIG_PGSTE 144 145 /** 146 * gmap_alloc - allocate a guest address space 147 * @mm: pointer to the parent mm_struct 148 * @limit: maximum size of the gmap address space 149 * 150 * Returns a guest address space structure. 151 */ 152 struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) 153 { 154 struct gmap *gmap; 155 struct page *page; 156 unsigned long *table; 157 unsigned long etype, atype; 158 159 if (limit < (1UL << 31)) { 160 limit = (1UL << 31) - 1; 161 atype = _ASCE_TYPE_SEGMENT; 162 etype = _SEGMENT_ENTRY_EMPTY; 163 } else if (limit < (1UL << 42)) { 164 limit = (1UL << 42) - 1; 165 atype = _ASCE_TYPE_REGION3; 166 etype = _REGION3_ENTRY_EMPTY; 167 } else if (limit < (1UL << 53)) { 168 limit = (1UL << 53) - 1; 169 atype = _ASCE_TYPE_REGION2; 170 etype = _REGION2_ENTRY_EMPTY; 171 } else { 172 limit = -1UL; 173 atype = _ASCE_TYPE_REGION1; 174 etype = _REGION1_ENTRY_EMPTY; 175 } 176 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 177 if (!gmap) 178 goto out; 179 INIT_LIST_HEAD(&gmap->crst_list); 180 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); 181 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); 182 spin_lock_init(&gmap->guest_table_lock); 183 gmap->mm = mm; 184 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 185 if (!page) 186 goto out_free; 187 page->index = 0; 188 list_add(&page->lru, &gmap->crst_list); 189 table = (unsigned long *) page_to_phys(page); 190 crst_table_init(table, etype); 191 gmap->table = table; 192 gmap->asce = atype | _ASCE_TABLE_LENGTH | 193 _ASCE_USER_BITS | __pa(table); 194 gmap->asce_end = limit; 195 down_write(&mm->mmap_sem); 196 list_add(&gmap->list, &mm->context.gmap_list); 197 up_write(&mm->mmap_sem); 198 return gmap; 199 200 out_free: 201 kfree(gmap); 202 out: 203 return NULL; 204 } 205 EXPORT_SYMBOL_GPL(gmap_alloc); 206 207 static void gmap_flush_tlb(struct gmap *gmap) 208 { 209 if (MACHINE_HAS_IDTE) 210 __tlb_flush_asce(gmap->mm, gmap->asce); 211 else 212 __tlb_flush_global(); 213 } 214 215 static void gmap_radix_tree_free(struct radix_tree_root *root) 216 { 217 struct radix_tree_iter iter; 218 unsigned long indices[16]; 219 unsigned long index; 220 void **slot; 221 int i, nr; 222 223 /* A radix tree is freed by deleting all of its entries */ 224 index = 0; 225 do { 226 nr = 0; 227 radix_tree_for_each_slot(slot, root, &iter, index) { 228 indices[nr] = iter.index; 229 if (++nr == 16) 230 break; 231 } 232 for (i = 0; i < nr; i++) { 233 index = indices[i]; 234 radix_tree_delete(root, index); 235 } 236 } while (nr > 0); 237 } 238 239 /** 240 * gmap_free - free a guest address space 241 * @gmap: pointer to the guest address space structure 242 */ 243 void gmap_free(struct gmap *gmap) 244 { 245 struct page *page, *next; 246 247 /* Flush tlb. */ 248 if (MACHINE_HAS_IDTE) 249 __tlb_flush_asce(gmap->mm, gmap->asce); 250 else 251 __tlb_flush_global(); 252 253 /* Free all segment & region tables. */ 254 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 255 __free_pages(page, ALLOC_ORDER); 256 gmap_radix_tree_free(&gmap->guest_to_host); 257 gmap_radix_tree_free(&gmap->host_to_guest); 258 down_write(&gmap->mm->mmap_sem); 259 list_del(&gmap->list); 260 up_write(&gmap->mm->mmap_sem); 261 kfree(gmap); 262 } 263 EXPORT_SYMBOL_GPL(gmap_free); 264 265 /** 266 * gmap_enable - switch primary space to the guest address space 267 * @gmap: pointer to the guest address space structure 268 */ 269 void gmap_enable(struct gmap *gmap) 270 { 271 S390_lowcore.gmap = (unsigned long) gmap; 272 } 273 EXPORT_SYMBOL_GPL(gmap_enable); 274 275 /** 276 * gmap_disable - switch back to the standard primary address space 277 * @gmap: pointer to the guest address space structure 278 */ 279 void gmap_disable(struct gmap *gmap) 280 { 281 S390_lowcore.gmap = 0UL; 282 } 283 EXPORT_SYMBOL_GPL(gmap_disable); 284 285 /* 286 * gmap_alloc_table is assumed to be called with mmap_sem held 287 */ 288 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 289 unsigned long init, unsigned long gaddr) 290 { 291 struct page *page; 292 unsigned long *new; 293 294 /* since we dont free the gmap table until gmap_free we can unlock */ 295 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 296 if (!page) 297 return -ENOMEM; 298 new = (unsigned long *) page_to_phys(page); 299 crst_table_init(new, init); 300 spin_lock(&gmap->mm->page_table_lock); 301 if (*table & _REGION_ENTRY_INVALID) { 302 list_add(&page->lru, &gmap->crst_list); 303 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 304 (*table & _REGION_ENTRY_TYPE_MASK); 305 page->index = gaddr; 306 page = NULL; 307 } 308 spin_unlock(&gmap->mm->page_table_lock); 309 if (page) 310 __free_pages(page, ALLOC_ORDER); 311 return 0; 312 } 313 314 /** 315 * __gmap_segment_gaddr - find virtual address from segment pointer 316 * @entry: pointer to a segment table entry in the guest address space 317 * 318 * Returns the virtual address in the guest address space for the segment 319 */ 320 static unsigned long __gmap_segment_gaddr(unsigned long *entry) 321 { 322 struct page *page; 323 unsigned long offset; 324 325 offset = (unsigned long) entry / sizeof(unsigned long); 326 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 327 page = pmd_to_page((pmd_t *) entry); 328 return page->index + offset; 329 } 330 331 /** 332 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 333 * @gmap: pointer to the guest address space structure 334 * @vmaddr: address in the host process address space 335 * 336 * Returns 1 if a TLB flush is required 337 */ 338 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 339 { 340 unsigned long *entry; 341 int flush = 0; 342 343 spin_lock(&gmap->guest_table_lock); 344 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 345 if (entry) { 346 flush = (*entry != _SEGMENT_ENTRY_INVALID); 347 *entry = _SEGMENT_ENTRY_INVALID; 348 } 349 spin_unlock(&gmap->guest_table_lock); 350 return flush; 351 } 352 353 /** 354 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 355 * @gmap: pointer to the guest address space structure 356 * @gaddr: address in the guest address space 357 * 358 * Returns 1 if a TLB flush is required 359 */ 360 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 361 { 362 unsigned long vmaddr; 363 364 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 365 gaddr >> PMD_SHIFT); 366 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 367 } 368 369 /** 370 * gmap_unmap_segment - unmap segment from the guest address space 371 * @gmap: pointer to the guest address space structure 372 * @to: address in the guest address space 373 * @len: length of the memory area to unmap 374 * 375 * Returns 0 if the unmap succeeded, -EINVAL if not. 376 */ 377 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 378 { 379 unsigned long off; 380 int flush; 381 382 if ((to | len) & (PMD_SIZE - 1)) 383 return -EINVAL; 384 if (len == 0 || to + len < to) 385 return -EINVAL; 386 387 flush = 0; 388 down_write(&gmap->mm->mmap_sem); 389 for (off = 0; off < len; off += PMD_SIZE) 390 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 391 up_write(&gmap->mm->mmap_sem); 392 if (flush) 393 gmap_flush_tlb(gmap); 394 return 0; 395 } 396 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 397 398 /** 399 * gmap_mmap_segment - map a segment to the guest address space 400 * @gmap: pointer to the guest address space structure 401 * @from: source address in the parent address space 402 * @to: target address in the guest address space 403 * @len: length of the memory area to map 404 * 405 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 406 */ 407 int gmap_map_segment(struct gmap *gmap, unsigned long from, 408 unsigned long to, unsigned long len) 409 { 410 unsigned long off; 411 int flush; 412 413 if ((from | to | len) & (PMD_SIZE - 1)) 414 return -EINVAL; 415 if (len == 0 || from + len < from || to + len < to || 416 from + len > TASK_MAX_SIZE || to + len > gmap->asce_end) 417 return -EINVAL; 418 419 flush = 0; 420 down_write(&gmap->mm->mmap_sem); 421 for (off = 0; off < len; off += PMD_SIZE) { 422 /* Remove old translation */ 423 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 424 /* Store new translation */ 425 if (radix_tree_insert(&gmap->guest_to_host, 426 (to + off) >> PMD_SHIFT, 427 (void *) from + off)) 428 break; 429 } 430 up_write(&gmap->mm->mmap_sem); 431 if (flush) 432 gmap_flush_tlb(gmap); 433 if (off >= len) 434 return 0; 435 gmap_unmap_segment(gmap, to, len); 436 return -ENOMEM; 437 } 438 EXPORT_SYMBOL_GPL(gmap_map_segment); 439 440 /** 441 * __gmap_translate - translate a guest address to a user space address 442 * @gmap: pointer to guest mapping meta data structure 443 * @gaddr: guest address 444 * 445 * Returns user space address which corresponds to the guest address or 446 * -EFAULT if no such mapping exists. 447 * This function does not establish potentially missing page table entries. 448 * The mmap_sem of the mm that belongs to the address space must be held 449 * when this function gets called. 450 */ 451 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 452 { 453 unsigned long vmaddr; 454 455 vmaddr = (unsigned long) 456 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 457 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 458 } 459 EXPORT_SYMBOL_GPL(__gmap_translate); 460 461 /** 462 * gmap_translate - translate a guest address to a user space address 463 * @gmap: pointer to guest mapping meta data structure 464 * @gaddr: guest address 465 * 466 * Returns user space address which corresponds to the guest address or 467 * -EFAULT if no such mapping exists. 468 * This function does not establish potentially missing page table entries. 469 */ 470 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 471 { 472 unsigned long rc; 473 474 down_read(&gmap->mm->mmap_sem); 475 rc = __gmap_translate(gmap, gaddr); 476 up_read(&gmap->mm->mmap_sem); 477 return rc; 478 } 479 EXPORT_SYMBOL_GPL(gmap_translate); 480 481 /** 482 * gmap_unlink - disconnect a page table from the gmap shadow tables 483 * @gmap: pointer to guest mapping meta data structure 484 * @table: pointer to the host page table 485 * @vmaddr: vm address associated with the host page table 486 */ 487 static void gmap_unlink(struct mm_struct *mm, unsigned long *table, 488 unsigned long vmaddr) 489 { 490 struct gmap *gmap; 491 int flush; 492 493 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 494 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 495 if (flush) 496 gmap_flush_tlb(gmap); 497 } 498 } 499 500 /** 501 * gmap_link - set up shadow page tables to connect a host to a guest address 502 * @gmap: pointer to guest mapping meta data structure 503 * @gaddr: guest address 504 * @vmaddr: vm address 505 * 506 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 507 * if the vm address is already mapped to a different guest segment. 508 * The mmap_sem of the mm that belongs to the address space must be held 509 * when this function gets called. 510 */ 511 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 512 { 513 struct mm_struct *mm; 514 unsigned long *table; 515 spinlock_t *ptl; 516 pgd_t *pgd; 517 pud_t *pud; 518 pmd_t *pmd; 519 int rc; 520 521 /* Create higher level tables in the gmap page table */ 522 table = gmap->table; 523 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 524 table += (gaddr >> 53) & 0x7ff; 525 if ((*table & _REGION_ENTRY_INVALID) && 526 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 527 gaddr & 0xffe0000000000000)) 528 return -ENOMEM; 529 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 530 } 531 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 532 table += (gaddr >> 42) & 0x7ff; 533 if ((*table & _REGION_ENTRY_INVALID) && 534 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 535 gaddr & 0xfffffc0000000000)) 536 return -ENOMEM; 537 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 538 } 539 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 540 table += (gaddr >> 31) & 0x7ff; 541 if ((*table & _REGION_ENTRY_INVALID) && 542 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 543 gaddr & 0xffffffff80000000)) 544 return -ENOMEM; 545 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 546 } 547 table += (gaddr >> 20) & 0x7ff; 548 /* Walk the parent mm page table */ 549 mm = gmap->mm; 550 pgd = pgd_offset(mm, vmaddr); 551 VM_BUG_ON(pgd_none(*pgd)); 552 pud = pud_offset(pgd, vmaddr); 553 VM_BUG_ON(pud_none(*pud)); 554 pmd = pmd_offset(pud, vmaddr); 555 VM_BUG_ON(pmd_none(*pmd)); 556 /* large pmds cannot yet be handled */ 557 if (pmd_large(*pmd)) 558 return -EFAULT; 559 /* Link gmap segment table entry location to page table. */ 560 rc = radix_tree_preload(GFP_KERNEL); 561 if (rc) 562 return rc; 563 ptl = pmd_lock(mm, pmd); 564 spin_lock(&gmap->guest_table_lock); 565 if (*table == _SEGMENT_ENTRY_INVALID) { 566 rc = radix_tree_insert(&gmap->host_to_guest, 567 vmaddr >> PMD_SHIFT, table); 568 if (!rc) 569 *table = pmd_val(*pmd); 570 } else 571 rc = 0; 572 spin_unlock(&gmap->guest_table_lock); 573 spin_unlock(ptl); 574 radix_tree_preload_end(); 575 return rc; 576 } 577 578 /** 579 * gmap_fault - resolve a fault on a guest address 580 * @gmap: pointer to guest mapping meta data structure 581 * @gaddr: guest address 582 * @fault_flags: flags to pass down to handle_mm_fault() 583 * 584 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 585 * if the vm address is already mapped to a different guest segment. 586 */ 587 int gmap_fault(struct gmap *gmap, unsigned long gaddr, 588 unsigned int fault_flags) 589 { 590 unsigned long vmaddr; 591 int rc; 592 593 down_read(&gmap->mm->mmap_sem); 594 vmaddr = __gmap_translate(gmap, gaddr); 595 if (IS_ERR_VALUE(vmaddr)) { 596 rc = vmaddr; 597 goto out_up; 598 } 599 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) { 600 rc = -EFAULT; 601 goto out_up; 602 } 603 rc = __gmap_link(gmap, gaddr, vmaddr); 604 out_up: 605 up_read(&gmap->mm->mmap_sem); 606 return rc; 607 } 608 EXPORT_SYMBOL_GPL(gmap_fault); 609 610 static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) 611 { 612 if (!non_swap_entry(entry)) 613 dec_mm_counter(mm, MM_SWAPENTS); 614 else if (is_migration_entry(entry)) { 615 struct page *page = migration_entry_to_page(entry); 616 617 if (PageAnon(page)) 618 dec_mm_counter(mm, MM_ANONPAGES); 619 else 620 dec_mm_counter(mm, MM_FILEPAGES); 621 } 622 free_swap_and_cache(entry); 623 } 624 625 /* 626 * this function is assumed to be called with mmap_sem held 627 */ 628 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 629 { 630 unsigned long vmaddr, ptev, pgstev; 631 pte_t *ptep, pte; 632 spinlock_t *ptl; 633 pgste_t pgste; 634 635 /* Find the vm address for the guest address */ 636 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 637 gaddr >> PMD_SHIFT); 638 if (!vmaddr) 639 return; 640 vmaddr |= gaddr & ~PMD_MASK; 641 /* Get pointer to the page table entry */ 642 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 643 if (unlikely(!ptep)) 644 return; 645 pte = *ptep; 646 if (!pte_swap(pte)) 647 goto out_pte; 648 /* Zap unused and logically-zero pages */ 649 pgste = pgste_get_lock(ptep); 650 pgstev = pgste_val(pgste); 651 ptev = pte_val(pte); 652 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || 653 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { 654 gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); 655 pte_clear(gmap->mm, vmaddr, ptep); 656 } 657 pgste_set_unlock(ptep, pgste); 658 out_pte: 659 pte_unmap_unlock(ptep, ptl); 660 } 661 EXPORT_SYMBOL_GPL(__gmap_zap); 662 663 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 664 { 665 unsigned long gaddr, vmaddr, size; 666 struct vm_area_struct *vma; 667 668 down_read(&gmap->mm->mmap_sem); 669 for (gaddr = from; gaddr < to; 670 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 671 /* Find the vm address for the guest address */ 672 vmaddr = (unsigned long) 673 radix_tree_lookup(&gmap->guest_to_host, 674 gaddr >> PMD_SHIFT); 675 if (!vmaddr) 676 continue; 677 vmaddr |= gaddr & ~PMD_MASK; 678 /* Find vma in the parent mm */ 679 vma = find_vma(gmap->mm, vmaddr); 680 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 681 zap_page_range(vma, vmaddr, size, NULL); 682 } 683 up_read(&gmap->mm->mmap_sem); 684 } 685 EXPORT_SYMBOL_GPL(gmap_discard); 686 687 static LIST_HEAD(gmap_notifier_list); 688 static DEFINE_SPINLOCK(gmap_notifier_lock); 689 690 /** 691 * gmap_register_ipte_notifier - register a pte invalidation callback 692 * @nb: pointer to the gmap notifier block 693 */ 694 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 695 { 696 spin_lock(&gmap_notifier_lock); 697 list_add(&nb->list, &gmap_notifier_list); 698 spin_unlock(&gmap_notifier_lock); 699 } 700 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 701 702 /** 703 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 704 * @nb: pointer to the gmap notifier block 705 */ 706 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 707 { 708 spin_lock(&gmap_notifier_lock); 709 list_del_init(&nb->list); 710 spin_unlock(&gmap_notifier_lock); 711 } 712 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 713 714 /** 715 * gmap_ipte_notify - mark a range of ptes for invalidation notification 716 * @gmap: pointer to guest mapping meta data structure 717 * @gaddr: virtual address in the guest address space 718 * @len: size of area 719 * 720 * Returns 0 if for each page in the given range a gmap mapping exists and 721 * the invalidation notification could be set. If the gmap mapping is missing 722 * for one or more pages -EFAULT is returned. If no memory could be allocated 723 * -ENOMEM is returned. This function establishes missing page table entries. 724 */ 725 int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) 726 { 727 unsigned long addr; 728 spinlock_t *ptl; 729 pte_t *ptep, entry; 730 pgste_t pgste; 731 int rc = 0; 732 733 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) 734 return -EINVAL; 735 down_read(&gmap->mm->mmap_sem); 736 while (len) { 737 /* Convert gmap address and connect the page tables */ 738 addr = __gmap_translate(gmap, gaddr); 739 if (IS_ERR_VALUE(addr)) { 740 rc = addr; 741 break; 742 } 743 /* Get the page mapped */ 744 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 745 rc = -EFAULT; 746 break; 747 } 748 rc = __gmap_link(gmap, gaddr, addr); 749 if (rc) 750 break; 751 /* Walk the process page table, lock and get pte pointer */ 752 ptep = get_locked_pte(gmap->mm, addr, &ptl); 753 if (unlikely(!ptep)) 754 continue; 755 /* Set notification bit in the pgste of the pte */ 756 entry = *ptep; 757 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { 758 pgste = pgste_get_lock(ptep); 759 pgste_val(pgste) |= PGSTE_IN_BIT; 760 pgste_set_unlock(ptep, pgste); 761 gaddr += PAGE_SIZE; 762 len -= PAGE_SIZE; 763 } 764 spin_unlock(ptl); 765 } 766 up_read(&gmap->mm->mmap_sem); 767 return rc; 768 } 769 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 770 771 /** 772 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 773 * @mm: pointer to the process mm_struct 774 * @addr: virtual address in the process address space 775 * @pte: pointer to the page table entry 776 * 777 * This function is assumed to be called with the page table lock held 778 * for the pte to notify. 779 */ 780 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) 781 { 782 unsigned long offset, gaddr; 783 unsigned long *table; 784 struct gmap_notifier *nb; 785 struct gmap *gmap; 786 787 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 788 offset = offset * (4096 / sizeof(pte_t)); 789 spin_lock(&gmap_notifier_lock); 790 list_for_each_entry(gmap, &mm->context.gmap_list, list) { 791 table = radix_tree_lookup(&gmap->host_to_guest, 792 vmaddr >> PMD_SHIFT); 793 if (!table) 794 continue; 795 gaddr = __gmap_segment_gaddr(table) + offset; 796 list_for_each_entry(nb, &gmap_notifier_list, list) 797 nb->notifier_call(gmap, gaddr); 798 } 799 spin_unlock(&gmap_notifier_lock); 800 } 801 EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); 802 803 static inline int page_table_with_pgste(struct page *page) 804 { 805 return atomic_read(&page->_mapcount) == 0; 806 } 807 808 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) 809 { 810 struct page *page; 811 unsigned long *table; 812 813 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 814 if (!page) 815 return NULL; 816 if (!pgtable_page_ctor(page)) { 817 __free_page(page); 818 return NULL; 819 } 820 atomic_set(&page->_mapcount, 0); 821 table = (unsigned long *) page_to_phys(page); 822 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 823 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 824 return table; 825 } 826 827 static inline void page_table_free_pgste(unsigned long *table) 828 { 829 struct page *page; 830 831 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 832 pgtable_page_dtor(page); 833 atomic_set(&page->_mapcount, -1); 834 __free_page(page); 835 } 836 837 static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t *pmd, 838 unsigned long addr, unsigned long end, bool init_skey) 839 { 840 pte_t *start_pte, *pte; 841 spinlock_t *ptl; 842 pgste_t pgste; 843 844 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 845 pte = start_pte; 846 do { 847 pgste = pgste_get_lock(pte); 848 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; 849 if (init_skey) { 850 unsigned long address; 851 852 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | 853 PGSTE_GR_BIT | PGSTE_GC_BIT); 854 855 /* skip invalid and not writable pages */ 856 if (pte_val(*pte) & _PAGE_INVALID || 857 !(pte_val(*pte) & _PAGE_WRITE)) { 858 pgste_set_unlock(pte, pgste); 859 continue; 860 } 861 862 address = pte_val(*pte) & PAGE_MASK; 863 page_set_storage_key(address, PAGE_DEFAULT_KEY, 1); 864 } 865 pgste_set_unlock(pte, pgste); 866 } while (pte++, addr += PAGE_SIZE, addr != end); 867 pte_unmap_unlock(start_pte, ptl); 868 869 return addr; 870 } 871 872 static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t *pud, 873 unsigned long addr, unsigned long end, bool init_skey) 874 { 875 unsigned long next; 876 pmd_t *pmd; 877 878 pmd = pmd_offset(pud, addr); 879 do { 880 next = pmd_addr_end(addr, end); 881 if (pmd_none_or_clear_bad(pmd)) 882 continue; 883 next = page_table_reset_pte(mm, pmd, addr, next, init_skey); 884 } while (pmd++, addr = next, addr != end); 885 886 return addr; 887 } 888 889 static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t *pgd, 890 unsigned long addr, unsigned long end, bool init_skey) 891 { 892 unsigned long next; 893 pud_t *pud; 894 895 pud = pud_offset(pgd, addr); 896 do { 897 next = pud_addr_end(addr, end); 898 if (pud_none_or_clear_bad(pud)) 899 continue; 900 next = page_table_reset_pmd(mm, pud, addr, next, init_skey); 901 } while (pud++, addr = next, addr != end); 902 903 return addr; 904 } 905 906 void page_table_reset_pgste(struct mm_struct *mm, unsigned long start, 907 unsigned long end, bool init_skey) 908 { 909 unsigned long addr, next; 910 pgd_t *pgd; 911 912 down_write(&mm->mmap_sem); 913 if (init_skey && mm_use_skey(mm)) 914 goto out_up; 915 addr = start; 916 pgd = pgd_offset(mm, addr); 917 do { 918 next = pgd_addr_end(addr, end); 919 if (pgd_none_or_clear_bad(pgd)) 920 continue; 921 next = page_table_reset_pud(mm, pgd, addr, next, init_skey); 922 } while (pgd++, addr = next, addr != end); 923 if (init_skey) 924 current->mm->context.use_skey = 1; 925 out_up: 926 up_write(&mm->mmap_sem); 927 } 928 EXPORT_SYMBOL(page_table_reset_pgste); 929 930 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 931 unsigned long key, bool nq) 932 { 933 spinlock_t *ptl; 934 pgste_t old, new; 935 pte_t *ptep; 936 937 down_read(&mm->mmap_sem); 938 retry: 939 ptep = get_locked_pte(current->mm, addr, &ptl); 940 if (unlikely(!ptep)) { 941 up_read(&mm->mmap_sem); 942 return -EFAULT; 943 } 944 if (!(pte_val(*ptep) & _PAGE_INVALID) && 945 (pte_val(*ptep) & _PAGE_PROTECT)) { 946 pte_unmap_unlock(ptep, ptl); 947 if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { 948 up_read(&mm->mmap_sem); 949 return -EFAULT; 950 } 951 goto retry; 952 } 953 954 new = old = pgste_get_lock(ptep); 955 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 956 PGSTE_ACC_BITS | PGSTE_FP_BIT); 957 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 958 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 959 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 960 unsigned long address, bits, skey; 961 962 address = pte_val(*ptep) & PAGE_MASK; 963 skey = (unsigned long) page_get_storage_key(address); 964 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 965 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 966 /* Set storage key ACC and FP */ 967 page_set_storage_key(address, skey, !nq); 968 /* Merge host changed & referenced into pgste */ 969 pgste_val(new) |= bits << 52; 970 } 971 /* changing the guest storage key is considered a change of the page */ 972 if ((pgste_val(new) ^ pgste_val(old)) & 973 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 974 pgste_val(new) |= PGSTE_UC_BIT; 975 976 pgste_set_unlock(ptep, new); 977 pte_unmap_unlock(ptep, ptl); 978 up_read(&mm->mmap_sem); 979 return 0; 980 } 981 EXPORT_SYMBOL(set_guest_storage_key); 982 983 #else /* CONFIG_PGSTE */ 984 985 static inline int page_table_with_pgste(struct page *page) 986 { 987 return 0; 988 } 989 990 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) 991 { 992 return NULL; 993 } 994 995 void page_table_reset_pgste(struct mm_struct *mm, unsigned long start, 996 unsigned long end, bool init_skey) 997 { 998 } 999 1000 static inline void page_table_free_pgste(unsigned long *table) 1001 { 1002 } 1003 1004 static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, 1005 unsigned long vmaddr) 1006 { 1007 } 1008 1009 #endif /* CONFIG_PGSTE */ 1010 1011 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 1012 { 1013 unsigned int old, new; 1014 1015 do { 1016 old = atomic_read(v); 1017 new = old ^ bits; 1018 } while (atomic_cmpxchg(v, old, new) != old); 1019 return new; 1020 } 1021 1022 /* 1023 * page table entry allocation/free routines. 1024 */ 1025 unsigned long *page_table_alloc(struct mm_struct *mm) 1026 { 1027 unsigned long *uninitialized_var(table); 1028 struct page *uninitialized_var(page); 1029 unsigned int mask, bit; 1030 1031 if (mm_has_pgste(mm)) 1032 return page_table_alloc_pgste(mm); 1033 /* Allocate fragments of a 4K page as 1K/2K page table */ 1034 spin_lock_bh(&mm->context.list_lock); 1035 mask = FRAG_MASK; 1036 if (!list_empty(&mm->context.pgtable_list)) { 1037 page = list_first_entry(&mm->context.pgtable_list, 1038 struct page, lru); 1039 table = (unsigned long *) page_to_phys(page); 1040 mask = atomic_read(&page->_mapcount); 1041 mask = mask | (mask >> 4); 1042 } 1043 if ((mask & FRAG_MASK) == FRAG_MASK) { 1044 spin_unlock_bh(&mm->context.list_lock); 1045 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 1046 if (!page) 1047 return NULL; 1048 if (!pgtable_page_ctor(page)) { 1049 __free_page(page); 1050 return NULL; 1051 } 1052 atomic_set(&page->_mapcount, 1); 1053 table = (unsigned long *) page_to_phys(page); 1054 clear_table(table, _PAGE_INVALID, PAGE_SIZE); 1055 spin_lock_bh(&mm->context.list_lock); 1056 list_add(&page->lru, &mm->context.pgtable_list); 1057 } else { 1058 for (bit = 1; mask & bit; bit <<= 1) 1059 table += PTRS_PER_PTE; 1060 mask = atomic_xor_bits(&page->_mapcount, bit); 1061 if ((mask & FRAG_MASK) == FRAG_MASK) 1062 list_del(&page->lru); 1063 } 1064 spin_unlock_bh(&mm->context.list_lock); 1065 return table; 1066 } 1067 1068 void page_table_free(struct mm_struct *mm, unsigned long *table) 1069 { 1070 struct page *page; 1071 unsigned int bit, mask; 1072 1073 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1074 if (page_table_with_pgste(page)) 1075 return page_table_free_pgste(table); 1076 /* Free 1K/2K page table fragment of a 4K page */ 1077 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 1078 spin_lock_bh(&mm->context.list_lock); 1079 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1080 list_del(&page->lru); 1081 mask = atomic_xor_bits(&page->_mapcount, bit); 1082 if (mask & FRAG_MASK) 1083 list_add(&page->lru, &mm->context.pgtable_list); 1084 spin_unlock_bh(&mm->context.list_lock); 1085 if (mask == 0) { 1086 pgtable_page_dtor(page); 1087 atomic_set(&page->_mapcount, -1); 1088 __free_page(page); 1089 } 1090 } 1091 1092 static void __page_table_free_rcu(void *table, unsigned bit) 1093 { 1094 struct page *page; 1095 1096 if (bit == FRAG_MASK) 1097 return page_table_free_pgste(table); 1098 /* Free 1K/2K page table fragment of a 4K page */ 1099 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1100 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 1101 pgtable_page_dtor(page); 1102 atomic_set(&page->_mapcount, -1); 1103 __free_page(page); 1104 } 1105 } 1106 1107 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, 1108 unsigned long vmaddr) 1109 { 1110 struct mm_struct *mm; 1111 struct page *page; 1112 unsigned int bit, mask; 1113 1114 mm = tlb->mm; 1115 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1116 if (page_table_with_pgste(page)) { 1117 gmap_unlink(mm, table, vmaddr); 1118 table = (unsigned long *) (__pa(table) | FRAG_MASK); 1119 tlb_remove_table(tlb, table); 1120 return; 1121 } 1122 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 1123 spin_lock_bh(&mm->context.list_lock); 1124 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1125 list_del(&page->lru); 1126 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 1127 if (mask & FRAG_MASK) 1128 list_add_tail(&page->lru, &mm->context.pgtable_list); 1129 spin_unlock_bh(&mm->context.list_lock); 1130 table = (unsigned long *) (__pa(table) | (bit << 4)); 1131 tlb_remove_table(tlb, table); 1132 } 1133 1134 static void __tlb_remove_table(void *_table) 1135 { 1136 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 1137 void *table = (void *)((unsigned long) _table & ~mask); 1138 unsigned type = (unsigned long) _table & mask; 1139 1140 if (type) 1141 __page_table_free_rcu(table, type); 1142 else 1143 free_pages((unsigned long) table, ALLOC_ORDER); 1144 } 1145 1146 static void tlb_remove_table_smp_sync(void *arg) 1147 { 1148 /* Simply deliver the interrupt */ 1149 } 1150 1151 static void tlb_remove_table_one(void *table) 1152 { 1153 /* 1154 * This isn't an RCU grace period and hence the page-tables cannot be 1155 * assumed to be actually RCU-freed. 1156 * 1157 * It is however sufficient for software page-table walkers that rely 1158 * on IRQ disabling. See the comment near struct mmu_table_batch. 1159 */ 1160 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 1161 __tlb_remove_table(table); 1162 } 1163 1164 static void tlb_remove_table_rcu(struct rcu_head *head) 1165 { 1166 struct mmu_table_batch *batch; 1167 int i; 1168 1169 batch = container_of(head, struct mmu_table_batch, rcu); 1170 1171 for (i = 0; i < batch->nr; i++) 1172 __tlb_remove_table(batch->tables[i]); 1173 1174 free_page((unsigned long)batch); 1175 } 1176 1177 void tlb_table_flush(struct mmu_gather *tlb) 1178 { 1179 struct mmu_table_batch **batch = &tlb->batch; 1180 1181 if (*batch) { 1182 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 1183 *batch = NULL; 1184 } 1185 } 1186 1187 void tlb_remove_table(struct mmu_gather *tlb, void *table) 1188 { 1189 struct mmu_table_batch **batch = &tlb->batch; 1190 1191 tlb->mm->context.flush_mm = 1; 1192 if (*batch == NULL) { 1193 *batch = (struct mmu_table_batch *) 1194 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 1195 if (*batch == NULL) { 1196 __tlb_flush_mm_lazy(tlb->mm); 1197 tlb_remove_table_one(table); 1198 return; 1199 } 1200 (*batch)->nr = 0; 1201 } 1202 (*batch)->tables[(*batch)->nr++] = table; 1203 if ((*batch)->nr == MAX_TABLE_BATCH) 1204 tlb_flush_mmu(tlb); 1205 } 1206 1207 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1208 static inline void thp_split_vma(struct vm_area_struct *vma) 1209 { 1210 unsigned long addr; 1211 1212 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) 1213 follow_page(vma, addr, FOLL_SPLIT); 1214 } 1215 1216 static inline void thp_split_mm(struct mm_struct *mm) 1217 { 1218 struct vm_area_struct *vma; 1219 1220 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { 1221 thp_split_vma(vma); 1222 vma->vm_flags &= ~VM_HUGEPAGE; 1223 vma->vm_flags |= VM_NOHUGEPAGE; 1224 } 1225 mm->def_flags |= VM_NOHUGEPAGE; 1226 } 1227 #else 1228 static inline void thp_split_mm(struct mm_struct *mm) 1229 { 1230 } 1231 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1232 1233 static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb, 1234 struct mm_struct *mm, pud_t *pud, 1235 unsigned long addr, unsigned long end) 1236 { 1237 unsigned long next, *table, *new; 1238 struct page *page; 1239 spinlock_t *ptl; 1240 pmd_t *pmd; 1241 1242 pmd = pmd_offset(pud, addr); 1243 do { 1244 next = pmd_addr_end(addr, end); 1245 again: 1246 if (pmd_none_or_clear_bad(pmd)) 1247 continue; 1248 table = (unsigned long *) pmd_deref(*pmd); 1249 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1250 if (page_table_with_pgste(page)) 1251 continue; 1252 /* Allocate new page table with pgstes */ 1253 new = page_table_alloc_pgste(mm); 1254 if (!new) 1255 return -ENOMEM; 1256 1257 ptl = pmd_lock(mm, pmd); 1258 if (likely((unsigned long *) pmd_deref(*pmd) == table)) { 1259 /* Nuke pmd entry pointing to the "short" page table */ 1260 pmdp_flush_lazy(mm, addr, pmd); 1261 pmd_clear(pmd); 1262 /* Copy ptes from old table to new table */ 1263 memcpy(new, table, PAGE_SIZE/2); 1264 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 1265 /* Establish new table */ 1266 pmd_populate(mm, pmd, (pte_t *) new); 1267 /* Free old table with rcu, there might be a walker! */ 1268 page_table_free_rcu(tlb, table, addr); 1269 new = NULL; 1270 } 1271 spin_unlock(ptl); 1272 if (new) { 1273 page_table_free_pgste(new); 1274 goto again; 1275 } 1276 } while (pmd++, addr = next, addr != end); 1277 1278 return addr; 1279 } 1280 1281 static unsigned long page_table_realloc_pud(struct mmu_gather *tlb, 1282 struct mm_struct *mm, pgd_t *pgd, 1283 unsigned long addr, unsigned long end) 1284 { 1285 unsigned long next; 1286 pud_t *pud; 1287 1288 pud = pud_offset(pgd, addr); 1289 do { 1290 next = pud_addr_end(addr, end); 1291 if (pud_none_or_clear_bad(pud)) 1292 continue; 1293 next = page_table_realloc_pmd(tlb, mm, pud, addr, next); 1294 if (unlikely(IS_ERR_VALUE(next))) 1295 return next; 1296 } while (pud++, addr = next, addr != end); 1297 1298 return addr; 1299 } 1300 1301 static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm, 1302 unsigned long addr, unsigned long end) 1303 { 1304 unsigned long next; 1305 pgd_t *pgd; 1306 1307 pgd = pgd_offset(mm, addr); 1308 do { 1309 next = pgd_addr_end(addr, end); 1310 if (pgd_none_or_clear_bad(pgd)) 1311 continue; 1312 next = page_table_realloc_pud(tlb, mm, pgd, addr, next); 1313 if (unlikely(IS_ERR_VALUE(next))) 1314 return next; 1315 } while (pgd++, addr = next, addr != end); 1316 1317 return 0; 1318 } 1319 1320 /* 1321 * switch on pgstes for its userspace process (for kvm) 1322 */ 1323 int s390_enable_sie(void) 1324 { 1325 struct task_struct *tsk = current; 1326 struct mm_struct *mm = tsk->mm; 1327 struct mmu_gather tlb; 1328 1329 /* Do we have pgstes? if yes, we are done */ 1330 if (mm_has_pgste(tsk->mm)) 1331 return 0; 1332 1333 down_write(&mm->mmap_sem); 1334 /* split thp mappings and disable thp for future mappings */ 1335 thp_split_mm(mm); 1336 /* Reallocate the page tables with pgstes */ 1337 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE); 1338 if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE)) 1339 mm->context.has_pgste = 1; 1340 tlb_finish_mmu(&tlb, 0, TASK_SIZE); 1341 up_write(&mm->mmap_sem); 1342 return mm->context.has_pgste ? 0 : -ENOMEM; 1343 } 1344 EXPORT_SYMBOL_GPL(s390_enable_sie); 1345 1346 /* 1347 * Enable storage key handling from now on and initialize the storage 1348 * keys with the default key. 1349 */ 1350 void s390_enable_skey(void) 1351 { 1352 page_table_reset_pgste(current->mm, 0, TASK_SIZE, true); 1353 } 1354 EXPORT_SYMBOL_GPL(s390_enable_skey); 1355 1356 /* 1357 * Test and reset if a guest page is dirty 1358 */ 1359 bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) 1360 { 1361 pte_t *pte; 1362 spinlock_t *ptl; 1363 bool dirty = false; 1364 1365 pte = get_locked_pte(gmap->mm, address, &ptl); 1366 if (unlikely(!pte)) 1367 return false; 1368 1369 if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) 1370 dirty = true; 1371 1372 spin_unlock(ptl); 1373 return dirty; 1374 } 1375 EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); 1376 1377 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1378 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1379 pmd_t *pmdp) 1380 { 1381 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1382 /* No need to flush TLB 1383 * On s390 reference bits are in storage key and never in TLB */ 1384 return pmdp_test_and_clear_young(vma, address, pmdp); 1385 } 1386 1387 int pmdp_set_access_flags(struct vm_area_struct *vma, 1388 unsigned long address, pmd_t *pmdp, 1389 pmd_t entry, int dirty) 1390 { 1391 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1392 1393 entry = pmd_mkyoung(entry); 1394 if (dirty) 1395 entry = pmd_mkdirty(entry); 1396 if (pmd_same(*pmdp, entry)) 1397 return 0; 1398 pmdp_invalidate(vma, address, pmdp); 1399 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1400 return 1; 1401 } 1402 1403 static void pmdp_splitting_flush_sync(void *arg) 1404 { 1405 /* Simply deliver the interrupt */ 1406 } 1407 1408 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 1409 pmd_t *pmdp) 1410 { 1411 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1412 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 1413 (unsigned long *) pmdp)) { 1414 /* need to serialize against gup-fast (IRQ disabled) */ 1415 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 1416 } 1417 } 1418 1419 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1420 pgtable_t pgtable) 1421 { 1422 struct list_head *lh = (struct list_head *) pgtable; 1423 1424 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1425 1426 /* FIFO */ 1427 if (!pmd_huge_pte(mm, pmdp)) 1428 INIT_LIST_HEAD(lh); 1429 else 1430 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1431 pmd_huge_pte(mm, pmdp) = pgtable; 1432 } 1433 1434 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1435 { 1436 struct list_head *lh; 1437 pgtable_t pgtable; 1438 pte_t *ptep; 1439 1440 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1441 1442 /* FIFO */ 1443 pgtable = pmd_huge_pte(mm, pmdp); 1444 lh = (struct list_head *) pgtable; 1445 if (list_empty(lh)) 1446 pmd_huge_pte(mm, pmdp) = NULL; 1447 else { 1448 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1449 list_del(lh); 1450 } 1451 ptep = (pte_t *) pgtable; 1452 pte_val(*ptep) = _PAGE_INVALID; 1453 ptep++; 1454 pte_val(*ptep) = _PAGE_INVALID; 1455 return pgtable; 1456 } 1457 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1458