1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * KVM guest address space mapping code 4 * 5 * Copyright IBM Corp. 2007, 2020 6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 7 * David Hildenbrand <david@redhat.com> 8 * Janosch Frank <frankja@linux.vnet.ibm.com> 9 */ 10 11 #include <linux/kernel.h> 12 #include <linux/pagewalk.h> 13 #include <linux/swap.h> 14 #include <linux/smp.h> 15 #include <linux/spinlock.h> 16 #include <linux/slab.h> 17 #include <linux/swapops.h> 18 #include <linux/ksm.h> 19 #include <linux/mman.h> 20 #include <linux/pgtable.h> 21 22 #include <asm/pgalloc.h> 23 #include <asm/gmap.h> 24 #include <asm/tlb.h> 25 26 #define GMAP_SHADOW_FAKE_TABLE 1ULL 27 28 /** 29 * gmap_alloc - allocate and initialize a guest address space 30 * @limit: maximum address of the gmap address space 31 * 32 * Returns a guest address space structure. 33 */ 34 static struct gmap *gmap_alloc(unsigned long limit) 35 { 36 struct gmap *gmap; 37 struct page *page; 38 unsigned long *table; 39 unsigned long etype, atype; 40 41 if (limit < _REGION3_SIZE) { 42 limit = _REGION3_SIZE - 1; 43 atype = _ASCE_TYPE_SEGMENT; 44 etype = _SEGMENT_ENTRY_EMPTY; 45 } else if (limit < _REGION2_SIZE) { 46 limit = _REGION2_SIZE - 1; 47 atype = _ASCE_TYPE_REGION3; 48 etype = _REGION3_ENTRY_EMPTY; 49 } else if (limit < _REGION1_SIZE) { 50 limit = _REGION1_SIZE - 1; 51 atype = _ASCE_TYPE_REGION2; 52 etype = _REGION2_ENTRY_EMPTY; 53 } else { 54 limit = -1UL; 55 atype = _ASCE_TYPE_REGION1; 56 etype = _REGION1_ENTRY_EMPTY; 57 } 58 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); 59 if (!gmap) 60 goto out; 61 INIT_LIST_HEAD(&gmap->crst_list); 62 INIT_LIST_HEAD(&gmap->children); 63 INIT_LIST_HEAD(&gmap->pt_list); 64 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); 65 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); 66 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); 67 spin_lock_init(&gmap->guest_table_lock); 68 spin_lock_init(&gmap->shadow_lock); 69 refcount_set(&gmap->ref_count, 1); 70 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 71 if (!page) 72 goto out_free; 73 page->index = 0; 74 list_add(&page->lru, &gmap->crst_list); 75 table = page_to_virt(page); 76 crst_table_init(table, etype); 77 gmap->table = table; 78 gmap->asce = atype | _ASCE_TABLE_LENGTH | 79 _ASCE_USER_BITS | __pa(table); 80 gmap->asce_end = limit; 81 return gmap; 82 83 out_free: 84 kfree(gmap); 85 out: 86 return NULL; 87 } 88 89 /** 90 * gmap_create - create a guest address space 91 * @mm: pointer to the parent mm_struct 92 * @limit: maximum size of the gmap address space 93 * 94 * Returns a guest address space structure. 95 */ 96 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) 97 { 98 struct gmap *gmap; 99 unsigned long gmap_asce; 100 101 gmap = gmap_alloc(limit); 102 if (!gmap) 103 return NULL; 104 gmap->mm = mm; 105 spin_lock(&mm->context.lock); 106 list_add_rcu(&gmap->list, &mm->context.gmap_list); 107 if (list_is_singular(&mm->context.gmap_list)) 108 gmap_asce = gmap->asce; 109 else 110 gmap_asce = -1UL; 111 WRITE_ONCE(mm->context.gmap_asce, gmap_asce); 112 spin_unlock(&mm->context.lock); 113 return gmap; 114 } 115 EXPORT_SYMBOL_GPL(gmap_create); 116 117 static void gmap_flush_tlb(struct gmap *gmap) 118 { 119 if (MACHINE_HAS_IDTE) 120 __tlb_flush_idte(gmap->asce); 121 else 122 __tlb_flush_global(); 123 } 124 125 static void gmap_radix_tree_free(struct radix_tree_root *root) 126 { 127 struct radix_tree_iter iter; 128 unsigned long indices[16]; 129 unsigned long index; 130 void __rcu **slot; 131 int i, nr; 132 133 /* A radix tree is freed by deleting all of its entries */ 134 index = 0; 135 do { 136 nr = 0; 137 radix_tree_for_each_slot(slot, root, &iter, index) { 138 indices[nr] = iter.index; 139 if (++nr == 16) 140 break; 141 } 142 for (i = 0; i < nr; i++) { 143 index = indices[i]; 144 radix_tree_delete(root, index); 145 } 146 } while (nr > 0); 147 } 148 149 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) 150 { 151 struct gmap_rmap *rmap, *rnext, *head; 152 struct radix_tree_iter iter; 153 unsigned long indices[16]; 154 unsigned long index; 155 void __rcu **slot; 156 int i, nr; 157 158 /* A radix tree is freed by deleting all of its entries */ 159 index = 0; 160 do { 161 nr = 0; 162 radix_tree_for_each_slot(slot, root, &iter, index) { 163 indices[nr] = iter.index; 164 if (++nr == 16) 165 break; 166 } 167 for (i = 0; i < nr; i++) { 168 index = indices[i]; 169 head = radix_tree_delete(root, index); 170 gmap_for_each_rmap_safe(rmap, rnext, head) 171 kfree(rmap); 172 } 173 } while (nr > 0); 174 } 175 176 /** 177 * gmap_free - free a guest address space 178 * @gmap: pointer to the guest address space structure 179 * 180 * No locks required. There are no references to this gmap anymore. 181 */ 182 static void gmap_free(struct gmap *gmap) 183 { 184 struct page *page, *next; 185 186 /* Flush tlb of all gmaps (if not already done for shadows) */ 187 if (!(gmap_is_shadow(gmap) && gmap->removed)) 188 gmap_flush_tlb(gmap); 189 /* Free all segment & region tables. */ 190 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 191 __free_pages(page, CRST_ALLOC_ORDER); 192 gmap_radix_tree_free(&gmap->guest_to_host); 193 gmap_radix_tree_free(&gmap->host_to_guest); 194 195 /* Free additional data for a shadow gmap */ 196 if (gmap_is_shadow(gmap)) { 197 /* Free all page tables. */ 198 list_for_each_entry_safe(page, next, &gmap->pt_list, lru) 199 page_table_free_pgste(page); 200 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 201 /* Release reference to the parent */ 202 gmap_put(gmap->parent); 203 } 204 205 kfree(gmap); 206 } 207 208 /** 209 * gmap_get - increase reference counter for guest address space 210 * @gmap: pointer to the guest address space structure 211 * 212 * Returns the gmap pointer 213 */ 214 struct gmap *gmap_get(struct gmap *gmap) 215 { 216 refcount_inc(&gmap->ref_count); 217 return gmap; 218 } 219 EXPORT_SYMBOL_GPL(gmap_get); 220 221 /** 222 * gmap_put - decrease reference counter for guest address space 223 * @gmap: pointer to the guest address space structure 224 * 225 * If the reference counter reaches zero the guest address space is freed. 226 */ 227 void gmap_put(struct gmap *gmap) 228 { 229 if (refcount_dec_and_test(&gmap->ref_count)) 230 gmap_free(gmap); 231 } 232 EXPORT_SYMBOL_GPL(gmap_put); 233 234 /** 235 * gmap_remove - remove a guest address space but do not free it yet 236 * @gmap: pointer to the guest address space structure 237 */ 238 void gmap_remove(struct gmap *gmap) 239 { 240 struct gmap *sg, *next; 241 unsigned long gmap_asce; 242 243 /* Remove all shadow gmaps linked to this gmap */ 244 if (!list_empty(&gmap->children)) { 245 spin_lock(&gmap->shadow_lock); 246 list_for_each_entry_safe(sg, next, &gmap->children, list) { 247 list_del(&sg->list); 248 gmap_put(sg); 249 } 250 spin_unlock(&gmap->shadow_lock); 251 } 252 /* Remove gmap from the pre-mm list */ 253 spin_lock(&gmap->mm->context.lock); 254 list_del_rcu(&gmap->list); 255 if (list_empty(&gmap->mm->context.gmap_list)) 256 gmap_asce = 0; 257 else if (list_is_singular(&gmap->mm->context.gmap_list)) 258 gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, 259 struct gmap, list)->asce; 260 else 261 gmap_asce = -1UL; 262 WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); 263 spin_unlock(&gmap->mm->context.lock); 264 synchronize_rcu(); 265 /* Put reference */ 266 gmap_put(gmap); 267 } 268 EXPORT_SYMBOL_GPL(gmap_remove); 269 270 /** 271 * gmap_enable - switch primary space to the guest address space 272 * @gmap: pointer to the guest address space structure 273 */ 274 void gmap_enable(struct gmap *gmap) 275 { 276 S390_lowcore.gmap = (unsigned long) gmap; 277 } 278 EXPORT_SYMBOL_GPL(gmap_enable); 279 280 /** 281 * gmap_disable - switch back to the standard primary address space 282 * @gmap: pointer to the guest address space structure 283 */ 284 void gmap_disable(struct gmap *gmap) 285 { 286 S390_lowcore.gmap = 0UL; 287 } 288 EXPORT_SYMBOL_GPL(gmap_disable); 289 290 /** 291 * gmap_get_enabled - get a pointer to the currently enabled gmap 292 * 293 * Returns a pointer to the currently enabled gmap. 0 if none is enabled. 294 */ 295 struct gmap *gmap_get_enabled(void) 296 { 297 return (struct gmap *) S390_lowcore.gmap; 298 } 299 EXPORT_SYMBOL_GPL(gmap_get_enabled); 300 301 /* 302 * gmap_alloc_table is assumed to be called with mmap_lock held 303 */ 304 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 305 unsigned long init, unsigned long gaddr) 306 { 307 struct page *page; 308 unsigned long *new; 309 310 /* since we dont free the gmap table until gmap_free we can unlock */ 311 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 312 if (!page) 313 return -ENOMEM; 314 new = page_to_virt(page); 315 crst_table_init(new, init); 316 spin_lock(&gmap->guest_table_lock); 317 if (*table & _REGION_ENTRY_INVALID) { 318 list_add(&page->lru, &gmap->crst_list); 319 *table = __pa(new) | _REGION_ENTRY_LENGTH | 320 (*table & _REGION_ENTRY_TYPE_MASK); 321 page->index = gaddr; 322 page = NULL; 323 } 324 spin_unlock(&gmap->guest_table_lock); 325 if (page) 326 __free_pages(page, CRST_ALLOC_ORDER); 327 return 0; 328 } 329 330 /** 331 * __gmap_segment_gaddr - find virtual address from segment pointer 332 * @entry: pointer to a segment table entry in the guest address space 333 * 334 * Returns the virtual address in the guest address space for the segment 335 */ 336 static unsigned long __gmap_segment_gaddr(unsigned long *entry) 337 { 338 struct page *page; 339 unsigned long offset; 340 341 offset = (unsigned long) entry / sizeof(unsigned long); 342 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 343 page = pmd_pgtable_page((pmd_t *) entry); 344 return page->index + offset; 345 } 346 347 /** 348 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 349 * @gmap: pointer to the guest address space structure 350 * @vmaddr: address in the host process address space 351 * 352 * Returns 1 if a TLB flush is required 353 */ 354 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 355 { 356 unsigned long *entry; 357 int flush = 0; 358 359 BUG_ON(gmap_is_shadow(gmap)); 360 spin_lock(&gmap->guest_table_lock); 361 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 362 if (entry) { 363 flush = (*entry != _SEGMENT_ENTRY_EMPTY); 364 *entry = _SEGMENT_ENTRY_EMPTY; 365 } 366 spin_unlock(&gmap->guest_table_lock); 367 return flush; 368 } 369 370 /** 371 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 372 * @gmap: pointer to the guest address space structure 373 * @gaddr: address in the guest address space 374 * 375 * Returns 1 if a TLB flush is required 376 */ 377 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 378 { 379 unsigned long vmaddr; 380 381 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 382 gaddr >> PMD_SHIFT); 383 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 384 } 385 386 /** 387 * gmap_unmap_segment - unmap segment from the guest address space 388 * @gmap: pointer to the guest address space structure 389 * @to: address in the guest address space 390 * @len: length of the memory area to unmap 391 * 392 * Returns 0 if the unmap succeeded, -EINVAL if not. 393 */ 394 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 395 { 396 unsigned long off; 397 int flush; 398 399 BUG_ON(gmap_is_shadow(gmap)); 400 if ((to | len) & (PMD_SIZE - 1)) 401 return -EINVAL; 402 if (len == 0 || to + len < to) 403 return -EINVAL; 404 405 flush = 0; 406 mmap_write_lock(gmap->mm); 407 for (off = 0; off < len; off += PMD_SIZE) 408 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 409 mmap_write_unlock(gmap->mm); 410 if (flush) 411 gmap_flush_tlb(gmap); 412 return 0; 413 } 414 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 415 416 /** 417 * gmap_map_segment - map a segment to the guest address space 418 * @gmap: pointer to the guest address space structure 419 * @from: source address in the parent address space 420 * @to: target address in the guest address space 421 * @len: length of the memory area to map 422 * 423 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 424 */ 425 int gmap_map_segment(struct gmap *gmap, unsigned long from, 426 unsigned long to, unsigned long len) 427 { 428 unsigned long off; 429 int flush; 430 431 BUG_ON(gmap_is_shadow(gmap)); 432 if ((from | to | len) & (PMD_SIZE - 1)) 433 return -EINVAL; 434 if (len == 0 || from + len < from || to + len < to || 435 from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) 436 return -EINVAL; 437 438 flush = 0; 439 mmap_write_lock(gmap->mm); 440 for (off = 0; off < len; off += PMD_SIZE) { 441 /* Remove old translation */ 442 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 443 /* Store new translation */ 444 if (radix_tree_insert(&gmap->guest_to_host, 445 (to + off) >> PMD_SHIFT, 446 (void *) from + off)) 447 break; 448 } 449 mmap_write_unlock(gmap->mm); 450 if (flush) 451 gmap_flush_tlb(gmap); 452 if (off >= len) 453 return 0; 454 gmap_unmap_segment(gmap, to, len); 455 return -ENOMEM; 456 } 457 EXPORT_SYMBOL_GPL(gmap_map_segment); 458 459 /** 460 * __gmap_translate - translate a guest address to a user space address 461 * @gmap: pointer to guest mapping meta data structure 462 * @gaddr: guest address 463 * 464 * Returns user space address which corresponds to the guest address or 465 * -EFAULT if no such mapping exists. 466 * This function does not establish potentially missing page table entries. 467 * The mmap_lock of the mm that belongs to the address space must be held 468 * when this function gets called. 469 * 470 * Note: Can also be called for shadow gmaps. 471 */ 472 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 473 { 474 unsigned long vmaddr; 475 476 vmaddr = (unsigned long) 477 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 478 /* Note: guest_to_host is empty for a shadow gmap */ 479 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 480 } 481 EXPORT_SYMBOL_GPL(__gmap_translate); 482 483 /** 484 * gmap_translate - translate a guest address to a user space address 485 * @gmap: pointer to guest mapping meta data structure 486 * @gaddr: guest address 487 * 488 * Returns user space address which corresponds to the guest address or 489 * -EFAULT if no such mapping exists. 490 * This function does not establish potentially missing page table entries. 491 */ 492 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 493 { 494 unsigned long rc; 495 496 mmap_read_lock(gmap->mm); 497 rc = __gmap_translate(gmap, gaddr); 498 mmap_read_unlock(gmap->mm); 499 return rc; 500 } 501 EXPORT_SYMBOL_GPL(gmap_translate); 502 503 /** 504 * gmap_unlink - disconnect a page table from the gmap shadow tables 505 * @mm: pointer to the parent mm_struct 506 * @table: pointer to the host page table 507 * @vmaddr: vm address associated with the host page table 508 */ 509 void gmap_unlink(struct mm_struct *mm, unsigned long *table, 510 unsigned long vmaddr) 511 { 512 struct gmap *gmap; 513 int flush; 514 515 rcu_read_lock(); 516 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 517 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 518 if (flush) 519 gmap_flush_tlb(gmap); 520 } 521 rcu_read_unlock(); 522 } 523 524 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, 525 unsigned long gaddr); 526 527 /** 528 * __gmap_link - set up shadow page tables to connect a host to a guest address 529 * @gmap: pointer to guest mapping meta data structure 530 * @gaddr: guest address 531 * @vmaddr: vm address 532 * 533 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 534 * if the vm address is already mapped to a different guest segment. 535 * The mmap_lock of the mm that belongs to the address space must be held 536 * when this function gets called. 537 */ 538 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 539 { 540 struct mm_struct *mm; 541 unsigned long *table; 542 spinlock_t *ptl; 543 pgd_t *pgd; 544 p4d_t *p4d; 545 pud_t *pud; 546 pmd_t *pmd; 547 u64 unprot; 548 int rc; 549 550 BUG_ON(gmap_is_shadow(gmap)); 551 /* Create higher level tables in the gmap page table */ 552 table = gmap->table; 553 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 554 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 555 if ((*table & _REGION_ENTRY_INVALID) && 556 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 557 gaddr & _REGION1_MASK)) 558 return -ENOMEM; 559 table = __va(*table & _REGION_ENTRY_ORIGIN); 560 } 561 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 562 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 563 if ((*table & _REGION_ENTRY_INVALID) && 564 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 565 gaddr & _REGION2_MASK)) 566 return -ENOMEM; 567 table = __va(*table & _REGION_ENTRY_ORIGIN); 568 } 569 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 570 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 571 if ((*table & _REGION_ENTRY_INVALID) && 572 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 573 gaddr & _REGION3_MASK)) 574 return -ENOMEM; 575 table = __va(*table & _REGION_ENTRY_ORIGIN); 576 } 577 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 578 /* Walk the parent mm page table */ 579 mm = gmap->mm; 580 pgd = pgd_offset(mm, vmaddr); 581 VM_BUG_ON(pgd_none(*pgd)); 582 p4d = p4d_offset(pgd, vmaddr); 583 VM_BUG_ON(p4d_none(*p4d)); 584 pud = pud_offset(p4d, vmaddr); 585 VM_BUG_ON(pud_none(*pud)); 586 /* large puds cannot yet be handled */ 587 if (pud_large(*pud)) 588 return -EFAULT; 589 pmd = pmd_offset(pud, vmaddr); 590 VM_BUG_ON(pmd_none(*pmd)); 591 /* Are we allowed to use huge pages? */ 592 if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) 593 return -EFAULT; 594 /* Link gmap segment table entry location to page table. */ 595 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 596 if (rc) 597 return rc; 598 ptl = pmd_lock(mm, pmd); 599 spin_lock(&gmap->guest_table_lock); 600 if (*table == _SEGMENT_ENTRY_EMPTY) { 601 rc = radix_tree_insert(&gmap->host_to_guest, 602 vmaddr >> PMD_SHIFT, table); 603 if (!rc) { 604 if (pmd_large(*pmd)) { 605 *table = (pmd_val(*pmd) & 606 _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) 607 | _SEGMENT_ENTRY_GMAP_UC; 608 } else 609 *table = pmd_val(*pmd) & 610 _SEGMENT_ENTRY_HARDWARE_BITS; 611 } 612 } else if (*table & _SEGMENT_ENTRY_PROTECT && 613 !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { 614 unprot = (u64)*table; 615 unprot &= ~_SEGMENT_ENTRY_PROTECT; 616 unprot |= _SEGMENT_ENTRY_GMAP_UC; 617 gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); 618 } 619 spin_unlock(&gmap->guest_table_lock); 620 spin_unlock(ptl); 621 radix_tree_preload_end(); 622 return rc; 623 } 624 625 /** 626 * gmap_fault - resolve a fault on a guest address 627 * @gmap: pointer to guest mapping meta data structure 628 * @gaddr: guest address 629 * @fault_flags: flags to pass down to handle_mm_fault() 630 * 631 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 632 * if the vm address is already mapped to a different guest segment. 633 */ 634 int gmap_fault(struct gmap *gmap, unsigned long gaddr, 635 unsigned int fault_flags) 636 { 637 unsigned long vmaddr; 638 int rc; 639 bool unlocked; 640 641 mmap_read_lock(gmap->mm); 642 643 retry: 644 unlocked = false; 645 vmaddr = __gmap_translate(gmap, gaddr); 646 if (IS_ERR_VALUE(vmaddr)) { 647 rc = vmaddr; 648 goto out_up; 649 } 650 if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, 651 &unlocked)) { 652 rc = -EFAULT; 653 goto out_up; 654 } 655 /* 656 * In the case that fixup_user_fault unlocked the mmap_lock during 657 * faultin redo __gmap_translate to not race with a map/unmap_segment. 658 */ 659 if (unlocked) 660 goto retry; 661 662 rc = __gmap_link(gmap, gaddr, vmaddr); 663 out_up: 664 mmap_read_unlock(gmap->mm); 665 return rc; 666 } 667 EXPORT_SYMBOL_GPL(gmap_fault); 668 669 /* 670 * this function is assumed to be called with mmap_lock held 671 */ 672 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 673 { 674 struct vm_area_struct *vma; 675 unsigned long vmaddr; 676 spinlock_t *ptl; 677 pte_t *ptep; 678 679 /* Find the vm address for the guest address */ 680 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 681 gaddr >> PMD_SHIFT); 682 if (vmaddr) { 683 vmaddr |= gaddr & ~PMD_MASK; 684 685 vma = vma_lookup(gmap->mm, vmaddr); 686 if (!vma || is_vm_hugetlb_page(vma)) 687 return; 688 689 /* Get pointer to the page table entry */ 690 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 691 if (likely(ptep)) { 692 ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); 693 pte_unmap_unlock(ptep, ptl); 694 } 695 } 696 } 697 EXPORT_SYMBOL_GPL(__gmap_zap); 698 699 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 700 { 701 unsigned long gaddr, vmaddr, size; 702 struct vm_area_struct *vma; 703 704 mmap_read_lock(gmap->mm); 705 for (gaddr = from; gaddr < to; 706 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 707 /* Find the vm address for the guest address */ 708 vmaddr = (unsigned long) 709 radix_tree_lookup(&gmap->guest_to_host, 710 gaddr >> PMD_SHIFT); 711 if (!vmaddr) 712 continue; 713 vmaddr |= gaddr & ~PMD_MASK; 714 /* Find vma in the parent mm */ 715 vma = find_vma(gmap->mm, vmaddr); 716 if (!vma) 717 continue; 718 /* 719 * We do not discard pages that are backed by 720 * hugetlbfs, so we don't have to refault them. 721 */ 722 if (is_vm_hugetlb_page(vma)) 723 continue; 724 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 725 zap_page_range_single(vma, vmaddr, size, NULL); 726 } 727 mmap_read_unlock(gmap->mm); 728 } 729 EXPORT_SYMBOL_GPL(gmap_discard); 730 731 static LIST_HEAD(gmap_notifier_list); 732 static DEFINE_SPINLOCK(gmap_notifier_lock); 733 734 /** 735 * gmap_register_pte_notifier - register a pte invalidation callback 736 * @nb: pointer to the gmap notifier block 737 */ 738 void gmap_register_pte_notifier(struct gmap_notifier *nb) 739 { 740 spin_lock(&gmap_notifier_lock); 741 list_add_rcu(&nb->list, &gmap_notifier_list); 742 spin_unlock(&gmap_notifier_lock); 743 } 744 EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); 745 746 /** 747 * gmap_unregister_pte_notifier - remove a pte invalidation callback 748 * @nb: pointer to the gmap notifier block 749 */ 750 void gmap_unregister_pte_notifier(struct gmap_notifier *nb) 751 { 752 spin_lock(&gmap_notifier_lock); 753 list_del_rcu(&nb->list); 754 spin_unlock(&gmap_notifier_lock); 755 synchronize_rcu(); 756 } 757 EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); 758 759 /** 760 * gmap_call_notifier - call all registered invalidation callbacks 761 * @gmap: pointer to guest mapping meta data structure 762 * @start: start virtual address in the guest address space 763 * @end: end virtual address in the guest address space 764 */ 765 static void gmap_call_notifier(struct gmap *gmap, unsigned long start, 766 unsigned long end) 767 { 768 struct gmap_notifier *nb; 769 770 list_for_each_entry(nb, &gmap_notifier_list, list) 771 nb->notifier_call(gmap, start, end); 772 } 773 774 /** 775 * gmap_table_walk - walk the gmap page tables 776 * @gmap: pointer to guest mapping meta data structure 777 * @gaddr: virtual address in the guest address space 778 * @level: page table level to stop at 779 * 780 * Returns a table entry pointer for the given guest address and @level 781 * @level=0 : returns a pointer to a page table table entry (or NULL) 782 * @level=1 : returns a pointer to a segment table entry (or NULL) 783 * @level=2 : returns a pointer to a region-3 table entry (or NULL) 784 * @level=3 : returns a pointer to a region-2 table entry (or NULL) 785 * @level=4 : returns a pointer to a region-1 table entry (or NULL) 786 * 787 * Returns NULL if the gmap page tables could not be walked to the 788 * requested level. 789 * 790 * Note: Can also be called for shadow gmaps. 791 */ 792 static inline unsigned long *gmap_table_walk(struct gmap *gmap, 793 unsigned long gaddr, int level) 794 { 795 const int asce_type = gmap->asce & _ASCE_TYPE_MASK; 796 unsigned long *table = gmap->table; 797 798 if (gmap_is_shadow(gmap) && gmap->removed) 799 return NULL; 800 801 if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) 802 return NULL; 803 804 if (asce_type != _ASCE_TYPE_REGION1 && 805 gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) 806 return NULL; 807 808 switch (asce_type) { 809 case _ASCE_TYPE_REGION1: 810 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 811 if (level == 4) 812 break; 813 if (*table & _REGION_ENTRY_INVALID) 814 return NULL; 815 table = __va(*table & _REGION_ENTRY_ORIGIN); 816 fallthrough; 817 case _ASCE_TYPE_REGION2: 818 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 819 if (level == 3) 820 break; 821 if (*table & _REGION_ENTRY_INVALID) 822 return NULL; 823 table = __va(*table & _REGION_ENTRY_ORIGIN); 824 fallthrough; 825 case _ASCE_TYPE_REGION3: 826 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 827 if (level == 2) 828 break; 829 if (*table & _REGION_ENTRY_INVALID) 830 return NULL; 831 table = __va(*table & _REGION_ENTRY_ORIGIN); 832 fallthrough; 833 case _ASCE_TYPE_SEGMENT: 834 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 835 if (level == 1) 836 break; 837 if (*table & _REGION_ENTRY_INVALID) 838 return NULL; 839 table = __va(*table & _SEGMENT_ENTRY_ORIGIN); 840 table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; 841 } 842 return table; 843 } 844 845 /** 846 * gmap_pte_op_walk - walk the gmap page table, get the page table lock 847 * and return the pte pointer 848 * @gmap: pointer to guest mapping meta data structure 849 * @gaddr: virtual address in the guest address space 850 * @ptl: pointer to the spinlock pointer 851 * 852 * Returns a pointer to the locked pte for a guest address, or NULL 853 */ 854 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, 855 spinlock_t **ptl) 856 { 857 unsigned long *table; 858 859 BUG_ON(gmap_is_shadow(gmap)); 860 /* Walk the gmap page table, lock and get pte pointer */ 861 table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ 862 if (!table || *table & _SEGMENT_ENTRY_INVALID) 863 return NULL; 864 return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); 865 } 866 867 /** 868 * gmap_pte_op_fixup - force a page in and connect the gmap page table 869 * @gmap: pointer to guest mapping meta data structure 870 * @gaddr: virtual address in the guest address space 871 * @vmaddr: address in the host process address space 872 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 873 * 874 * Returns 0 if the caller can retry __gmap_translate (might fail again), 875 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing 876 * up or connecting the gmap page table. 877 */ 878 static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, 879 unsigned long vmaddr, int prot) 880 { 881 struct mm_struct *mm = gmap->mm; 882 unsigned int fault_flags; 883 bool unlocked = false; 884 885 BUG_ON(gmap_is_shadow(gmap)); 886 fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 887 if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) 888 return -EFAULT; 889 if (unlocked) 890 /* lost mmap_lock, caller has to retry __gmap_translate */ 891 return 0; 892 /* Connect the page tables */ 893 return __gmap_link(gmap, gaddr, vmaddr); 894 } 895 896 /** 897 * gmap_pte_op_end - release the page table lock 898 * @ptep: pointer to the locked pte 899 * @ptl: pointer to the page table spinlock 900 */ 901 static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) 902 { 903 pte_unmap_unlock(ptep, ptl); 904 } 905 906 /** 907 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock 908 * and return the pmd pointer 909 * @gmap: pointer to guest mapping meta data structure 910 * @gaddr: virtual address in the guest address space 911 * 912 * Returns a pointer to the pmd for a guest address, or NULL 913 */ 914 static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) 915 { 916 pmd_t *pmdp; 917 918 BUG_ON(gmap_is_shadow(gmap)); 919 pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); 920 if (!pmdp) 921 return NULL; 922 923 /* without huge pages, there is no need to take the table lock */ 924 if (!gmap->mm->context.allow_gmap_hpage_1m) 925 return pmd_none(*pmdp) ? NULL : pmdp; 926 927 spin_lock(&gmap->guest_table_lock); 928 if (pmd_none(*pmdp)) { 929 spin_unlock(&gmap->guest_table_lock); 930 return NULL; 931 } 932 933 /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ 934 if (!pmd_large(*pmdp)) 935 spin_unlock(&gmap->guest_table_lock); 936 return pmdp; 937 } 938 939 /** 940 * gmap_pmd_op_end - release the guest_table_lock if needed 941 * @gmap: pointer to the guest mapping meta data structure 942 * @pmdp: pointer to the pmd 943 */ 944 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) 945 { 946 if (pmd_large(*pmdp)) 947 spin_unlock(&gmap->guest_table_lock); 948 } 949 950 /* 951 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits 952 * @pmdp: pointer to the pmd to be protected 953 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 954 * @bits: notification bits to set 955 * 956 * Returns: 957 * 0 if successfully protected 958 * -EAGAIN if a fixup is needed 959 * -EINVAL if unsupported notifier bits have been specified 960 * 961 * Expected to be called with sg->mm->mmap_lock in read and 962 * guest_table_lock held. 963 */ 964 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, 965 pmd_t *pmdp, int prot, unsigned long bits) 966 { 967 int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; 968 int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; 969 pmd_t new = *pmdp; 970 971 /* Fixup needed */ 972 if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) 973 return -EAGAIN; 974 975 if (prot == PROT_NONE && !pmd_i) { 976 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); 977 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 978 } 979 980 if (prot == PROT_READ && !pmd_p) { 981 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); 982 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); 983 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 984 } 985 986 if (bits & GMAP_NOTIFY_MPROT) 987 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); 988 989 /* Shadow GMAP protection needs split PMDs */ 990 if (bits & GMAP_NOTIFY_SHADOW) 991 return -EINVAL; 992 993 return 0; 994 } 995 996 /* 997 * gmap_protect_pte - remove access rights to memory and set pgste bits 998 * @gmap: pointer to guest mapping meta data structure 999 * @gaddr: virtual address in the guest address space 1000 * @pmdp: pointer to the pmd associated with the pte 1001 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1002 * @bits: notification bits to set 1003 * 1004 * Returns 0 if successfully protected, -ENOMEM if out of memory and 1005 * -EAGAIN if a fixup is needed. 1006 * 1007 * Expected to be called with sg->mm->mmap_lock in read 1008 */ 1009 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, 1010 pmd_t *pmdp, int prot, unsigned long bits) 1011 { 1012 int rc; 1013 pte_t *ptep; 1014 spinlock_t *ptl; 1015 unsigned long pbits = 0; 1016 1017 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 1018 return -EAGAIN; 1019 1020 ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); 1021 if (!ptep) 1022 return -ENOMEM; 1023 1024 pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; 1025 pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; 1026 /* Protect and unlock. */ 1027 rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); 1028 gmap_pte_op_end(ptep, ptl); 1029 return rc; 1030 } 1031 1032 /* 1033 * gmap_protect_range - remove access rights to memory and set pgste bits 1034 * @gmap: pointer to guest mapping meta data structure 1035 * @gaddr: virtual address in the guest address space 1036 * @len: size of area 1037 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1038 * @bits: pgste notification bits to set 1039 * 1040 * Returns 0 if successfully protected, -ENOMEM if out of memory and 1041 * -EFAULT if gaddr is invalid (or mapping for shadows is missing). 1042 * 1043 * Called with sg->mm->mmap_lock in read. 1044 */ 1045 static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, 1046 unsigned long len, int prot, unsigned long bits) 1047 { 1048 unsigned long vmaddr, dist; 1049 pmd_t *pmdp; 1050 int rc; 1051 1052 BUG_ON(gmap_is_shadow(gmap)); 1053 while (len) { 1054 rc = -EAGAIN; 1055 pmdp = gmap_pmd_op_walk(gmap, gaddr); 1056 if (pmdp) { 1057 if (!pmd_large(*pmdp)) { 1058 rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, 1059 bits); 1060 if (!rc) { 1061 len -= PAGE_SIZE; 1062 gaddr += PAGE_SIZE; 1063 } 1064 } else { 1065 rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, 1066 bits); 1067 if (!rc) { 1068 dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); 1069 len = len < dist ? 0 : len - dist; 1070 gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; 1071 } 1072 } 1073 gmap_pmd_op_end(gmap, pmdp); 1074 } 1075 if (rc) { 1076 if (rc == -EINVAL) 1077 return rc; 1078 1079 /* -EAGAIN, fixup of userspace mm and gmap */ 1080 vmaddr = __gmap_translate(gmap, gaddr); 1081 if (IS_ERR_VALUE(vmaddr)) 1082 return vmaddr; 1083 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); 1084 if (rc) 1085 return rc; 1086 } 1087 } 1088 return 0; 1089 } 1090 1091 /** 1092 * gmap_mprotect_notify - change access rights for a range of ptes and 1093 * call the notifier if any pte changes again 1094 * @gmap: pointer to guest mapping meta data structure 1095 * @gaddr: virtual address in the guest address space 1096 * @len: size of area 1097 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1098 * 1099 * Returns 0 if for each page in the given range a gmap mapping exists, 1100 * the new access rights could be set and the notifier could be armed. 1101 * If the gmap mapping is missing for one or more pages -EFAULT is 1102 * returned. If no memory could be allocated -ENOMEM is returned. 1103 * This function establishes missing page table entries. 1104 */ 1105 int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, 1106 unsigned long len, int prot) 1107 { 1108 int rc; 1109 1110 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) 1111 return -EINVAL; 1112 if (!MACHINE_HAS_ESOP && prot == PROT_READ) 1113 return -EINVAL; 1114 mmap_read_lock(gmap->mm); 1115 rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); 1116 mmap_read_unlock(gmap->mm); 1117 return rc; 1118 } 1119 EXPORT_SYMBOL_GPL(gmap_mprotect_notify); 1120 1121 /** 1122 * gmap_read_table - get an unsigned long value from a guest page table using 1123 * absolute addressing, without marking the page referenced. 1124 * @gmap: pointer to guest mapping meta data structure 1125 * @gaddr: virtual address in the guest address space 1126 * @val: pointer to the unsigned long value to return 1127 * 1128 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT 1129 * if reading using the virtual address failed. -EINVAL if called on a gmap 1130 * shadow. 1131 * 1132 * Called with gmap->mm->mmap_lock in read. 1133 */ 1134 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) 1135 { 1136 unsigned long address, vmaddr; 1137 spinlock_t *ptl; 1138 pte_t *ptep, pte; 1139 int rc; 1140 1141 if (gmap_is_shadow(gmap)) 1142 return -EINVAL; 1143 1144 while (1) { 1145 rc = -EAGAIN; 1146 ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); 1147 if (ptep) { 1148 pte = *ptep; 1149 if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { 1150 address = pte_val(pte) & PAGE_MASK; 1151 address += gaddr & ~PAGE_MASK; 1152 *val = *(unsigned long *)__va(address); 1153 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); 1154 /* Do *NOT* clear the _PAGE_INVALID bit! */ 1155 rc = 0; 1156 } 1157 gmap_pte_op_end(ptep, ptl); 1158 } 1159 if (!rc) 1160 break; 1161 vmaddr = __gmap_translate(gmap, gaddr); 1162 if (IS_ERR_VALUE(vmaddr)) { 1163 rc = vmaddr; 1164 break; 1165 } 1166 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); 1167 if (rc) 1168 break; 1169 } 1170 return rc; 1171 } 1172 EXPORT_SYMBOL_GPL(gmap_read_table); 1173 1174 /** 1175 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree 1176 * @sg: pointer to the shadow guest address space structure 1177 * @vmaddr: vm address associated with the rmap 1178 * @rmap: pointer to the rmap structure 1179 * 1180 * Called with the sg->guest_table_lock 1181 */ 1182 static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, 1183 struct gmap_rmap *rmap) 1184 { 1185 struct gmap_rmap *temp; 1186 void __rcu **slot; 1187 1188 BUG_ON(!gmap_is_shadow(sg)); 1189 slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 1190 if (slot) { 1191 rmap->next = radix_tree_deref_slot_protected(slot, 1192 &sg->guest_table_lock); 1193 for (temp = rmap->next; temp; temp = temp->next) { 1194 if (temp->raddr == rmap->raddr) { 1195 kfree(rmap); 1196 return; 1197 } 1198 } 1199 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); 1200 } else { 1201 rmap->next = NULL; 1202 radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, 1203 rmap); 1204 } 1205 } 1206 1207 /** 1208 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap 1209 * @sg: pointer to the shadow guest address space structure 1210 * @raddr: rmap address in the shadow gmap 1211 * @paddr: address in the parent guest address space 1212 * @len: length of the memory area to protect 1213 * 1214 * Returns 0 if successfully protected and the rmap was created, -ENOMEM 1215 * if out of memory and -EFAULT if paddr is invalid. 1216 */ 1217 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, 1218 unsigned long paddr, unsigned long len) 1219 { 1220 struct gmap *parent; 1221 struct gmap_rmap *rmap; 1222 unsigned long vmaddr; 1223 spinlock_t *ptl; 1224 pte_t *ptep; 1225 int rc; 1226 1227 BUG_ON(!gmap_is_shadow(sg)); 1228 parent = sg->parent; 1229 while (len) { 1230 vmaddr = __gmap_translate(parent, paddr); 1231 if (IS_ERR_VALUE(vmaddr)) 1232 return vmaddr; 1233 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); 1234 if (!rmap) 1235 return -ENOMEM; 1236 rmap->raddr = raddr; 1237 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 1238 if (rc) { 1239 kfree(rmap); 1240 return rc; 1241 } 1242 rc = -EAGAIN; 1243 ptep = gmap_pte_op_walk(parent, paddr, &ptl); 1244 if (ptep) { 1245 spin_lock(&sg->guest_table_lock); 1246 rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, 1247 PGSTE_VSIE_BIT); 1248 if (!rc) 1249 gmap_insert_rmap(sg, vmaddr, rmap); 1250 spin_unlock(&sg->guest_table_lock); 1251 gmap_pte_op_end(ptep, ptl); 1252 } 1253 radix_tree_preload_end(); 1254 if (rc) { 1255 kfree(rmap); 1256 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); 1257 if (rc) 1258 return rc; 1259 continue; 1260 } 1261 paddr += PAGE_SIZE; 1262 len -= PAGE_SIZE; 1263 } 1264 return 0; 1265 } 1266 1267 #define _SHADOW_RMAP_MASK 0x7 1268 #define _SHADOW_RMAP_REGION1 0x5 1269 #define _SHADOW_RMAP_REGION2 0x4 1270 #define _SHADOW_RMAP_REGION3 0x3 1271 #define _SHADOW_RMAP_SEGMENT 0x2 1272 #define _SHADOW_RMAP_PGTABLE 0x1 1273 1274 /** 1275 * gmap_idte_one - invalidate a single region or segment table entry 1276 * @asce: region or segment table *origin* + table-type bits 1277 * @vaddr: virtual address to identify the table entry to flush 1278 * 1279 * The invalid bit of a single region or segment table entry is set 1280 * and the associated TLB entries depending on the entry are flushed. 1281 * The table-type of the @asce identifies the portion of the @vaddr 1282 * that is used as the invalidation index. 1283 */ 1284 static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) 1285 { 1286 asm volatile( 1287 " idte %0,0,%1" 1288 : : "a" (asce), "a" (vaddr) : "cc", "memory"); 1289 } 1290 1291 /** 1292 * gmap_unshadow_page - remove a page from a shadow page table 1293 * @sg: pointer to the shadow guest address space structure 1294 * @raddr: rmap address in the shadow guest address space 1295 * 1296 * Called with the sg->guest_table_lock 1297 */ 1298 static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) 1299 { 1300 unsigned long *table; 1301 1302 BUG_ON(!gmap_is_shadow(sg)); 1303 table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ 1304 if (!table || *table & _PAGE_INVALID) 1305 return; 1306 gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); 1307 ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); 1308 } 1309 1310 /** 1311 * __gmap_unshadow_pgt - remove all entries from a shadow page table 1312 * @sg: pointer to the shadow guest address space structure 1313 * @raddr: rmap address in the shadow guest address space 1314 * @pgt: pointer to the start of a shadow page table 1315 * 1316 * Called with the sg->guest_table_lock 1317 */ 1318 static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, 1319 unsigned long *pgt) 1320 { 1321 int i; 1322 1323 BUG_ON(!gmap_is_shadow(sg)); 1324 for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) 1325 pgt[i] = _PAGE_INVALID; 1326 } 1327 1328 /** 1329 * gmap_unshadow_pgt - remove a shadow page table from a segment entry 1330 * @sg: pointer to the shadow guest address space structure 1331 * @raddr: address in the shadow guest address space 1332 * 1333 * Called with the sg->guest_table_lock 1334 */ 1335 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) 1336 { 1337 unsigned long *ste; 1338 phys_addr_t sto, pgt; 1339 struct page *page; 1340 1341 BUG_ON(!gmap_is_shadow(sg)); 1342 ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ 1343 if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) 1344 return; 1345 gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); 1346 sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); 1347 gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); 1348 pgt = *ste & _SEGMENT_ENTRY_ORIGIN; 1349 *ste = _SEGMENT_ENTRY_EMPTY; 1350 __gmap_unshadow_pgt(sg, raddr, __va(pgt)); 1351 /* Free page table */ 1352 page = phys_to_page(pgt); 1353 list_del(&page->lru); 1354 page_table_free_pgste(page); 1355 } 1356 1357 /** 1358 * __gmap_unshadow_sgt - remove all entries from a shadow segment table 1359 * @sg: pointer to the shadow guest address space structure 1360 * @raddr: rmap address in the shadow guest address space 1361 * @sgt: pointer to the start of a shadow segment table 1362 * 1363 * Called with the sg->guest_table_lock 1364 */ 1365 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, 1366 unsigned long *sgt) 1367 { 1368 struct page *page; 1369 phys_addr_t pgt; 1370 int i; 1371 1372 BUG_ON(!gmap_is_shadow(sg)); 1373 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { 1374 if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) 1375 continue; 1376 pgt = sgt[i] & _REGION_ENTRY_ORIGIN; 1377 sgt[i] = _SEGMENT_ENTRY_EMPTY; 1378 __gmap_unshadow_pgt(sg, raddr, __va(pgt)); 1379 /* Free page table */ 1380 page = phys_to_page(pgt); 1381 list_del(&page->lru); 1382 page_table_free_pgste(page); 1383 } 1384 } 1385 1386 /** 1387 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry 1388 * @sg: pointer to the shadow guest address space structure 1389 * @raddr: rmap address in the shadow guest address space 1390 * 1391 * Called with the shadow->guest_table_lock 1392 */ 1393 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) 1394 { 1395 unsigned long r3o, *r3e; 1396 phys_addr_t sgt; 1397 struct page *page; 1398 1399 BUG_ON(!gmap_is_shadow(sg)); 1400 r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ 1401 if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) 1402 return; 1403 gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); 1404 r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); 1405 gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); 1406 sgt = *r3e & _REGION_ENTRY_ORIGIN; 1407 *r3e = _REGION3_ENTRY_EMPTY; 1408 __gmap_unshadow_sgt(sg, raddr, __va(sgt)); 1409 /* Free segment table */ 1410 page = phys_to_page(sgt); 1411 list_del(&page->lru); 1412 __free_pages(page, CRST_ALLOC_ORDER); 1413 } 1414 1415 /** 1416 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table 1417 * @sg: pointer to the shadow guest address space structure 1418 * @raddr: address in the shadow guest address space 1419 * @r3t: pointer to the start of a shadow region-3 table 1420 * 1421 * Called with the sg->guest_table_lock 1422 */ 1423 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, 1424 unsigned long *r3t) 1425 { 1426 struct page *page; 1427 phys_addr_t sgt; 1428 int i; 1429 1430 BUG_ON(!gmap_is_shadow(sg)); 1431 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { 1432 if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) 1433 continue; 1434 sgt = r3t[i] & _REGION_ENTRY_ORIGIN; 1435 r3t[i] = _REGION3_ENTRY_EMPTY; 1436 __gmap_unshadow_sgt(sg, raddr, __va(sgt)); 1437 /* Free segment table */ 1438 page = phys_to_page(sgt); 1439 list_del(&page->lru); 1440 __free_pages(page, CRST_ALLOC_ORDER); 1441 } 1442 } 1443 1444 /** 1445 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry 1446 * @sg: pointer to the shadow guest address space structure 1447 * @raddr: rmap address in the shadow guest address space 1448 * 1449 * Called with the sg->guest_table_lock 1450 */ 1451 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) 1452 { 1453 unsigned long r2o, *r2e; 1454 phys_addr_t r3t; 1455 struct page *page; 1456 1457 BUG_ON(!gmap_is_shadow(sg)); 1458 r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ 1459 if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) 1460 return; 1461 gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); 1462 r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); 1463 gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); 1464 r3t = *r2e & _REGION_ENTRY_ORIGIN; 1465 *r2e = _REGION2_ENTRY_EMPTY; 1466 __gmap_unshadow_r3t(sg, raddr, __va(r3t)); 1467 /* Free region 3 table */ 1468 page = phys_to_page(r3t); 1469 list_del(&page->lru); 1470 __free_pages(page, CRST_ALLOC_ORDER); 1471 } 1472 1473 /** 1474 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table 1475 * @sg: pointer to the shadow guest address space structure 1476 * @raddr: rmap address in the shadow guest address space 1477 * @r2t: pointer to the start of a shadow region-2 table 1478 * 1479 * Called with the sg->guest_table_lock 1480 */ 1481 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, 1482 unsigned long *r2t) 1483 { 1484 phys_addr_t r3t; 1485 struct page *page; 1486 int i; 1487 1488 BUG_ON(!gmap_is_shadow(sg)); 1489 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { 1490 if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) 1491 continue; 1492 r3t = r2t[i] & _REGION_ENTRY_ORIGIN; 1493 r2t[i] = _REGION2_ENTRY_EMPTY; 1494 __gmap_unshadow_r3t(sg, raddr, __va(r3t)); 1495 /* Free region 3 table */ 1496 page = phys_to_page(r3t); 1497 list_del(&page->lru); 1498 __free_pages(page, CRST_ALLOC_ORDER); 1499 } 1500 } 1501 1502 /** 1503 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry 1504 * @sg: pointer to the shadow guest address space structure 1505 * @raddr: rmap address in the shadow guest address space 1506 * 1507 * Called with the sg->guest_table_lock 1508 */ 1509 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) 1510 { 1511 unsigned long r1o, *r1e; 1512 struct page *page; 1513 phys_addr_t r2t; 1514 1515 BUG_ON(!gmap_is_shadow(sg)); 1516 r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ 1517 if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) 1518 return; 1519 gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); 1520 r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); 1521 gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); 1522 r2t = *r1e & _REGION_ENTRY_ORIGIN; 1523 *r1e = _REGION1_ENTRY_EMPTY; 1524 __gmap_unshadow_r2t(sg, raddr, __va(r2t)); 1525 /* Free region 2 table */ 1526 page = phys_to_page(r2t); 1527 list_del(&page->lru); 1528 __free_pages(page, CRST_ALLOC_ORDER); 1529 } 1530 1531 /** 1532 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table 1533 * @sg: pointer to the shadow guest address space structure 1534 * @raddr: rmap address in the shadow guest address space 1535 * @r1t: pointer to the start of a shadow region-1 table 1536 * 1537 * Called with the shadow->guest_table_lock 1538 */ 1539 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, 1540 unsigned long *r1t) 1541 { 1542 unsigned long asce; 1543 struct page *page; 1544 phys_addr_t r2t; 1545 int i; 1546 1547 BUG_ON(!gmap_is_shadow(sg)); 1548 asce = __pa(r1t) | _ASCE_TYPE_REGION1; 1549 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { 1550 if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) 1551 continue; 1552 r2t = r1t[i] & _REGION_ENTRY_ORIGIN; 1553 __gmap_unshadow_r2t(sg, raddr, __va(r2t)); 1554 /* Clear entry and flush translation r1t -> r2t */ 1555 gmap_idte_one(asce, raddr); 1556 r1t[i] = _REGION1_ENTRY_EMPTY; 1557 /* Free region 2 table */ 1558 page = phys_to_page(r2t); 1559 list_del(&page->lru); 1560 __free_pages(page, CRST_ALLOC_ORDER); 1561 } 1562 } 1563 1564 /** 1565 * gmap_unshadow - remove a shadow page table completely 1566 * @sg: pointer to the shadow guest address space structure 1567 * 1568 * Called with sg->guest_table_lock 1569 */ 1570 static void gmap_unshadow(struct gmap *sg) 1571 { 1572 unsigned long *table; 1573 1574 BUG_ON(!gmap_is_shadow(sg)); 1575 if (sg->removed) 1576 return; 1577 sg->removed = 1; 1578 gmap_call_notifier(sg, 0, -1UL); 1579 gmap_flush_tlb(sg); 1580 table = __va(sg->asce & _ASCE_ORIGIN); 1581 switch (sg->asce & _ASCE_TYPE_MASK) { 1582 case _ASCE_TYPE_REGION1: 1583 __gmap_unshadow_r1t(sg, 0, table); 1584 break; 1585 case _ASCE_TYPE_REGION2: 1586 __gmap_unshadow_r2t(sg, 0, table); 1587 break; 1588 case _ASCE_TYPE_REGION3: 1589 __gmap_unshadow_r3t(sg, 0, table); 1590 break; 1591 case _ASCE_TYPE_SEGMENT: 1592 __gmap_unshadow_sgt(sg, 0, table); 1593 break; 1594 } 1595 } 1596 1597 /** 1598 * gmap_find_shadow - find a specific asce in the list of shadow tables 1599 * @parent: pointer to the parent gmap 1600 * @asce: ASCE for which the shadow table is created 1601 * @edat_level: edat level to be used for the shadow translation 1602 * 1603 * Returns the pointer to a gmap if a shadow table with the given asce is 1604 * already available, ERR_PTR(-EAGAIN) if another one is just being created, 1605 * otherwise NULL 1606 */ 1607 static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, 1608 int edat_level) 1609 { 1610 struct gmap *sg; 1611 1612 list_for_each_entry(sg, &parent->children, list) { 1613 if (sg->orig_asce != asce || sg->edat_level != edat_level || 1614 sg->removed) 1615 continue; 1616 if (!sg->initialized) 1617 return ERR_PTR(-EAGAIN); 1618 refcount_inc(&sg->ref_count); 1619 return sg; 1620 } 1621 return NULL; 1622 } 1623 1624 /** 1625 * gmap_shadow_valid - check if a shadow guest address space matches the 1626 * given properties and is still valid 1627 * @sg: pointer to the shadow guest address space structure 1628 * @asce: ASCE for which the shadow table is requested 1629 * @edat_level: edat level to be used for the shadow translation 1630 * 1631 * Returns 1 if the gmap shadow is still valid and matches the given 1632 * properties, the caller can continue using it. Returns 0 otherwise, the 1633 * caller has to request a new shadow gmap in this case. 1634 * 1635 */ 1636 int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) 1637 { 1638 if (sg->removed) 1639 return 0; 1640 return sg->orig_asce == asce && sg->edat_level == edat_level; 1641 } 1642 EXPORT_SYMBOL_GPL(gmap_shadow_valid); 1643 1644 /** 1645 * gmap_shadow - create/find a shadow guest address space 1646 * @parent: pointer to the parent gmap 1647 * @asce: ASCE for which the shadow table is created 1648 * @edat_level: edat level to be used for the shadow translation 1649 * 1650 * The pages of the top level page table referred by the asce parameter 1651 * will be set to read-only and marked in the PGSTEs of the kvm process. 1652 * The shadow table will be removed automatically on any change to the 1653 * PTE mapping for the source table. 1654 * 1655 * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 1656 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 1657 * parent gmap table could not be protected. 1658 */ 1659 struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, 1660 int edat_level) 1661 { 1662 struct gmap *sg, *new; 1663 unsigned long limit; 1664 int rc; 1665 1666 BUG_ON(parent->mm->context.allow_gmap_hpage_1m); 1667 BUG_ON(gmap_is_shadow(parent)); 1668 spin_lock(&parent->shadow_lock); 1669 sg = gmap_find_shadow(parent, asce, edat_level); 1670 spin_unlock(&parent->shadow_lock); 1671 if (sg) 1672 return sg; 1673 /* Create a new shadow gmap */ 1674 limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); 1675 if (asce & _ASCE_REAL_SPACE) 1676 limit = -1UL; 1677 new = gmap_alloc(limit); 1678 if (!new) 1679 return ERR_PTR(-ENOMEM); 1680 new->mm = parent->mm; 1681 new->parent = gmap_get(parent); 1682 new->orig_asce = asce; 1683 new->edat_level = edat_level; 1684 new->initialized = false; 1685 spin_lock(&parent->shadow_lock); 1686 /* Recheck if another CPU created the same shadow */ 1687 sg = gmap_find_shadow(parent, asce, edat_level); 1688 if (sg) { 1689 spin_unlock(&parent->shadow_lock); 1690 gmap_free(new); 1691 return sg; 1692 } 1693 if (asce & _ASCE_REAL_SPACE) { 1694 /* only allow one real-space gmap shadow */ 1695 list_for_each_entry(sg, &parent->children, list) { 1696 if (sg->orig_asce & _ASCE_REAL_SPACE) { 1697 spin_lock(&sg->guest_table_lock); 1698 gmap_unshadow(sg); 1699 spin_unlock(&sg->guest_table_lock); 1700 list_del(&sg->list); 1701 gmap_put(sg); 1702 break; 1703 } 1704 } 1705 } 1706 refcount_set(&new->ref_count, 2); 1707 list_add(&new->list, &parent->children); 1708 if (asce & _ASCE_REAL_SPACE) { 1709 /* nothing to protect, return right away */ 1710 new->initialized = true; 1711 spin_unlock(&parent->shadow_lock); 1712 return new; 1713 } 1714 spin_unlock(&parent->shadow_lock); 1715 /* protect after insertion, so it will get properly invalidated */ 1716 mmap_read_lock(parent->mm); 1717 rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, 1718 ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, 1719 PROT_READ, GMAP_NOTIFY_SHADOW); 1720 mmap_read_unlock(parent->mm); 1721 spin_lock(&parent->shadow_lock); 1722 new->initialized = true; 1723 if (rc) { 1724 list_del(&new->list); 1725 gmap_free(new); 1726 new = ERR_PTR(rc); 1727 } 1728 spin_unlock(&parent->shadow_lock); 1729 return new; 1730 } 1731 EXPORT_SYMBOL_GPL(gmap_shadow); 1732 1733 /** 1734 * gmap_shadow_r2t - create an empty shadow region 2 table 1735 * @sg: pointer to the shadow guest address space structure 1736 * @saddr: faulting address in the shadow gmap 1737 * @r2t: parent gmap address of the region 2 table to get shadowed 1738 * @fake: r2t references contiguous guest memory block, not a r2t 1739 * 1740 * The r2t parameter specifies the address of the source table. The 1741 * four pages of the source table are made read-only in the parent gmap 1742 * address space. A write to the source table area @r2t will automatically 1743 * remove the shadow r2 table and all of its descendants. 1744 * 1745 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1746 * shadow table structure is incomplete, -ENOMEM if out of memory and 1747 * -EFAULT if an address in the parent gmap could not be resolved. 1748 * 1749 * Called with sg->mm->mmap_lock in read. 1750 */ 1751 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, 1752 int fake) 1753 { 1754 unsigned long raddr, origin, offset, len; 1755 unsigned long *table; 1756 phys_addr_t s_r2t; 1757 struct page *page; 1758 int rc; 1759 1760 BUG_ON(!gmap_is_shadow(sg)); 1761 /* Allocate a shadow region second table */ 1762 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 1763 if (!page) 1764 return -ENOMEM; 1765 page->index = r2t & _REGION_ENTRY_ORIGIN; 1766 if (fake) 1767 page->index |= GMAP_SHADOW_FAKE_TABLE; 1768 s_r2t = page_to_phys(page); 1769 /* Install shadow region second table */ 1770 spin_lock(&sg->guest_table_lock); 1771 table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ 1772 if (!table) { 1773 rc = -EAGAIN; /* Race with unshadow */ 1774 goto out_free; 1775 } 1776 if (!(*table & _REGION_ENTRY_INVALID)) { 1777 rc = 0; /* Already established */ 1778 goto out_free; 1779 } else if (*table & _REGION_ENTRY_ORIGIN) { 1780 rc = -EAGAIN; /* Race with shadow */ 1781 goto out_free; 1782 } 1783 crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); 1784 /* mark as invalid as long as the parent table is not protected */ 1785 *table = s_r2t | _REGION_ENTRY_LENGTH | 1786 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; 1787 if (sg->edat_level >= 1) 1788 *table |= (r2t & _REGION_ENTRY_PROTECT); 1789 list_add(&page->lru, &sg->crst_list); 1790 if (fake) { 1791 /* nothing to protect for fake tables */ 1792 *table &= ~_REGION_ENTRY_INVALID; 1793 spin_unlock(&sg->guest_table_lock); 1794 return 0; 1795 } 1796 spin_unlock(&sg->guest_table_lock); 1797 /* Make r2t read-only in parent gmap page table */ 1798 raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; 1799 origin = r2t & _REGION_ENTRY_ORIGIN; 1800 offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1801 len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1802 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1803 spin_lock(&sg->guest_table_lock); 1804 if (!rc) { 1805 table = gmap_table_walk(sg, saddr, 4); 1806 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) 1807 rc = -EAGAIN; /* Race with unshadow */ 1808 else 1809 *table &= ~_REGION_ENTRY_INVALID; 1810 } else { 1811 gmap_unshadow_r2t(sg, raddr); 1812 } 1813 spin_unlock(&sg->guest_table_lock); 1814 return rc; 1815 out_free: 1816 spin_unlock(&sg->guest_table_lock); 1817 __free_pages(page, CRST_ALLOC_ORDER); 1818 return rc; 1819 } 1820 EXPORT_SYMBOL_GPL(gmap_shadow_r2t); 1821 1822 /** 1823 * gmap_shadow_r3t - create a shadow region 3 table 1824 * @sg: pointer to the shadow guest address space structure 1825 * @saddr: faulting address in the shadow gmap 1826 * @r3t: parent gmap address of the region 3 table to get shadowed 1827 * @fake: r3t references contiguous guest memory block, not a r3t 1828 * 1829 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1830 * shadow table structure is incomplete, -ENOMEM if out of memory and 1831 * -EFAULT if an address in the parent gmap could not be resolved. 1832 * 1833 * Called with sg->mm->mmap_lock in read. 1834 */ 1835 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, 1836 int fake) 1837 { 1838 unsigned long raddr, origin, offset, len; 1839 unsigned long *table; 1840 phys_addr_t s_r3t; 1841 struct page *page; 1842 int rc; 1843 1844 BUG_ON(!gmap_is_shadow(sg)); 1845 /* Allocate a shadow region second table */ 1846 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 1847 if (!page) 1848 return -ENOMEM; 1849 page->index = r3t & _REGION_ENTRY_ORIGIN; 1850 if (fake) 1851 page->index |= GMAP_SHADOW_FAKE_TABLE; 1852 s_r3t = page_to_phys(page); 1853 /* Install shadow region second table */ 1854 spin_lock(&sg->guest_table_lock); 1855 table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ 1856 if (!table) { 1857 rc = -EAGAIN; /* Race with unshadow */ 1858 goto out_free; 1859 } 1860 if (!(*table & _REGION_ENTRY_INVALID)) { 1861 rc = 0; /* Already established */ 1862 goto out_free; 1863 } else if (*table & _REGION_ENTRY_ORIGIN) { 1864 rc = -EAGAIN; /* Race with shadow */ 1865 goto out_free; 1866 } 1867 crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); 1868 /* mark as invalid as long as the parent table is not protected */ 1869 *table = s_r3t | _REGION_ENTRY_LENGTH | 1870 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; 1871 if (sg->edat_level >= 1) 1872 *table |= (r3t & _REGION_ENTRY_PROTECT); 1873 list_add(&page->lru, &sg->crst_list); 1874 if (fake) { 1875 /* nothing to protect for fake tables */ 1876 *table &= ~_REGION_ENTRY_INVALID; 1877 spin_unlock(&sg->guest_table_lock); 1878 return 0; 1879 } 1880 spin_unlock(&sg->guest_table_lock); 1881 /* Make r3t read-only in parent gmap page table */ 1882 raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; 1883 origin = r3t & _REGION_ENTRY_ORIGIN; 1884 offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1885 len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1886 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1887 spin_lock(&sg->guest_table_lock); 1888 if (!rc) { 1889 table = gmap_table_walk(sg, saddr, 3); 1890 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) 1891 rc = -EAGAIN; /* Race with unshadow */ 1892 else 1893 *table &= ~_REGION_ENTRY_INVALID; 1894 } else { 1895 gmap_unshadow_r3t(sg, raddr); 1896 } 1897 spin_unlock(&sg->guest_table_lock); 1898 return rc; 1899 out_free: 1900 spin_unlock(&sg->guest_table_lock); 1901 __free_pages(page, CRST_ALLOC_ORDER); 1902 return rc; 1903 } 1904 EXPORT_SYMBOL_GPL(gmap_shadow_r3t); 1905 1906 /** 1907 * gmap_shadow_sgt - create a shadow segment table 1908 * @sg: pointer to the shadow guest address space structure 1909 * @saddr: faulting address in the shadow gmap 1910 * @sgt: parent gmap address of the segment table to get shadowed 1911 * @fake: sgt references contiguous guest memory block, not a sgt 1912 * 1913 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the 1914 * shadow table structure is incomplete, -ENOMEM if out of memory and 1915 * -EFAULT if an address in the parent gmap could not be resolved. 1916 * 1917 * Called with sg->mm->mmap_lock in read. 1918 */ 1919 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, 1920 int fake) 1921 { 1922 unsigned long raddr, origin, offset, len; 1923 unsigned long *table; 1924 phys_addr_t s_sgt; 1925 struct page *page; 1926 int rc; 1927 1928 BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); 1929 /* Allocate a shadow segment table */ 1930 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 1931 if (!page) 1932 return -ENOMEM; 1933 page->index = sgt & _REGION_ENTRY_ORIGIN; 1934 if (fake) 1935 page->index |= GMAP_SHADOW_FAKE_TABLE; 1936 s_sgt = page_to_phys(page); 1937 /* Install shadow region second table */ 1938 spin_lock(&sg->guest_table_lock); 1939 table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ 1940 if (!table) { 1941 rc = -EAGAIN; /* Race with unshadow */ 1942 goto out_free; 1943 } 1944 if (!(*table & _REGION_ENTRY_INVALID)) { 1945 rc = 0; /* Already established */ 1946 goto out_free; 1947 } else if (*table & _REGION_ENTRY_ORIGIN) { 1948 rc = -EAGAIN; /* Race with shadow */ 1949 goto out_free; 1950 } 1951 crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); 1952 /* mark as invalid as long as the parent table is not protected */ 1953 *table = s_sgt | _REGION_ENTRY_LENGTH | 1954 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; 1955 if (sg->edat_level >= 1) 1956 *table |= sgt & _REGION_ENTRY_PROTECT; 1957 list_add(&page->lru, &sg->crst_list); 1958 if (fake) { 1959 /* nothing to protect for fake tables */ 1960 *table &= ~_REGION_ENTRY_INVALID; 1961 spin_unlock(&sg->guest_table_lock); 1962 return 0; 1963 } 1964 spin_unlock(&sg->guest_table_lock); 1965 /* Make sgt read-only in parent gmap page table */ 1966 raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; 1967 origin = sgt & _REGION_ENTRY_ORIGIN; 1968 offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1969 len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1970 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1971 spin_lock(&sg->guest_table_lock); 1972 if (!rc) { 1973 table = gmap_table_walk(sg, saddr, 2); 1974 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) 1975 rc = -EAGAIN; /* Race with unshadow */ 1976 else 1977 *table &= ~_REGION_ENTRY_INVALID; 1978 } else { 1979 gmap_unshadow_sgt(sg, raddr); 1980 } 1981 spin_unlock(&sg->guest_table_lock); 1982 return rc; 1983 out_free: 1984 spin_unlock(&sg->guest_table_lock); 1985 __free_pages(page, CRST_ALLOC_ORDER); 1986 return rc; 1987 } 1988 EXPORT_SYMBOL_GPL(gmap_shadow_sgt); 1989 1990 /** 1991 * gmap_shadow_pgt_lookup - find a shadow page table 1992 * @sg: pointer to the shadow guest address space structure 1993 * @saddr: the address in the shadow aguest address space 1994 * @pgt: parent gmap address of the page table to get shadowed 1995 * @dat_protection: if the pgtable is marked as protected by dat 1996 * @fake: pgt references contiguous guest memory block, not a pgtable 1997 * 1998 * Returns 0 if the shadow page table was found and -EAGAIN if the page 1999 * table was not found. 2000 * 2001 * Called with sg->mm->mmap_lock in read. 2002 */ 2003 int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, 2004 unsigned long *pgt, int *dat_protection, 2005 int *fake) 2006 { 2007 unsigned long *table; 2008 struct page *page; 2009 int rc; 2010 2011 BUG_ON(!gmap_is_shadow(sg)); 2012 spin_lock(&sg->guest_table_lock); 2013 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 2014 if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { 2015 /* Shadow page tables are full pages (pte+pgste) */ 2016 page = pfn_to_page(*table >> PAGE_SHIFT); 2017 *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; 2018 *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); 2019 *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); 2020 rc = 0; 2021 } else { 2022 rc = -EAGAIN; 2023 } 2024 spin_unlock(&sg->guest_table_lock); 2025 return rc; 2026 2027 } 2028 EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); 2029 2030 /** 2031 * gmap_shadow_pgt - instantiate a shadow page table 2032 * @sg: pointer to the shadow guest address space structure 2033 * @saddr: faulting address in the shadow gmap 2034 * @pgt: parent gmap address of the page table to get shadowed 2035 * @fake: pgt references contiguous guest memory block, not a pgtable 2036 * 2037 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 2038 * shadow table structure is incomplete, -ENOMEM if out of memory, 2039 * -EFAULT if an address in the parent gmap could not be resolved and 2040 * 2041 * Called with gmap->mm->mmap_lock in read 2042 */ 2043 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, 2044 int fake) 2045 { 2046 unsigned long raddr, origin; 2047 unsigned long *table; 2048 struct page *page; 2049 phys_addr_t s_pgt; 2050 int rc; 2051 2052 BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); 2053 /* Allocate a shadow page table */ 2054 page = page_table_alloc_pgste(sg->mm); 2055 if (!page) 2056 return -ENOMEM; 2057 page->index = pgt & _SEGMENT_ENTRY_ORIGIN; 2058 if (fake) 2059 page->index |= GMAP_SHADOW_FAKE_TABLE; 2060 s_pgt = page_to_phys(page); 2061 /* Install shadow page table */ 2062 spin_lock(&sg->guest_table_lock); 2063 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 2064 if (!table) { 2065 rc = -EAGAIN; /* Race with unshadow */ 2066 goto out_free; 2067 } 2068 if (!(*table & _SEGMENT_ENTRY_INVALID)) { 2069 rc = 0; /* Already established */ 2070 goto out_free; 2071 } else if (*table & _SEGMENT_ENTRY_ORIGIN) { 2072 rc = -EAGAIN; /* Race with shadow */ 2073 goto out_free; 2074 } 2075 /* mark as invalid as long as the parent table is not protected */ 2076 *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | 2077 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; 2078 list_add(&page->lru, &sg->pt_list); 2079 if (fake) { 2080 /* nothing to protect for fake tables */ 2081 *table &= ~_SEGMENT_ENTRY_INVALID; 2082 spin_unlock(&sg->guest_table_lock); 2083 return 0; 2084 } 2085 spin_unlock(&sg->guest_table_lock); 2086 /* Make pgt read-only in parent gmap page table (not the pgste) */ 2087 raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; 2088 origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; 2089 rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); 2090 spin_lock(&sg->guest_table_lock); 2091 if (!rc) { 2092 table = gmap_table_walk(sg, saddr, 1); 2093 if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) 2094 rc = -EAGAIN; /* Race with unshadow */ 2095 else 2096 *table &= ~_SEGMENT_ENTRY_INVALID; 2097 } else { 2098 gmap_unshadow_pgt(sg, raddr); 2099 } 2100 spin_unlock(&sg->guest_table_lock); 2101 return rc; 2102 out_free: 2103 spin_unlock(&sg->guest_table_lock); 2104 page_table_free_pgste(page); 2105 return rc; 2106 2107 } 2108 EXPORT_SYMBOL_GPL(gmap_shadow_pgt); 2109 2110 /** 2111 * gmap_shadow_page - create a shadow page mapping 2112 * @sg: pointer to the shadow guest address space structure 2113 * @saddr: faulting address in the shadow gmap 2114 * @pte: pte in parent gmap address space to get shadowed 2115 * 2116 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 2117 * shadow table structure is incomplete, -ENOMEM if out of memory and 2118 * -EFAULT if an address in the parent gmap could not be resolved. 2119 * 2120 * Called with sg->mm->mmap_lock in read. 2121 */ 2122 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) 2123 { 2124 struct gmap *parent; 2125 struct gmap_rmap *rmap; 2126 unsigned long vmaddr, paddr; 2127 spinlock_t *ptl; 2128 pte_t *sptep, *tptep; 2129 int prot; 2130 int rc; 2131 2132 BUG_ON(!gmap_is_shadow(sg)); 2133 parent = sg->parent; 2134 prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; 2135 2136 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); 2137 if (!rmap) 2138 return -ENOMEM; 2139 rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; 2140 2141 while (1) { 2142 paddr = pte_val(pte) & PAGE_MASK; 2143 vmaddr = __gmap_translate(parent, paddr); 2144 if (IS_ERR_VALUE(vmaddr)) { 2145 rc = vmaddr; 2146 break; 2147 } 2148 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 2149 if (rc) 2150 break; 2151 rc = -EAGAIN; 2152 sptep = gmap_pte_op_walk(parent, paddr, &ptl); 2153 if (sptep) { 2154 spin_lock(&sg->guest_table_lock); 2155 /* Get page table pointer */ 2156 tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); 2157 if (!tptep) { 2158 spin_unlock(&sg->guest_table_lock); 2159 gmap_pte_op_end(sptep, ptl); 2160 radix_tree_preload_end(); 2161 break; 2162 } 2163 rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); 2164 if (rc > 0) { 2165 /* Success and a new mapping */ 2166 gmap_insert_rmap(sg, vmaddr, rmap); 2167 rmap = NULL; 2168 rc = 0; 2169 } 2170 gmap_pte_op_end(sptep, ptl); 2171 spin_unlock(&sg->guest_table_lock); 2172 } 2173 radix_tree_preload_end(); 2174 if (!rc) 2175 break; 2176 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); 2177 if (rc) 2178 break; 2179 } 2180 kfree(rmap); 2181 return rc; 2182 } 2183 EXPORT_SYMBOL_GPL(gmap_shadow_page); 2184 2185 /* 2186 * gmap_shadow_notify - handle notifications for shadow gmap 2187 * 2188 * Called with sg->parent->shadow_lock. 2189 */ 2190 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, 2191 unsigned long gaddr) 2192 { 2193 struct gmap_rmap *rmap, *rnext, *head; 2194 unsigned long start, end, bits, raddr; 2195 2196 BUG_ON(!gmap_is_shadow(sg)); 2197 2198 spin_lock(&sg->guest_table_lock); 2199 if (sg->removed) { 2200 spin_unlock(&sg->guest_table_lock); 2201 return; 2202 } 2203 /* Check for top level table */ 2204 start = sg->orig_asce & _ASCE_ORIGIN; 2205 end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; 2206 if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && 2207 gaddr < end) { 2208 /* The complete shadow table has to go */ 2209 gmap_unshadow(sg); 2210 spin_unlock(&sg->guest_table_lock); 2211 list_del(&sg->list); 2212 gmap_put(sg); 2213 return; 2214 } 2215 /* Remove the page table tree from on specific entry */ 2216 head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 2217 gmap_for_each_rmap_safe(rmap, rnext, head) { 2218 bits = rmap->raddr & _SHADOW_RMAP_MASK; 2219 raddr = rmap->raddr ^ bits; 2220 switch (bits) { 2221 case _SHADOW_RMAP_REGION1: 2222 gmap_unshadow_r2t(sg, raddr); 2223 break; 2224 case _SHADOW_RMAP_REGION2: 2225 gmap_unshadow_r3t(sg, raddr); 2226 break; 2227 case _SHADOW_RMAP_REGION3: 2228 gmap_unshadow_sgt(sg, raddr); 2229 break; 2230 case _SHADOW_RMAP_SEGMENT: 2231 gmap_unshadow_pgt(sg, raddr); 2232 break; 2233 case _SHADOW_RMAP_PGTABLE: 2234 gmap_unshadow_page(sg, raddr); 2235 break; 2236 } 2237 kfree(rmap); 2238 } 2239 spin_unlock(&sg->guest_table_lock); 2240 } 2241 2242 /** 2243 * ptep_notify - call all invalidation callbacks for a specific pte. 2244 * @mm: pointer to the process mm_struct 2245 * @vmaddr: virtual address in the process address space 2246 * @pte: pointer to the page table entry 2247 * @bits: bits from the pgste that caused the notify call 2248 * 2249 * This function is assumed to be called with the page table lock held 2250 * for the pte to notify. 2251 */ 2252 void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, 2253 pte_t *pte, unsigned long bits) 2254 { 2255 unsigned long offset, gaddr = 0; 2256 unsigned long *table; 2257 struct gmap *gmap, *sg, *next; 2258 2259 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 2260 offset = offset * (PAGE_SIZE / sizeof(pte_t)); 2261 rcu_read_lock(); 2262 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2263 spin_lock(&gmap->guest_table_lock); 2264 table = radix_tree_lookup(&gmap->host_to_guest, 2265 vmaddr >> PMD_SHIFT); 2266 if (table) 2267 gaddr = __gmap_segment_gaddr(table) + offset; 2268 spin_unlock(&gmap->guest_table_lock); 2269 if (!table) 2270 continue; 2271 2272 if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { 2273 spin_lock(&gmap->shadow_lock); 2274 list_for_each_entry_safe(sg, next, 2275 &gmap->children, list) 2276 gmap_shadow_notify(sg, vmaddr, gaddr); 2277 spin_unlock(&gmap->shadow_lock); 2278 } 2279 if (bits & PGSTE_IN_BIT) 2280 gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); 2281 } 2282 rcu_read_unlock(); 2283 } 2284 EXPORT_SYMBOL_GPL(ptep_notify); 2285 2286 static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, 2287 unsigned long gaddr) 2288 { 2289 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); 2290 gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); 2291 } 2292 2293 /** 2294 * gmap_pmdp_xchg - exchange a gmap pmd with another 2295 * @gmap: pointer to the guest address space structure 2296 * @pmdp: pointer to the pmd entry 2297 * @new: replacement entry 2298 * @gaddr: the affected guest address 2299 * 2300 * This function is assumed to be called with the guest_table_lock 2301 * held. 2302 */ 2303 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, 2304 unsigned long gaddr) 2305 { 2306 gaddr &= HPAGE_MASK; 2307 pmdp_notify_gmap(gmap, pmdp, gaddr); 2308 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); 2309 if (MACHINE_HAS_TLB_GUEST) 2310 __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, 2311 IDTE_GLOBAL); 2312 else if (MACHINE_HAS_IDTE) 2313 __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); 2314 else 2315 __pmdp_csp(pmdp); 2316 set_pmd(pmdp, new); 2317 } 2318 2319 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, 2320 int purge) 2321 { 2322 pmd_t *pmdp; 2323 struct gmap *gmap; 2324 unsigned long gaddr; 2325 2326 rcu_read_lock(); 2327 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2328 spin_lock(&gmap->guest_table_lock); 2329 pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, 2330 vmaddr >> PMD_SHIFT); 2331 if (pmdp) { 2332 gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); 2333 pmdp_notify_gmap(gmap, pmdp, gaddr); 2334 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2335 _SEGMENT_ENTRY_GMAP_UC)); 2336 if (purge) 2337 __pmdp_csp(pmdp); 2338 set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); 2339 } 2340 spin_unlock(&gmap->guest_table_lock); 2341 } 2342 rcu_read_unlock(); 2343 } 2344 2345 /** 2346 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without 2347 * flushing 2348 * @mm: pointer to the process mm_struct 2349 * @vmaddr: virtual address in the process address space 2350 */ 2351 void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) 2352 { 2353 gmap_pmdp_clear(mm, vmaddr, 0); 2354 } 2355 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); 2356 2357 /** 2358 * gmap_pmdp_csp - csp all affected guest pmd entries 2359 * @mm: pointer to the process mm_struct 2360 * @vmaddr: virtual address in the process address space 2361 */ 2362 void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) 2363 { 2364 gmap_pmdp_clear(mm, vmaddr, 1); 2365 } 2366 EXPORT_SYMBOL_GPL(gmap_pmdp_csp); 2367 2368 /** 2369 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry 2370 * @mm: pointer to the process mm_struct 2371 * @vmaddr: virtual address in the process address space 2372 */ 2373 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) 2374 { 2375 unsigned long *entry, gaddr; 2376 struct gmap *gmap; 2377 pmd_t *pmdp; 2378 2379 rcu_read_lock(); 2380 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2381 spin_lock(&gmap->guest_table_lock); 2382 entry = radix_tree_delete(&gmap->host_to_guest, 2383 vmaddr >> PMD_SHIFT); 2384 if (entry) { 2385 pmdp = (pmd_t *)entry; 2386 gaddr = __gmap_segment_gaddr(entry); 2387 pmdp_notify_gmap(gmap, pmdp, gaddr); 2388 WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2389 _SEGMENT_ENTRY_GMAP_UC)); 2390 if (MACHINE_HAS_TLB_GUEST) 2391 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2392 gmap->asce, IDTE_LOCAL); 2393 else if (MACHINE_HAS_IDTE) 2394 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); 2395 *entry = _SEGMENT_ENTRY_EMPTY; 2396 } 2397 spin_unlock(&gmap->guest_table_lock); 2398 } 2399 rcu_read_unlock(); 2400 } 2401 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); 2402 2403 /** 2404 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry 2405 * @mm: pointer to the process mm_struct 2406 * @vmaddr: virtual address in the process address space 2407 */ 2408 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) 2409 { 2410 unsigned long *entry, gaddr; 2411 struct gmap *gmap; 2412 pmd_t *pmdp; 2413 2414 rcu_read_lock(); 2415 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2416 spin_lock(&gmap->guest_table_lock); 2417 entry = radix_tree_delete(&gmap->host_to_guest, 2418 vmaddr >> PMD_SHIFT); 2419 if (entry) { 2420 pmdp = (pmd_t *)entry; 2421 gaddr = __gmap_segment_gaddr(entry); 2422 pmdp_notify_gmap(gmap, pmdp, gaddr); 2423 WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2424 _SEGMENT_ENTRY_GMAP_UC)); 2425 if (MACHINE_HAS_TLB_GUEST) 2426 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2427 gmap->asce, IDTE_GLOBAL); 2428 else if (MACHINE_HAS_IDTE) 2429 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); 2430 else 2431 __pmdp_csp(pmdp); 2432 *entry = _SEGMENT_ENTRY_EMPTY; 2433 } 2434 spin_unlock(&gmap->guest_table_lock); 2435 } 2436 rcu_read_unlock(); 2437 } 2438 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); 2439 2440 /** 2441 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status 2442 * @gmap: pointer to guest address space 2443 * @pmdp: pointer to the pmd to be tested 2444 * @gaddr: virtual address in the guest address space 2445 * 2446 * This function is assumed to be called with the guest_table_lock 2447 * held. 2448 */ 2449 static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, 2450 unsigned long gaddr) 2451 { 2452 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 2453 return false; 2454 2455 /* Already protected memory, which did not change is clean */ 2456 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && 2457 !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) 2458 return false; 2459 2460 /* Clear UC indication and reset protection */ 2461 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); 2462 gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); 2463 return true; 2464 } 2465 2466 /** 2467 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment 2468 * @gmap: pointer to guest address space 2469 * @bitmap: dirty bitmap for this pmd 2470 * @gaddr: virtual address in the guest address space 2471 * @vmaddr: virtual address in the host address space 2472 * 2473 * This function is assumed to be called with the guest_table_lock 2474 * held. 2475 */ 2476 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], 2477 unsigned long gaddr, unsigned long vmaddr) 2478 { 2479 int i; 2480 pmd_t *pmdp; 2481 pte_t *ptep; 2482 spinlock_t *ptl; 2483 2484 pmdp = gmap_pmd_op_walk(gmap, gaddr); 2485 if (!pmdp) 2486 return; 2487 2488 if (pmd_large(*pmdp)) { 2489 if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) 2490 bitmap_fill(bitmap, _PAGE_ENTRIES); 2491 } else { 2492 for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { 2493 ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); 2494 if (!ptep) 2495 continue; 2496 if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) 2497 set_bit(i, bitmap); 2498 pte_unmap_unlock(ptep, ptl); 2499 } 2500 } 2501 gmap_pmd_op_end(gmap, pmdp); 2502 } 2503 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); 2504 2505 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2506 static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, 2507 unsigned long end, struct mm_walk *walk) 2508 { 2509 struct vm_area_struct *vma = walk->vma; 2510 2511 split_huge_pmd(vma, pmd, addr); 2512 return 0; 2513 } 2514 2515 static const struct mm_walk_ops thp_split_walk_ops = { 2516 .pmd_entry = thp_split_walk_pmd_entry, 2517 }; 2518 2519 static inline void thp_split_mm(struct mm_struct *mm) 2520 { 2521 struct vm_area_struct *vma; 2522 VMA_ITERATOR(vmi, mm, 0); 2523 2524 for_each_vma(vmi, vma) { 2525 vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); 2526 walk_page_vma(vma, &thp_split_walk_ops, NULL); 2527 } 2528 mm->def_flags |= VM_NOHUGEPAGE; 2529 } 2530 #else 2531 static inline void thp_split_mm(struct mm_struct *mm) 2532 { 2533 } 2534 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2535 2536 /* 2537 * Remove all empty zero pages from the mapping for lazy refaulting 2538 * - This must be called after mm->context.has_pgste is set, to avoid 2539 * future creation of zero pages 2540 * - This must be called after THP was disabled. 2541 * 2542 * mm contracts with s390, that even if mm were to remove a page table, 2543 * racing with the loop below and so causing pte_offset_map_lock() to fail, 2544 * it will never insert a page table containing empty zero pages once 2545 * mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set. 2546 */ 2547 static int __zap_zero_pages(pmd_t *pmd, unsigned long start, 2548 unsigned long end, struct mm_walk *walk) 2549 { 2550 unsigned long addr; 2551 2552 for (addr = start; addr != end; addr += PAGE_SIZE) { 2553 pte_t *ptep; 2554 spinlock_t *ptl; 2555 2556 ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 2557 if (!ptep) 2558 break; 2559 if (is_zero_pfn(pte_pfn(*ptep))) 2560 ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); 2561 pte_unmap_unlock(ptep, ptl); 2562 } 2563 return 0; 2564 } 2565 2566 static const struct mm_walk_ops zap_zero_walk_ops = { 2567 .pmd_entry = __zap_zero_pages, 2568 }; 2569 2570 /* 2571 * switch on pgstes for its userspace process (for kvm) 2572 */ 2573 int s390_enable_sie(void) 2574 { 2575 struct mm_struct *mm = current->mm; 2576 2577 /* Do we have pgstes? if yes, we are done */ 2578 if (mm_has_pgste(mm)) 2579 return 0; 2580 /* Fail if the page tables are 2K */ 2581 if (!mm_alloc_pgste(mm)) 2582 return -EINVAL; 2583 mmap_write_lock(mm); 2584 mm->context.has_pgste = 1; 2585 /* split thp mappings and disable thp for future mappings */ 2586 thp_split_mm(mm); 2587 walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); 2588 mmap_write_unlock(mm); 2589 return 0; 2590 } 2591 EXPORT_SYMBOL_GPL(s390_enable_sie); 2592 2593 int gmap_mark_unmergeable(void) 2594 { 2595 /* 2596 * Make sure to disable KSM (if enabled for the whole process or 2597 * individual VMAs). Note that nothing currently hinders user space 2598 * from re-enabling it. 2599 */ 2600 return ksm_disable(current->mm); 2601 } 2602 EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); 2603 2604 /* 2605 * Enable storage key handling from now on and initialize the storage 2606 * keys with the default key. 2607 */ 2608 static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, 2609 unsigned long next, struct mm_walk *walk) 2610 { 2611 /* Clear storage key */ 2612 ptep_zap_key(walk->mm, addr, pte); 2613 return 0; 2614 } 2615 2616 /* 2617 * Give a chance to schedule after setting a key to 256 pages. 2618 * We only hold the mm lock, which is a rwsem and the kvm srcu. 2619 * Both can sleep. 2620 */ 2621 static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, 2622 unsigned long next, struct mm_walk *walk) 2623 { 2624 cond_resched(); 2625 return 0; 2626 } 2627 2628 static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, 2629 unsigned long hmask, unsigned long next, 2630 struct mm_walk *walk) 2631 { 2632 pmd_t *pmd = (pmd_t *)pte; 2633 unsigned long start, end; 2634 struct page *page = pmd_page(*pmd); 2635 2636 /* 2637 * The write check makes sure we do not set a key on shared 2638 * memory. This is needed as the walker does not differentiate 2639 * between actual guest memory and the process executable or 2640 * shared libraries. 2641 */ 2642 if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || 2643 !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) 2644 return 0; 2645 2646 start = pmd_val(*pmd) & HPAGE_MASK; 2647 end = start + HPAGE_SIZE - 1; 2648 __storage_key_init_range(start, end); 2649 set_bit(PG_arch_1, &page->flags); 2650 cond_resched(); 2651 return 0; 2652 } 2653 2654 static const struct mm_walk_ops enable_skey_walk_ops = { 2655 .hugetlb_entry = __s390_enable_skey_hugetlb, 2656 .pte_entry = __s390_enable_skey_pte, 2657 .pmd_entry = __s390_enable_skey_pmd, 2658 }; 2659 2660 int s390_enable_skey(void) 2661 { 2662 struct mm_struct *mm = current->mm; 2663 int rc = 0; 2664 2665 mmap_write_lock(mm); 2666 if (mm_uses_skeys(mm)) 2667 goto out_up; 2668 2669 mm->context.uses_skeys = 1; 2670 rc = gmap_mark_unmergeable(); 2671 if (rc) { 2672 mm->context.uses_skeys = 0; 2673 goto out_up; 2674 } 2675 walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); 2676 2677 out_up: 2678 mmap_write_unlock(mm); 2679 return rc; 2680 } 2681 EXPORT_SYMBOL_GPL(s390_enable_skey); 2682 2683 /* 2684 * Reset CMMA state, make all pages stable again. 2685 */ 2686 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 2687 unsigned long next, struct mm_walk *walk) 2688 { 2689 ptep_zap_unused(walk->mm, addr, pte, 1); 2690 return 0; 2691 } 2692 2693 static const struct mm_walk_ops reset_cmma_walk_ops = { 2694 .pte_entry = __s390_reset_cmma, 2695 }; 2696 2697 void s390_reset_cmma(struct mm_struct *mm) 2698 { 2699 mmap_write_lock(mm); 2700 walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); 2701 mmap_write_unlock(mm); 2702 } 2703 EXPORT_SYMBOL_GPL(s390_reset_cmma); 2704 2705 #define GATHER_GET_PAGES 32 2706 2707 struct reset_walk_state { 2708 unsigned long next; 2709 unsigned long count; 2710 unsigned long pfns[GATHER_GET_PAGES]; 2711 }; 2712 2713 static int s390_gather_pages(pte_t *ptep, unsigned long addr, 2714 unsigned long next, struct mm_walk *walk) 2715 { 2716 struct reset_walk_state *p = walk->private; 2717 pte_t pte = READ_ONCE(*ptep); 2718 2719 if (pte_present(pte)) { 2720 /* we have a reference from the mapping, take an extra one */ 2721 get_page(phys_to_page(pte_val(pte))); 2722 p->pfns[p->count] = phys_to_pfn(pte_val(pte)); 2723 p->next = next; 2724 p->count++; 2725 } 2726 return p->count >= GATHER_GET_PAGES; 2727 } 2728 2729 static const struct mm_walk_ops gather_pages_ops = { 2730 .pte_entry = s390_gather_pages, 2731 }; 2732 2733 /* 2734 * Call the Destroy secure page UVC on each page in the given array of PFNs. 2735 * Each page needs to have an extra reference, which will be released here. 2736 */ 2737 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) 2738 { 2739 unsigned long i; 2740 2741 for (i = 0; i < count; i++) { 2742 /* we always have an extra reference */ 2743 uv_destroy_owned_page(pfn_to_phys(pfns[i])); 2744 /* get rid of the extra reference */ 2745 put_page(pfn_to_page(pfns[i])); 2746 cond_resched(); 2747 } 2748 } 2749 EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); 2750 2751 /** 2752 * __s390_uv_destroy_range - Call the destroy secure page UVC on each page 2753 * in the given range of the given address space. 2754 * @mm: the mm to operate on 2755 * @start: the start of the range 2756 * @end: the end of the range 2757 * @interruptible: if not 0, stop when a fatal signal is received 2758 * 2759 * Walk the given range of the given address space and call the destroy 2760 * secure page UVC on each page. Optionally exit early if a fatal signal is 2761 * pending. 2762 * 2763 * Return: 0 on success, -EINTR if the function stopped before completing 2764 */ 2765 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, 2766 unsigned long end, bool interruptible) 2767 { 2768 struct reset_walk_state state = { .next = start }; 2769 int r = 1; 2770 2771 while (r > 0) { 2772 state.count = 0; 2773 mmap_read_lock(mm); 2774 r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); 2775 mmap_read_unlock(mm); 2776 cond_resched(); 2777 s390_uv_destroy_pfns(state.count, state.pfns); 2778 if (interruptible && fatal_signal_pending(current)) 2779 return -EINTR; 2780 } 2781 return 0; 2782 } 2783 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); 2784 2785 /** 2786 * s390_unlist_old_asce - Remove the topmost level of page tables from the 2787 * list of page tables of the gmap. 2788 * @gmap: the gmap whose table is to be removed 2789 * 2790 * On s390x, KVM keeps a list of all pages containing the page tables of the 2791 * gmap (the CRST list). This list is used at tear down time to free all 2792 * pages that are now not needed anymore. 2793 * 2794 * This function removes the topmost page of the tree (the one pointed to by 2795 * the ASCE) from the CRST list. 2796 * 2797 * This means that it will not be freed when the VM is torn down, and needs 2798 * to be handled separately by the caller, unless a leak is actually 2799 * intended. Notice that this function will only remove the page from the 2800 * list, the page will still be used as a top level page table (and ASCE). 2801 */ 2802 void s390_unlist_old_asce(struct gmap *gmap) 2803 { 2804 struct page *old; 2805 2806 old = virt_to_page(gmap->table); 2807 spin_lock(&gmap->guest_table_lock); 2808 list_del(&old->lru); 2809 /* 2810 * Sometimes the topmost page might need to be "removed" multiple 2811 * times, for example if the VM is rebooted into secure mode several 2812 * times concurrently, or if s390_replace_asce fails after calling 2813 * s390_remove_old_asce and is attempted again later. In that case 2814 * the old asce has been removed from the list, and therefore it 2815 * will not be freed when the VM terminates, but the ASCE is still 2816 * in use and still pointed to. 2817 * A subsequent call to replace_asce will follow the pointer and try 2818 * to remove the same page from the list again. 2819 * Therefore it's necessary that the page of the ASCE has valid 2820 * pointers, so list_del can work (and do nothing) without 2821 * dereferencing stale or invalid pointers. 2822 */ 2823 INIT_LIST_HEAD(&old->lru); 2824 spin_unlock(&gmap->guest_table_lock); 2825 } 2826 EXPORT_SYMBOL_GPL(s390_unlist_old_asce); 2827 2828 /** 2829 * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy 2830 * @gmap: the gmap whose ASCE needs to be replaced 2831 * 2832 * If the ASCE is a SEGMENT type then this function will return -EINVAL, 2833 * otherwise the pointers in the host_to_guest radix tree will keep pointing 2834 * to the wrong pages, causing use-after-free and memory corruption. 2835 * If the allocation of the new top level page table fails, the ASCE is not 2836 * replaced. 2837 * In any case, the old ASCE is always removed from the gmap CRST list. 2838 * Therefore the caller has to make sure to save a pointer to it 2839 * beforehand, unless a leak is actually intended. 2840 */ 2841 int s390_replace_asce(struct gmap *gmap) 2842 { 2843 unsigned long asce; 2844 struct page *page; 2845 void *table; 2846 2847 s390_unlist_old_asce(gmap); 2848 2849 /* Replacing segment type ASCEs would cause serious issues */ 2850 if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) 2851 return -EINVAL; 2852 2853 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 2854 if (!page) 2855 return -ENOMEM; 2856 table = page_to_virt(page); 2857 memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); 2858 2859 /* 2860 * The caller has to deal with the old ASCE, but here we make sure 2861 * the new one is properly added to the CRST list, so that 2862 * it will be freed when the VM is torn down. 2863 */ 2864 spin_lock(&gmap->guest_table_lock); 2865 list_add(&page->lru, &gmap->crst_list); 2866 spin_unlock(&gmap->guest_table_lock); 2867 2868 /* Set new table origin while preserving existing ASCE control bits */ 2869 asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); 2870 WRITE_ONCE(gmap->asce, asce); 2871 WRITE_ONCE(gmap->mm->context.gmap_asce, asce); 2872 WRITE_ONCE(gmap->table, table); 2873 2874 return 0; 2875 } 2876 EXPORT_SYMBOL_GPL(s390_replace_asce); 2877