1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * KVM guest address space mapping code 4 * 5 * Copyright IBM Corp. 2007, 2020 6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 7 * David Hildenbrand <david@redhat.com> 8 * Janosch Frank <frankja@linux.vnet.ibm.com> 9 */ 10 11 #include <linux/kernel.h> 12 #include <linux/pagewalk.h> 13 #include <linux/swap.h> 14 #include <linux/smp.h> 15 #include <linux/spinlock.h> 16 #include <linux/slab.h> 17 #include <linux/swapops.h> 18 #include <linux/ksm.h> 19 #include <linux/mman.h> 20 #include <linux/pgtable.h> 21 22 #include <asm/pgalloc.h> 23 #include <asm/gmap.h> 24 #include <asm/tlb.h> 25 26 #define GMAP_SHADOW_FAKE_TABLE 1ULL 27 28 /** 29 * gmap_alloc - allocate and initialize a guest address space 30 * @limit: maximum address of the gmap address space 31 * 32 * Returns a guest address space structure. 33 */ 34 static struct gmap *gmap_alloc(unsigned long limit) 35 { 36 struct gmap *gmap; 37 struct page *page; 38 unsigned long *table; 39 unsigned long etype, atype; 40 41 if (limit < _REGION3_SIZE) { 42 limit = _REGION3_SIZE - 1; 43 atype = _ASCE_TYPE_SEGMENT; 44 etype = _SEGMENT_ENTRY_EMPTY; 45 } else if (limit < _REGION2_SIZE) { 46 limit = _REGION2_SIZE - 1; 47 atype = _ASCE_TYPE_REGION3; 48 etype = _REGION3_ENTRY_EMPTY; 49 } else if (limit < _REGION1_SIZE) { 50 limit = _REGION1_SIZE - 1; 51 atype = _ASCE_TYPE_REGION2; 52 etype = _REGION2_ENTRY_EMPTY; 53 } else { 54 limit = -1UL; 55 atype = _ASCE_TYPE_REGION1; 56 etype = _REGION1_ENTRY_EMPTY; 57 } 58 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); 59 if (!gmap) 60 goto out; 61 INIT_LIST_HEAD(&gmap->crst_list); 62 INIT_LIST_HEAD(&gmap->children); 63 INIT_LIST_HEAD(&gmap->pt_list); 64 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); 65 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); 66 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); 67 spin_lock_init(&gmap->guest_table_lock); 68 spin_lock_init(&gmap->shadow_lock); 69 refcount_set(&gmap->ref_count, 1); 70 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 71 if (!page) 72 goto out_free; 73 page->index = 0; 74 list_add(&page->lru, &gmap->crst_list); 75 table = (unsigned long *) page_to_phys(page); 76 crst_table_init(table, etype); 77 gmap->table = table; 78 gmap->asce = atype | _ASCE_TABLE_LENGTH | 79 _ASCE_USER_BITS | __pa(table); 80 gmap->asce_end = limit; 81 return gmap; 82 83 out_free: 84 kfree(gmap); 85 out: 86 return NULL; 87 } 88 89 /** 90 * gmap_create - create a guest address space 91 * @mm: pointer to the parent mm_struct 92 * @limit: maximum size of the gmap address space 93 * 94 * Returns a guest address space structure. 95 */ 96 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) 97 { 98 struct gmap *gmap; 99 unsigned long gmap_asce; 100 101 gmap = gmap_alloc(limit); 102 if (!gmap) 103 return NULL; 104 gmap->mm = mm; 105 spin_lock(&mm->context.lock); 106 list_add_rcu(&gmap->list, &mm->context.gmap_list); 107 if (list_is_singular(&mm->context.gmap_list)) 108 gmap_asce = gmap->asce; 109 else 110 gmap_asce = -1UL; 111 WRITE_ONCE(mm->context.gmap_asce, gmap_asce); 112 spin_unlock(&mm->context.lock); 113 return gmap; 114 } 115 EXPORT_SYMBOL_GPL(gmap_create); 116 117 static void gmap_flush_tlb(struct gmap *gmap) 118 { 119 if (MACHINE_HAS_IDTE) 120 __tlb_flush_idte(gmap->asce); 121 else 122 __tlb_flush_global(); 123 } 124 125 static void gmap_radix_tree_free(struct radix_tree_root *root) 126 { 127 struct radix_tree_iter iter; 128 unsigned long indices[16]; 129 unsigned long index; 130 void __rcu **slot; 131 int i, nr; 132 133 /* A radix tree is freed by deleting all of its entries */ 134 index = 0; 135 do { 136 nr = 0; 137 radix_tree_for_each_slot(slot, root, &iter, index) { 138 indices[nr] = iter.index; 139 if (++nr == 16) 140 break; 141 } 142 for (i = 0; i < nr; i++) { 143 index = indices[i]; 144 radix_tree_delete(root, index); 145 } 146 } while (nr > 0); 147 } 148 149 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) 150 { 151 struct gmap_rmap *rmap, *rnext, *head; 152 struct radix_tree_iter iter; 153 unsigned long indices[16]; 154 unsigned long index; 155 void __rcu **slot; 156 int i, nr; 157 158 /* A radix tree is freed by deleting all of its entries */ 159 index = 0; 160 do { 161 nr = 0; 162 radix_tree_for_each_slot(slot, root, &iter, index) { 163 indices[nr] = iter.index; 164 if (++nr == 16) 165 break; 166 } 167 for (i = 0; i < nr; i++) { 168 index = indices[i]; 169 head = radix_tree_delete(root, index); 170 gmap_for_each_rmap_safe(rmap, rnext, head) 171 kfree(rmap); 172 } 173 } while (nr > 0); 174 } 175 176 /** 177 * gmap_free - free a guest address space 178 * @gmap: pointer to the guest address space structure 179 * 180 * No locks required. There are no references to this gmap anymore. 181 */ 182 static void gmap_free(struct gmap *gmap) 183 { 184 struct page *page, *next; 185 186 /* Flush tlb of all gmaps (if not already done for shadows) */ 187 if (!(gmap_is_shadow(gmap) && gmap->removed)) 188 gmap_flush_tlb(gmap); 189 /* Free all segment & region tables. */ 190 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 191 __free_pages(page, CRST_ALLOC_ORDER); 192 gmap_radix_tree_free(&gmap->guest_to_host); 193 gmap_radix_tree_free(&gmap->host_to_guest); 194 195 /* Free additional data for a shadow gmap */ 196 if (gmap_is_shadow(gmap)) { 197 /* Free all page tables. */ 198 list_for_each_entry_safe(page, next, &gmap->pt_list, lru) 199 page_table_free_pgste(page); 200 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 201 /* Release reference to the parent */ 202 gmap_put(gmap->parent); 203 } 204 205 kfree(gmap); 206 } 207 208 /** 209 * gmap_get - increase reference counter for guest address space 210 * @gmap: pointer to the guest address space structure 211 * 212 * Returns the gmap pointer 213 */ 214 struct gmap *gmap_get(struct gmap *gmap) 215 { 216 refcount_inc(&gmap->ref_count); 217 return gmap; 218 } 219 EXPORT_SYMBOL_GPL(gmap_get); 220 221 /** 222 * gmap_put - decrease reference counter for guest address space 223 * @gmap: pointer to the guest address space structure 224 * 225 * If the reference counter reaches zero the guest address space is freed. 226 */ 227 void gmap_put(struct gmap *gmap) 228 { 229 if (refcount_dec_and_test(&gmap->ref_count)) 230 gmap_free(gmap); 231 } 232 EXPORT_SYMBOL_GPL(gmap_put); 233 234 /** 235 * gmap_remove - remove a guest address space but do not free it yet 236 * @gmap: pointer to the guest address space structure 237 */ 238 void gmap_remove(struct gmap *gmap) 239 { 240 struct gmap *sg, *next; 241 unsigned long gmap_asce; 242 243 /* Remove all shadow gmaps linked to this gmap */ 244 if (!list_empty(&gmap->children)) { 245 spin_lock(&gmap->shadow_lock); 246 list_for_each_entry_safe(sg, next, &gmap->children, list) { 247 list_del(&sg->list); 248 gmap_put(sg); 249 } 250 spin_unlock(&gmap->shadow_lock); 251 } 252 /* Remove gmap from the pre-mm list */ 253 spin_lock(&gmap->mm->context.lock); 254 list_del_rcu(&gmap->list); 255 if (list_empty(&gmap->mm->context.gmap_list)) 256 gmap_asce = 0; 257 else if (list_is_singular(&gmap->mm->context.gmap_list)) 258 gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, 259 struct gmap, list)->asce; 260 else 261 gmap_asce = -1UL; 262 WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); 263 spin_unlock(&gmap->mm->context.lock); 264 synchronize_rcu(); 265 /* Put reference */ 266 gmap_put(gmap); 267 } 268 EXPORT_SYMBOL_GPL(gmap_remove); 269 270 /** 271 * gmap_enable - switch primary space to the guest address space 272 * @gmap: pointer to the guest address space structure 273 */ 274 void gmap_enable(struct gmap *gmap) 275 { 276 S390_lowcore.gmap = (unsigned long) gmap; 277 } 278 EXPORT_SYMBOL_GPL(gmap_enable); 279 280 /** 281 * gmap_disable - switch back to the standard primary address space 282 * @gmap: pointer to the guest address space structure 283 */ 284 void gmap_disable(struct gmap *gmap) 285 { 286 S390_lowcore.gmap = 0UL; 287 } 288 EXPORT_SYMBOL_GPL(gmap_disable); 289 290 /** 291 * gmap_get_enabled - get a pointer to the currently enabled gmap 292 * 293 * Returns a pointer to the currently enabled gmap. 0 if none is enabled. 294 */ 295 struct gmap *gmap_get_enabled(void) 296 { 297 return (struct gmap *) S390_lowcore.gmap; 298 } 299 EXPORT_SYMBOL_GPL(gmap_get_enabled); 300 301 /* 302 * gmap_alloc_table is assumed to be called with mmap_lock held 303 */ 304 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, 305 unsigned long init, unsigned long gaddr) 306 { 307 struct page *page; 308 unsigned long *new; 309 310 /* since we dont free the gmap table until gmap_free we can unlock */ 311 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 312 if (!page) 313 return -ENOMEM; 314 new = (unsigned long *) page_to_phys(page); 315 crst_table_init(new, init); 316 spin_lock(&gmap->guest_table_lock); 317 if (*table & _REGION_ENTRY_INVALID) { 318 list_add(&page->lru, &gmap->crst_list); 319 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 320 (*table & _REGION_ENTRY_TYPE_MASK); 321 page->index = gaddr; 322 page = NULL; 323 } 324 spin_unlock(&gmap->guest_table_lock); 325 if (page) 326 __free_pages(page, CRST_ALLOC_ORDER); 327 return 0; 328 } 329 330 /** 331 * __gmap_segment_gaddr - find virtual address from segment pointer 332 * @entry: pointer to a segment table entry in the guest address space 333 * 334 * Returns the virtual address in the guest address space for the segment 335 */ 336 static unsigned long __gmap_segment_gaddr(unsigned long *entry) 337 { 338 struct page *page; 339 unsigned long offset; 340 341 offset = (unsigned long) entry / sizeof(unsigned long); 342 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 343 page = pmd_pgtable_page((pmd_t *) entry); 344 return page->index + offset; 345 } 346 347 /** 348 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address 349 * @gmap: pointer to the guest address space structure 350 * @vmaddr: address in the host process address space 351 * 352 * Returns 1 if a TLB flush is required 353 */ 354 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 355 { 356 unsigned long *entry; 357 int flush = 0; 358 359 BUG_ON(gmap_is_shadow(gmap)); 360 spin_lock(&gmap->guest_table_lock); 361 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 362 if (entry) { 363 flush = (*entry != _SEGMENT_ENTRY_EMPTY); 364 *entry = _SEGMENT_ENTRY_EMPTY; 365 } 366 spin_unlock(&gmap->guest_table_lock); 367 return flush; 368 } 369 370 /** 371 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address 372 * @gmap: pointer to the guest address space structure 373 * @gaddr: address in the guest address space 374 * 375 * Returns 1 if a TLB flush is required 376 */ 377 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) 378 { 379 unsigned long vmaddr; 380 381 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, 382 gaddr >> PMD_SHIFT); 383 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; 384 } 385 386 /** 387 * gmap_unmap_segment - unmap segment from the guest address space 388 * @gmap: pointer to the guest address space structure 389 * @to: address in the guest address space 390 * @len: length of the memory area to unmap 391 * 392 * Returns 0 if the unmap succeeded, -EINVAL if not. 393 */ 394 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 395 { 396 unsigned long off; 397 int flush; 398 399 BUG_ON(gmap_is_shadow(gmap)); 400 if ((to | len) & (PMD_SIZE - 1)) 401 return -EINVAL; 402 if (len == 0 || to + len < to) 403 return -EINVAL; 404 405 flush = 0; 406 mmap_write_lock(gmap->mm); 407 for (off = 0; off < len; off += PMD_SIZE) 408 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 409 mmap_write_unlock(gmap->mm); 410 if (flush) 411 gmap_flush_tlb(gmap); 412 return 0; 413 } 414 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 415 416 /** 417 * gmap_map_segment - map a segment to the guest address space 418 * @gmap: pointer to the guest address space structure 419 * @from: source address in the parent address space 420 * @to: target address in the guest address space 421 * @len: length of the memory area to map 422 * 423 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 424 */ 425 int gmap_map_segment(struct gmap *gmap, unsigned long from, 426 unsigned long to, unsigned long len) 427 { 428 unsigned long off; 429 int flush; 430 431 BUG_ON(gmap_is_shadow(gmap)); 432 if ((from | to | len) & (PMD_SIZE - 1)) 433 return -EINVAL; 434 if (len == 0 || from + len < from || to + len < to || 435 from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) 436 return -EINVAL; 437 438 flush = 0; 439 mmap_write_lock(gmap->mm); 440 for (off = 0; off < len; off += PMD_SIZE) { 441 /* Remove old translation */ 442 flush |= __gmap_unmap_by_gaddr(gmap, to + off); 443 /* Store new translation */ 444 if (radix_tree_insert(&gmap->guest_to_host, 445 (to + off) >> PMD_SHIFT, 446 (void *) from + off)) 447 break; 448 } 449 mmap_write_unlock(gmap->mm); 450 if (flush) 451 gmap_flush_tlb(gmap); 452 if (off >= len) 453 return 0; 454 gmap_unmap_segment(gmap, to, len); 455 return -ENOMEM; 456 } 457 EXPORT_SYMBOL_GPL(gmap_map_segment); 458 459 /** 460 * __gmap_translate - translate a guest address to a user space address 461 * @gmap: pointer to guest mapping meta data structure 462 * @gaddr: guest address 463 * 464 * Returns user space address which corresponds to the guest address or 465 * -EFAULT if no such mapping exists. 466 * This function does not establish potentially missing page table entries. 467 * The mmap_lock of the mm that belongs to the address space must be held 468 * when this function gets called. 469 * 470 * Note: Can also be called for shadow gmaps. 471 */ 472 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 473 { 474 unsigned long vmaddr; 475 476 vmaddr = (unsigned long) 477 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 478 /* Note: guest_to_host is empty for a shadow gmap */ 479 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 480 } 481 EXPORT_SYMBOL_GPL(__gmap_translate); 482 483 /** 484 * gmap_translate - translate a guest address to a user space address 485 * @gmap: pointer to guest mapping meta data structure 486 * @gaddr: guest address 487 * 488 * Returns user space address which corresponds to the guest address or 489 * -EFAULT if no such mapping exists. 490 * This function does not establish potentially missing page table entries. 491 */ 492 unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 493 { 494 unsigned long rc; 495 496 mmap_read_lock(gmap->mm); 497 rc = __gmap_translate(gmap, gaddr); 498 mmap_read_unlock(gmap->mm); 499 return rc; 500 } 501 EXPORT_SYMBOL_GPL(gmap_translate); 502 503 /** 504 * gmap_unlink - disconnect a page table from the gmap shadow tables 505 * @mm: pointer to the parent mm_struct 506 * @table: pointer to the host page table 507 * @vmaddr: vm address associated with the host page table 508 */ 509 void gmap_unlink(struct mm_struct *mm, unsigned long *table, 510 unsigned long vmaddr) 511 { 512 struct gmap *gmap; 513 int flush; 514 515 rcu_read_lock(); 516 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 517 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); 518 if (flush) 519 gmap_flush_tlb(gmap); 520 } 521 rcu_read_unlock(); 522 } 523 524 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, 525 unsigned long gaddr); 526 527 /** 528 * __gmap_link - set up shadow page tables to connect a host to a guest address 529 * @gmap: pointer to guest mapping meta data structure 530 * @gaddr: guest address 531 * @vmaddr: vm address 532 * 533 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 534 * if the vm address is already mapped to a different guest segment. 535 * The mmap_lock of the mm that belongs to the address space must be held 536 * when this function gets called. 537 */ 538 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) 539 { 540 struct mm_struct *mm; 541 unsigned long *table; 542 spinlock_t *ptl; 543 pgd_t *pgd; 544 p4d_t *p4d; 545 pud_t *pud; 546 pmd_t *pmd; 547 u64 unprot; 548 int rc; 549 550 BUG_ON(gmap_is_shadow(gmap)); 551 /* Create higher level tables in the gmap page table */ 552 table = gmap->table; 553 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 554 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 555 if ((*table & _REGION_ENTRY_INVALID) && 556 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, 557 gaddr & _REGION1_MASK)) 558 return -ENOMEM; 559 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 560 } 561 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { 562 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 563 if ((*table & _REGION_ENTRY_INVALID) && 564 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, 565 gaddr & _REGION2_MASK)) 566 return -ENOMEM; 567 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 568 } 569 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { 570 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 571 if ((*table & _REGION_ENTRY_INVALID) && 572 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, 573 gaddr & _REGION3_MASK)) 574 return -ENOMEM; 575 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 576 } 577 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 578 /* Walk the parent mm page table */ 579 mm = gmap->mm; 580 pgd = pgd_offset(mm, vmaddr); 581 VM_BUG_ON(pgd_none(*pgd)); 582 p4d = p4d_offset(pgd, vmaddr); 583 VM_BUG_ON(p4d_none(*p4d)); 584 pud = pud_offset(p4d, vmaddr); 585 VM_BUG_ON(pud_none(*pud)); 586 /* large puds cannot yet be handled */ 587 if (pud_large(*pud)) 588 return -EFAULT; 589 pmd = pmd_offset(pud, vmaddr); 590 VM_BUG_ON(pmd_none(*pmd)); 591 /* Are we allowed to use huge pages? */ 592 if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) 593 return -EFAULT; 594 /* Link gmap segment table entry location to page table. */ 595 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 596 if (rc) 597 return rc; 598 ptl = pmd_lock(mm, pmd); 599 spin_lock(&gmap->guest_table_lock); 600 if (*table == _SEGMENT_ENTRY_EMPTY) { 601 rc = radix_tree_insert(&gmap->host_to_guest, 602 vmaddr >> PMD_SHIFT, table); 603 if (!rc) { 604 if (pmd_large(*pmd)) { 605 *table = (pmd_val(*pmd) & 606 _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) 607 | _SEGMENT_ENTRY_GMAP_UC; 608 } else 609 *table = pmd_val(*pmd) & 610 _SEGMENT_ENTRY_HARDWARE_BITS; 611 } 612 } else if (*table & _SEGMENT_ENTRY_PROTECT && 613 !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { 614 unprot = (u64)*table; 615 unprot &= ~_SEGMENT_ENTRY_PROTECT; 616 unprot |= _SEGMENT_ENTRY_GMAP_UC; 617 gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); 618 } 619 spin_unlock(&gmap->guest_table_lock); 620 spin_unlock(ptl); 621 radix_tree_preload_end(); 622 return rc; 623 } 624 625 /** 626 * gmap_fault - resolve a fault on a guest address 627 * @gmap: pointer to guest mapping meta data structure 628 * @gaddr: guest address 629 * @fault_flags: flags to pass down to handle_mm_fault() 630 * 631 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT 632 * if the vm address is already mapped to a different guest segment. 633 */ 634 int gmap_fault(struct gmap *gmap, unsigned long gaddr, 635 unsigned int fault_flags) 636 { 637 unsigned long vmaddr; 638 int rc; 639 bool unlocked; 640 641 mmap_read_lock(gmap->mm); 642 643 retry: 644 unlocked = false; 645 vmaddr = __gmap_translate(gmap, gaddr); 646 if (IS_ERR_VALUE(vmaddr)) { 647 rc = vmaddr; 648 goto out_up; 649 } 650 if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, 651 &unlocked)) { 652 rc = -EFAULT; 653 goto out_up; 654 } 655 /* 656 * In the case that fixup_user_fault unlocked the mmap_lock during 657 * faultin redo __gmap_translate to not race with a map/unmap_segment. 658 */ 659 if (unlocked) 660 goto retry; 661 662 rc = __gmap_link(gmap, gaddr, vmaddr); 663 out_up: 664 mmap_read_unlock(gmap->mm); 665 return rc; 666 } 667 EXPORT_SYMBOL_GPL(gmap_fault); 668 669 /* 670 * this function is assumed to be called with mmap_lock held 671 */ 672 void __gmap_zap(struct gmap *gmap, unsigned long gaddr) 673 { 674 struct vm_area_struct *vma; 675 unsigned long vmaddr; 676 spinlock_t *ptl; 677 pte_t *ptep; 678 679 /* Find the vm address for the guest address */ 680 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, 681 gaddr >> PMD_SHIFT); 682 if (vmaddr) { 683 vmaddr |= gaddr & ~PMD_MASK; 684 685 vma = vma_lookup(gmap->mm, vmaddr); 686 if (!vma || is_vm_hugetlb_page(vma)) 687 return; 688 689 /* Get pointer to the page table entry */ 690 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); 691 if (likely(ptep)) { 692 ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); 693 pte_unmap_unlock(ptep, ptl); 694 } 695 } 696 } 697 EXPORT_SYMBOL_GPL(__gmap_zap); 698 699 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) 700 { 701 unsigned long gaddr, vmaddr, size; 702 struct vm_area_struct *vma; 703 704 mmap_read_lock(gmap->mm); 705 for (gaddr = from; gaddr < to; 706 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { 707 /* Find the vm address for the guest address */ 708 vmaddr = (unsigned long) 709 radix_tree_lookup(&gmap->guest_to_host, 710 gaddr >> PMD_SHIFT); 711 if (!vmaddr) 712 continue; 713 vmaddr |= gaddr & ~PMD_MASK; 714 /* Find vma in the parent mm */ 715 vma = find_vma(gmap->mm, vmaddr); 716 if (!vma) 717 continue; 718 /* 719 * We do not discard pages that are backed by 720 * hugetlbfs, so we don't have to refault them. 721 */ 722 if (is_vm_hugetlb_page(vma)) 723 continue; 724 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 725 zap_page_range(vma, vmaddr, size); 726 } 727 mmap_read_unlock(gmap->mm); 728 } 729 EXPORT_SYMBOL_GPL(gmap_discard); 730 731 static LIST_HEAD(gmap_notifier_list); 732 static DEFINE_SPINLOCK(gmap_notifier_lock); 733 734 /** 735 * gmap_register_pte_notifier - register a pte invalidation callback 736 * @nb: pointer to the gmap notifier block 737 */ 738 void gmap_register_pte_notifier(struct gmap_notifier *nb) 739 { 740 spin_lock(&gmap_notifier_lock); 741 list_add_rcu(&nb->list, &gmap_notifier_list); 742 spin_unlock(&gmap_notifier_lock); 743 } 744 EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); 745 746 /** 747 * gmap_unregister_pte_notifier - remove a pte invalidation callback 748 * @nb: pointer to the gmap notifier block 749 */ 750 void gmap_unregister_pte_notifier(struct gmap_notifier *nb) 751 { 752 spin_lock(&gmap_notifier_lock); 753 list_del_rcu(&nb->list); 754 spin_unlock(&gmap_notifier_lock); 755 synchronize_rcu(); 756 } 757 EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); 758 759 /** 760 * gmap_call_notifier - call all registered invalidation callbacks 761 * @gmap: pointer to guest mapping meta data structure 762 * @start: start virtual address in the guest address space 763 * @end: end virtual address in the guest address space 764 */ 765 static void gmap_call_notifier(struct gmap *gmap, unsigned long start, 766 unsigned long end) 767 { 768 struct gmap_notifier *nb; 769 770 list_for_each_entry(nb, &gmap_notifier_list, list) 771 nb->notifier_call(gmap, start, end); 772 } 773 774 /** 775 * gmap_table_walk - walk the gmap page tables 776 * @gmap: pointer to guest mapping meta data structure 777 * @gaddr: virtual address in the guest address space 778 * @level: page table level to stop at 779 * 780 * Returns a table entry pointer for the given guest address and @level 781 * @level=0 : returns a pointer to a page table table entry (or NULL) 782 * @level=1 : returns a pointer to a segment table entry (or NULL) 783 * @level=2 : returns a pointer to a region-3 table entry (or NULL) 784 * @level=3 : returns a pointer to a region-2 table entry (or NULL) 785 * @level=4 : returns a pointer to a region-1 table entry (or NULL) 786 * 787 * Returns NULL if the gmap page tables could not be walked to the 788 * requested level. 789 * 790 * Note: Can also be called for shadow gmaps. 791 */ 792 static inline unsigned long *gmap_table_walk(struct gmap *gmap, 793 unsigned long gaddr, int level) 794 { 795 const int asce_type = gmap->asce & _ASCE_TYPE_MASK; 796 unsigned long *table = gmap->table; 797 798 if (gmap_is_shadow(gmap) && gmap->removed) 799 return NULL; 800 801 if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) 802 return NULL; 803 804 if (asce_type != _ASCE_TYPE_REGION1 && 805 gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) 806 return NULL; 807 808 switch (asce_type) { 809 case _ASCE_TYPE_REGION1: 810 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; 811 if (level == 4) 812 break; 813 if (*table & _REGION_ENTRY_INVALID) 814 return NULL; 815 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 816 fallthrough; 817 case _ASCE_TYPE_REGION2: 818 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; 819 if (level == 3) 820 break; 821 if (*table & _REGION_ENTRY_INVALID) 822 return NULL; 823 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 824 fallthrough; 825 case _ASCE_TYPE_REGION3: 826 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; 827 if (level == 2) 828 break; 829 if (*table & _REGION_ENTRY_INVALID) 830 return NULL; 831 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 832 fallthrough; 833 case _ASCE_TYPE_SEGMENT: 834 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; 835 if (level == 1) 836 break; 837 if (*table & _REGION_ENTRY_INVALID) 838 return NULL; 839 table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); 840 table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; 841 } 842 return table; 843 } 844 845 /** 846 * gmap_pte_op_walk - walk the gmap page table, get the page table lock 847 * and return the pte pointer 848 * @gmap: pointer to guest mapping meta data structure 849 * @gaddr: virtual address in the guest address space 850 * @ptl: pointer to the spinlock pointer 851 * 852 * Returns a pointer to the locked pte for a guest address, or NULL 853 */ 854 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, 855 spinlock_t **ptl) 856 { 857 unsigned long *table; 858 859 BUG_ON(gmap_is_shadow(gmap)); 860 /* Walk the gmap page table, lock and get pte pointer */ 861 table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ 862 if (!table || *table & _SEGMENT_ENTRY_INVALID) 863 return NULL; 864 return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); 865 } 866 867 /** 868 * gmap_pte_op_fixup - force a page in and connect the gmap page table 869 * @gmap: pointer to guest mapping meta data structure 870 * @gaddr: virtual address in the guest address space 871 * @vmaddr: address in the host process address space 872 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 873 * 874 * Returns 0 if the caller can retry __gmap_translate (might fail again), 875 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing 876 * up or connecting the gmap page table. 877 */ 878 static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, 879 unsigned long vmaddr, int prot) 880 { 881 struct mm_struct *mm = gmap->mm; 882 unsigned int fault_flags; 883 bool unlocked = false; 884 885 BUG_ON(gmap_is_shadow(gmap)); 886 fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 887 if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) 888 return -EFAULT; 889 if (unlocked) 890 /* lost mmap_lock, caller has to retry __gmap_translate */ 891 return 0; 892 /* Connect the page tables */ 893 return __gmap_link(gmap, gaddr, vmaddr); 894 } 895 896 /** 897 * gmap_pte_op_end - release the page table lock 898 * @ptl: pointer to the spinlock pointer 899 */ 900 static void gmap_pte_op_end(spinlock_t *ptl) 901 { 902 if (ptl) 903 spin_unlock(ptl); 904 } 905 906 /** 907 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock 908 * and return the pmd pointer 909 * @gmap: pointer to guest mapping meta data structure 910 * @gaddr: virtual address in the guest address space 911 * 912 * Returns a pointer to the pmd for a guest address, or NULL 913 */ 914 static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) 915 { 916 pmd_t *pmdp; 917 918 BUG_ON(gmap_is_shadow(gmap)); 919 pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); 920 if (!pmdp) 921 return NULL; 922 923 /* without huge pages, there is no need to take the table lock */ 924 if (!gmap->mm->context.allow_gmap_hpage_1m) 925 return pmd_none(*pmdp) ? NULL : pmdp; 926 927 spin_lock(&gmap->guest_table_lock); 928 if (pmd_none(*pmdp)) { 929 spin_unlock(&gmap->guest_table_lock); 930 return NULL; 931 } 932 933 /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ 934 if (!pmd_large(*pmdp)) 935 spin_unlock(&gmap->guest_table_lock); 936 return pmdp; 937 } 938 939 /** 940 * gmap_pmd_op_end - release the guest_table_lock if needed 941 * @gmap: pointer to the guest mapping meta data structure 942 * @pmdp: pointer to the pmd 943 */ 944 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) 945 { 946 if (pmd_large(*pmdp)) 947 spin_unlock(&gmap->guest_table_lock); 948 } 949 950 /* 951 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits 952 * @pmdp: pointer to the pmd to be protected 953 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 954 * @bits: notification bits to set 955 * 956 * Returns: 957 * 0 if successfully protected 958 * -EAGAIN if a fixup is needed 959 * -EINVAL if unsupported notifier bits have been specified 960 * 961 * Expected to be called with sg->mm->mmap_lock in read and 962 * guest_table_lock held. 963 */ 964 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, 965 pmd_t *pmdp, int prot, unsigned long bits) 966 { 967 int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; 968 int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; 969 pmd_t new = *pmdp; 970 971 /* Fixup needed */ 972 if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) 973 return -EAGAIN; 974 975 if (prot == PROT_NONE && !pmd_i) { 976 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); 977 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 978 } 979 980 if (prot == PROT_READ && !pmd_p) { 981 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); 982 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); 983 gmap_pmdp_xchg(gmap, pmdp, new, gaddr); 984 } 985 986 if (bits & GMAP_NOTIFY_MPROT) 987 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); 988 989 /* Shadow GMAP protection needs split PMDs */ 990 if (bits & GMAP_NOTIFY_SHADOW) 991 return -EINVAL; 992 993 return 0; 994 } 995 996 /* 997 * gmap_protect_pte - remove access rights to memory and set pgste bits 998 * @gmap: pointer to guest mapping meta data structure 999 * @gaddr: virtual address in the guest address space 1000 * @pmdp: pointer to the pmd associated with the pte 1001 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1002 * @bits: notification bits to set 1003 * 1004 * Returns 0 if successfully protected, -ENOMEM if out of memory and 1005 * -EAGAIN if a fixup is needed. 1006 * 1007 * Expected to be called with sg->mm->mmap_lock in read 1008 */ 1009 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, 1010 pmd_t *pmdp, int prot, unsigned long bits) 1011 { 1012 int rc; 1013 pte_t *ptep; 1014 spinlock_t *ptl = NULL; 1015 unsigned long pbits = 0; 1016 1017 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 1018 return -EAGAIN; 1019 1020 ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); 1021 if (!ptep) 1022 return -ENOMEM; 1023 1024 pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; 1025 pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; 1026 /* Protect and unlock. */ 1027 rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); 1028 gmap_pte_op_end(ptl); 1029 return rc; 1030 } 1031 1032 /* 1033 * gmap_protect_range - remove access rights to memory and set pgste bits 1034 * @gmap: pointer to guest mapping meta data structure 1035 * @gaddr: virtual address in the guest address space 1036 * @len: size of area 1037 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1038 * @bits: pgste notification bits to set 1039 * 1040 * Returns 0 if successfully protected, -ENOMEM if out of memory and 1041 * -EFAULT if gaddr is invalid (or mapping for shadows is missing). 1042 * 1043 * Called with sg->mm->mmap_lock in read. 1044 */ 1045 static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, 1046 unsigned long len, int prot, unsigned long bits) 1047 { 1048 unsigned long vmaddr, dist; 1049 pmd_t *pmdp; 1050 int rc; 1051 1052 BUG_ON(gmap_is_shadow(gmap)); 1053 while (len) { 1054 rc = -EAGAIN; 1055 pmdp = gmap_pmd_op_walk(gmap, gaddr); 1056 if (pmdp) { 1057 if (!pmd_large(*pmdp)) { 1058 rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, 1059 bits); 1060 if (!rc) { 1061 len -= PAGE_SIZE; 1062 gaddr += PAGE_SIZE; 1063 } 1064 } else { 1065 rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, 1066 bits); 1067 if (!rc) { 1068 dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); 1069 len = len < dist ? 0 : len - dist; 1070 gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; 1071 } 1072 } 1073 gmap_pmd_op_end(gmap, pmdp); 1074 } 1075 if (rc) { 1076 if (rc == -EINVAL) 1077 return rc; 1078 1079 /* -EAGAIN, fixup of userspace mm and gmap */ 1080 vmaddr = __gmap_translate(gmap, gaddr); 1081 if (IS_ERR_VALUE(vmaddr)) 1082 return vmaddr; 1083 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); 1084 if (rc) 1085 return rc; 1086 } 1087 } 1088 return 0; 1089 } 1090 1091 /** 1092 * gmap_mprotect_notify - change access rights for a range of ptes and 1093 * call the notifier if any pte changes again 1094 * @gmap: pointer to guest mapping meta data structure 1095 * @gaddr: virtual address in the guest address space 1096 * @len: size of area 1097 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1098 * 1099 * Returns 0 if for each page in the given range a gmap mapping exists, 1100 * the new access rights could be set and the notifier could be armed. 1101 * If the gmap mapping is missing for one or more pages -EFAULT is 1102 * returned. If no memory could be allocated -ENOMEM is returned. 1103 * This function establishes missing page table entries. 1104 */ 1105 int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, 1106 unsigned long len, int prot) 1107 { 1108 int rc; 1109 1110 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) 1111 return -EINVAL; 1112 if (!MACHINE_HAS_ESOP && prot == PROT_READ) 1113 return -EINVAL; 1114 mmap_read_lock(gmap->mm); 1115 rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); 1116 mmap_read_unlock(gmap->mm); 1117 return rc; 1118 } 1119 EXPORT_SYMBOL_GPL(gmap_mprotect_notify); 1120 1121 /** 1122 * gmap_read_table - get an unsigned long value from a guest page table using 1123 * absolute addressing, without marking the page referenced. 1124 * @gmap: pointer to guest mapping meta data structure 1125 * @gaddr: virtual address in the guest address space 1126 * @val: pointer to the unsigned long value to return 1127 * 1128 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT 1129 * if reading using the virtual address failed. -EINVAL if called on a gmap 1130 * shadow. 1131 * 1132 * Called with gmap->mm->mmap_lock in read. 1133 */ 1134 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) 1135 { 1136 unsigned long address, vmaddr; 1137 spinlock_t *ptl; 1138 pte_t *ptep, pte; 1139 int rc; 1140 1141 if (gmap_is_shadow(gmap)) 1142 return -EINVAL; 1143 1144 while (1) { 1145 rc = -EAGAIN; 1146 ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); 1147 if (ptep) { 1148 pte = *ptep; 1149 if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { 1150 address = pte_val(pte) & PAGE_MASK; 1151 address += gaddr & ~PAGE_MASK; 1152 *val = *(unsigned long *) address; 1153 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); 1154 /* Do *NOT* clear the _PAGE_INVALID bit! */ 1155 rc = 0; 1156 } 1157 gmap_pte_op_end(ptl); 1158 } 1159 if (!rc) 1160 break; 1161 vmaddr = __gmap_translate(gmap, gaddr); 1162 if (IS_ERR_VALUE(vmaddr)) { 1163 rc = vmaddr; 1164 break; 1165 } 1166 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); 1167 if (rc) 1168 break; 1169 } 1170 return rc; 1171 } 1172 EXPORT_SYMBOL_GPL(gmap_read_table); 1173 1174 /** 1175 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree 1176 * @sg: pointer to the shadow guest address space structure 1177 * @vmaddr: vm address associated with the rmap 1178 * @rmap: pointer to the rmap structure 1179 * 1180 * Called with the sg->guest_table_lock 1181 */ 1182 static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, 1183 struct gmap_rmap *rmap) 1184 { 1185 struct gmap_rmap *temp; 1186 void __rcu **slot; 1187 1188 BUG_ON(!gmap_is_shadow(sg)); 1189 slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 1190 if (slot) { 1191 rmap->next = radix_tree_deref_slot_protected(slot, 1192 &sg->guest_table_lock); 1193 for (temp = rmap->next; temp; temp = temp->next) { 1194 if (temp->raddr == rmap->raddr) { 1195 kfree(rmap); 1196 return; 1197 } 1198 } 1199 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); 1200 } else { 1201 rmap->next = NULL; 1202 radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, 1203 rmap); 1204 } 1205 } 1206 1207 /** 1208 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap 1209 * @sg: pointer to the shadow guest address space structure 1210 * @raddr: rmap address in the shadow gmap 1211 * @paddr: address in the parent guest address space 1212 * @len: length of the memory area to protect 1213 * 1214 * Returns 0 if successfully protected and the rmap was created, -ENOMEM 1215 * if out of memory and -EFAULT if paddr is invalid. 1216 */ 1217 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, 1218 unsigned long paddr, unsigned long len) 1219 { 1220 struct gmap *parent; 1221 struct gmap_rmap *rmap; 1222 unsigned long vmaddr; 1223 spinlock_t *ptl; 1224 pte_t *ptep; 1225 int rc; 1226 1227 BUG_ON(!gmap_is_shadow(sg)); 1228 parent = sg->parent; 1229 while (len) { 1230 vmaddr = __gmap_translate(parent, paddr); 1231 if (IS_ERR_VALUE(vmaddr)) 1232 return vmaddr; 1233 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); 1234 if (!rmap) 1235 return -ENOMEM; 1236 rmap->raddr = raddr; 1237 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 1238 if (rc) { 1239 kfree(rmap); 1240 return rc; 1241 } 1242 rc = -EAGAIN; 1243 ptep = gmap_pte_op_walk(parent, paddr, &ptl); 1244 if (ptep) { 1245 spin_lock(&sg->guest_table_lock); 1246 rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, 1247 PGSTE_VSIE_BIT); 1248 if (!rc) 1249 gmap_insert_rmap(sg, vmaddr, rmap); 1250 spin_unlock(&sg->guest_table_lock); 1251 gmap_pte_op_end(ptl); 1252 } 1253 radix_tree_preload_end(); 1254 if (rc) { 1255 kfree(rmap); 1256 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); 1257 if (rc) 1258 return rc; 1259 continue; 1260 } 1261 paddr += PAGE_SIZE; 1262 len -= PAGE_SIZE; 1263 } 1264 return 0; 1265 } 1266 1267 #define _SHADOW_RMAP_MASK 0x7 1268 #define _SHADOW_RMAP_REGION1 0x5 1269 #define _SHADOW_RMAP_REGION2 0x4 1270 #define _SHADOW_RMAP_REGION3 0x3 1271 #define _SHADOW_RMAP_SEGMENT 0x2 1272 #define _SHADOW_RMAP_PGTABLE 0x1 1273 1274 /** 1275 * gmap_idte_one - invalidate a single region or segment table entry 1276 * @asce: region or segment table *origin* + table-type bits 1277 * @vaddr: virtual address to identify the table entry to flush 1278 * 1279 * The invalid bit of a single region or segment table entry is set 1280 * and the associated TLB entries depending on the entry are flushed. 1281 * The table-type of the @asce identifies the portion of the @vaddr 1282 * that is used as the invalidation index. 1283 */ 1284 static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) 1285 { 1286 asm volatile( 1287 " idte %0,0,%1" 1288 : : "a" (asce), "a" (vaddr) : "cc", "memory"); 1289 } 1290 1291 /** 1292 * gmap_unshadow_page - remove a page from a shadow page table 1293 * @sg: pointer to the shadow guest address space structure 1294 * @raddr: rmap address in the shadow guest address space 1295 * 1296 * Called with the sg->guest_table_lock 1297 */ 1298 static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) 1299 { 1300 unsigned long *table; 1301 1302 BUG_ON(!gmap_is_shadow(sg)); 1303 table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ 1304 if (!table || *table & _PAGE_INVALID) 1305 return; 1306 gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); 1307 ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); 1308 } 1309 1310 /** 1311 * __gmap_unshadow_pgt - remove all entries from a shadow page table 1312 * @sg: pointer to the shadow guest address space structure 1313 * @raddr: rmap address in the shadow guest address space 1314 * @pgt: pointer to the start of a shadow page table 1315 * 1316 * Called with the sg->guest_table_lock 1317 */ 1318 static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, 1319 unsigned long *pgt) 1320 { 1321 int i; 1322 1323 BUG_ON(!gmap_is_shadow(sg)); 1324 for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) 1325 pgt[i] = _PAGE_INVALID; 1326 } 1327 1328 /** 1329 * gmap_unshadow_pgt - remove a shadow page table from a segment entry 1330 * @sg: pointer to the shadow guest address space structure 1331 * @raddr: address in the shadow guest address space 1332 * 1333 * Called with the sg->guest_table_lock 1334 */ 1335 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) 1336 { 1337 unsigned long sto, *ste, *pgt; 1338 struct page *page; 1339 1340 BUG_ON(!gmap_is_shadow(sg)); 1341 ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ 1342 if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) 1343 return; 1344 gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); 1345 sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); 1346 gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); 1347 pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); 1348 *ste = _SEGMENT_ENTRY_EMPTY; 1349 __gmap_unshadow_pgt(sg, raddr, pgt); 1350 /* Free page table */ 1351 page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); 1352 list_del(&page->lru); 1353 page_table_free_pgste(page); 1354 } 1355 1356 /** 1357 * __gmap_unshadow_sgt - remove all entries from a shadow segment table 1358 * @sg: pointer to the shadow guest address space structure 1359 * @raddr: rmap address in the shadow guest address space 1360 * @sgt: pointer to the start of a shadow segment table 1361 * 1362 * Called with the sg->guest_table_lock 1363 */ 1364 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, 1365 unsigned long *sgt) 1366 { 1367 unsigned long *pgt; 1368 struct page *page; 1369 int i; 1370 1371 BUG_ON(!gmap_is_shadow(sg)); 1372 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { 1373 if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) 1374 continue; 1375 pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); 1376 sgt[i] = _SEGMENT_ENTRY_EMPTY; 1377 __gmap_unshadow_pgt(sg, raddr, pgt); 1378 /* Free page table */ 1379 page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); 1380 list_del(&page->lru); 1381 page_table_free_pgste(page); 1382 } 1383 } 1384 1385 /** 1386 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry 1387 * @sg: pointer to the shadow guest address space structure 1388 * @raddr: rmap address in the shadow guest address space 1389 * 1390 * Called with the shadow->guest_table_lock 1391 */ 1392 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) 1393 { 1394 unsigned long r3o, *r3e, *sgt; 1395 struct page *page; 1396 1397 BUG_ON(!gmap_is_shadow(sg)); 1398 r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ 1399 if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) 1400 return; 1401 gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); 1402 r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); 1403 gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); 1404 sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); 1405 *r3e = _REGION3_ENTRY_EMPTY; 1406 __gmap_unshadow_sgt(sg, raddr, sgt); 1407 /* Free segment table */ 1408 page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); 1409 list_del(&page->lru); 1410 __free_pages(page, CRST_ALLOC_ORDER); 1411 } 1412 1413 /** 1414 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table 1415 * @sg: pointer to the shadow guest address space structure 1416 * @raddr: address in the shadow guest address space 1417 * @r3t: pointer to the start of a shadow region-3 table 1418 * 1419 * Called with the sg->guest_table_lock 1420 */ 1421 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, 1422 unsigned long *r3t) 1423 { 1424 unsigned long *sgt; 1425 struct page *page; 1426 int i; 1427 1428 BUG_ON(!gmap_is_shadow(sg)); 1429 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { 1430 if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) 1431 continue; 1432 sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); 1433 r3t[i] = _REGION3_ENTRY_EMPTY; 1434 __gmap_unshadow_sgt(sg, raddr, sgt); 1435 /* Free segment table */ 1436 page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); 1437 list_del(&page->lru); 1438 __free_pages(page, CRST_ALLOC_ORDER); 1439 } 1440 } 1441 1442 /** 1443 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry 1444 * @sg: pointer to the shadow guest address space structure 1445 * @raddr: rmap address in the shadow guest address space 1446 * 1447 * Called with the sg->guest_table_lock 1448 */ 1449 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) 1450 { 1451 unsigned long r2o, *r2e, *r3t; 1452 struct page *page; 1453 1454 BUG_ON(!gmap_is_shadow(sg)); 1455 r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ 1456 if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) 1457 return; 1458 gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); 1459 r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); 1460 gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); 1461 r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); 1462 *r2e = _REGION2_ENTRY_EMPTY; 1463 __gmap_unshadow_r3t(sg, raddr, r3t); 1464 /* Free region 3 table */ 1465 page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); 1466 list_del(&page->lru); 1467 __free_pages(page, CRST_ALLOC_ORDER); 1468 } 1469 1470 /** 1471 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table 1472 * @sg: pointer to the shadow guest address space structure 1473 * @raddr: rmap address in the shadow guest address space 1474 * @r2t: pointer to the start of a shadow region-2 table 1475 * 1476 * Called with the sg->guest_table_lock 1477 */ 1478 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, 1479 unsigned long *r2t) 1480 { 1481 unsigned long *r3t; 1482 struct page *page; 1483 int i; 1484 1485 BUG_ON(!gmap_is_shadow(sg)); 1486 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { 1487 if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) 1488 continue; 1489 r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); 1490 r2t[i] = _REGION2_ENTRY_EMPTY; 1491 __gmap_unshadow_r3t(sg, raddr, r3t); 1492 /* Free region 3 table */ 1493 page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); 1494 list_del(&page->lru); 1495 __free_pages(page, CRST_ALLOC_ORDER); 1496 } 1497 } 1498 1499 /** 1500 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry 1501 * @sg: pointer to the shadow guest address space structure 1502 * @raddr: rmap address in the shadow guest address space 1503 * 1504 * Called with the sg->guest_table_lock 1505 */ 1506 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) 1507 { 1508 unsigned long r1o, *r1e, *r2t; 1509 struct page *page; 1510 1511 BUG_ON(!gmap_is_shadow(sg)); 1512 r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ 1513 if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) 1514 return; 1515 gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); 1516 r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); 1517 gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); 1518 r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); 1519 *r1e = _REGION1_ENTRY_EMPTY; 1520 __gmap_unshadow_r2t(sg, raddr, r2t); 1521 /* Free region 2 table */ 1522 page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); 1523 list_del(&page->lru); 1524 __free_pages(page, CRST_ALLOC_ORDER); 1525 } 1526 1527 /** 1528 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table 1529 * @sg: pointer to the shadow guest address space structure 1530 * @raddr: rmap address in the shadow guest address space 1531 * @r1t: pointer to the start of a shadow region-1 table 1532 * 1533 * Called with the shadow->guest_table_lock 1534 */ 1535 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, 1536 unsigned long *r1t) 1537 { 1538 unsigned long asce, *r2t; 1539 struct page *page; 1540 int i; 1541 1542 BUG_ON(!gmap_is_shadow(sg)); 1543 asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; 1544 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { 1545 if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) 1546 continue; 1547 r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); 1548 __gmap_unshadow_r2t(sg, raddr, r2t); 1549 /* Clear entry and flush translation r1t -> r2t */ 1550 gmap_idte_one(asce, raddr); 1551 r1t[i] = _REGION1_ENTRY_EMPTY; 1552 /* Free region 2 table */ 1553 page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); 1554 list_del(&page->lru); 1555 __free_pages(page, CRST_ALLOC_ORDER); 1556 } 1557 } 1558 1559 /** 1560 * gmap_unshadow - remove a shadow page table completely 1561 * @sg: pointer to the shadow guest address space structure 1562 * 1563 * Called with sg->guest_table_lock 1564 */ 1565 static void gmap_unshadow(struct gmap *sg) 1566 { 1567 unsigned long *table; 1568 1569 BUG_ON(!gmap_is_shadow(sg)); 1570 if (sg->removed) 1571 return; 1572 sg->removed = 1; 1573 gmap_call_notifier(sg, 0, -1UL); 1574 gmap_flush_tlb(sg); 1575 table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); 1576 switch (sg->asce & _ASCE_TYPE_MASK) { 1577 case _ASCE_TYPE_REGION1: 1578 __gmap_unshadow_r1t(sg, 0, table); 1579 break; 1580 case _ASCE_TYPE_REGION2: 1581 __gmap_unshadow_r2t(sg, 0, table); 1582 break; 1583 case _ASCE_TYPE_REGION3: 1584 __gmap_unshadow_r3t(sg, 0, table); 1585 break; 1586 case _ASCE_TYPE_SEGMENT: 1587 __gmap_unshadow_sgt(sg, 0, table); 1588 break; 1589 } 1590 } 1591 1592 /** 1593 * gmap_find_shadow - find a specific asce in the list of shadow tables 1594 * @parent: pointer to the parent gmap 1595 * @asce: ASCE for which the shadow table is created 1596 * @edat_level: edat level to be used for the shadow translation 1597 * 1598 * Returns the pointer to a gmap if a shadow table with the given asce is 1599 * already available, ERR_PTR(-EAGAIN) if another one is just being created, 1600 * otherwise NULL 1601 */ 1602 static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, 1603 int edat_level) 1604 { 1605 struct gmap *sg; 1606 1607 list_for_each_entry(sg, &parent->children, list) { 1608 if (sg->orig_asce != asce || sg->edat_level != edat_level || 1609 sg->removed) 1610 continue; 1611 if (!sg->initialized) 1612 return ERR_PTR(-EAGAIN); 1613 refcount_inc(&sg->ref_count); 1614 return sg; 1615 } 1616 return NULL; 1617 } 1618 1619 /** 1620 * gmap_shadow_valid - check if a shadow guest address space matches the 1621 * given properties and is still valid 1622 * @sg: pointer to the shadow guest address space structure 1623 * @asce: ASCE for which the shadow table is requested 1624 * @edat_level: edat level to be used for the shadow translation 1625 * 1626 * Returns 1 if the gmap shadow is still valid and matches the given 1627 * properties, the caller can continue using it. Returns 0 otherwise, the 1628 * caller has to request a new shadow gmap in this case. 1629 * 1630 */ 1631 int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) 1632 { 1633 if (sg->removed) 1634 return 0; 1635 return sg->orig_asce == asce && sg->edat_level == edat_level; 1636 } 1637 EXPORT_SYMBOL_GPL(gmap_shadow_valid); 1638 1639 /** 1640 * gmap_shadow - create/find a shadow guest address space 1641 * @parent: pointer to the parent gmap 1642 * @asce: ASCE for which the shadow table is created 1643 * @edat_level: edat level to be used for the shadow translation 1644 * 1645 * The pages of the top level page table referred by the asce parameter 1646 * will be set to read-only and marked in the PGSTEs of the kvm process. 1647 * The shadow table will be removed automatically on any change to the 1648 * PTE mapping for the source table. 1649 * 1650 * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 1651 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 1652 * parent gmap table could not be protected. 1653 */ 1654 struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, 1655 int edat_level) 1656 { 1657 struct gmap *sg, *new; 1658 unsigned long limit; 1659 int rc; 1660 1661 BUG_ON(parent->mm->context.allow_gmap_hpage_1m); 1662 BUG_ON(gmap_is_shadow(parent)); 1663 spin_lock(&parent->shadow_lock); 1664 sg = gmap_find_shadow(parent, asce, edat_level); 1665 spin_unlock(&parent->shadow_lock); 1666 if (sg) 1667 return sg; 1668 /* Create a new shadow gmap */ 1669 limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); 1670 if (asce & _ASCE_REAL_SPACE) 1671 limit = -1UL; 1672 new = gmap_alloc(limit); 1673 if (!new) 1674 return ERR_PTR(-ENOMEM); 1675 new->mm = parent->mm; 1676 new->parent = gmap_get(parent); 1677 new->orig_asce = asce; 1678 new->edat_level = edat_level; 1679 new->initialized = false; 1680 spin_lock(&parent->shadow_lock); 1681 /* Recheck if another CPU created the same shadow */ 1682 sg = gmap_find_shadow(parent, asce, edat_level); 1683 if (sg) { 1684 spin_unlock(&parent->shadow_lock); 1685 gmap_free(new); 1686 return sg; 1687 } 1688 if (asce & _ASCE_REAL_SPACE) { 1689 /* only allow one real-space gmap shadow */ 1690 list_for_each_entry(sg, &parent->children, list) { 1691 if (sg->orig_asce & _ASCE_REAL_SPACE) { 1692 spin_lock(&sg->guest_table_lock); 1693 gmap_unshadow(sg); 1694 spin_unlock(&sg->guest_table_lock); 1695 list_del(&sg->list); 1696 gmap_put(sg); 1697 break; 1698 } 1699 } 1700 } 1701 refcount_set(&new->ref_count, 2); 1702 list_add(&new->list, &parent->children); 1703 if (asce & _ASCE_REAL_SPACE) { 1704 /* nothing to protect, return right away */ 1705 new->initialized = true; 1706 spin_unlock(&parent->shadow_lock); 1707 return new; 1708 } 1709 spin_unlock(&parent->shadow_lock); 1710 /* protect after insertion, so it will get properly invalidated */ 1711 mmap_read_lock(parent->mm); 1712 rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, 1713 ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, 1714 PROT_READ, GMAP_NOTIFY_SHADOW); 1715 mmap_read_unlock(parent->mm); 1716 spin_lock(&parent->shadow_lock); 1717 new->initialized = true; 1718 if (rc) { 1719 list_del(&new->list); 1720 gmap_free(new); 1721 new = ERR_PTR(rc); 1722 } 1723 spin_unlock(&parent->shadow_lock); 1724 return new; 1725 } 1726 EXPORT_SYMBOL_GPL(gmap_shadow); 1727 1728 /** 1729 * gmap_shadow_r2t - create an empty shadow region 2 table 1730 * @sg: pointer to the shadow guest address space structure 1731 * @saddr: faulting address in the shadow gmap 1732 * @r2t: parent gmap address of the region 2 table to get shadowed 1733 * @fake: r2t references contiguous guest memory block, not a r2t 1734 * 1735 * The r2t parameter specifies the address of the source table. The 1736 * four pages of the source table are made read-only in the parent gmap 1737 * address space. A write to the source table area @r2t will automatically 1738 * remove the shadow r2 table and all of its decendents. 1739 * 1740 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1741 * shadow table structure is incomplete, -ENOMEM if out of memory and 1742 * -EFAULT if an address in the parent gmap could not be resolved. 1743 * 1744 * Called with sg->mm->mmap_lock in read. 1745 */ 1746 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, 1747 int fake) 1748 { 1749 unsigned long raddr, origin, offset, len; 1750 unsigned long *s_r2t, *table; 1751 struct page *page; 1752 int rc; 1753 1754 BUG_ON(!gmap_is_shadow(sg)); 1755 /* Allocate a shadow region second table */ 1756 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 1757 if (!page) 1758 return -ENOMEM; 1759 page->index = r2t & _REGION_ENTRY_ORIGIN; 1760 if (fake) 1761 page->index |= GMAP_SHADOW_FAKE_TABLE; 1762 s_r2t = (unsigned long *) page_to_phys(page); 1763 /* Install shadow region second table */ 1764 spin_lock(&sg->guest_table_lock); 1765 table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ 1766 if (!table) { 1767 rc = -EAGAIN; /* Race with unshadow */ 1768 goto out_free; 1769 } 1770 if (!(*table & _REGION_ENTRY_INVALID)) { 1771 rc = 0; /* Already established */ 1772 goto out_free; 1773 } else if (*table & _REGION_ENTRY_ORIGIN) { 1774 rc = -EAGAIN; /* Race with shadow */ 1775 goto out_free; 1776 } 1777 crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); 1778 /* mark as invalid as long as the parent table is not protected */ 1779 *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | 1780 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; 1781 if (sg->edat_level >= 1) 1782 *table |= (r2t & _REGION_ENTRY_PROTECT); 1783 list_add(&page->lru, &sg->crst_list); 1784 if (fake) { 1785 /* nothing to protect for fake tables */ 1786 *table &= ~_REGION_ENTRY_INVALID; 1787 spin_unlock(&sg->guest_table_lock); 1788 return 0; 1789 } 1790 spin_unlock(&sg->guest_table_lock); 1791 /* Make r2t read-only in parent gmap page table */ 1792 raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; 1793 origin = r2t & _REGION_ENTRY_ORIGIN; 1794 offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1795 len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1796 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1797 spin_lock(&sg->guest_table_lock); 1798 if (!rc) { 1799 table = gmap_table_walk(sg, saddr, 4); 1800 if (!table || (*table & _REGION_ENTRY_ORIGIN) != 1801 (unsigned long) s_r2t) 1802 rc = -EAGAIN; /* Race with unshadow */ 1803 else 1804 *table &= ~_REGION_ENTRY_INVALID; 1805 } else { 1806 gmap_unshadow_r2t(sg, raddr); 1807 } 1808 spin_unlock(&sg->guest_table_lock); 1809 return rc; 1810 out_free: 1811 spin_unlock(&sg->guest_table_lock); 1812 __free_pages(page, CRST_ALLOC_ORDER); 1813 return rc; 1814 } 1815 EXPORT_SYMBOL_GPL(gmap_shadow_r2t); 1816 1817 /** 1818 * gmap_shadow_r3t - create a shadow region 3 table 1819 * @sg: pointer to the shadow guest address space structure 1820 * @saddr: faulting address in the shadow gmap 1821 * @r3t: parent gmap address of the region 3 table to get shadowed 1822 * @fake: r3t references contiguous guest memory block, not a r3t 1823 * 1824 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 1825 * shadow table structure is incomplete, -ENOMEM if out of memory and 1826 * -EFAULT if an address in the parent gmap could not be resolved. 1827 * 1828 * Called with sg->mm->mmap_lock in read. 1829 */ 1830 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, 1831 int fake) 1832 { 1833 unsigned long raddr, origin, offset, len; 1834 unsigned long *s_r3t, *table; 1835 struct page *page; 1836 int rc; 1837 1838 BUG_ON(!gmap_is_shadow(sg)); 1839 /* Allocate a shadow region second table */ 1840 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 1841 if (!page) 1842 return -ENOMEM; 1843 page->index = r3t & _REGION_ENTRY_ORIGIN; 1844 if (fake) 1845 page->index |= GMAP_SHADOW_FAKE_TABLE; 1846 s_r3t = (unsigned long *) page_to_phys(page); 1847 /* Install shadow region second table */ 1848 spin_lock(&sg->guest_table_lock); 1849 table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ 1850 if (!table) { 1851 rc = -EAGAIN; /* Race with unshadow */ 1852 goto out_free; 1853 } 1854 if (!(*table & _REGION_ENTRY_INVALID)) { 1855 rc = 0; /* Already established */ 1856 goto out_free; 1857 } else if (*table & _REGION_ENTRY_ORIGIN) { 1858 rc = -EAGAIN; /* Race with shadow */ 1859 goto out_free; 1860 } 1861 crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); 1862 /* mark as invalid as long as the parent table is not protected */ 1863 *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | 1864 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; 1865 if (sg->edat_level >= 1) 1866 *table |= (r3t & _REGION_ENTRY_PROTECT); 1867 list_add(&page->lru, &sg->crst_list); 1868 if (fake) { 1869 /* nothing to protect for fake tables */ 1870 *table &= ~_REGION_ENTRY_INVALID; 1871 spin_unlock(&sg->guest_table_lock); 1872 return 0; 1873 } 1874 spin_unlock(&sg->guest_table_lock); 1875 /* Make r3t read-only in parent gmap page table */ 1876 raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; 1877 origin = r3t & _REGION_ENTRY_ORIGIN; 1878 offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1879 len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1880 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1881 spin_lock(&sg->guest_table_lock); 1882 if (!rc) { 1883 table = gmap_table_walk(sg, saddr, 3); 1884 if (!table || (*table & _REGION_ENTRY_ORIGIN) != 1885 (unsigned long) s_r3t) 1886 rc = -EAGAIN; /* Race with unshadow */ 1887 else 1888 *table &= ~_REGION_ENTRY_INVALID; 1889 } else { 1890 gmap_unshadow_r3t(sg, raddr); 1891 } 1892 spin_unlock(&sg->guest_table_lock); 1893 return rc; 1894 out_free: 1895 spin_unlock(&sg->guest_table_lock); 1896 __free_pages(page, CRST_ALLOC_ORDER); 1897 return rc; 1898 } 1899 EXPORT_SYMBOL_GPL(gmap_shadow_r3t); 1900 1901 /** 1902 * gmap_shadow_sgt - create a shadow segment table 1903 * @sg: pointer to the shadow guest address space structure 1904 * @saddr: faulting address in the shadow gmap 1905 * @sgt: parent gmap address of the segment table to get shadowed 1906 * @fake: sgt references contiguous guest memory block, not a sgt 1907 * 1908 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the 1909 * shadow table structure is incomplete, -ENOMEM if out of memory and 1910 * -EFAULT if an address in the parent gmap could not be resolved. 1911 * 1912 * Called with sg->mm->mmap_lock in read. 1913 */ 1914 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, 1915 int fake) 1916 { 1917 unsigned long raddr, origin, offset, len; 1918 unsigned long *s_sgt, *table; 1919 struct page *page; 1920 int rc; 1921 1922 BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); 1923 /* Allocate a shadow segment table */ 1924 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 1925 if (!page) 1926 return -ENOMEM; 1927 page->index = sgt & _REGION_ENTRY_ORIGIN; 1928 if (fake) 1929 page->index |= GMAP_SHADOW_FAKE_TABLE; 1930 s_sgt = (unsigned long *) page_to_phys(page); 1931 /* Install shadow region second table */ 1932 spin_lock(&sg->guest_table_lock); 1933 table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ 1934 if (!table) { 1935 rc = -EAGAIN; /* Race with unshadow */ 1936 goto out_free; 1937 } 1938 if (!(*table & _REGION_ENTRY_INVALID)) { 1939 rc = 0; /* Already established */ 1940 goto out_free; 1941 } else if (*table & _REGION_ENTRY_ORIGIN) { 1942 rc = -EAGAIN; /* Race with shadow */ 1943 goto out_free; 1944 } 1945 crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); 1946 /* mark as invalid as long as the parent table is not protected */ 1947 *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | 1948 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; 1949 if (sg->edat_level >= 1) 1950 *table |= sgt & _REGION_ENTRY_PROTECT; 1951 list_add(&page->lru, &sg->crst_list); 1952 if (fake) { 1953 /* nothing to protect for fake tables */ 1954 *table &= ~_REGION_ENTRY_INVALID; 1955 spin_unlock(&sg->guest_table_lock); 1956 return 0; 1957 } 1958 spin_unlock(&sg->guest_table_lock); 1959 /* Make sgt read-only in parent gmap page table */ 1960 raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; 1961 origin = sgt & _REGION_ENTRY_ORIGIN; 1962 offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; 1963 len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; 1964 rc = gmap_protect_rmap(sg, raddr, origin + offset, len); 1965 spin_lock(&sg->guest_table_lock); 1966 if (!rc) { 1967 table = gmap_table_walk(sg, saddr, 2); 1968 if (!table || (*table & _REGION_ENTRY_ORIGIN) != 1969 (unsigned long) s_sgt) 1970 rc = -EAGAIN; /* Race with unshadow */ 1971 else 1972 *table &= ~_REGION_ENTRY_INVALID; 1973 } else { 1974 gmap_unshadow_sgt(sg, raddr); 1975 } 1976 spin_unlock(&sg->guest_table_lock); 1977 return rc; 1978 out_free: 1979 spin_unlock(&sg->guest_table_lock); 1980 __free_pages(page, CRST_ALLOC_ORDER); 1981 return rc; 1982 } 1983 EXPORT_SYMBOL_GPL(gmap_shadow_sgt); 1984 1985 /** 1986 * gmap_shadow_pgt_lookup - find a shadow page table 1987 * @sg: pointer to the shadow guest address space structure 1988 * @saddr: the address in the shadow aguest address space 1989 * @pgt: parent gmap address of the page table to get shadowed 1990 * @dat_protection: if the pgtable is marked as protected by dat 1991 * @fake: pgt references contiguous guest memory block, not a pgtable 1992 * 1993 * Returns 0 if the shadow page table was found and -EAGAIN if the page 1994 * table was not found. 1995 * 1996 * Called with sg->mm->mmap_lock in read. 1997 */ 1998 int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, 1999 unsigned long *pgt, int *dat_protection, 2000 int *fake) 2001 { 2002 unsigned long *table; 2003 struct page *page; 2004 int rc; 2005 2006 BUG_ON(!gmap_is_shadow(sg)); 2007 spin_lock(&sg->guest_table_lock); 2008 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 2009 if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { 2010 /* Shadow page tables are full pages (pte+pgste) */ 2011 page = pfn_to_page(*table >> PAGE_SHIFT); 2012 *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; 2013 *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); 2014 *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); 2015 rc = 0; 2016 } else { 2017 rc = -EAGAIN; 2018 } 2019 spin_unlock(&sg->guest_table_lock); 2020 return rc; 2021 2022 } 2023 EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); 2024 2025 /** 2026 * gmap_shadow_pgt - instantiate a shadow page table 2027 * @sg: pointer to the shadow guest address space structure 2028 * @saddr: faulting address in the shadow gmap 2029 * @pgt: parent gmap address of the page table to get shadowed 2030 * @fake: pgt references contiguous guest memory block, not a pgtable 2031 * 2032 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 2033 * shadow table structure is incomplete, -ENOMEM if out of memory, 2034 * -EFAULT if an address in the parent gmap could not be resolved and 2035 * 2036 * Called with gmap->mm->mmap_lock in read 2037 */ 2038 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, 2039 int fake) 2040 { 2041 unsigned long raddr, origin; 2042 unsigned long *s_pgt, *table; 2043 struct page *page; 2044 int rc; 2045 2046 BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); 2047 /* Allocate a shadow page table */ 2048 page = page_table_alloc_pgste(sg->mm); 2049 if (!page) 2050 return -ENOMEM; 2051 page->index = pgt & _SEGMENT_ENTRY_ORIGIN; 2052 if (fake) 2053 page->index |= GMAP_SHADOW_FAKE_TABLE; 2054 s_pgt = (unsigned long *) page_to_phys(page); 2055 /* Install shadow page table */ 2056 spin_lock(&sg->guest_table_lock); 2057 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 2058 if (!table) { 2059 rc = -EAGAIN; /* Race with unshadow */ 2060 goto out_free; 2061 } 2062 if (!(*table & _SEGMENT_ENTRY_INVALID)) { 2063 rc = 0; /* Already established */ 2064 goto out_free; 2065 } else if (*table & _SEGMENT_ENTRY_ORIGIN) { 2066 rc = -EAGAIN; /* Race with shadow */ 2067 goto out_free; 2068 } 2069 /* mark as invalid as long as the parent table is not protected */ 2070 *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | 2071 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; 2072 list_add(&page->lru, &sg->pt_list); 2073 if (fake) { 2074 /* nothing to protect for fake tables */ 2075 *table &= ~_SEGMENT_ENTRY_INVALID; 2076 spin_unlock(&sg->guest_table_lock); 2077 return 0; 2078 } 2079 spin_unlock(&sg->guest_table_lock); 2080 /* Make pgt read-only in parent gmap page table (not the pgste) */ 2081 raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; 2082 origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; 2083 rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); 2084 spin_lock(&sg->guest_table_lock); 2085 if (!rc) { 2086 table = gmap_table_walk(sg, saddr, 1); 2087 if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != 2088 (unsigned long) s_pgt) 2089 rc = -EAGAIN; /* Race with unshadow */ 2090 else 2091 *table &= ~_SEGMENT_ENTRY_INVALID; 2092 } else { 2093 gmap_unshadow_pgt(sg, raddr); 2094 } 2095 spin_unlock(&sg->guest_table_lock); 2096 return rc; 2097 out_free: 2098 spin_unlock(&sg->guest_table_lock); 2099 page_table_free_pgste(page); 2100 return rc; 2101 2102 } 2103 EXPORT_SYMBOL_GPL(gmap_shadow_pgt); 2104 2105 /** 2106 * gmap_shadow_page - create a shadow page mapping 2107 * @sg: pointer to the shadow guest address space structure 2108 * @saddr: faulting address in the shadow gmap 2109 * @pte: pte in parent gmap address space to get shadowed 2110 * 2111 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the 2112 * shadow table structure is incomplete, -ENOMEM if out of memory and 2113 * -EFAULT if an address in the parent gmap could not be resolved. 2114 * 2115 * Called with sg->mm->mmap_lock in read. 2116 */ 2117 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) 2118 { 2119 struct gmap *parent; 2120 struct gmap_rmap *rmap; 2121 unsigned long vmaddr, paddr; 2122 spinlock_t *ptl; 2123 pte_t *sptep, *tptep; 2124 int prot; 2125 int rc; 2126 2127 BUG_ON(!gmap_is_shadow(sg)); 2128 parent = sg->parent; 2129 prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; 2130 2131 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); 2132 if (!rmap) 2133 return -ENOMEM; 2134 rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; 2135 2136 while (1) { 2137 paddr = pte_val(pte) & PAGE_MASK; 2138 vmaddr = __gmap_translate(parent, paddr); 2139 if (IS_ERR_VALUE(vmaddr)) { 2140 rc = vmaddr; 2141 break; 2142 } 2143 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); 2144 if (rc) 2145 break; 2146 rc = -EAGAIN; 2147 sptep = gmap_pte_op_walk(parent, paddr, &ptl); 2148 if (sptep) { 2149 spin_lock(&sg->guest_table_lock); 2150 /* Get page table pointer */ 2151 tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); 2152 if (!tptep) { 2153 spin_unlock(&sg->guest_table_lock); 2154 gmap_pte_op_end(ptl); 2155 radix_tree_preload_end(); 2156 break; 2157 } 2158 rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); 2159 if (rc > 0) { 2160 /* Success and a new mapping */ 2161 gmap_insert_rmap(sg, vmaddr, rmap); 2162 rmap = NULL; 2163 rc = 0; 2164 } 2165 gmap_pte_op_end(ptl); 2166 spin_unlock(&sg->guest_table_lock); 2167 } 2168 radix_tree_preload_end(); 2169 if (!rc) 2170 break; 2171 rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); 2172 if (rc) 2173 break; 2174 } 2175 kfree(rmap); 2176 return rc; 2177 } 2178 EXPORT_SYMBOL_GPL(gmap_shadow_page); 2179 2180 /* 2181 * gmap_shadow_notify - handle notifications for shadow gmap 2182 * 2183 * Called with sg->parent->shadow_lock. 2184 */ 2185 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, 2186 unsigned long gaddr) 2187 { 2188 struct gmap_rmap *rmap, *rnext, *head; 2189 unsigned long start, end, bits, raddr; 2190 2191 BUG_ON(!gmap_is_shadow(sg)); 2192 2193 spin_lock(&sg->guest_table_lock); 2194 if (sg->removed) { 2195 spin_unlock(&sg->guest_table_lock); 2196 return; 2197 } 2198 /* Check for top level table */ 2199 start = sg->orig_asce & _ASCE_ORIGIN; 2200 end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; 2201 if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && 2202 gaddr < end) { 2203 /* The complete shadow table has to go */ 2204 gmap_unshadow(sg); 2205 spin_unlock(&sg->guest_table_lock); 2206 list_del(&sg->list); 2207 gmap_put(sg); 2208 return; 2209 } 2210 /* Remove the page table tree from on specific entry */ 2211 head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); 2212 gmap_for_each_rmap_safe(rmap, rnext, head) { 2213 bits = rmap->raddr & _SHADOW_RMAP_MASK; 2214 raddr = rmap->raddr ^ bits; 2215 switch (bits) { 2216 case _SHADOW_RMAP_REGION1: 2217 gmap_unshadow_r2t(sg, raddr); 2218 break; 2219 case _SHADOW_RMAP_REGION2: 2220 gmap_unshadow_r3t(sg, raddr); 2221 break; 2222 case _SHADOW_RMAP_REGION3: 2223 gmap_unshadow_sgt(sg, raddr); 2224 break; 2225 case _SHADOW_RMAP_SEGMENT: 2226 gmap_unshadow_pgt(sg, raddr); 2227 break; 2228 case _SHADOW_RMAP_PGTABLE: 2229 gmap_unshadow_page(sg, raddr); 2230 break; 2231 } 2232 kfree(rmap); 2233 } 2234 spin_unlock(&sg->guest_table_lock); 2235 } 2236 2237 /** 2238 * ptep_notify - call all invalidation callbacks for a specific pte. 2239 * @mm: pointer to the process mm_struct 2240 * @vmaddr: virtual address in the process address space 2241 * @pte: pointer to the page table entry 2242 * @bits: bits from the pgste that caused the notify call 2243 * 2244 * This function is assumed to be called with the page table lock held 2245 * for the pte to notify. 2246 */ 2247 void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, 2248 pte_t *pte, unsigned long bits) 2249 { 2250 unsigned long offset, gaddr = 0; 2251 unsigned long *table; 2252 struct gmap *gmap, *sg, *next; 2253 2254 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 2255 offset = offset * (PAGE_SIZE / sizeof(pte_t)); 2256 rcu_read_lock(); 2257 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2258 spin_lock(&gmap->guest_table_lock); 2259 table = radix_tree_lookup(&gmap->host_to_guest, 2260 vmaddr >> PMD_SHIFT); 2261 if (table) 2262 gaddr = __gmap_segment_gaddr(table) + offset; 2263 spin_unlock(&gmap->guest_table_lock); 2264 if (!table) 2265 continue; 2266 2267 if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { 2268 spin_lock(&gmap->shadow_lock); 2269 list_for_each_entry_safe(sg, next, 2270 &gmap->children, list) 2271 gmap_shadow_notify(sg, vmaddr, gaddr); 2272 spin_unlock(&gmap->shadow_lock); 2273 } 2274 if (bits & PGSTE_IN_BIT) 2275 gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); 2276 } 2277 rcu_read_unlock(); 2278 } 2279 EXPORT_SYMBOL_GPL(ptep_notify); 2280 2281 static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, 2282 unsigned long gaddr) 2283 { 2284 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); 2285 gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); 2286 } 2287 2288 /** 2289 * gmap_pmdp_xchg - exchange a gmap pmd with another 2290 * @gmap: pointer to the guest address space structure 2291 * @pmdp: pointer to the pmd entry 2292 * @new: replacement entry 2293 * @gaddr: the affected guest address 2294 * 2295 * This function is assumed to be called with the guest_table_lock 2296 * held. 2297 */ 2298 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, 2299 unsigned long gaddr) 2300 { 2301 gaddr &= HPAGE_MASK; 2302 pmdp_notify_gmap(gmap, pmdp, gaddr); 2303 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); 2304 if (MACHINE_HAS_TLB_GUEST) 2305 __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, 2306 IDTE_GLOBAL); 2307 else if (MACHINE_HAS_IDTE) 2308 __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); 2309 else 2310 __pmdp_csp(pmdp); 2311 set_pmd(pmdp, new); 2312 } 2313 2314 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, 2315 int purge) 2316 { 2317 pmd_t *pmdp; 2318 struct gmap *gmap; 2319 unsigned long gaddr; 2320 2321 rcu_read_lock(); 2322 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2323 spin_lock(&gmap->guest_table_lock); 2324 pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, 2325 vmaddr >> PMD_SHIFT); 2326 if (pmdp) { 2327 gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); 2328 pmdp_notify_gmap(gmap, pmdp, gaddr); 2329 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2330 _SEGMENT_ENTRY_GMAP_UC)); 2331 if (purge) 2332 __pmdp_csp(pmdp); 2333 set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); 2334 } 2335 spin_unlock(&gmap->guest_table_lock); 2336 } 2337 rcu_read_unlock(); 2338 } 2339 2340 /** 2341 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without 2342 * flushing 2343 * @mm: pointer to the process mm_struct 2344 * @vmaddr: virtual address in the process address space 2345 */ 2346 void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) 2347 { 2348 gmap_pmdp_clear(mm, vmaddr, 0); 2349 } 2350 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); 2351 2352 /** 2353 * gmap_pmdp_csp - csp all affected guest pmd entries 2354 * @mm: pointer to the process mm_struct 2355 * @vmaddr: virtual address in the process address space 2356 */ 2357 void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) 2358 { 2359 gmap_pmdp_clear(mm, vmaddr, 1); 2360 } 2361 EXPORT_SYMBOL_GPL(gmap_pmdp_csp); 2362 2363 /** 2364 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry 2365 * @mm: pointer to the process mm_struct 2366 * @vmaddr: virtual address in the process address space 2367 */ 2368 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) 2369 { 2370 unsigned long *entry, gaddr; 2371 struct gmap *gmap; 2372 pmd_t *pmdp; 2373 2374 rcu_read_lock(); 2375 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2376 spin_lock(&gmap->guest_table_lock); 2377 entry = radix_tree_delete(&gmap->host_to_guest, 2378 vmaddr >> PMD_SHIFT); 2379 if (entry) { 2380 pmdp = (pmd_t *)entry; 2381 gaddr = __gmap_segment_gaddr(entry); 2382 pmdp_notify_gmap(gmap, pmdp, gaddr); 2383 WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2384 _SEGMENT_ENTRY_GMAP_UC)); 2385 if (MACHINE_HAS_TLB_GUEST) 2386 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2387 gmap->asce, IDTE_LOCAL); 2388 else if (MACHINE_HAS_IDTE) 2389 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); 2390 *entry = _SEGMENT_ENTRY_EMPTY; 2391 } 2392 spin_unlock(&gmap->guest_table_lock); 2393 } 2394 rcu_read_unlock(); 2395 } 2396 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); 2397 2398 /** 2399 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry 2400 * @mm: pointer to the process mm_struct 2401 * @vmaddr: virtual address in the process address space 2402 */ 2403 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) 2404 { 2405 unsigned long *entry, gaddr; 2406 struct gmap *gmap; 2407 pmd_t *pmdp; 2408 2409 rcu_read_lock(); 2410 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2411 spin_lock(&gmap->guest_table_lock); 2412 entry = radix_tree_delete(&gmap->host_to_guest, 2413 vmaddr >> PMD_SHIFT); 2414 if (entry) { 2415 pmdp = (pmd_t *)entry; 2416 gaddr = __gmap_segment_gaddr(entry); 2417 pmdp_notify_gmap(gmap, pmdp, gaddr); 2418 WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2419 _SEGMENT_ENTRY_GMAP_UC)); 2420 if (MACHINE_HAS_TLB_GUEST) 2421 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2422 gmap->asce, IDTE_GLOBAL); 2423 else if (MACHINE_HAS_IDTE) 2424 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); 2425 else 2426 __pmdp_csp(pmdp); 2427 *entry = _SEGMENT_ENTRY_EMPTY; 2428 } 2429 spin_unlock(&gmap->guest_table_lock); 2430 } 2431 rcu_read_unlock(); 2432 } 2433 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); 2434 2435 /** 2436 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status 2437 * @gmap: pointer to guest address space 2438 * @pmdp: pointer to the pmd to be tested 2439 * @gaddr: virtual address in the guest address space 2440 * 2441 * This function is assumed to be called with the guest_table_lock 2442 * held. 2443 */ 2444 static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, 2445 unsigned long gaddr) 2446 { 2447 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) 2448 return false; 2449 2450 /* Already protected memory, which did not change is clean */ 2451 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && 2452 !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) 2453 return false; 2454 2455 /* Clear UC indication and reset protection */ 2456 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); 2457 gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); 2458 return true; 2459 } 2460 2461 /** 2462 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment 2463 * @gmap: pointer to guest address space 2464 * @bitmap: dirty bitmap for this pmd 2465 * @gaddr: virtual address in the guest address space 2466 * @vmaddr: virtual address in the host address space 2467 * 2468 * This function is assumed to be called with the guest_table_lock 2469 * held. 2470 */ 2471 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], 2472 unsigned long gaddr, unsigned long vmaddr) 2473 { 2474 int i; 2475 pmd_t *pmdp; 2476 pte_t *ptep; 2477 spinlock_t *ptl; 2478 2479 pmdp = gmap_pmd_op_walk(gmap, gaddr); 2480 if (!pmdp) 2481 return; 2482 2483 if (pmd_large(*pmdp)) { 2484 if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) 2485 bitmap_fill(bitmap, _PAGE_ENTRIES); 2486 } else { 2487 for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { 2488 ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); 2489 if (!ptep) 2490 continue; 2491 if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) 2492 set_bit(i, bitmap); 2493 spin_unlock(ptl); 2494 } 2495 } 2496 gmap_pmd_op_end(gmap, pmdp); 2497 } 2498 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); 2499 2500 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2501 static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, 2502 unsigned long end, struct mm_walk *walk) 2503 { 2504 struct vm_area_struct *vma = walk->vma; 2505 2506 split_huge_pmd(vma, pmd, addr); 2507 return 0; 2508 } 2509 2510 static const struct mm_walk_ops thp_split_walk_ops = { 2511 .pmd_entry = thp_split_walk_pmd_entry, 2512 }; 2513 2514 static inline void thp_split_mm(struct mm_struct *mm) 2515 { 2516 struct vm_area_struct *vma; 2517 VMA_ITERATOR(vmi, mm, 0); 2518 2519 for_each_vma(vmi, vma) { 2520 vma->vm_flags &= ~VM_HUGEPAGE; 2521 vma->vm_flags |= VM_NOHUGEPAGE; 2522 walk_page_vma(vma, &thp_split_walk_ops, NULL); 2523 } 2524 mm->def_flags |= VM_NOHUGEPAGE; 2525 } 2526 #else 2527 static inline void thp_split_mm(struct mm_struct *mm) 2528 { 2529 } 2530 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2531 2532 /* 2533 * Remove all empty zero pages from the mapping for lazy refaulting 2534 * - This must be called after mm->context.has_pgste is set, to avoid 2535 * future creation of zero pages 2536 * - This must be called after THP was enabled 2537 */ 2538 static int __zap_zero_pages(pmd_t *pmd, unsigned long start, 2539 unsigned long end, struct mm_walk *walk) 2540 { 2541 unsigned long addr; 2542 2543 for (addr = start; addr != end; addr += PAGE_SIZE) { 2544 pte_t *ptep; 2545 spinlock_t *ptl; 2546 2547 ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 2548 if (is_zero_pfn(pte_pfn(*ptep))) 2549 ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); 2550 pte_unmap_unlock(ptep, ptl); 2551 } 2552 return 0; 2553 } 2554 2555 static const struct mm_walk_ops zap_zero_walk_ops = { 2556 .pmd_entry = __zap_zero_pages, 2557 }; 2558 2559 /* 2560 * switch on pgstes for its userspace process (for kvm) 2561 */ 2562 int s390_enable_sie(void) 2563 { 2564 struct mm_struct *mm = current->mm; 2565 2566 /* Do we have pgstes? if yes, we are done */ 2567 if (mm_has_pgste(mm)) 2568 return 0; 2569 /* Fail if the page tables are 2K */ 2570 if (!mm_alloc_pgste(mm)) 2571 return -EINVAL; 2572 mmap_write_lock(mm); 2573 mm->context.has_pgste = 1; 2574 /* split thp mappings and disable thp for future mappings */ 2575 thp_split_mm(mm); 2576 walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); 2577 mmap_write_unlock(mm); 2578 return 0; 2579 } 2580 EXPORT_SYMBOL_GPL(s390_enable_sie); 2581 2582 int gmap_mark_unmergeable(void) 2583 { 2584 struct mm_struct *mm = current->mm; 2585 struct vm_area_struct *vma; 2586 int ret; 2587 VMA_ITERATOR(vmi, mm, 0); 2588 2589 for_each_vma(vmi, vma) { 2590 ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, 2591 MADV_UNMERGEABLE, &vma->vm_flags); 2592 if (ret) 2593 return ret; 2594 } 2595 mm->def_flags &= ~VM_MERGEABLE; 2596 return 0; 2597 } 2598 EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); 2599 2600 /* 2601 * Enable storage key handling from now on and initialize the storage 2602 * keys with the default key. 2603 */ 2604 static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, 2605 unsigned long next, struct mm_walk *walk) 2606 { 2607 /* Clear storage key */ 2608 ptep_zap_key(walk->mm, addr, pte); 2609 return 0; 2610 } 2611 2612 /* 2613 * Give a chance to schedule after setting a key to 256 pages. 2614 * We only hold the mm lock, which is a rwsem and the kvm srcu. 2615 * Both can sleep. 2616 */ 2617 static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, 2618 unsigned long next, struct mm_walk *walk) 2619 { 2620 cond_resched(); 2621 return 0; 2622 } 2623 2624 static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, 2625 unsigned long hmask, unsigned long next, 2626 struct mm_walk *walk) 2627 { 2628 pmd_t *pmd = (pmd_t *)pte; 2629 unsigned long start, end; 2630 struct page *page = pmd_page(*pmd); 2631 2632 /* 2633 * The write check makes sure we do not set a key on shared 2634 * memory. This is needed as the walker does not differentiate 2635 * between actual guest memory and the process executable or 2636 * shared libraries. 2637 */ 2638 if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || 2639 !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) 2640 return 0; 2641 2642 start = pmd_val(*pmd) & HPAGE_MASK; 2643 end = start + HPAGE_SIZE - 1; 2644 __storage_key_init_range(start, end); 2645 set_bit(PG_arch_1, &page->flags); 2646 cond_resched(); 2647 return 0; 2648 } 2649 2650 static const struct mm_walk_ops enable_skey_walk_ops = { 2651 .hugetlb_entry = __s390_enable_skey_hugetlb, 2652 .pte_entry = __s390_enable_skey_pte, 2653 .pmd_entry = __s390_enable_skey_pmd, 2654 }; 2655 2656 int s390_enable_skey(void) 2657 { 2658 struct mm_struct *mm = current->mm; 2659 int rc = 0; 2660 2661 mmap_write_lock(mm); 2662 if (mm_uses_skeys(mm)) 2663 goto out_up; 2664 2665 mm->context.uses_skeys = 1; 2666 rc = gmap_mark_unmergeable(); 2667 if (rc) { 2668 mm->context.uses_skeys = 0; 2669 goto out_up; 2670 } 2671 walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); 2672 2673 out_up: 2674 mmap_write_unlock(mm); 2675 return rc; 2676 } 2677 EXPORT_SYMBOL_GPL(s390_enable_skey); 2678 2679 /* 2680 * Reset CMMA state, make all pages stable again. 2681 */ 2682 static int __s390_reset_cmma(pte_t *pte, unsigned long addr, 2683 unsigned long next, struct mm_walk *walk) 2684 { 2685 ptep_zap_unused(walk->mm, addr, pte, 1); 2686 return 0; 2687 } 2688 2689 static const struct mm_walk_ops reset_cmma_walk_ops = { 2690 .pte_entry = __s390_reset_cmma, 2691 }; 2692 2693 void s390_reset_cmma(struct mm_struct *mm) 2694 { 2695 mmap_write_lock(mm); 2696 walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); 2697 mmap_write_unlock(mm); 2698 } 2699 EXPORT_SYMBOL_GPL(s390_reset_cmma); 2700 2701 #define GATHER_GET_PAGES 32 2702 2703 struct reset_walk_state { 2704 unsigned long next; 2705 unsigned long count; 2706 unsigned long pfns[GATHER_GET_PAGES]; 2707 }; 2708 2709 static int s390_gather_pages(pte_t *ptep, unsigned long addr, 2710 unsigned long next, struct mm_walk *walk) 2711 { 2712 struct reset_walk_state *p = walk->private; 2713 pte_t pte = READ_ONCE(*ptep); 2714 2715 if (pte_present(pte)) { 2716 /* we have a reference from the mapping, take an extra one */ 2717 get_page(phys_to_page(pte_val(pte))); 2718 p->pfns[p->count] = phys_to_pfn(pte_val(pte)); 2719 p->next = next; 2720 p->count++; 2721 } 2722 return p->count >= GATHER_GET_PAGES; 2723 } 2724 2725 static const struct mm_walk_ops gather_pages_ops = { 2726 .pte_entry = s390_gather_pages, 2727 }; 2728 2729 /* 2730 * Call the Destroy secure page UVC on each page in the given array of PFNs. 2731 * Each page needs to have an extra reference, which will be released here. 2732 */ 2733 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) 2734 { 2735 unsigned long i; 2736 2737 for (i = 0; i < count; i++) { 2738 /* we always have an extra reference */ 2739 uv_destroy_owned_page(pfn_to_phys(pfns[i])); 2740 /* get rid of the extra reference */ 2741 put_page(pfn_to_page(pfns[i])); 2742 cond_resched(); 2743 } 2744 } 2745 EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); 2746 2747 /** 2748 * __s390_uv_destroy_range - Call the destroy secure page UVC on each page 2749 * in the given range of the given address space. 2750 * @mm: the mm to operate on 2751 * @start: the start of the range 2752 * @end: the end of the range 2753 * @interruptible: if not 0, stop when a fatal signal is received 2754 * 2755 * Walk the given range of the given address space and call the destroy 2756 * secure page UVC on each page. Optionally exit early if a fatal signal is 2757 * pending. 2758 * 2759 * Return: 0 on success, -EINTR if the function stopped before completing 2760 */ 2761 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, 2762 unsigned long end, bool interruptible) 2763 { 2764 struct reset_walk_state state = { .next = start }; 2765 int r = 1; 2766 2767 while (r > 0) { 2768 state.count = 0; 2769 mmap_read_lock(mm); 2770 r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); 2771 mmap_read_unlock(mm); 2772 cond_resched(); 2773 s390_uv_destroy_pfns(state.count, state.pfns); 2774 if (interruptible && fatal_signal_pending(current)) 2775 return -EINTR; 2776 } 2777 return 0; 2778 } 2779 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); 2780 2781 /** 2782 * s390_unlist_old_asce - Remove the topmost level of page tables from the 2783 * list of page tables of the gmap. 2784 * @gmap: the gmap whose table is to be removed 2785 * 2786 * On s390x, KVM keeps a list of all pages containing the page tables of the 2787 * gmap (the CRST list). This list is used at tear down time to free all 2788 * pages that are now not needed anymore. 2789 * 2790 * This function removes the topmost page of the tree (the one pointed to by 2791 * the ASCE) from the CRST list. 2792 * 2793 * This means that it will not be freed when the VM is torn down, and needs 2794 * to be handled separately by the caller, unless a leak is actually 2795 * intended. Notice that this function will only remove the page from the 2796 * list, the page will still be used as a top level page table (and ASCE). 2797 */ 2798 void s390_unlist_old_asce(struct gmap *gmap) 2799 { 2800 struct page *old; 2801 2802 old = virt_to_page(gmap->table); 2803 spin_lock(&gmap->guest_table_lock); 2804 list_del(&old->lru); 2805 /* 2806 * Sometimes the topmost page might need to be "removed" multiple 2807 * times, for example if the VM is rebooted into secure mode several 2808 * times concurrently, or if s390_replace_asce fails after calling 2809 * s390_remove_old_asce and is attempted again later. In that case 2810 * the old asce has been removed from the list, and therefore it 2811 * will not be freed when the VM terminates, but the ASCE is still 2812 * in use and still pointed to. 2813 * A subsequent call to replace_asce will follow the pointer and try 2814 * to remove the same page from the list again. 2815 * Therefore it's necessary that the page of the ASCE has valid 2816 * pointers, so list_del can work (and do nothing) without 2817 * dereferencing stale or invalid pointers. 2818 */ 2819 INIT_LIST_HEAD(&old->lru); 2820 spin_unlock(&gmap->guest_table_lock); 2821 } 2822 EXPORT_SYMBOL_GPL(s390_unlist_old_asce); 2823 2824 /** 2825 * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy 2826 * @gmap: the gmap whose ASCE needs to be replaced 2827 * 2828 * If the allocation of the new top level page table fails, the ASCE is not 2829 * replaced. 2830 * In any case, the old ASCE is always removed from the gmap CRST list. 2831 * Therefore the caller has to make sure to save a pointer to it 2832 * beforehand, unless a leak is actually intended. 2833 */ 2834 int s390_replace_asce(struct gmap *gmap) 2835 { 2836 unsigned long asce; 2837 struct page *page; 2838 void *table; 2839 2840 s390_unlist_old_asce(gmap); 2841 2842 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); 2843 if (!page) 2844 return -ENOMEM; 2845 table = page_to_virt(page); 2846 memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); 2847 2848 /* 2849 * The caller has to deal with the old ASCE, but here we make sure 2850 * the new one is properly added to the CRST list, so that 2851 * it will be freed when the VM is torn down. 2852 */ 2853 spin_lock(&gmap->guest_table_lock); 2854 list_add(&page->lru, &gmap->crst_list); 2855 spin_unlock(&gmap->guest_table_lock); 2856 2857 /* Set new table origin while preserving existing ASCE control bits */ 2858 asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); 2859 WRITE_ONCE(gmap->asce, asce); 2860 WRITE_ONCE(gmap->mm->context.gmap_asce, asce); 2861 WRITE_ONCE(gmap->table, table); 2862 2863 return 0; 2864 } 2865 EXPORT_SYMBOL_GPL(s390_replace_asce); 2866