1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 #include <linux/swapops.h> 21 22 #include <asm/pgtable.h> 23 #include <asm/pgalloc.h> 24 #include <asm/tlb.h> 25 #include <asm/tlbflush.h> 26 #include <asm/mmu_context.h> 27 28 #ifndef CONFIG_64BIT 29 #define ALLOC_ORDER 1 30 #define FRAG_MASK 0x0f 31 #else 32 #define ALLOC_ORDER 2 33 #define FRAG_MASK 0x03 34 #endif 35 36 37 unsigned long *crst_table_alloc(struct mm_struct *mm) 38 { 39 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 40 41 if (!page) 42 return NULL; 43 return (unsigned long *) page_to_phys(page); 44 } 45 46 void crst_table_free(struct mm_struct *mm, unsigned long *table) 47 { 48 free_pages((unsigned long) table, ALLOC_ORDER); 49 } 50 51 #ifdef CONFIG_64BIT 52 static void __crst_table_upgrade(void *arg) 53 { 54 struct mm_struct *mm = arg; 55 56 if (current->active_mm == mm) { 57 clear_user_asce(); 58 set_user_asce(mm); 59 } 60 __tlb_flush_local(); 61 } 62 63 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 64 { 65 unsigned long *table, *pgd; 66 unsigned long entry; 67 int flush; 68 69 BUG_ON(limit > (1UL << 53)); 70 flush = 0; 71 repeat: 72 table = crst_table_alloc(mm); 73 if (!table) 74 return -ENOMEM; 75 spin_lock_bh(&mm->page_table_lock); 76 if (mm->context.asce_limit < limit) { 77 pgd = (unsigned long *) mm->pgd; 78 if (mm->context.asce_limit <= (1UL << 31)) { 79 entry = _REGION3_ENTRY_EMPTY; 80 mm->context.asce_limit = 1UL << 42; 81 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 82 _ASCE_USER_BITS | 83 _ASCE_TYPE_REGION3; 84 } else { 85 entry = _REGION2_ENTRY_EMPTY; 86 mm->context.asce_limit = 1UL << 53; 87 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 88 _ASCE_USER_BITS | 89 _ASCE_TYPE_REGION2; 90 } 91 crst_table_init(table, entry); 92 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 93 mm->pgd = (pgd_t *) table; 94 mm->task_size = mm->context.asce_limit; 95 table = NULL; 96 flush = 1; 97 } 98 spin_unlock_bh(&mm->page_table_lock); 99 if (table) 100 crst_table_free(mm, table); 101 if (mm->context.asce_limit < limit) 102 goto repeat; 103 if (flush) 104 on_each_cpu(__crst_table_upgrade, mm, 0); 105 return 0; 106 } 107 108 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 109 { 110 pgd_t *pgd; 111 112 if (current->active_mm == mm) { 113 clear_user_asce(); 114 __tlb_flush_mm(mm); 115 } 116 while (mm->context.asce_limit > limit) { 117 pgd = mm->pgd; 118 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 119 case _REGION_ENTRY_TYPE_R2: 120 mm->context.asce_limit = 1UL << 42; 121 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 122 _ASCE_USER_BITS | 123 _ASCE_TYPE_REGION3; 124 break; 125 case _REGION_ENTRY_TYPE_R3: 126 mm->context.asce_limit = 1UL << 31; 127 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 128 _ASCE_USER_BITS | 129 _ASCE_TYPE_SEGMENT; 130 break; 131 default: 132 BUG(); 133 } 134 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 135 mm->task_size = mm->context.asce_limit; 136 crst_table_free(mm, (unsigned long *) pgd); 137 } 138 if (current->active_mm == mm) 139 set_user_asce(mm); 140 } 141 #endif 142 143 #ifdef CONFIG_PGSTE 144 145 /** 146 * gmap_alloc - allocate a guest address space 147 * @mm: pointer to the parent mm_struct 148 * 149 * Returns a guest address space structure. 150 */ 151 struct gmap *gmap_alloc(struct mm_struct *mm) 152 { 153 struct gmap *gmap; 154 struct page *page; 155 unsigned long *table; 156 157 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 158 if (!gmap) 159 goto out; 160 INIT_LIST_HEAD(&gmap->crst_list); 161 gmap->mm = mm; 162 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 163 if (!page) 164 goto out_free; 165 list_add(&page->lru, &gmap->crst_list); 166 table = (unsigned long *) page_to_phys(page); 167 crst_table_init(table, _REGION1_ENTRY_EMPTY); 168 gmap->table = table; 169 gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH | 170 _ASCE_USER_BITS | __pa(table); 171 list_add(&gmap->list, &mm->context.gmap_list); 172 return gmap; 173 174 out_free: 175 kfree(gmap); 176 out: 177 return NULL; 178 } 179 EXPORT_SYMBOL_GPL(gmap_alloc); 180 181 static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) 182 { 183 struct gmap_pgtable *mp; 184 struct gmap_rmap *rmap; 185 struct page *page; 186 187 if (*table & _SEGMENT_ENTRY_INVALID) 188 return 0; 189 page = pfn_to_page(*table >> PAGE_SHIFT); 190 mp = (struct gmap_pgtable *) page->index; 191 list_for_each_entry(rmap, &mp->mapper, list) { 192 if (rmap->entry != table) 193 continue; 194 list_del(&rmap->list); 195 kfree(rmap); 196 break; 197 } 198 *table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT; 199 return 1; 200 } 201 202 static void gmap_flush_tlb(struct gmap *gmap) 203 { 204 if (MACHINE_HAS_IDTE) 205 __tlb_flush_asce(gmap->mm, (unsigned long) gmap->table | 206 _ASCE_TYPE_REGION1); 207 else 208 __tlb_flush_global(); 209 } 210 211 /** 212 * gmap_free - free a guest address space 213 * @gmap: pointer to the guest address space structure 214 */ 215 void gmap_free(struct gmap *gmap) 216 { 217 struct page *page, *next; 218 unsigned long *table; 219 int i; 220 221 222 /* Flush tlb. */ 223 if (MACHINE_HAS_IDTE) 224 __tlb_flush_asce(gmap->mm, (unsigned long) gmap->table | 225 _ASCE_TYPE_REGION1); 226 else 227 __tlb_flush_global(); 228 229 /* Free all segment & region tables. */ 230 down_read(&gmap->mm->mmap_sem); 231 spin_lock(&gmap->mm->page_table_lock); 232 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { 233 table = (unsigned long *) page_to_phys(page); 234 if ((*table & _REGION_ENTRY_TYPE_MASK) == 0) 235 /* Remove gmap rmap structures for segment table. */ 236 for (i = 0; i < PTRS_PER_PMD; i++, table++) 237 gmap_unlink_segment(gmap, table); 238 __free_pages(page, ALLOC_ORDER); 239 } 240 spin_unlock(&gmap->mm->page_table_lock); 241 up_read(&gmap->mm->mmap_sem); 242 list_del(&gmap->list); 243 kfree(gmap); 244 } 245 EXPORT_SYMBOL_GPL(gmap_free); 246 247 /** 248 * gmap_enable - switch primary space to the guest address space 249 * @gmap: pointer to the guest address space structure 250 */ 251 void gmap_enable(struct gmap *gmap) 252 { 253 S390_lowcore.gmap = (unsigned long) gmap; 254 } 255 EXPORT_SYMBOL_GPL(gmap_enable); 256 257 /** 258 * gmap_disable - switch back to the standard primary address space 259 * @gmap: pointer to the guest address space structure 260 */ 261 void gmap_disable(struct gmap *gmap) 262 { 263 S390_lowcore.gmap = 0UL; 264 } 265 EXPORT_SYMBOL_GPL(gmap_disable); 266 267 /* 268 * gmap_alloc_table is assumed to be called with mmap_sem held 269 */ 270 static int gmap_alloc_table(struct gmap *gmap, 271 unsigned long *table, unsigned long init) 272 __releases(&gmap->mm->page_table_lock) 273 __acquires(&gmap->mm->page_table_lock) 274 { 275 struct page *page; 276 unsigned long *new; 277 278 /* since we dont free the gmap table until gmap_free we can unlock */ 279 spin_unlock(&gmap->mm->page_table_lock); 280 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 281 spin_lock(&gmap->mm->page_table_lock); 282 if (!page) 283 return -ENOMEM; 284 new = (unsigned long *) page_to_phys(page); 285 crst_table_init(new, init); 286 if (*table & _REGION_ENTRY_INVALID) { 287 list_add(&page->lru, &gmap->crst_list); 288 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 289 (*table & _REGION_ENTRY_TYPE_MASK); 290 } else 291 __free_pages(page, ALLOC_ORDER); 292 return 0; 293 } 294 295 /** 296 * gmap_unmap_segment - unmap segment from the guest address space 297 * @gmap: pointer to the guest address space structure 298 * @addr: address in the guest address space 299 * @len: length of the memory area to unmap 300 * 301 * Returns 0 if the unmap succeeded, -EINVAL if not. 302 */ 303 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 304 { 305 unsigned long *table; 306 unsigned long off; 307 int flush; 308 309 if ((to | len) & (PMD_SIZE - 1)) 310 return -EINVAL; 311 if (len == 0 || to + len < to) 312 return -EINVAL; 313 314 flush = 0; 315 down_read(&gmap->mm->mmap_sem); 316 spin_lock(&gmap->mm->page_table_lock); 317 for (off = 0; off < len; off += PMD_SIZE) { 318 /* Walk the guest addr space page table */ 319 table = gmap->table + (((to + off) >> 53) & 0x7ff); 320 if (*table & _REGION_ENTRY_INVALID) 321 goto out; 322 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 323 table = table + (((to + off) >> 42) & 0x7ff); 324 if (*table & _REGION_ENTRY_INVALID) 325 goto out; 326 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 327 table = table + (((to + off) >> 31) & 0x7ff); 328 if (*table & _REGION_ENTRY_INVALID) 329 goto out; 330 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 331 table = table + (((to + off) >> 20) & 0x7ff); 332 333 /* Clear segment table entry in guest address space. */ 334 flush |= gmap_unlink_segment(gmap, table); 335 *table = _SEGMENT_ENTRY_INVALID; 336 } 337 out: 338 spin_unlock(&gmap->mm->page_table_lock); 339 up_read(&gmap->mm->mmap_sem); 340 if (flush) 341 gmap_flush_tlb(gmap); 342 return 0; 343 } 344 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 345 346 /** 347 * gmap_mmap_segment - map a segment to the guest address space 348 * @gmap: pointer to the guest address space structure 349 * @from: source address in the parent address space 350 * @to: target address in the guest address space 351 * 352 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. 353 */ 354 int gmap_map_segment(struct gmap *gmap, unsigned long from, 355 unsigned long to, unsigned long len) 356 { 357 unsigned long *table; 358 unsigned long off; 359 int flush; 360 361 if ((from | to | len) & (PMD_SIZE - 1)) 362 return -EINVAL; 363 if (len == 0 || from + len > TASK_MAX_SIZE || 364 from + len < from || to + len < to) 365 return -EINVAL; 366 367 flush = 0; 368 down_read(&gmap->mm->mmap_sem); 369 spin_lock(&gmap->mm->page_table_lock); 370 for (off = 0; off < len; off += PMD_SIZE) { 371 /* Walk the gmap address space page table */ 372 table = gmap->table + (((to + off) >> 53) & 0x7ff); 373 if ((*table & _REGION_ENTRY_INVALID) && 374 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY)) 375 goto out_unmap; 376 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 377 table = table + (((to + off) >> 42) & 0x7ff); 378 if ((*table & _REGION_ENTRY_INVALID) && 379 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY)) 380 goto out_unmap; 381 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 382 table = table + (((to + off) >> 31) & 0x7ff); 383 if ((*table & _REGION_ENTRY_INVALID) && 384 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY)) 385 goto out_unmap; 386 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN); 387 table = table + (((to + off) >> 20) & 0x7ff); 388 389 /* Store 'from' address in an invalid segment table entry. */ 390 flush |= gmap_unlink_segment(gmap, table); 391 *table = (from + off) | (_SEGMENT_ENTRY_INVALID | 392 _SEGMENT_ENTRY_PROTECT); 393 } 394 spin_unlock(&gmap->mm->page_table_lock); 395 up_read(&gmap->mm->mmap_sem); 396 if (flush) 397 gmap_flush_tlb(gmap); 398 return 0; 399 400 out_unmap: 401 spin_unlock(&gmap->mm->page_table_lock); 402 up_read(&gmap->mm->mmap_sem); 403 gmap_unmap_segment(gmap, to, len); 404 return -ENOMEM; 405 } 406 EXPORT_SYMBOL_GPL(gmap_map_segment); 407 408 static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap) 409 { 410 unsigned long *table; 411 412 table = gmap->table + ((address >> 53) & 0x7ff); 413 if (unlikely(*table & _REGION_ENTRY_INVALID)) 414 return ERR_PTR(-EFAULT); 415 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 416 table = table + ((address >> 42) & 0x7ff); 417 if (unlikely(*table & _REGION_ENTRY_INVALID)) 418 return ERR_PTR(-EFAULT); 419 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 420 table = table + ((address >> 31) & 0x7ff); 421 if (unlikely(*table & _REGION_ENTRY_INVALID)) 422 return ERR_PTR(-EFAULT); 423 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 424 table = table + ((address >> 20) & 0x7ff); 425 return table; 426 } 427 428 /** 429 * __gmap_translate - translate a guest address to a user space address 430 * @address: guest address 431 * @gmap: pointer to guest mapping meta data structure 432 * 433 * Returns user space address which corresponds to the guest address or 434 * -EFAULT if no such mapping exists. 435 * This function does not establish potentially missing page table entries. 436 * The mmap_sem of the mm that belongs to the address space must be held 437 * when this function gets called. 438 */ 439 unsigned long __gmap_translate(unsigned long address, struct gmap *gmap) 440 { 441 unsigned long *segment_ptr, vmaddr, segment; 442 struct gmap_pgtable *mp; 443 struct page *page; 444 445 current->thread.gmap_addr = address; 446 segment_ptr = gmap_table_walk(address, gmap); 447 if (IS_ERR(segment_ptr)) 448 return PTR_ERR(segment_ptr); 449 /* Convert the gmap address to an mm address. */ 450 segment = *segment_ptr; 451 if (!(segment & _SEGMENT_ENTRY_INVALID)) { 452 page = pfn_to_page(segment >> PAGE_SHIFT); 453 mp = (struct gmap_pgtable *) page->index; 454 return mp->vmaddr | (address & ~PMD_MASK); 455 } else if (segment & _SEGMENT_ENTRY_PROTECT) { 456 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 457 return vmaddr | (address & ~PMD_MASK); 458 } 459 return -EFAULT; 460 } 461 EXPORT_SYMBOL_GPL(__gmap_translate); 462 463 /** 464 * gmap_translate - translate a guest address to a user space address 465 * @address: guest address 466 * @gmap: pointer to guest mapping meta data structure 467 * 468 * Returns user space address which corresponds to the guest address or 469 * -EFAULT if no such mapping exists. 470 * This function does not establish potentially missing page table entries. 471 */ 472 unsigned long gmap_translate(unsigned long address, struct gmap *gmap) 473 { 474 unsigned long rc; 475 476 down_read(&gmap->mm->mmap_sem); 477 rc = __gmap_translate(address, gmap); 478 up_read(&gmap->mm->mmap_sem); 479 return rc; 480 } 481 EXPORT_SYMBOL_GPL(gmap_translate); 482 483 static int gmap_connect_pgtable(unsigned long address, unsigned long segment, 484 unsigned long *segment_ptr, struct gmap *gmap) 485 { 486 unsigned long vmaddr; 487 struct vm_area_struct *vma; 488 struct gmap_pgtable *mp; 489 struct gmap_rmap *rmap; 490 struct mm_struct *mm; 491 struct page *page; 492 pgd_t *pgd; 493 pud_t *pud; 494 pmd_t *pmd; 495 496 mm = gmap->mm; 497 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 498 vma = find_vma(mm, vmaddr); 499 if (!vma || vma->vm_start > vmaddr) 500 return -EFAULT; 501 /* Walk the parent mm page table */ 502 pgd = pgd_offset(mm, vmaddr); 503 pud = pud_alloc(mm, pgd, vmaddr); 504 if (!pud) 505 return -ENOMEM; 506 pmd = pmd_alloc(mm, pud, vmaddr); 507 if (!pmd) 508 return -ENOMEM; 509 if (!pmd_present(*pmd) && 510 __pte_alloc(mm, vma, pmd, vmaddr)) 511 return -ENOMEM; 512 /* large pmds cannot yet be handled */ 513 if (pmd_large(*pmd)) 514 return -EFAULT; 515 /* pmd now points to a valid segment table entry. */ 516 rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); 517 if (!rmap) 518 return -ENOMEM; 519 /* Link gmap segment table entry location to page table. */ 520 page = pmd_page(*pmd); 521 mp = (struct gmap_pgtable *) page->index; 522 rmap->gmap = gmap; 523 rmap->entry = segment_ptr; 524 rmap->vmaddr = address & PMD_MASK; 525 spin_lock(&mm->page_table_lock); 526 if (*segment_ptr == segment) { 527 list_add(&rmap->list, &mp->mapper); 528 /* Set gmap segment table entry to page table. */ 529 *segment_ptr = pmd_val(*pmd) & PAGE_MASK; 530 rmap = NULL; 531 } 532 spin_unlock(&mm->page_table_lock); 533 kfree(rmap); 534 return 0; 535 } 536 537 static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table) 538 { 539 struct gmap_rmap *rmap, *next; 540 struct gmap_pgtable *mp; 541 struct page *page; 542 int flush; 543 544 flush = 0; 545 spin_lock(&mm->page_table_lock); 546 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 547 mp = (struct gmap_pgtable *) page->index; 548 list_for_each_entry_safe(rmap, next, &mp->mapper, list) { 549 *rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID | 550 _SEGMENT_ENTRY_PROTECT); 551 list_del(&rmap->list); 552 kfree(rmap); 553 flush = 1; 554 } 555 spin_unlock(&mm->page_table_lock); 556 if (flush) 557 __tlb_flush_global(); 558 } 559 560 /* 561 * this function is assumed to be called with mmap_sem held 562 */ 563 unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) 564 { 565 unsigned long *segment_ptr, segment; 566 struct gmap_pgtable *mp; 567 struct page *page; 568 int rc; 569 570 current->thread.gmap_addr = address; 571 segment_ptr = gmap_table_walk(address, gmap); 572 if (IS_ERR(segment_ptr)) 573 return -EFAULT; 574 /* Convert the gmap address to an mm address. */ 575 while (1) { 576 segment = *segment_ptr; 577 if (!(segment & _SEGMENT_ENTRY_INVALID)) { 578 /* Page table is present */ 579 page = pfn_to_page(segment >> PAGE_SHIFT); 580 mp = (struct gmap_pgtable *) page->index; 581 return mp->vmaddr | (address & ~PMD_MASK); 582 } 583 if (!(segment & _SEGMENT_ENTRY_PROTECT)) 584 /* Nothing mapped in the gmap address space. */ 585 break; 586 rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap); 587 if (rc) 588 return rc; 589 } 590 return -EFAULT; 591 } 592 593 unsigned long gmap_fault(unsigned long address, struct gmap *gmap) 594 { 595 unsigned long rc; 596 597 down_read(&gmap->mm->mmap_sem); 598 rc = __gmap_fault(address, gmap); 599 up_read(&gmap->mm->mmap_sem); 600 601 return rc; 602 } 603 EXPORT_SYMBOL_GPL(gmap_fault); 604 605 static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) 606 { 607 if (!non_swap_entry(entry)) 608 dec_mm_counter(mm, MM_SWAPENTS); 609 else if (is_migration_entry(entry)) { 610 struct page *page = migration_entry_to_page(entry); 611 612 if (PageAnon(page)) 613 dec_mm_counter(mm, MM_ANONPAGES); 614 else 615 dec_mm_counter(mm, MM_FILEPAGES); 616 } 617 free_swap_and_cache(entry); 618 } 619 620 /** 621 * The mm->mmap_sem lock must be held 622 */ 623 static void gmap_zap_unused(struct mm_struct *mm, unsigned long address) 624 { 625 unsigned long ptev, pgstev; 626 spinlock_t *ptl; 627 pgste_t pgste; 628 pte_t *ptep, pte; 629 630 ptep = get_locked_pte(mm, address, &ptl); 631 if (unlikely(!ptep)) 632 return; 633 pte = *ptep; 634 if (!pte_swap(pte)) 635 goto out_pte; 636 /* Zap unused and logically-zero pages */ 637 pgste = pgste_get_lock(ptep); 638 pgstev = pgste_val(pgste); 639 ptev = pte_val(pte); 640 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || 641 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { 642 gmap_zap_swap_entry(pte_to_swp_entry(pte), mm); 643 pte_clear(mm, address, ptep); 644 } 645 pgste_set_unlock(ptep, pgste); 646 out_pte: 647 pte_unmap_unlock(*ptep, ptl); 648 } 649 650 /* 651 * this function is assumed to be called with mmap_sem held 652 */ 653 void __gmap_zap(unsigned long address, struct gmap *gmap) 654 { 655 unsigned long *table, *segment_ptr; 656 unsigned long segment, pgstev, ptev; 657 struct gmap_pgtable *mp; 658 struct page *page; 659 660 segment_ptr = gmap_table_walk(address, gmap); 661 if (IS_ERR(segment_ptr)) 662 return; 663 segment = *segment_ptr; 664 if (segment & _SEGMENT_ENTRY_INVALID) 665 return; 666 page = pfn_to_page(segment >> PAGE_SHIFT); 667 mp = (struct gmap_pgtable *) page->index; 668 address = mp->vmaddr | (address & ~PMD_MASK); 669 /* Page table is present */ 670 table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN); 671 table = table + ((address >> 12) & 0xff); 672 pgstev = table[PTRS_PER_PTE]; 673 ptev = table[0]; 674 /* quick check, checked again with locks held */ 675 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || 676 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) 677 gmap_zap_unused(gmap->mm, address); 678 } 679 EXPORT_SYMBOL_GPL(__gmap_zap); 680 681 void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) 682 { 683 684 unsigned long *table, address, size; 685 struct vm_area_struct *vma; 686 struct gmap_pgtable *mp; 687 struct page *page; 688 689 down_read(&gmap->mm->mmap_sem); 690 address = from; 691 while (address < to) { 692 /* Walk the gmap address space page table */ 693 table = gmap->table + ((address >> 53) & 0x7ff); 694 if (unlikely(*table & _REGION_ENTRY_INVALID)) { 695 address = (address + PMD_SIZE) & PMD_MASK; 696 continue; 697 } 698 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 699 table = table + ((address >> 42) & 0x7ff); 700 if (unlikely(*table & _REGION_ENTRY_INVALID)) { 701 address = (address + PMD_SIZE) & PMD_MASK; 702 continue; 703 } 704 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 705 table = table + ((address >> 31) & 0x7ff); 706 if (unlikely(*table & _REGION_ENTRY_INVALID)) { 707 address = (address + PMD_SIZE) & PMD_MASK; 708 continue; 709 } 710 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 711 table = table + ((address >> 20) & 0x7ff); 712 if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) { 713 address = (address + PMD_SIZE) & PMD_MASK; 714 continue; 715 } 716 page = pfn_to_page(*table >> PAGE_SHIFT); 717 mp = (struct gmap_pgtable *) page->index; 718 vma = find_vma(gmap->mm, mp->vmaddr); 719 size = min(to - address, PMD_SIZE - (address & ~PMD_MASK)); 720 zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK), 721 size, NULL); 722 address = (address + PMD_SIZE) & PMD_MASK; 723 } 724 up_read(&gmap->mm->mmap_sem); 725 } 726 EXPORT_SYMBOL_GPL(gmap_discard); 727 728 static LIST_HEAD(gmap_notifier_list); 729 static DEFINE_SPINLOCK(gmap_notifier_lock); 730 731 /** 732 * gmap_register_ipte_notifier - register a pte invalidation callback 733 * @nb: pointer to the gmap notifier block 734 */ 735 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 736 { 737 spin_lock(&gmap_notifier_lock); 738 list_add(&nb->list, &gmap_notifier_list); 739 spin_unlock(&gmap_notifier_lock); 740 } 741 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 742 743 /** 744 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 745 * @nb: pointer to the gmap notifier block 746 */ 747 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 748 { 749 spin_lock(&gmap_notifier_lock); 750 list_del_init(&nb->list); 751 spin_unlock(&gmap_notifier_lock); 752 } 753 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 754 755 /** 756 * gmap_ipte_notify - mark a range of ptes for invalidation notification 757 * @gmap: pointer to guest mapping meta data structure 758 * @start: virtual address in the guest address space 759 * @len: size of area 760 * 761 * Returns 0 if for each page in the given range a gmap mapping exists and 762 * the invalidation notification could be set. If the gmap mapping is missing 763 * for one or more pages -EFAULT is returned. If no memory could be allocated 764 * -ENOMEM is returned. This function establishes missing page table entries. 765 */ 766 int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) 767 { 768 unsigned long addr; 769 spinlock_t *ptl; 770 pte_t *ptep, entry; 771 pgste_t pgste; 772 int rc = 0; 773 774 if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK)) 775 return -EINVAL; 776 down_read(&gmap->mm->mmap_sem); 777 while (len) { 778 /* Convert gmap address and connect the page tables */ 779 addr = __gmap_fault(start, gmap); 780 if (IS_ERR_VALUE(addr)) { 781 rc = addr; 782 break; 783 } 784 /* Get the page mapped */ 785 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 786 rc = -EFAULT; 787 break; 788 } 789 /* Walk the process page table, lock and get pte pointer */ 790 ptep = get_locked_pte(gmap->mm, addr, &ptl); 791 if (unlikely(!ptep)) 792 continue; 793 /* Set notification bit in the pgste of the pte */ 794 entry = *ptep; 795 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { 796 pgste = pgste_get_lock(ptep); 797 pgste_val(pgste) |= PGSTE_IN_BIT; 798 pgste_set_unlock(ptep, pgste); 799 start += PAGE_SIZE; 800 len -= PAGE_SIZE; 801 } 802 spin_unlock(ptl); 803 } 804 up_read(&gmap->mm->mmap_sem); 805 return rc; 806 } 807 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 808 809 /** 810 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 811 * @mm: pointer to the process mm_struct 812 * @pte: pointer to the page table entry 813 * 814 * This function is assumed to be called with the page table lock held 815 * for the pte to notify. 816 */ 817 void gmap_do_ipte_notify(struct mm_struct *mm, pte_t *pte) 818 { 819 unsigned long segment_offset; 820 struct gmap_notifier *nb; 821 struct gmap_pgtable *mp; 822 struct gmap_rmap *rmap; 823 struct page *page; 824 825 segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 826 segment_offset = segment_offset * (4096 / sizeof(pte_t)); 827 page = pfn_to_page(__pa(pte) >> PAGE_SHIFT); 828 mp = (struct gmap_pgtable *) page->index; 829 spin_lock(&gmap_notifier_lock); 830 list_for_each_entry(rmap, &mp->mapper, list) { 831 list_for_each_entry(nb, &gmap_notifier_list, list) 832 nb->notifier_call(rmap->gmap, 833 rmap->vmaddr + segment_offset); 834 } 835 spin_unlock(&gmap_notifier_lock); 836 } 837 EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); 838 839 static inline int page_table_with_pgste(struct page *page) 840 { 841 return atomic_read(&page->_mapcount) == 0; 842 } 843 844 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 845 unsigned long vmaddr) 846 { 847 struct page *page; 848 unsigned long *table; 849 struct gmap_pgtable *mp; 850 851 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 852 if (!page) 853 return NULL; 854 mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT); 855 if (!mp) { 856 __free_page(page); 857 return NULL; 858 } 859 if (!pgtable_page_ctor(page)) { 860 kfree(mp); 861 __free_page(page); 862 return NULL; 863 } 864 mp->vmaddr = vmaddr & PMD_MASK; 865 INIT_LIST_HEAD(&mp->mapper); 866 page->index = (unsigned long) mp; 867 atomic_set(&page->_mapcount, 0); 868 table = (unsigned long *) page_to_phys(page); 869 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 870 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 871 return table; 872 } 873 874 static inline void page_table_free_pgste(unsigned long *table) 875 { 876 struct page *page; 877 struct gmap_pgtable *mp; 878 879 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 880 mp = (struct gmap_pgtable *) page->index; 881 BUG_ON(!list_empty(&mp->mapper)); 882 pgtable_page_dtor(page); 883 atomic_set(&page->_mapcount, -1); 884 kfree(mp); 885 __free_page(page); 886 } 887 888 static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t *pmd, 889 unsigned long addr, unsigned long end, bool init_skey) 890 { 891 pte_t *start_pte, *pte; 892 spinlock_t *ptl; 893 pgste_t pgste; 894 895 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 896 pte = start_pte; 897 do { 898 pgste = pgste_get_lock(pte); 899 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; 900 if (init_skey) { 901 unsigned long address; 902 903 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | 904 PGSTE_GR_BIT | PGSTE_GC_BIT); 905 906 /* skip invalid and not writable pages */ 907 if (pte_val(*pte) & _PAGE_INVALID || 908 !(pte_val(*pte) & _PAGE_WRITE)) { 909 pgste_set_unlock(pte, pgste); 910 continue; 911 } 912 913 address = pte_val(*pte) & PAGE_MASK; 914 page_set_storage_key(address, PAGE_DEFAULT_KEY, 1); 915 } 916 pgste_set_unlock(pte, pgste); 917 } while (pte++, addr += PAGE_SIZE, addr != end); 918 pte_unmap_unlock(start_pte, ptl); 919 920 return addr; 921 } 922 923 static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t *pud, 924 unsigned long addr, unsigned long end, bool init_skey) 925 { 926 unsigned long next; 927 pmd_t *pmd; 928 929 pmd = pmd_offset(pud, addr); 930 do { 931 next = pmd_addr_end(addr, end); 932 if (pmd_none_or_clear_bad(pmd)) 933 continue; 934 next = page_table_reset_pte(mm, pmd, addr, next, init_skey); 935 } while (pmd++, addr = next, addr != end); 936 937 return addr; 938 } 939 940 static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t *pgd, 941 unsigned long addr, unsigned long end, bool init_skey) 942 { 943 unsigned long next; 944 pud_t *pud; 945 946 pud = pud_offset(pgd, addr); 947 do { 948 next = pud_addr_end(addr, end); 949 if (pud_none_or_clear_bad(pud)) 950 continue; 951 next = page_table_reset_pmd(mm, pud, addr, next, init_skey); 952 } while (pud++, addr = next, addr != end); 953 954 return addr; 955 } 956 957 void page_table_reset_pgste(struct mm_struct *mm, unsigned long start, 958 unsigned long end, bool init_skey) 959 { 960 unsigned long addr, next; 961 pgd_t *pgd; 962 963 down_write(&mm->mmap_sem); 964 if (init_skey && mm_use_skey(mm)) 965 goto out_up; 966 addr = start; 967 pgd = pgd_offset(mm, addr); 968 do { 969 next = pgd_addr_end(addr, end); 970 if (pgd_none_or_clear_bad(pgd)) 971 continue; 972 next = page_table_reset_pud(mm, pgd, addr, next, init_skey); 973 } while (pgd++, addr = next, addr != end); 974 if (init_skey) 975 current->mm->context.use_skey = 1; 976 out_up: 977 up_write(&mm->mmap_sem); 978 } 979 EXPORT_SYMBOL(page_table_reset_pgste); 980 981 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 982 unsigned long key, bool nq) 983 { 984 spinlock_t *ptl; 985 pgste_t old, new; 986 pte_t *ptep; 987 988 down_read(&mm->mmap_sem); 989 ptep = get_locked_pte(current->mm, addr, &ptl); 990 if (unlikely(!ptep)) { 991 up_read(&mm->mmap_sem); 992 return -EFAULT; 993 } 994 995 new = old = pgste_get_lock(ptep); 996 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 997 PGSTE_ACC_BITS | PGSTE_FP_BIT); 998 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 999 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 1000 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1001 unsigned long address, bits, skey; 1002 1003 address = pte_val(*ptep) & PAGE_MASK; 1004 skey = (unsigned long) page_get_storage_key(address); 1005 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 1006 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 1007 /* Set storage key ACC and FP */ 1008 page_set_storage_key(address, skey, !nq); 1009 /* Merge host changed & referenced into pgste */ 1010 pgste_val(new) |= bits << 52; 1011 } 1012 /* changing the guest storage key is considered a change of the page */ 1013 if ((pgste_val(new) ^ pgste_val(old)) & 1014 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 1015 pgste_val(new) |= PGSTE_UC_BIT; 1016 1017 pgste_set_unlock(ptep, new); 1018 pte_unmap_unlock(*ptep, ptl); 1019 up_read(&mm->mmap_sem); 1020 return 0; 1021 } 1022 EXPORT_SYMBOL(set_guest_storage_key); 1023 1024 #else /* CONFIG_PGSTE */ 1025 1026 static inline int page_table_with_pgste(struct page *page) 1027 { 1028 return 0; 1029 } 1030 1031 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 1032 unsigned long vmaddr) 1033 { 1034 return NULL; 1035 } 1036 1037 void page_table_reset_pgste(struct mm_struct *mm, unsigned long start, 1038 unsigned long end, bool init_skey) 1039 { 1040 } 1041 1042 static inline void page_table_free_pgste(unsigned long *table) 1043 { 1044 } 1045 1046 static inline void gmap_disconnect_pgtable(struct mm_struct *mm, 1047 unsigned long *table) 1048 { 1049 } 1050 1051 #endif /* CONFIG_PGSTE */ 1052 1053 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 1054 { 1055 unsigned int old, new; 1056 1057 do { 1058 old = atomic_read(v); 1059 new = old ^ bits; 1060 } while (atomic_cmpxchg(v, old, new) != old); 1061 return new; 1062 } 1063 1064 /* 1065 * page table entry allocation/free routines. 1066 */ 1067 unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) 1068 { 1069 unsigned long *uninitialized_var(table); 1070 struct page *uninitialized_var(page); 1071 unsigned int mask, bit; 1072 1073 if (mm_has_pgste(mm)) 1074 return page_table_alloc_pgste(mm, vmaddr); 1075 /* Allocate fragments of a 4K page as 1K/2K page table */ 1076 spin_lock_bh(&mm->context.list_lock); 1077 mask = FRAG_MASK; 1078 if (!list_empty(&mm->context.pgtable_list)) { 1079 page = list_first_entry(&mm->context.pgtable_list, 1080 struct page, lru); 1081 table = (unsigned long *) page_to_phys(page); 1082 mask = atomic_read(&page->_mapcount); 1083 mask = mask | (mask >> 4); 1084 } 1085 if ((mask & FRAG_MASK) == FRAG_MASK) { 1086 spin_unlock_bh(&mm->context.list_lock); 1087 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 1088 if (!page) 1089 return NULL; 1090 if (!pgtable_page_ctor(page)) { 1091 __free_page(page); 1092 return NULL; 1093 } 1094 atomic_set(&page->_mapcount, 1); 1095 table = (unsigned long *) page_to_phys(page); 1096 clear_table(table, _PAGE_INVALID, PAGE_SIZE); 1097 spin_lock_bh(&mm->context.list_lock); 1098 list_add(&page->lru, &mm->context.pgtable_list); 1099 } else { 1100 for (bit = 1; mask & bit; bit <<= 1) 1101 table += PTRS_PER_PTE; 1102 mask = atomic_xor_bits(&page->_mapcount, bit); 1103 if ((mask & FRAG_MASK) == FRAG_MASK) 1104 list_del(&page->lru); 1105 } 1106 spin_unlock_bh(&mm->context.list_lock); 1107 return table; 1108 } 1109 1110 void page_table_free(struct mm_struct *mm, unsigned long *table) 1111 { 1112 struct page *page; 1113 unsigned int bit, mask; 1114 1115 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1116 if (page_table_with_pgste(page)) { 1117 gmap_disconnect_pgtable(mm, table); 1118 return page_table_free_pgste(table); 1119 } 1120 /* Free 1K/2K page table fragment of a 4K page */ 1121 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 1122 spin_lock_bh(&mm->context.list_lock); 1123 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1124 list_del(&page->lru); 1125 mask = atomic_xor_bits(&page->_mapcount, bit); 1126 if (mask & FRAG_MASK) 1127 list_add(&page->lru, &mm->context.pgtable_list); 1128 spin_unlock_bh(&mm->context.list_lock); 1129 if (mask == 0) { 1130 pgtable_page_dtor(page); 1131 atomic_set(&page->_mapcount, -1); 1132 __free_page(page); 1133 } 1134 } 1135 1136 static void __page_table_free_rcu(void *table, unsigned bit) 1137 { 1138 struct page *page; 1139 1140 if (bit == FRAG_MASK) 1141 return page_table_free_pgste(table); 1142 /* Free 1K/2K page table fragment of a 4K page */ 1143 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1144 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 1145 pgtable_page_dtor(page); 1146 atomic_set(&page->_mapcount, -1); 1147 __free_page(page); 1148 } 1149 } 1150 1151 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) 1152 { 1153 struct mm_struct *mm; 1154 struct page *page; 1155 unsigned int bit, mask; 1156 1157 mm = tlb->mm; 1158 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1159 if (page_table_with_pgste(page)) { 1160 gmap_disconnect_pgtable(mm, table); 1161 table = (unsigned long *) (__pa(table) | FRAG_MASK); 1162 tlb_remove_table(tlb, table); 1163 return; 1164 } 1165 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 1166 spin_lock_bh(&mm->context.list_lock); 1167 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 1168 list_del(&page->lru); 1169 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 1170 if (mask & FRAG_MASK) 1171 list_add_tail(&page->lru, &mm->context.pgtable_list); 1172 spin_unlock_bh(&mm->context.list_lock); 1173 table = (unsigned long *) (__pa(table) | (bit << 4)); 1174 tlb_remove_table(tlb, table); 1175 } 1176 1177 static void __tlb_remove_table(void *_table) 1178 { 1179 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 1180 void *table = (void *)((unsigned long) _table & ~mask); 1181 unsigned type = (unsigned long) _table & mask; 1182 1183 if (type) 1184 __page_table_free_rcu(table, type); 1185 else 1186 free_pages((unsigned long) table, ALLOC_ORDER); 1187 } 1188 1189 static void tlb_remove_table_smp_sync(void *arg) 1190 { 1191 /* Simply deliver the interrupt */ 1192 } 1193 1194 static void tlb_remove_table_one(void *table) 1195 { 1196 /* 1197 * This isn't an RCU grace period and hence the page-tables cannot be 1198 * assumed to be actually RCU-freed. 1199 * 1200 * It is however sufficient for software page-table walkers that rely 1201 * on IRQ disabling. See the comment near struct mmu_table_batch. 1202 */ 1203 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 1204 __tlb_remove_table(table); 1205 } 1206 1207 static void tlb_remove_table_rcu(struct rcu_head *head) 1208 { 1209 struct mmu_table_batch *batch; 1210 int i; 1211 1212 batch = container_of(head, struct mmu_table_batch, rcu); 1213 1214 for (i = 0; i < batch->nr; i++) 1215 __tlb_remove_table(batch->tables[i]); 1216 1217 free_page((unsigned long)batch); 1218 } 1219 1220 void tlb_table_flush(struct mmu_gather *tlb) 1221 { 1222 struct mmu_table_batch **batch = &tlb->batch; 1223 1224 if (*batch) { 1225 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 1226 *batch = NULL; 1227 } 1228 } 1229 1230 void tlb_remove_table(struct mmu_gather *tlb, void *table) 1231 { 1232 struct mmu_table_batch **batch = &tlb->batch; 1233 1234 tlb->mm->context.flush_mm = 1; 1235 if (*batch == NULL) { 1236 *batch = (struct mmu_table_batch *) 1237 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 1238 if (*batch == NULL) { 1239 __tlb_flush_mm_lazy(tlb->mm); 1240 tlb_remove_table_one(table); 1241 return; 1242 } 1243 (*batch)->nr = 0; 1244 } 1245 (*batch)->tables[(*batch)->nr++] = table; 1246 if ((*batch)->nr == MAX_TABLE_BATCH) 1247 tlb_flush_mmu(tlb); 1248 } 1249 1250 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1251 static inline void thp_split_vma(struct vm_area_struct *vma) 1252 { 1253 unsigned long addr; 1254 1255 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) 1256 follow_page(vma, addr, FOLL_SPLIT); 1257 } 1258 1259 static inline void thp_split_mm(struct mm_struct *mm) 1260 { 1261 struct vm_area_struct *vma; 1262 1263 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { 1264 thp_split_vma(vma); 1265 vma->vm_flags &= ~VM_HUGEPAGE; 1266 vma->vm_flags |= VM_NOHUGEPAGE; 1267 } 1268 mm->def_flags |= VM_NOHUGEPAGE; 1269 } 1270 #else 1271 static inline void thp_split_mm(struct mm_struct *mm) 1272 { 1273 } 1274 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1275 1276 static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb, 1277 struct mm_struct *mm, pud_t *pud, 1278 unsigned long addr, unsigned long end) 1279 { 1280 unsigned long next, *table, *new; 1281 struct page *page; 1282 pmd_t *pmd; 1283 1284 pmd = pmd_offset(pud, addr); 1285 do { 1286 next = pmd_addr_end(addr, end); 1287 again: 1288 if (pmd_none_or_clear_bad(pmd)) 1289 continue; 1290 table = (unsigned long *) pmd_deref(*pmd); 1291 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 1292 if (page_table_with_pgste(page)) 1293 continue; 1294 /* Allocate new page table with pgstes */ 1295 new = page_table_alloc_pgste(mm, addr); 1296 if (!new) 1297 return -ENOMEM; 1298 1299 spin_lock(&mm->page_table_lock); 1300 if (likely((unsigned long *) pmd_deref(*pmd) == table)) { 1301 /* Nuke pmd entry pointing to the "short" page table */ 1302 pmdp_flush_lazy(mm, addr, pmd); 1303 pmd_clear(pmd); 1304 /* Copy ptes from old table to new table */ 1305 memcpy(new, table, PAGE_SIZE/2); 1306 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 1307 /* Establish new table */ 1308 pmd_populate(mm, pmd, (pte_t *) new); 1309 /* Free old table with rcu, there might be a walker! */ 1310 page_table_free_rcu(tlb, table); 1311 new = NULL; 1312 } 1313 spin_unlock(&mm->page_table_lock); 1314 if (new) { 1315 page_table_free_pgste(new); 1316 goto again; 1317 } 1318 } while (pmd++, addr = next, addr != end); 1319 1320 return addr; 1321 } 1322 1323 static unsigned long page_table_realloc_pud(struct mmu_gather *tlb, 1324 struct mm_struct *mm, pgd_t *pgd, 1325 unsigned long addr, unsigned long end) 1326 { 1327 unsigned long next; 1328 pud_t *pud; 1329 1330 pud = pud_offset(pgd, addr); 1331 do { 1332 next = pud_addr_end(addr, end); 1333 if (pud_none_or_clear_bad(pud)) 1334 continue; 1335 next = page_table_realloc_pmd(tlb, mm, pud, addr, next); 1336 if (unlikely(IS_ERR_VALUE(next))) 1337 return next; 1338 } while (pud++, addr = next, addr != end); 1339 1340 return addr; 1341 } 1342 1343 static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm, 1344 unsigned long addr, unsigned long end) 1345 { 1346 unsigned long next; 1347 pgd_t *pgd; 1348 1349 pgd = pgd_offset(mm, addr); 1350 do { 1351 next = pgd_addr_end(addr, end); 1352 if (pgd_none_or_clear_bad(pgd)) 1353 continue; 1354 next = page_table_realloc_pud(tlb, mm, pgd, addr, next); 1355 if (unlikely(IS_ERR_VALUE(next))) 1356 return next; 1357 } while (pgd++, addr = next, addr != end); 1358 1359 return 0; 1360 } 1361 1362 /* 1363 * switch on pgstes for its userspace process (for kvm) 1364 */ 1365 int s390_enable_sie(void) 1366 { 1367 struct task_struct *tsk = current; 1368 struct mm_struct *mm = tsk->mm; 1369 struct mmu_gather tlb; 1370 1371 /* Do we have pgstes? if yes, we are done */ 1372 if (mm_has_pgste(tsk->mm)) 1373 return 0; 1374 1375 down_write(&mm->mmap_sem); 1376 /* split thp mappings and disable thp for future mappings */ 1377 thp_split_mm(mm); 1378 /* Reallocate the page tables with pgstes */ 1379 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE); 1380 if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE)) 1381 mm->context.has_pgste = 1; 1382 tlb_finish_mmu(&tlb, 0, TASK_SIZE); 1383 up_write(&mm->mmap_sem); 1384 return mm->context.has_pgste ? 0 : -ENOMEM; 1385 } 1386 EXPORT_SYMBOL_GPL(s390_enable_sie); 1387 1388 /* 1389 * Enable storage key handling from now on and initialize the storage 1390 * keys with the default key. 1391 */ 1392 void s390_enable_skey(void) 1393 { 1394 page_table_reset_pgste(current->mm, 0, TASK_SIZE, true); 1395 } 1396 EXPORT_SYMBOL_GPL(s390_enable_skey); 1397 1398 /* 1399 * Test and reset if a guest page is dirty 1400 */ 1401 bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) 1402 { 1403 pte_t *pte; 1404 spinlock_t *ptl; 1405 bool dirty = false; 1406 1407 pte = get_locked_pte(gmap->mm, address, &ptl); 1408 if (unlikely(!pte)) 1409 return false; 1410 1411 if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) 1412 dirty = true; 1413 1414 spin_unlock(ptl); 1415 return dirty; 1416 } 1417 EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); 1418 1419 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1420 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1421 pmd_t *pmdp) 1422 { 1423 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1424 /* No need to flush TLB 1425 * On s390 reference bits are in storage key and never in TLB */ 1426 return pmdp_test_and_clear_young(vma, address, pmdp); 1427 } 1428 1429 int pmdp_set_access_flags(struct vm_area_struct *vma, 1430 unsigned long address, pmd_t *pmdp, 1431 pmd_t entry, int dirty) 1432 { 1433 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1434 1435 if (pmd_same(*pmdp, entry)) 1436 return 0; 1437 pmdp_invalidate(vma, address, pmdp); 1438 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1439 return 1; 1440 } 1441 1442 static void pmdp_splitting_flush_sync(void *arg) 1443 { 1444 /* Simply deliver the interrupt */ 1445 } 1446 1447 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 1448 pmd_t *pmdp) 1449 { 1450 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1451 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 1452 (unsigned long *) pmdp)) { 1453 /* need to serialize against gup-fast (IRQ disabled) */ 1454 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 1455 } 1456 } 1457 1458 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1459 pgtable_t pgtable) 1460 { 1461 struct list_head *lh = (struct list_head *) pgtable; 1462 1463 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1464 1465 /* FIFO */ 1466 if (!pmd_huge_pte(mm, pmdp)) 1467 INIT_LIST_HEAD(lh); 1468 else 1469 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1470 pmd_huge_pte(mm, pmdp) = pgtable; 1471 } 1472 1473 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1474 { 1475 struct list_head *lh; 1476 pgtable_t pgtable; 1477 pte_t *ptep; 1478 1479 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1480 1481 /* FIFO */ 1482 pgtable = pmd_huge_pte(mm, pmdp); 1483 lh = (struct list_head *) pgtable; 1484 if (list_empty(lh)) 1485 pmd_huge_pte(mm, pmdp) = NULL; 1486 else { 1487 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1488 list_del(lh); 1489 } 1490 ptep = (pte_t *) pgtable; 1491 pte_val(*ptep) = _PAGE_INVALID; 1492 ptep++; 1493 pte_val(*ptep) = _PAGE_INVALID; 1494 return pgtable; 1495 } 1496 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1497