1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 27 #ifndef CONFIG_64BIT 28 #define ALLOC_ORDER 1 29 #define FRAG_MASK 0x0f 30 #else 31 #define ALLOC_ORDER 2 32 #define FRAG_MASK 0x03 33 #endif 34 35 36 unsigned long *crst_table_alloc(struct mm_struct *mm) 37 { 38 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 39 40 if (!page) 41 return NULL; 42 return (unsigned long *) page_to_phys(page); 43 } 44 45 void crst_table_free(struct mm_struct *mm, unsigned long *table) 46 { 47 free_pages((unsigned long) table, ALLOC_ORDER); 48 } 49 50 #ifdef CONFIG_64BIT 51 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 52 { 53 unsigned long *table, *pgd; 54 unsigned long entry; 55 56 BUG_ON(limit > (1UL << 53)); 57 repeat: 58 table = crst_table_alloc(mm); 59 if (!table) 60 return -ENOMEM; 61 spin_lock_bh(&mm->page_table_lock); 62 if (mm->context.asce_limit < limit) { 63 pgd = (unsigned long *) mm->pgd; 64 if (mm->context.asce_limit <= (1UL << 31)) { 65 entry = _REGION3_ENTRY_EMPTY; 66 mm->context.asce_limit = 1UL << 42; 67 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 68 _ASCE_USER_BITS | 69 _ASCE_TYPE_REGION3; 70 } else { 71 entry = _REGION2_ENTRY_EMPTY; 72 mm->context.asce_limit = 1UL << 53; 73 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 74 _ASCE_USER_BITS | 75 _ASCE_TYPE_REGION2; 76 } 77 crst_table_init(table, entry); 78 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 79 mm->pgd = (pgd_t *) table; 80 mm->task_size = mm->context.asce_limit; 81 table = NULL; 82 } 83 spin_unlock_bh(&mm->page_table_lock); 84 if (table) 85 crst_table_free(mm, table); 86 if (mm->context.asce_limit < limit) 87 goto repeat; 88 return 0; 89 } 90 91 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 92 { 93 pgd_t *pgd; 94 95 while (mm->context.asce_limit > limit) { 96 pgd = mm->pgd; 97 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 98 case _REGION_ENTRY_TYPE_R2: 99 mm->context.asce_limit = 1UL << 42; 100 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 101 _ASCE_USER_BITS | 102 _ASCE_TYPE_REGION3; 103 break; 104 case _REGION_ENTRY_TYPE_R3: 105 mm->context.asce_limit = 1UL << 31; 106 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 107 _ASCE_USER_BITS | 108 _ASCE_TYPE_SEGMENT; 109 break; 110 default: 111 BUG(); 112 } 113 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 114 mm->task_size = mm->context.asce_limit; 115 crst_table_free(mm, (unsigned long *) pgd); 116 } 117 } 118 #endif 119 120 #ifdef CONFIG_PGSTE 121 122 /** 123 * gmap_alloc - allocate a guest address space 124 * @mm: pointer to the parent mm_struct 125 * 126 * Returns a guest address space structure. 127 */ 128 struct gmap *gmap_alloc(struct mm_struct *mm) 129 { 130 struct gmap *gmap; 131 struct page *page; 132 unsigned long *table; 133 134 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 135 if (!gmap) 136 goto out; 137 INIT_LIST_HEAD(&gmap->crst_list); 138 gmap->mm = mm; 139 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 140 if (!page) 141 goto out_free; 142 list_add(&page->lru, &gmap->crst_list); 143 table = (unsigned long *) page_to_phys(page); 144 crst_table_init(table, _REGION1_ENTRY_EMPTY); 145 gmap->table = table; 146 gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH | 147 _ASCE_USER_BITS | __pa(table); 148 list_add(&gmap->list, &mm->context.gmap_list); 149 return gmap; 150 151 out_free: 152 kfree(gmap); 153 out: 154 return NULL; 155 } 156 EXPORT_SYMBOL_GPL(gmap_alloc); 157 158 static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) 159 { 160 struct gmap_pgtable *mp; 161 struct gmap_rmap *rmap; 162 struct page *page; 163 164 if (*table & _SEGMENT_ENTRY_INV) 165 return 0; 166 page = pfn_to_page(*table >> PAGE_SHIFT); 167 mp = (struct gmap_pgtable *) page->index; 168 list_for_each_entry(rmap, &mp->mapper, list) { 169 if (rmap->entry != table) 170 continue; 171 list_del(&rmap->list); 172 kfree(rmap); 173 break; 174 } 175 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; 176 return 1; 177 } 178 179 static void gmap_flush_tlb(struct gmap *gmap) 180 { 181 if (MACHINE_HAS_IDTE) 182 __tlb_flush_idte((unsigned long) gmap->table | 183 _ASCE_TYPE_REGION1); 184 else 185 __tlb_flush_global(); 186 } 187 188 /** 189 * gmap_free - free a guest address space 190 * @gmap: pointer to the guest address space structure 191 */ 192 void gmap_free(struct gmap *gmap) 193 { 194 struct page *page, *next; 195 unsigned long *table; 196 int i; 197 198 199 /* Flush tlb. */ 200 if (MACHINE_HAS_IDTE) 201 __tlb_flush_idte((unsigned long) gmap->table | 202 _ASCE_TYPE_REGION1); 203 else 204 __tlb_flush_global(); 205 206 /* Free all segment & region tables. */ 207 down_read(&gmap->mm->mmap_sem); 208 spin_lock(&gmap->mm->page_table_lock); 209 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { 210 table = (unsigned long *) page_to_phys(page); 211 if ((*table & _REGION_ENTRY_TYPE_MASK) == 0) 212 /* Remove gmap rmap structures for segment table. */ 213 for (i = 0; i < PTRS_PER_PMD; i++, table++) 214 gmap_unlink_segment(gmap, table); 215 __free_pages(page, ALLOC_ORDER); 216 } 217 spin_unlock(&gmap->mm->page_table_lock); 218 up_read(&gmap->mm->mmap_sem); 219 list_del(&gmap->list); 220 kfree(gmap); 221 } 222 EXPORT_SYMBOL_GPL(gmap_free); 223 224 /** 225 * gmap_enable - switch primary space to the guest address space 226 * @gmap: pointer to the guest address space structure 227 */ 228 void gmap_enable(struct gmap *gmap) 229 { 230 S390_lowcore.gmap = (unsigned long) gmap; 231 } 232 EXPORT_SYMBOL_GPL(gmap_enable); 233 234 /** 235 * gmap_disable - switch back to the standard primary address space 236 * @gmap: pointer to the guest address space structure 237 */ 238 void gmap_disable(struct gmap *gmap) 239 { 240 S390_lowcore.gmap = 0UL; 241 } 242 EXPORT_SYMBOL_GPL(gmap_disable); 243 244 /* 245 * gmap_alloc_table is assumed to be called with mmap_sem held 246 */ 247 static int gmap_alloc_table(struct gmap *gmap, 248 unsigned long *table, unsigned long init) 249 { 250 struct page *page; 251 unsigned long *new; 252 253 /* since we dont free the gmap table until gmap_free we can unlock */ 254 spin_unlock(&gmap->mm->page_table_lock); 255 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 256 spin_lock(&gmap->mm->page_table_lock); 257 if (!page) 258 return -ENOMEM; 259 new = (unsigned long *) page_to_phys(page); 260 crst_table_init(new, init); 261 if (*table & _REGION_ENTRY_INV) { 262 list_add(&page->lru, &gmap->crst_list); 263 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 264 (*table & _REGION_ENTRY_TYPE_MASK); 265 } else 266 __free_pages(page, ALLOC_ORDER); 267 return 0; 268 } 269 270 /** 271 * gmap_unmap_segment - unmap segment from the guest address space 272 * @gmap: pointer to the guest address space structure 273 * @addr: address in the guest address space 274 * @len: length of the memory area to unmap 275 * 276 * Returns 0 if the unmap succeded, -EINVAL if not. 277 */ 278 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 279 { 280 unsigned long *table; 281 unsigned long off; 282 int flush; 283 284 if ((to | len) & (PMD_SIZE - 1)) 285 return -EINVAL; 286 if (len == 0 || to + len < to) 287 return -EINVAL; 288 289 flush = 0; 290 down_read(&gmap->mm->mmap_sem); 291 spin_lock(&gmap->mm->page_table_lock); 292 for (off = 0; off < len; off += PMD_SIZE) { 293 /* Walk the guest addr space page table */ 294 table = gmap->table + (((to + off) >> 53) & 0x7ff); 295 if (*table & _REGION_ENTRY_INV) 296 goto out; 297 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 298 table = table + (((to + off) >> 42) & 0x7ff); 299 if (*table & _REGION_ENTRY_INV) 300 goto out; 301 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 302 table = table + (((to + off) >> 31) & 0x7ff); 303 if (*table & _REGION_ENTRY_INV) 304 goto out; 305 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 306 table = table + (((to + off) >> 20) & 0x7ff); 307 308 /* Clear segment table entry in guest address space. */ 309 flush |= gmap_unlink_segment(gmap, table); 310 *table = _SEGMENT_ENTRY_INV; 311 } 312 out: 313 spin_unlock(&gmap->mm->page_table_lock); 314 up_read(&gmap->mm->mmap_sem); 315 if (flush) 316 gmap_flush_tlb(gmap); 317 return 0; 318 } 319 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 320 321 /** 322 * gmap_mmap_segment - map a segment to the guest address space 323 * @gmap: pointer to the guest address space structure 324 * @from: source address in the parent address space 325 * @to: target address in the guest address space 326 * 327 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not. 328 */ 329 int gmap_map_segment(struct gmap *gmap, unsigned long from, 330 unsigned long to, unsigned long len) 331 { 332 unsigned long *table; 333 unsigned long off; 334 int flush; 335 336 if ((from | to | len) & (PMD_SIZE - 1)) 337 return -EINVAL; 338 if (len == 0 || from + len > PGDIR_SIZE || 339 from + len < from || to + len < to) 340 return -EINVAL; 341 342 flush = 0; 343 down_read(&gmap->mm->mmap_sem); 344 spin_lock(&gmap->mm->page_table_lock); 345 for (off = 0; off < len; off += PMD_SIZE) { 346 /* Walk the gmap address space page table */ 347 table = gmap->table + (((to + off) >> 53) & 0x7ff); 348 if ((*table & _REGION_ENTRY_INV) && 349 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY)) 350 goto out_unmap; 351 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 352 table = table + (((to + off) >> 42) & 0x7ff); 353 if ((*table & _REGION_ENTRY_INV) && 354 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY)) 355 goto out_unmap; 356 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 357 table = table + (((to + off) >> 31) & 0x7ff); 358 if ((*table & _REGION_ENTRY_INV) && 359 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY)) 360 goto out_unmap; 361 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN); 362 table = table + (((to + off) >> 20) & 0x7ff); 363 364 /* Store 'from' address in an invalid segment table entry. */ 365 flush |= gmap_unlink_segment(gmap, table); 366 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off); 367 } 368 spin_unlock(&gmap->mm->page_table_lock); 369 up_read(&gmap->mm->mmap_sem); 370 if (flush) 371 gmap_flush_tlb(gmap); 372 return 0; 373 374 out_unmap: 375 spin_unlock(&gmap->mm->page_table_lock); 376 up_read(&gmap->mm->mmap_sem); 377 gmap_unmap_segment(gmap, to, len); 378 return -ENOMEM; 379 } 380 EXPORT_SYMBOL_GPL(gmap_map_segment); 381 382 static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap) 383 { 384 unsigned long *table; 385 386 table = gmap->table + ((address >> 53) & 0x7ff); 387 if (unlikely(*table & _REGION_ENTRY_INV)) 388 return ERR_PTR(-EFAULT); 389 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 390 table = table + ((address >> 42) & 0x7ff); 391 if (unlikely(*table & _REGION_ENTRY_INV)) 392 return ERR_PTR(-EFAULT); 393 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 394 table = table + ((address >> 31) & 0x7ff); 395 if (unlikely(*table & _REGION_ENTRY_INV)) 396 return ERR_PTR(-EFAULT); 397 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 398 table = table + ((address >> 20) & 0x7ff); 399 return table; 400 } 401 402 /** 403 * __gmap_translate - translate a guest address to a user space address 404 * @address: guest address 405 * @gmap: pointer to guest mapping meta data structure 406 * 407 * Returns user space address which corresponds to the guest address or 408 * -EFAULT if no such mapping exists. 409 * This function does not establish potentially missing page table entries. 410 * The mmap_sem of the mm that belongs to the address space must be held 411 * when this function gets called. 412 */ 413 unsigned long __gmap_translate(unsigned long address, struct gmap *gmap) 414 { 415 unsigned long *segment_ptr, vmaddr, segment; 416 struct gmap_pgtable *mp; 417 struct page *page; 418 419 current->thread.gmap_addr = address; 420 segment_ptr = gmap_table_walk(address, gmap); 421 if (IS_ERR(segment_ptr)) 422 return PTR_ERR(segment_ptr); 423 /* Convert the gmap address to an mm address. */ 424 segment = *segment_ptr; 425 if (!(segment & _SEGMENT_ENTRY_INV)) { 426 page = pfn_to_page(segment >> PAGE_SHIFT); 427 mp = (struct gmap_pgtable *) page->index; 428 return mp->vmaddr | (address & ~PMD_MASK); 429 } else if (segment & _SEGMENT_ENTRY_RO) { 430 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 431 return vmaddr | (address & ~PMD_MASK); 432 } 433 return -EFAULT; 434 } 435 EXPORT_SYMBOL_GPL(__gmap_translate); 436 437 /** 438 * gmap_translate - translate a guest address to a user space address 439 * @address: guest address 440 * @gmap: pointer to guest mapping meta data structure 441 * 442 * Returns user space address which corresponds to the guest address or 443 * -EFAULT if no such mapping exists. 444 * This function does not establish potentially missing page table entries. 445 */ 446 unsigned long gmap_translate(unsigned long address, struct gmap *gmap) 447 { 448 unsigned long rc; 449 450 down_read(&gmap->mm->mmap_sem); 451 rc = __gmap_translate(address, gmap); 452 up_read(&gmap->mm->mmap_sem); 453 return rc; 454 } 455 EXPORT_SYMBOL_GPL(gmap_translate); 456 457 static int gmap_connect_pgtable(unsigned long address, unsigned long segment, 458 unsigned long *segment_ptr, struct gmap *gmap) 459 { 460 unsigned long vmaddr; 461 struct vm_area_struct *vma; 462 struct gmap_pgtable *mp; 463 struct gmap_rmap *rmap; 464 struct mm_struct *mm; 465 struct page *page; 466 pgd_t *pgd; 467 pud_t *pud; 468 pmd_t *pmd; 469 470 mm = gmap->mm; 471 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 472 vma = find_vma(mm, vmaddr); 473 if (!vma || vma->vm_start > vmaddr) 474 return -EFAULT; 475 /* Walk the parent mm page table */ 476 pgd = pgd_offset(mm, vmaddr); 477 pud = pud_alloc(mm, pgd, vmaddr); 478 if (!pud) 479 return -ENOMEM; 480 pmd = pmd_alloc(mm, pud, vmaddr); 481 if (!pmd) 482 return -ENOMEM; 483 if (!pmd_present(*pmd) && 484 __pte_alloc(mm, vma, pmd, vmaddr)) 485 return -ENOMEM; 486 /* pmd now points to a valid segment table entry. */ 487 rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); 488 if (!rmap) 489 return -ENOMEM; 490 /* Link gmap segment table entry location to page table. */ 491 page = pmd_page(*pmd); 492 mp = (struct gmap_pgtable *) page->index; 493 rmap->gmap = gmap; 494 rmap->entry = segment_ptr; 495 rmap->vmaddr = address & PMD_MASK; 496 spin_lock(&mm->page_table_lock); 497 if (*segment_ptr == segment) { 498 list_add(&rmap->list, &mp->mapper); 499 /* Set gmap segment table entry to page table. */ 500 *segment_ptr = pmd_val(*pmd) & PAGE_MASK; 501 rmap = NULL; 502 } 503 spin_unlock(&mm->page_table_lock); 504 kfree(rmap); 505 return 0; 506 } 507 508 static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table) 509 { 510 struct gmap_rmap *rmap, *next; 511 struct gmap_pgtable *mp; 512 struct page *page; 513 int flush; 514 515 flush = 0; 516 spin_lock(&mm->page_table_lock); 517 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 518 mp = (struct gmap_pgtable *) page->index; 519 list_for_each_entry_safe(rmap, next, &mp->mapper, list) { 520 *rmap->entry = 521 _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; 522 list_del(&rmap->list); 523 kfree(rmap); 524 flush = 1; 525 } 526 spin_unlock(&mm->page_table_lock); 527 if (flush) 528 __tlb_flush_global(); 529 } 530 531 /* 532 * this function is assumed to be called with mmap_sem held 533 */ 534 unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) 535 { 536 unsigned long *segment_ptr, segment; 537 struct gmap_pgtable *mp; 538 struct page *page; 539 int rc; 540 541 current->thread.gmap_addr = address; 542 segment_ptr = gmap_table_walk(address, gmap); 543 if (IS_ERR(segment_ptr)) 544 return -EFAULT; 545 /* Convert the gmap address to an mm address. */ 546 while (1) { 547 segment = *segment_ptr; 548 if (!(segment & _SEGMENT_ENTRY_INV)) { 549 /* Page table is present */ 550 page = pfn_to_page(segment >> PAGE_SHIFT); 551 mp = (struct gmap_pgtable *) page->index; 552 return mp->vmaddr | (address & ~PMD_MASK); 553 } 554 if (!(segment & _SEGMENT_ENTRY_RO)) 555 /* Nothing mapped in the gmap address space. */ 556 break; 557 rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap); 558 if (rc) 559 return rc; 560 } 561 return -EFAULT; 562 } 563 564 unsigned long gmap_fault(unsigned long address, struct gmap *gmap) 565 { 566 unsigned long rc; 567 568 down_read(&gmap->mm->mmap_sem); 569 rc = __gmap_fault(address, gmap); 570 up_read(&gmap->mm->mmap_sem); 571 572 return rc; 573 } 574 EXPORT_SYMBOL_GPL(gmap_fault); 575 576 void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) 577 { 578 579 unsigned long *table, address, size; 580 struct vm_area_struct *vma; 581 struct gmap_pgtable *mp; 582 struct page *page; 583 584 down_read(&gmap->mm->mmap_sem); 585 address = from; 586 while (address < to) { 587 /* Walk the gmap address space page table */ 588 table = gmap->table + ((address >> 53) & 0x7ff); 589 if (unlikely(*table & _REGION_ENTRY_INV)) { 590 address = (address + PMD_SIZE) & PMD_MASK; 591 continue; 592 } 593 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 594 table = table + ((address >> 42) & 0x7ff); 595 if (unlikely(*table & _REGION_ENTRY_INV)) { 596 address = (address + PMD_SIZE) & PMD_MASK; 597 continue; 598 } 599 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 600 table = table + ((address >> 31) & 0x7ff); 601 if (unlikely(*table & _REGION_ENTRY_INV)) { 602 address = (address + PMD_SIZE) & PMD_MASK; 603 continue; 604 } 605 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 606 table = table + ((address >> 20) & 0x7ff); 607 if (unlikely(*table & _SEGMENT_ENTRY_INV)) { 608 address = (address + PMD_SIZE) & PMD_MASK; 609 continue; 610 } 611 page = pfn_to_page(*table >> PAGE_SHIFT); 612 mp = (struct gmap_pgtable *) page->index; 613 vma = find_vma(gmap->mm, mp->vmaddr); 614 size = min(to - address, PMD_SIZE - (address & ~PMD_MASK)); 615 zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK), 616 size, NULL); 617 address = (address + PMD_SIZE) & PMD_MASK; 618 } 619 up_read(&gmap->mm->mmap_sem); 620 } 621 EXPORT_SYMBOL_GPL(gmap_discard); 622 623 static LIST_HEAD(gmap_notifier_list); 624 static DEFINE_SPINLOCK(gmap_notifier_lock); 625 626 /** 627 * gmap_register_ipte_notifier - register a pte invalidation callback 628 * @nb: pointer to the gmap notifier block 629 */ 630 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 631 { 632 spin_lock(&gmap_notifier_lock); 633 list_add(&nb->list, &gmap_notifier_list); 634 spin_unlock(&gmap_notifier_lock); 635 } 636 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 637 638 /** 639 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 640 * @nb: pointer to the gmap notifier block 641 */ 642 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 643 { 644 spin_lock(&gmap_notifier_lock); 645 list_del_init(&nb->list); 646 spin_unlock(&gmap_notifier_lock); 647 } 648 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 649 650 /** 651 * gmap_ipte_notify - mark a range of ptes for invalidation notification 652 * @gmap: pointer to guest mapping meta data structure 653 * @address: virtual address in the guest address space 654 * @len: size of area 655 * 656 * Returns 0 if for each page in the given range a gmap mapping exists and 657 * the invalidation notification could be set. If the gmap mapping is missing 658 * for one or more pages -EFAULT is returned. If no memory could be allocated 659 * -ENOMEM is returned. This function establishes missing page table entries. 660 */ 661 int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) 662 { 663 unsigned long addr; 664 spinlock_t *ptl; 665 pte_t *ptep, entry; 666 pgste_t pgste; 667 int rc = 0; 668 669 if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK)) 670 return -EINVAL; 671 down_read(&gmap->mm->mmap_sem); 672 while (len) { 673 /* Convert gmap address and connect the page tables */ 674 addr = __gmap_fault(start, gmap); 675 if (IS_ERR_VALUE(addr)) { 676 rc = addr; 677 break; 678 } 679 /* Get the page mapped */ 680 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 681 rc = -EFAULT; 682 break; 683 } 684 /* Walk the process page table, lock and get pte pointer */ 685 ptep = get_locked_pte(gmap->mm, addr, &ptl); 686 if (unlikely(!ptep)) 687 continue; 688 /* Set notification bit in the pgste of the pte */ 689 entry = *ptep; 690 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) { 691 pgste = pgste_get_lock(ptep); 692 pgste_val(pgste) |= PGSTE_IN_BIT; 693 pgste_set_unlock(ptep, pgste); 694 start += PAGE_SIZE; 695 len -= PAGE_SIZE; 696 } 697 spin_unlock(ptl); 698 } 699 up_read(&gmap->mm->mmap_sem); 700 return rc; 701 } 702 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 703 704 /** 705 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 706 * @mm: pointer to the process mm_struct 707 * @addr: virtual address in the process address space 708 * @pte: pointer to the page table entry 709 * 710 * This function is assumed to be called with the page table lock held 711 * for the pte to notify. 712 */ 713 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte) 714 { 715 unsigned long segment_offset; 716 struct gmap_notifier *nb; 717 struct gmap_pgtable *mp; 718 struct gmap_rmap *rmap; 719 struct page *page; 720 721 segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 722 segment_offset = segment_offset * (4096 / sizeof(pte_t)); 723 page = pfn_to_page(__pa(pte) >> PAGE_SHIFT); 724 mp = (struct gmap_pgtable *) page->index; 725 spin_lock(&gmap_notifier_lock); 726 list_for_each_entry(rmap, &mp->mapper, list) { 727 list_for_each_entry(nb, &gmap_notifier_list, list) 728 nb->notifier_call(rmap->gmap, 729 rmap->vmaddr + segment_offset); 730 } 731 spin_unlock(&gmap_notifier_lock); 732 } 733 734 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 735 unsigned long vmaddr) 736 { 737 struct page *page; 738 unsigned long *table; 739 struct gmap_pgtable *mp; 740 741 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 742 if (!page) 743 return NULL; 744 mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT); 745 if (!mp) { 746 __free_page(page); 747 return NULL; 748 } 749 pgtable_page_ctor(page); 750 mp->vmaddr = vmaddr & PMD_MASK; 751 INIT_LIST_HEAD(&mp->mapper); 752 page->index = (unsigned long) mp; 753 atomic_set(&page->_mapcount, 3); 754 table = (unsigned long *) page_to_phys(page); 755 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); 756 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 757 return table; 758 } 759 760 static inline void page_table_free_pgste(unsigned long *table) 761 { 762 struct page *page; 763 struct gmap_pgtable *mp; 764 765 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 766 mp = (struct gmap_pgtable *) page->index; 767 BUG_ON(!list_empty(&mp->mapper)); 768 pgtable_page_dtor(page); 769 atomic_set(&page->_mapcount, -1); 770 kfree(mp); 771 __free_page(page); 772 } 773 774 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 775 unsigned long key, bool nq) 776 { 777 spinlock_t *ptl; 778 pgste_t old, new; 779 pte_t *ptep; 780 781 down_read(&mm->mmap_sem); 782 ptep = get_locked_pte(current->mm, addr, &ptl); 783 if (unlikely(!ptep)) { 784 up_read(&mm->mmap_sem); 785 return -EFAULT; 786 } 787 788 new = old = pgste_get_lock(ptep); 789 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 790 PGSTE_ACC_BITS | PGSTE_FP_BIT); 791 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 792 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 793 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 794 unsigned long address, bits; 795 unsigned char skey; 796 797 address = pte_val(*ptep) & PAGE_MASK; 798 skey = page_get_storage_key(address); 799 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 800 /* Set storage key ACC and FP */ 801 page_set_storage_key(address, 802 (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)), 803 !nq); 804 805 /* Merge host changed & referenced into pgste */ 806 pgste_val(new) |= bits << 52; 807 /* Transfer skey changed & referenced bit to kvm user bits */ 808 pgste_val(new) |= bits << 45; /* PGSTE_UR_BIT & PGSTE_UC_BIT */ 809 } 810 /* changing the guest storage key is considered a change of the page */ 811 if ((pgste_val(new) ^ pgste_val(old)) & 812 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 813 pgste_val(new) |= PGSTE_UC_BIT; 814 815 pgste_set_unlock(ptep, new); 816 pte_unmap_unlock(*ptep, ptl); 817 up_read(&mm->mmap_sem); 818 return 0; 819 } 820 EXPORT_SYMBOL(set_guest_storage_key); 821 822 #else /* CONFIG_PGSTE */ 823 824 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 825 unsigned long vmaddr) 826 { 827 return NULL; 828 } 829 830 static inline void page_table_free_pgste(unsigned long *table) 831 { 832 } 833 834 static inline void gmap_disconnect_pgtable(struct mm_struct *mm, 835 unsigned long *table) 836 { 837 } 838 839 #endif /* CONFIG_PGSTE */ 840 841 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 842 { 843 unsigned int old, new; 844 845 do { 846 old = atomic_read(v); 847 new = old ^ bits; 848 } while (atomic_cmpxchg(v, old, new) != old); 849 return new; 850 } 851 852 /* 853 * page table entry allocation/free routines. 854 */ 855 unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) 856 { 857 unsigned long *uninitialized_var(table); 858 struct page *uninitialized_var(page); 859 unsigned int mask, bit; 860 861 if (mm_has_pgste(mm)) 862 return page_table_alloc_pgste(mm, vmaddr); 863 /* Allocate fragments of a 4K page as 1K/2K page table */ 864 spin_lock_bh(&mm->context.list_lock); 865 mask = FRAG_MASK; 866 if (!list_empty(&mm->context.pgtable_list)) { 867 page = list_first_entry(&mm->context.pgtable_list, 868 struct page, lru); 869 table = (unsigned long *) page_to_phys(page); 870 mask = atomic_read(&page->_mapcount); 871 mask = mask | (mask >> 4); 872 } 873 if ((mask & FRAG_MASK) == FRAG_MASK) { 874 spin_unlock_bh(&mm->context.list_lock); 875 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 876 if (!page) 877 return NULL; 878 pgtable_page_ctor(page); 879 atomic_set(&page->_mapcount, 1); 880 table = (unsigned long *) page_to_phys(page); 881 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); 882 spin_lock_bh(&mm->context.list_lock); 883 list_add(&page->lru, &mm->context.pgtable_list); 884 } else { 885 for (bit = 1; mask & bit; bit <<= 1) 886 table += PTRS_PER_PTE; 887 mask = atomic_xor_bits(&page->_mapcount, bit); 888 if ((mask & FRAG_MASK) == FRAG_MASK) 889 list_del(&page->lru); 890 } 891 spin_unlock_bh(&mm->context.list_lock); 892 return table; 893 } 894 895 void page_table_free(struct mm_struct *mm, unsigned long *table) 896 { 897 struct page *page; 898 unsigned int bit, mask; 899 900 if (mm_has_pgste(mm)) { 901 gmap_disconnect_pgtable(mm, table); 902 return page_table_free_pgste(table); 903 } 904 /* Free 1K/2K page table fragment of a 4K page */ 905 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 906 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 907 spin_lock_bh(&mm->context.list_lock); 908 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 909 list_del(&page->lru); 910 mask = atomic_xor_bits(&page->_mapcount, bit); 911 if (mask & FRAG_MASK) 912 list_add(&page->lru, &mm->context.pgtable_list); 913 spin_unlock_bh(&mm->context.list_lock); 914 if (mask == 0) { 915 pgtable_page_dtor(page); 916 atomic_set(&page->_mapcount, -1); 917 __free_page(page); 918 } 919 } 920 921 static void __page_table_free_rcu(void *table, unsigned bit) 922 { 923 struct page *page; 924 925 if (bit == FRAG_MASK) 926 return page_table_free_pgste(table); 927 /* Free 1K/2K page table fragment of a 4K page */ 928 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 929 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 930 pgtable_page_dtor(page); 931 atomic_set(&page->_mapcount, -1); 932 __free_page(page); 933 } 934 } 935 936 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) 937 { 938 struct mm_struct *mm; 939 struct page *page; 940 unsigned int bit, mask; 941 942 mm = tlb->mm; 943 if (mm_has_pgste(mm)) { 944 gmap_disconnect_pgtable(mm, table); 945 table = (unsigned long *) (__pa(table) | FRAG_MASK); 946 tlb_remove_table(tlb, table); 947 return; 948 } 949 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 950 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 951 spin_lock_bh(&mm->context.list_lock); 952 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 953 list_del(&page->lru); 954 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 955 if (mask & FRAG_MASK) 956 list_add_tail(&page->lru, &mm->context.pgtable_list); 957 spin_unlock_bh(&mm->context.list_lock); 958 table = (unsigned long *) (__pa(table) | (bit << 4)); 959 tlb_remove_table(tlb, table); 960 } 961 962 void __tlb_remove_table(void *_table) 963 { 964 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 965 void *table = (void *)((unsigned long) _table & ~mask); 966 unsigned type = (unsigned long) _table & mask; 967 968 if (type) 969 __page_table_free_rcu(table, type); 970 else 971 free_pages((unsigned long) table, ALLOC_ORDER); 972 } 973 974 static void tlb_remove_table_smp_sync(void *arg) 975 { 976 /* Simply deliver the interrupt */ 977 } 978 979 static void tlb_remove_table_one(void *table) 980 { 981 /* 982 * This isn't an RCU grace period and hence the page-tables cannot be 983 * assumed to be actually RCU-freed. 984 * 985 * It is however sufficient for software page-table walkers that rely 986 * on IRQ disabling. See the comment near struct mmu_table_batch. 987 */ 988 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 989 __tlb_remove_table(table); 990 } 991 992 static void tlb_remove_table_rcu(struct rcu_head *head) 993 { 994 struct mmu_table_batch *batch; 995 int i; 996 997 batch = container_of(head, struct mmu_table_batch, rcu); 998 999 for (i = 0; i < batch->nr; i++) 1000 __tlb_remove_table(batch->tables[i]); 1001 1002 free_page((unsigned long)batch); 1003 } 1004 1005 void tlb_table_flush(struct mmu_gather *tlb) 1006 { 1007 struct mmu_table_batch **batch = &tlb->batch; 1008 1009 if (*batch) { 1010 __tlb_flush_mm(tlb->mm); 1011 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 1012 *batch = NULL; 1013 } 1014 } 1015 1016 void tlb_remove_table(struct mmu_gather *tlb, void *table) 1017 { 1018 struct mmu_table_batch **batch = &tlb->batch; 1019 1020 if (*batch == NULL) { 1021 *batch = (struct mmu_table_batch *) 1022 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 1023 if (*batch == NULL) { 1024 __tlb_flush_mm(tlb->mm); 1025 tlb_remove_table_one(table); 1026 return; 1027 } 1028 (*batch)->nr = 0; 1029 } 1030 (*batch)->tables[(*batch)->nr++] = table; 1031 if ((*batch)->nr == MAX_TABLE_BATCH) 1032 tlb_table_flush(tlb); 1033 } 1034 1035 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1036 void thp_split_vma(struct vm_area_struct *vma) 1037 { 1038 unsigned long addr; 1039 struct page *page; 1040 1041 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { 1042 page = follow_page(vma, addr, FOLL_SPLIT); 1043 } 1044 } 1045 1046 void thp_split_mm(struct mm_struct *mm) 1047 { 1048 struct vm_area_struct *vma = mm->mmap; 1049 1050 while (vma != NULL) { 1051 thp_split_vma(vma); 1052 vma->vm_flags &= ~VM_HUGEPAGE; 1053 vma->vm_flags |= VM_NOHUGEPAGE; 1054 vma = vma->vm_next; 1055 } 1056 } 1057 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1058 1059 /* 1060 * switch on pgstes for its userspace process (for kvm) 1061 */ 1062 int s390_enable_sie(void) 1063 { 1064 struct task_struct *tsk = current; 1065 struct mm_struct *mm, *old_mm; 1066 1067 /* Do we have switched amode? If no, we cannot do sie */ 1068 if (s390_user_mode == HOME_SPACE_MODE) 1069 return -EINVAL; 1070 1071 /* Do we have pgstes? if yes, we are done */ 1072 if (mm_has_pgste(tsk->mm)) 1073 return 0; 1074 1075 /* lets check if we are allowed to replace the mm */ 1076 task_lock(tsk); 1077 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 1078 #ifdef CONFIG_AIO 1079 !hlist_empty(&tsk->mm->ioctx_list) || 1080 #endif 1081 tsk->mm != tsk->active_mm) { 1082 task_unlock(tsk); 1083 return -EINVAL; 1084 } 1085 task_unlock(tsk); 1086 1087 /* we copy the mm and let dup_mm create the page tables with_pgstes */ 1088 tsk->mm->context.alloc_pgste = 1; 1089 /* make sure that both mms have a correct rss state */ 1090 sync_mm_rss(tsk->mm); 1091 mm = dup_mm(tsk); 1092 tsk->mm->context.alloc_pgste = 0; 1093 if (!mm) 1094 return -ENOMEM; 1095 1096 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1097 /* split thp mappings and disable thp for future mappings */ 1098 thp_split_mm(mm); 1099 mm->def_flags |= VM_NOHUGEPAGE; 1100 #endif 1101 1102 /* Now lets check again if something happened */ 1103 task_lock(tsk); 1104 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 1105 #ifdef CONFIG_AIO 1106 !hlist_empty(&tsk->mm->ioctx_list) || 1107 #endif 1108 tsk->mm != tsk->active_mm) { 1109 mmput(mm); 1110 task_unlock(tsk); 1111 return -EINVAL; 1112 } 1113 1114 /* ok, we are alone. No ptrace, no threads, etc. */ 1115 old_mm = tsk->mm; 1116 tsk->mm = tsk->active_mm = mm; 1117 preempt_disable(); 1118 update_mm(mm, tsk); 1119 atomic_inc(&mm->context.attach_count); 1120 atomic_dec(&old_mm->context.attach_count); 1121 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); 1122 preempt_enable(); 1123 task_unlock(tsk); 1124 mmput(old_mm); 1125 return 0; 1126 } 1127 EXPORT_SYMBOL_GPL(s390_enable_sie); 1128 1129 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1130 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1131 pmd_t *pmdp) 1132 { 1133 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1134 /* No need to flush TLB 1135 * On s390 reference bits are in storage key and never in TLB */ 1136 return pmdp_test_and_clear_young(vma, address, pmdp); 1137 } 1138 1139 int pmdp_set_access_flags(struct vm_area_struct *vma, 1140 unsigned long address, pmd_t *pmdp, 1141 pmd_t entry, int dirty) 1142 { 1143 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1144 1145 if (pmd_same(*pmdp, entry)) 1146 return 0; 1147 pmdp_invalidate(vma, address, pmdp); 1148 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1149 return 1; 1150 } 1151 1152 static void pmdp_splitting_flush_sync(void *arg) 1153 { 1154 /* Simply deliver the interrupt */ 1155 } 1156 1157 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 1158 pmd_t *pmdp) 1159 { 1160 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1161 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 1162 (unsigned long *) pmdp)) { 1163 /* need to serialize against gup-fast (IRQ disabled) */ 1164 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 1165 } 1166 } 1167 1168 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1169 pgtable_t pgtable) 1170 { 1171 struct list_head *lh = (struct list_head *) pgtable; 1172 1173 assert_spin_locked(&mm->page_table_lock); 1174 1175 /* FIFO */ 1176 if (!mm->pmd_huge_pte) 1177 INIT_LIST_HEAD(lh); 1178 else 1179 list_add(lh, (struct list_head *) mm->pmd_huge_pte); 1180 mm->pmd_huge_pte = pgtable; 1181 } 1182 1183 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1184 { 1185 struct list_head *lh; 1186 pgtable_t pgtable; 1187 pte_t *ptep; 1188 1189 assert_spin_locked(&mm->page_table_lock); 1190 1191 /* FIFO */ 1192 pgtable = mm->pmd_huge_pte; 1193 lh = (struct list_head *) pgtable; 1194 if (list_empty(lh)) 1195 mm->pmd_huge_pte = NULL; 1196 else { 1197 mm->pmd_huge_pte = (pgtable_t) lh->next; 1198 list_del(lh); 1199 } 1200 ptep = (pte_t *) pgtable; 1201 pte_val(*ptep) = _PAGE_TYPE_EMPTY; 1202 ptep++; 1203 pte_val(*ptep) = _PAGE_TYPE_EMPTY; 1204 return pgtable; 1205 } 1206 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1207