1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 27 #ifndef CONFIG_64BIT 28 #define ALLOC_ORDER 1 29 #define FRAG_MASK 0x0f 30 #else 31 #define ALLOC_ORDER 2 32 #define FRAG_MASK 0x03 33 #endif 34 35 36 unsigned long *crst_table_alloc(struct mm_struct *mm) 37 { 38 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 39 40 if (!page) 41 return NULL; 42 return (unsigned long *) page_to_phys(page); 43 } 44 45 void crst_table_free(struct mm_struct *mm, unsigned long *table) 46 { 47 free_pages((unsigned long) table, ALLOC_ORDER); 48 } 49 50 #ifdef CONFIG_64BIT 51 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 52 { 53 unsigned long *table, *pgd; 54 unsigned long entry; 55 56 BUG_ON(limit > (1UL << 53)); 57 repeat: 58 table = crst_table_alloc(mm); 59 if (!table) 60 return -ENOMEM; 61 spin_lock_bh(&mm->page_table_lock); 62 if (mm->context.asce_limit < limit) { 63 pgd = (unsigned long *) mm->pgd; 64 if (mm->context.asce_limit <= (1UL << 31)) { 65 entry = _REGION3_ENTRY_EMPTY; 66 mm->context.asce_limit = 1UL << 42; 67 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 68 _ASCE_USER_BITS | 69 _ASCE_TYPE_REGION3; 70 } else { 71 entry = _REGION2_ENTRY_EMPTY; 72 mm->context.asce_limit = 1UL << 53; 73 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 74 _ASCE_USER_BITS | 75 _ASCE_TYPE_REGION2; 76 } 77 crst_table_init(table, entry); 78 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 79 mm->pgd = (pgd_t *) table; 80 mm->task_size = mm->context.asce_limit; 81 table = NULL; 82 } 83 spin_unlock_bh(&mm->page_table_lock); 84 if (table) 85 crst_table_free(mm, table); 86 if (mm->context.asce_limit < limit) 87 goto repeat; 88 return 0; 89 } 90 91 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 92 { 93 pgd_t *pgd; 94 95 while (mm->context.asce_limit > limit) { 96 pgd = mm->pgd; 97 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 98 case _REGION_ENTRY_TYPE_R2: 99 mm->context.asce_limit = 1UL << 42; 100 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 101 _ASCE_USER_BITS | 102 _ASCE_TYPE_REGION3; 103 break; 104 case _REGION_ENTRY_TYPE_R3: 105 mm->context.asce_limit = 1UL << 31; 106 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 107 _ASCE_USER_BITS | 108 _ASCE_TYPE_SEGMENT; 109 break; 110 default: 111 BUG(); 112 } 113 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 114 mm->task_size = mm->context.asce_limit; 115 crst_table_free(mm, (unsigned long *) pgd); 116 } 117 } 118 #endif 119 120 #ifdef CONFIG_PGSTE 121 122 /** 123 * gmap_alloc - allocate a guest address space 124 * @mm: pointer to the parent mm_struct 125 * 126 * Returns a guest address space structure. 127 */ 128 struct gmap *gmap_alloc(struct mm_struct *mm) 129 { 130 struct gmap *gmap; 131 struct page *page; 132 unsigned long *table; 133 134 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 135 if (!gmap) 136 goto out; 137 INIT_LIST_HEAD(&gmap->crst_list); 138 gmap->mm = mm; 139 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 140 if (!page) 141 goto out_free; 142 list_add(&page->lru, &gmap->crst_list); 143 table = (unsigned long *) page_to_phys(page); 144 crst_table_init(table, _REGION1_ENTRY_EMPTY); 145 gmap->table = table; 146 gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH | 147 _ASCE_USER_BITS | __pa(table); 148 list_add(&gmap->list, &mm->context.gmap_list); 149 return gmap; 150 151 out_free: 152 kfree(gmap); 153 out: 154 return NULL; 155 } 156 EXPORT_SYMBOL_GPL(gmap_alloc); 157 158 static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) 159 { 160 struct gmap_pgtable *mp; 161 struct gmap_rmap *rmap; 162 struct page *page; 163 164 if (*table & _SEGMENT_ENTRY_INV) 165 return 0; 166 page = pfn_to_page(*table >> PAGE_SHIFT); 167 mp = (struct gmap_pgtable *) page->index; 168 list_for_each_entry(rmap, &mp->mapper, list) { 169 if (rmap->entry != table) 170 continue; 171 list_del(&rmap->list); 172 kfree(rmap); 173 break; 174 } 175 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; 176 return 1; 177 } 178 179 static void gmap_flush_tlb(struct gmap *gmap) 180 { 181 if (MACHINE_HAS_IDTE) 182 __tlb_flush_idte((unsigned long) gmap->table | 183 _ASCE_TYPE_REGION1); 184 else 185 __tlb_flush_global(); 186 } 187 188 /** 189 * gmap_free - free a guest address space 190 * @gmap: pointer to the guest address space structure 191 */ 192 void gmap_free(struct gmap *gmap) 193 { 194 struct page *page, *next; 195 unsigned long *table; 196 int i; 197 198 199 /* Flush tlb. */ 200 if (MACHINE_HAS_IDTE) 201 __tlb_flush_idte((unsigned long) gmap->table | 202 _ASCE_TYPE_REGION1); 203 else 204 __tlb_flush_global(); 205 206 /* Free all segment & region tables. */ 207 down_read(&gmap->mm->mmap_sem); 208 spin_lock(&gmap->mm->page_table_lock); 209 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { 210 table = (unsigned long *) page_to_phys(page); 211 if ((*table & _REGION_ENTRY_TYPE_MASK) == 0) 212 /* Remove gmap rmap structures for segment table. */ 213 for (i = 0; i < PTRS_PER_PMD; i++, table++) 214 gmap_unlink_segment(gmap, table); 215 __free_pages(page, ALLOC_ORDER); 216 } 217 spin_unlock(&gmap->mm->page_table_lock); 218 up_read(&gmap->mm->mmap_sem); 219 list_del(&gmap->list); 220 kfree(gmap); 221 } 222 EXPORT_SYMBOL_GPL(gmap_free); 223 224 /** 225 * gmap_enable - switch primary space to the guest address space 226 * @gmap: pointer to the guest address space structure 227 */ 228 void gmap_enable(struct gmap *gmap) 229 { 230 S390_lowcore.gmap = (unsigned long) gmap; 231 } 232 EXPORT_SYMBOL_GPL(gmap_enable); 233 234 /** 235 * gmap_disable - switch back to the standard primary address space 236 * @gmap: pointer to the guest address space structure 237 */ 238 void gmap_disable(struct gmap *gmap) 239 { 240 S390_lowcore.gmap = 0UL; 241 } 242 EXPORT_SYMBOL_GPL(gmap_disable); 243 244 /* 245 * gmap_alloc_table is assumed to be called with mmap_sem held 246 */ 247 static int gmap_alloc_table(struct gmap *gmap, 248 unsigned long *table, unsigned long init) 249 { 250 struct page *page; 251 unsigned long *new; 252 253 /* since we dont free the gmap table until gmap_free we can unlock */ 254 spin_unlock(&gmap->mm->page_table_lock); 255 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 256 spin_lock(&gmap->mm->page_table_lock); 257 if (!page) 258 return -ENOMEM; 259 new = (unsigned long *) page_to_phys(page); 260 crst_table_init(new, init); 261 if (*table & _REGION_ENTRY_INV) { 262 list_add(&page->lru, &gmap->crst_list); 263 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 264 (*table & _REGION_ENTRY_TYPE_MASK); 265 } else 266 __free_pages(page, ALLOC_ORDER); 267 return 0; 268 } 269 270 /** 271 * gmap_unmap_segment - unmap segment from the guest address space 272 * @gmap: pointer to the guest address space structure 273 * @addr: address in the guest address space 274 * @len: length of the memory area to unmap 275 * 276 * Returns 0 if the unmap succeded, -EINVAL if not. 277 */ 278 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 279 { 280 unsigned long *table; 281 unsigned long off; 282 int flush; 283 284 if ((to | len) & (PMD_SIZE - 1)) 285 return -EINVAL; 286 if (len == 0 || to + len < to) 287 return -EINVAL; 288 289 flush = 0; 290 down_read(&gmap->mm->mmap_sem); 291 spin_lock(&gmap->mm->page_table_lock); 292 for (off = 0; off < len; off += PMD_SIZE) { 293 /* Walk the guest addr space page table */ 294 table = gmap->table + (((to + off) >> 53) & 0x7ff); 295 if (*table & _REGION_ENTRY_INV) 296 goto out; 297 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 298 table = table + (((to + off) >> 42) & 0x7ff); 299 if (*table & _REGION_ENTRY_INV) 300 goto out; 301 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 302 table = table + (((to + off) >> 31) & 0x7ff); 303 if (*table & _REGION_ENTRY_INV) 304 goto out; 305 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 306 table = table + (((to + off) >> 20) & 0x7ff); 307 308 /* Clear segment table entry in guest address space. */ 309 flush |= gmap_unlink_segment(gmap, table); 310 *table = _SEGMENT_ENTRY_INV; 311 } 312 out: 313 spin_unlock(&gmap->mm->page_table_lock); 314 up_read(&gmap->mm->mmap_sem); 315 if (flush) 316 gmap_flush_tlb(gmap); 317 return 0; 318 } 319 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 320 321 /** 322 * gmap_mmap_segment - map a segment to the guest address space 323 * @gmap: pointer to the guest address space structure 324 * @from: source address in the parent address space 325 * @to: target address in the guest address space 326 * 327 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not. 328 */ 329 int gmap_map_segment(struct gmap *gmap, unsigned long from, 330 unsigned long to, unsigned long len) 331 { 332 unsigned long *table; 333 unsigned long off; 334 int flush; 335 336 if ((from | to | len) & (PMD_SIZE - 1)) 337 return -EINVAL; 338 if (len == 0 || from + len > PGDIR_SIZE || 339 from + len < from || to + len < to) 340 return -EINVAL; 341 342 flush = 0; 343 down_read(&gmap->mm->mmap_sem); 344 spin_lock(&gmap->mm->page_table_lock); 345 for (off = 0; off < len; off += PMD_SIZE) { 346 /* Walk the gmap address space page table */ 347 table = gmap->table + (((to + off) >> 53) & 0x7ff); 348 if ((*table & _REGION_ENTRY_INV) && 349 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY)) 350 goto out_unmap; 351 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 352 table = table + (((to + off) >> 42) & 0x7ff); 353 if ((*table & _REGION_ENTRY_INV) && 354 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY)) 355 goto out_unmap; 356 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 357 table = table + (((to + off) >> 31) & 0x7ff); 358 if ((*table & _REGION_ENTRY_INV) && 359 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY)) 360 goto out_unmap; 361 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN); 362 table = table + (((to + off) >> 20) & 0x7ff); 363 364 /* Store 'from' address in an invalid segment table entry. */ 365 flush |= gmap_unlink_segment(gmap, table); 366 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off); 367 } 368 spin_unlock(&gmap->mm->page_table_lock); 369 up_read(&gmap->mm->mmap_sem); 370 if (flush) 371 gmap_flush_tlb(gmap); 372 return 0; 373 374 out_unmap: 375 spin_unlock(&gmap->mm->page_table_lock); 376 up_read(&gmap->mm->mmap_sem); 377 gmap_unmap_segment(gmap, to, len); 378 return -ENOMEM; 379 } 380 EXPORT_SYMBOL_GPL(gmap_map_segment); 381 382 static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap) 383 { 384 unsigned long *table; 385 386 table = gmap->table + ((address >> 53) & 0x7ff); 387 if (unlikely(*table & _REGION_ENTRY_INV)) 388 return ERR_PTR(-EFAULT); 389 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 390 table = table + ((address >> 42) & 0x7ff); 391 if (unlikely(*table & _REGION_ENTRY_INV)) 392 return ERR_PTR(-EFAULT); 393 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 394 table = table + ((address >> 31) & 0x7ff); 395 if (unlikely(*table & _REGION_ENTRY_INV)) 396 return ERR_PTR(-EFAULT); 397 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 398 table = table + ((address >> 20) & 0x7ff); 399 return table; 400 } 401 402 /** 403 * __gmap_translate - translate a guest address to a user space address 404 * @address: guest address 405 * @gmap: pointer to guest mapping meta data structure 406 * 407 * Returns user space address which corresponds to the guest address or 408 * -EFAULT if no such mapping exists. 409 * This function does not establish potentially missing page table entries. 410 * The mmap_sem of the mm that belongs to the address space must be held 411 * when this function gets called. 412 */ 413 unsigned long __gmap_translate(unsigned long address, struct gmap *gmap) 414 { 415 unsigned long *segment_ptr, vmaddr, segment; 416 struct gmap_pgtable *mp; 417 struct page *page; 418 419 current->thread.gmap_addr = address; 420 segment_ptr = gmap_table_walk(address, gmap); 421 if (IS_ERR(segment_ptr)) 422 return PTR_ERR(segment_ptr); 423 /* Convert the gmap address to an mm address. */ 424 segment = *segment_ptr; 425 if (!(segment & _SEGMENT_ENTRY_INV)) { 426 page = pfn_to_page(segment >> PAGE_SHIFT); 427 mp = (struct gmap_pgtable *) page->index; 428 return mp->vmaddr | (address & ~PMD_MASK); 429 } else if (segment & _SEGMENT_ENTRY_RO) { 430 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 431 return vmaddr | (address & ~PMD_MASK); 432 } 433 return -EFAULT; 434 } 435 EXPORT_SYMBOL_GPL(__gmap_translate); 436 437 /** 438 * gmap_translate - translate a guest address to a user space address 439 * @address: guest address 440 * @gmap: pointer to guest mapping meta data structure 441 * 442 * Returns user space address which corresponds to the guest address or 443 * -EFAULT if no such mapping exists. 444 * This function does not establish potentially missing page table entries. 445 */ 446 unsigned long gmap_translate(unsigned long address, struct gmap *gmap) 447 { 448 unsigned long rc; 449 450 down_read(&gmap->mm->mmap_sem); 451 rc = __gmap_translate(address, gmap); 452 up_read(&gmap->mm->mmap_sem); 453 return rc; 454 } 455 EXPORT_SYMBOL_GPL(gmap_translate); 456 457 static int gmap_connect_pgtable(unsigned long address, unsigned long segment, 458 unsigned long *segment_ptr, struct gmap *gmap) 459 { 460 unsigned long vmaddr; 461 struct vm_area_struct *vma; 462 struct gmap_pgtable *mp; 463 struct gmap_rmap *rmap; 464 struct mm_struct *mm; 465 struct page *page; 466 pgd_t *pgd; 467 pud_t *pud; 468 pmd_t *pmd; 469 470 mm = gmap->mm; 471 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 472 vma = find_vma(mm, vmaddr); 473 if (!vma || vma->vm_start > vmaddr) 474 return -EFAULT; 475 /* Walk the parent mm page table */ 476 pgd = pgd_offset(mm, vmaddr); 477 pud = pud_alloc(mm, pgd, vmaddr); 478 if (!pud) 479 return -ENOMEM; 480 pmd = pmd_alloc(mm, pud, vmaddr); 481 if (!pmd) 482 return -ENOMEM; 483 if (!pmd_present(*pmd) && 484 __pte_alloc(mm, vma, pmd, vmaddr)) 485 return -ENOMEM; 486 /* pmd now points to a valid segment table entry. */ 487 rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); 488 if (!rmap) 489 return -ENOMEM; 490 /* Link gmap segment table entry location to page table. */ 491 page = pmd_page(*pmd); 492 mp = (struct gmap_pgtable *) page->index; 493 rmap->gmap = gmap; 494 rmap->entry = segment_ptr; 495 rmap->vmaddr = address & PMD_MASK; 496 spin_lock(&mm->page_table_lock); 497 if (*segment_ptr == segment) { 498 list_add(&rmap->list, &mp->mapper); 499 /* Set gmap segment table entry to page table. */ 500 *segment_ptr = pmd_val(*pmd) & PAGE_MASK; 501 rmap = NULL; 502 } 503 spin_unlock(&mm->page_table_lock); 504 kfree(rmap); 505 return 0; 506 } 507 508 static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table) 509 { 510 struct gmap_rmap *rmap, *next; 511 struct gmap_pgtable *mp; 512 struct page *page; 513 int flush; 514 515 flush = 0; 516 spin_lock(&mm->page_table_lock); 517 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 518 mp = (struct gmap_pgtable *) page->index; 519 list_for_each_entry_safe(rmap, next, &mp->mapper, list) { 520 *rmap->entry = 521 _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; 522 list_del(&rmap->list); 523 kfree(rmap); 524 flush = 1; 525 } 526 spin_unlock(&mm->page_table_lock); 527 if (flush) 528 __tlb_flush_global(); 529 } 530 531 /* 532 * this function is assumed to be called with mmap_sem held 533 */ 534 unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) 535 { 536 unsigned long *segment_ptr, segment; 537 struct gmap_pgtable *mp; 538 struct page *page; 539 int rc; 540 541 current->thread.gmap_addr = address; 542 segment_ptr = gmap_table_walk(address, gmap); 543 if (IS_ERR(segment_ptr)) 544 return -EFAULT; 545 /* Convert the gmap address to an mm address. */ 546 while (1) { 547 segment = *segment_ptr; 548 if (!(segment & _SEGMENT_ENTRY_INV)) { 549 /* Page table is present */ 550 page = pfn_to_page(segment >> PAGE_SHIFT); 551 mp = (struct gmap_pgtable *) page->index; 552 return mp->vmaddr | (address & ~PMD_MASK); 553 } 554 if (!(segment & _SEGMENT_ENTRY_RO)) 555 /* Nothing mapped in the gmap address space. */ 556 break; 557 rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap); 558 if (rc) 559 return rc; 560 } 561 return -EFAULT; 562 } 563 564 unsigned long gmap_fault(unsigned long address, struct gmap *gmap) 565 { 566 unsigned long rc; 567 568 down_read(&gmap->mm->mmap_sem); 569 rc = __gmap_fault(address, gmap); 570 up_read(&gmap->mm->mmap_sem); 571 572 return rc; 573 } 574 EXPORT_SYMBOL_GPL(gmap_fault); 575 576 void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) 577 { 578 579 unsigned long *table, address, size; 580 struct vm_area_struct *vma; 581 struct gmap_pgtable *mp; 582 struct page *page; 583 584 down_read(&gmap->mm->mmap_sem); 585 address = from; 586 while (address < to) { 587 /* Walk the gmap address space page table */ 588 table = gmap->table + ((address >> 53) & 0x7ff); 589 if (unlikely(*table & _REGION_ENTRY_INV)) { 590 address = (address + PMD_SIZE) & PMD_MASK; 591 continue; 592 } 593 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 594 table = table + ((address >> 42) & 0x7ff); 595 if (unlikely(*table & _REGION_ENTRY_INV)) { 596 address = (address + PMD_SIZE) & PMD_MASK; 597 continue; 598 } 599 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 600 table = table + ((address >> 31) & 0x7ff); 601 if (unlikely(*table & _REGION_ENTRY_INV)) { 602 address = (address + PMD_SIZE) & PMD_MASK; 603 continue; 604 } 605 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 606 table = table + ((address >> 20) & 0x7ff); 607 if (unlikely(*table & _SEGMENT_ENTRY_INV)) { 608 address = (address + PMD_SIZE) & PMD_MASK; 609 continue; 610 } 611 page = pfn_to_page(*table >> PAGE_SHIFT); 612 mp = (struct gmap_pgtable *) page->index; 613 vma = find_vma(gmap->mm, mp->vmaddr); 614 size = min(to - address, PMD_SIZE - (address & ~PMD_MASK)); 615 zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK), 616 size, NULL); 617 address = (address + PMD_SIZE) & PMD_MASK; 618 } 619 up_read(&gmap->mm->mmap_sem); 620 } 621 EXPORT_SYMBOL_GPL(gmap_discard); 622 623 static LIST_HEAD(gmap_notifier_list); 624 static DEFINE_SPINLOCK(gmap_notifier_lock); 625 626 /** 627 * gmap_register_ipte_notifier - register a pte invalidation callback 628 * @nb: pointer to the gmap notifier block 629 */ 630 void gmap_register_ipte_notifier(struct gmap_notifier *nb) 631 { 632 spin_lock(&gmap_notifier_lock); 633 list_add(&nb->list, &gmap_notifier_list); 634 spin_unlock(&gmap_notifier_lock); 635 } 636 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); 637 638 /** 639 * gmap_unregister_ipte_notifier - remove a pte invalidation callback 640 * @nb: pointer to the gmap notifier block 641 */ 642 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) 643 { 644 spin_lock(&gmap_notifier_lock); 645 list_del_init(&nb->list); 646 spin_unlock(&gmap_notifier_lock); 647 } 648 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); 649 650 /** 651 * gmap_ipte_notify - mark a range of ptes for invalidation notification 652 * @gmap: pointer to guest mapping meta data structure 653 * @address: virtual address in the guest address space 654 * @len: size of area 655 * 656 * Returns 0 if for each page in the given range a gmap mapping exists and 657 * the invalidation notification could be set. If the gmap mapping is missing 658 * for one or more pages -EFAULT is returned. If no memory could be allocated 659 * -ENOMEM is returned. This function establishes missing page table entries. 660 */ 661 int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) 662 { 663 unsigned long addr; 664 spinlock_t *ptl; 665 pte_t *ptep, entry; 666 pgste_t pgste; 667 int rc = 0; 668 669 if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK)) 670 return -EINVAL; 671 down_read(&gmap->mm->mmap_sem); 672 while (len) { 673 /* Convert gmap address and connect the page tables */ 674 addr = __gmap_fault(start, gmap); 675 if (IS_ERR_VALUE(addr)) { 676 rc = addr; 677 break; 678 } 679 /* Get the page mapped */ 680 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { 681 rc = -EFAULT; 682 break; 683 } 684 /* Walk the process page table, lock and get pte pointer */ 685 ptep = get_locked_pte(gmap->mm, addr, &ptl); 686 if (unlikely(!ptep)) 687 continue; 688 /* Set notification bit in the pgste of the pte */ 689 entry = *ptep; 690 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) { 691 pgste = pgste_get_lock(ptep); 692 pgste_val(pgste) |= RCP_IN_BIT; 693 pgste_set_unlock(ptep, pgste); 694 start += PAGE_SIZE; 695 len -= PAGE_SIZE; 696 } 697 spin_unlock(ptl); 698 } 699 up_read(&gmap->mm->mmap_sem); 700 return rc; 701 } 702 EXPORT_SYMBOL_GPL(gmap_ipte_notify); 703 704 /** 705 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. 706 * @mm: pointer to the process mm_struct 707 * @addr: virtual address in the process address space 708 * @pte: pointer to the page table entry 709 * 710 * This function is assumed to be called with the page table lock held 711 * for the pte to notify. 712 */ 713 void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte) 714 { 715 unsigned long segment_offset; 716 struct gmap_notifier *nb; 717 struct gmap_pgtable *mp; 718 struct gmap_rmap *rmap; 719 struct page *page; 720 721 segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 722 segment_offset = segment_offset * (4096 / sizeof(pte_t)); 723 page = pfn_to_page(__pa(pte) >> PAGE_SHIFT); 724 mp = (struct gmap_pgtable *) page->index; 725 spin_lock(&gmap_notifier_lock); 726 list_for_each_entry(rmap, &mp->mapper, list) { 727 list_for_each_entry(nb, &gmap_notifier_list, list) 728 nb->notifier_call(rmap->gmap, 729 rmap->vmaddr + segment_offset); 730 } 731 spin_unlock(&gmap_notifier_lock); 732 } 733 734 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 735 unsigned long vmaddr) 736 { 737 struct page *page; 738 unsigned long *table; 739 struct gmap_pgtable *mp; 740 741 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 742 if (!page) 743 return NULL; 744 mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT); 745 if (!mp) { 746 __free_page(page); 747 return NULL; 748 } 749 pgtable_page_ctor(page); 750 mp->vmaddr = vmaddr & PMD_MASK; 751 INIT_LIST_HEAD(&mp->mapper); 752 page->index = (unsigned long) mp; 753 atomic_set(&page->_mapcount, 3); 754 table = (unsigned long *) page_to_phys(page); 755 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); 756 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 757 return table; 758 } 759 760 static inline void page_table_free_pgste(unsigned long *table) 761 { 762 struct page *page; 763 struct gmap_pgtable *mp; 764 765 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 766 mp = (struct gmap_pgtable *) page->index; 767 BUG_ON(!list_empty(&mp->mapper)); 768 pgtable_page_dtor(page); 769 atomic_set(&page->_mapcount, -1); 770 kfree(mp); 771 __free_page(page); 772 } 773 774 #else /* CONFIG_PGSTE */ 775 776 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 777 unsigned long vmaddr) 778 { 779 return NULL; 780 } 781 782 static inline void page_table_free_pgste(unsigned long *table) 783 { 784 } 785 786 static inline void gmap_disconnect_pgtable(struct mm_struct *mm, 787 unsigned long *table) 788 { 789 } 790 791 #endif /* CONFIG_PGSTE */ 792 793 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 794 { 795 unsigned int old, new; 796 797 do { 798 old = atomic_read(v); 799 new = old ^ bits; 800 } while (atomic_cmpxchg(v, old, new) != old); 801 return new; 802 } 803 804 /* 805 * page table entry allocation/free routines. 806 */ 807 unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) 808 { 809 unsigned long *uninitialized_var(table); 810 struct page *uninitialized_var(page); 811 unsigned int mask, bit; 812 813 if (mm_has_pgste(mm)) 814 return page_table_alloc_pgste(mm, vmaddr); 815 /* Allocate fragments of a 4K page as 1K/2K page table */ 816 spin_lock_bh(&mm->context.list_lock); 817 mask = FRAG_MASK; 818 if (!list_empty(&mm->context.pgtable_list)) { 819 page = list_first_entry(&mm->context.pgtable_list, 820 struct page, lru); 821 table = (unsigned long *) page_to_phys(page); 822 mask = atomic_read(&page->_mapcount); 823 mask = mask | (mask >> 4); 824 } 825 if ((mask & FRAG_MASK) == FRAG_MASK) { 826 spin_unlock_bh(&mm->context.list_lock); 827 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 828 if (!page) 829 return NULL; 830 pgtable_page_ctor(page); 831 atomic_set(&page->_mapcount, 1); 832 table = (unsigned long *) page_to_phys(page); 833 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); 834 spin_lock_bh(&mm->context.list_lock); 835 list_add(&page->lru, &mm->context.pgtable_list); 836 } else { 837 for (bit = 1; mask & bit; bit <<= 1) 838 table += PTRS_PER_PTE; 839 mask = atomic_xor_bits(&page->_mapcount, bit); 840 if ((mask & FRAG_MASK) == FRAG_MASK) 841 list_del(&page->lru); 842 } 843 spin_unlock_bh(&mm->context.list_lock); 844 return table; 845 } 846 847 void page_table_free(struct mm_struct *mm, unsigned long *table) 848 { 849 struct page *page; 850 unsigned int bit, mask; 851 852 if (mm_has_pgste(mm)) { 853 gmap_disconnect_pgtable(mm, table); 854 return page_table_free_pgste(table); 855 } 856 /* Free 1K/2K page table fragment of a 4K page */ 857 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 858 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 859 spin_lock_bh(&mm->context.list_lock); 860 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 861 list_del(&page->lru); 862 mask = atomic_xor_bits(&page->_mapcount, bit); 863 if (mask & FRAG_MASK) 864 list_add(&page->lru, &mm->context.pgtable_list); 865 spin_unlock_bh(&mm->context.list_lock); 866 if (mask == 0) { 867 pgtable_page_dtor(page); 868 atomic_set(&page->_mapcount, -1); 869 __free_page(page); 870 } 871 } 872 873 static void __page_table_free_rcu(void *table, unsigned bit) 874 { 875 struct page *page; 876 877 if (bit == FRAG_MASK) 878 return page_table_free_pgste(table); 879 /* Free 1K/2K page table fragment of a 4K page */ 880 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 881 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 882 pgtable_page_dtor(page); 883 atomic_set(&page->_mapcount, -1); 884 __free_page(page); 885 } 886 } 887 888 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) 889 { 890 struct mm_struct *mm; 891 struct page *page; 892 unsigned int bit, mask; 893 894 mm = tlb->mm; 895 if (mm_has_pgste(mm)) { 896 gmap_disconnect_pgtable(mm, table); 897 table = (unsigned long *) (__pa(table) | FRAG_MASK); 898 tlb_remove_table(tlb, table); 899 return; 900 } 901 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 902 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 903 spin_lock_bh(&mm->context.list_lock); 904 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 905 list_del(&page->lru); 906 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 907 if (mask & FRAG_MASK) 908 list_add_tail(&page->lru, &mm->context.pgtable_list); 909 spin_unlock_bh(&mm->context.list_lock); 910 table = (unsigned long *) (__pa(table) | (bit << 4)); 911 tlb_remove_table(tlb, table); 912 } 913 914 void __tlb_remove_table(void *_table) 915 { 916 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 917 void *table = (void *)((unsigned long) _table & ~mask); 918 unsigned type = (unsigned long) _table & mask; 919 920 if (type) 921 __page_table_free_rcu(table, type); 922 else 923 free_pages((unsigned long) table, ALLOC_ORDER); 924 } 925 926 static void tlb_remove_table_smp_sync(void *arg) 927 { 928 /* Simply deliver the interrupt */ 929 } 930 931 static void tlb_remove_table_one(void *table) 932 { 933 /* 934 * This isn't an RCU grace period and hence the page-tables cannot be 935 * assumed to be actually RCU-freed. 936 * 937 * It is however sufficient for software page-table walkers that rely 938 * on IRQ disabling. See the comment near struct mmu_table_batch. 939 */ 940 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 941 __tlb_remove_table(table); 942 } 943 944 static void tlb_remove_table_rcu(struct rcu_head *head) 945 { 946 struct mmu_table_batch *batch; 947 int i; 948 949 batch = container_of(head, struct mmu_table_batch, rcu); 950 951 for (i = 0; i < batch->nr; i++) 952 __tlb_remove_table(batch->tables[i]); 953 954 free_page((unsigned long)batch); 955 } 956 957 void tlb_table_flush(struct mmu_gather *tlb) 958 { 959 struct mmu_table_batch **batch = &tlb->batch; 960 961 if (*batch) { 962 __tlb_flush_mm(tlb->mm); 963 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 964 *batch = NULL; 965 } 966 } 967 968 void tlb_remove_table(struct mmu_gather *tlb, void *table) 969 { 970 struct mmu_table_batch **batch = &tlb->batch; 971 972 if (*batch == NULL) { 973 *batch = (struct mmu_table_batch *) 974 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 975 if (*batch == NULL) { 976 __tlb_flush_mm(tlb->mm); 977 tlb_remove_table_one(table); 978 return; 979 } 980 (*batch)->nr = 0; 981 } 982 (*batch)->tables[(*batch)->nr++] = table; 983 if ((*batch)->nr == MAX_TABLE_BATCH) 984 tlb_table_flush(tlb); 985 } 986 987 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 988 void thp_split_vma(struct vm_area_struct *vma) 989 { 990 unsigned long addr; 991 struct page *page; 992 993 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { 994 page = follow_page(vma, addr, FOLL_SPLIT); 995 } 996 } 997 998 void thp_split_mm(struct mm_struct *mm) 999 { 1000 struct vm_area_struct *vma = mm->mmap; 1001 1002 while (vma != NULL) { 1003 thp_split_vma(vma); 1004 vma->vm_flags &= ~VM_HUGEPAGE; 1005 vma->vm_flags |= VM_NOHUGEPAGE; 1006 vma = vma->vm_next; 1007 } 1008 } 1009 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1010 1011 /* 1012 * switch on pgstes for its userspace process (for kvm) 1013 */ 1014 int s390_enable_sie(void) 1015 { 1016 struct task_struct *tsk = current; 1017 struct mm_struct *mm, *old_mm; 1018 1019 /* Do we have switched amode? If no, we cannot do sie */ 1020 if (s390_user_mode == HOME_SPACE_MODE) 1021 return -EINVAL; 1022 1023 /* Do we have pgstes? if yes, we are done */ 1024 if (mm_has_pgste(tsk->mm)) 1025 return 0; 1026 1027 /* lets check if we are allowed to replace the mm */ 1028 task_lock(tsk); 1029 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 1030 #ifdef CONFIG_AIO 1031 !hlist_empty(&tsk->mm->ioctx_list) || 1032 #endif 1033 tsk->mm != tsk->active_mm) { 1034 task_unlock(tsk); 1035 return -EINVAL; 1036 } 1037 task_unlock(tsk); 1038 1039 /* we copy the mm and let dup_mm create the page tables with_pgstes */ 1040 tsk->mm->context.alloc_pgste = 1; 1041 /* make sure that both mms have a correct rss state */ 1042 sync_mm_rss(tsk->mm); 1043 mm = dup_mm(tsk); 1044 tsk->mm->context.alloc_pgste = 0; 1045 if (!mm) 1046 return -ENOMEM; 1047 1048 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1049 /* split thp mappings and disable thp for future mappings */ 1050 thp_split_mm(mm); 1051 mm->def_flags |= VM_NOHUGEPAGE; 1052 #endif 1053 1054 /* Now lets check again if something happened */ 1055 task_lock(tsk); 1056 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 1057 #ifdef CONFIG_AIO 1058 !hlist_empty(&tsk->mm->ioctx_list) || 1059 #endif 1060 tsk->mm != tsk->active_mm) { 1061 mmput(mm); 1062 task_unlock(tsk); 1063 return -EINVAL; 1064 } 1065 1066 /* ok, we are alone. No ptrace, no threads, etc. */ 1067 old_mm = tsk->mm; 1068 tsk->mm = tsk->active_mm = mm; 1069 preempt_disable(); 1070 update_mm(mm, tsk); 1071 atomic_inc(&mm->context.attach_count); 1072 atomic_dec(&old_mm->context.attach_count); 1073 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); 1074 preempt_enable(); 1075 task_unlock(tsk); 1076 mmput(old_mm); 1077 return 0; 1078 } 1079 EXPORT_SYMBOL_GPL(s390_enable_sie); 1080 1081 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1082 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 1083 pmd_t *pmdp) 1084 { 1085 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1086 /* No need to flush TLB 1087 * On s390 reference bits are in storage key and never in TLB */ 1088 return pmdp_test_and_clear_young(vma, address, pmdp); 1089 } 1090 1091 int pmdp_set_access_flags(struct vm_area_struct *vma, 1092 unsigned long address, pmd_t *pmdp, 1093 pmd_t entry, int dirty) 1094 { 1095 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1096 1097 if (pmd_same(*pmdp, entry)) 1098 return 0; 1099 pmdp_invalidate(vma, address, pmdp); 1100 set_pmd_at(vma->vm_mm, address, pmdp, entry); 1101 return 1; 1102 } 1103 1104 static void pmdp_splitting_flush_sync(void *arg) 1105 { 1106 /* Simply deliver the interrupt */ 1107 } 1108 1109 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 1110 pmd_t *pmdp) 1111 { 1112 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1113 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 1114 (unsigned long *) pmdp)) { 1115 /* need to serialize against gup-fast (IRQ disabled) */ 1116 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 1117 } 1118 } 1119 1120 void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) 1121 { 1122 struct list_head *lh = (struct list_head *) pgtable; 1123 1124 assert_spin_locked(&mm->page_table_lock); 1125 1126 /* FIFO */ 1127 if (!mm->pmd_huge_pte) 1128 INIT_LIST_HEAD(lh); 1129 else 1130 list_add(lh, (struct list_head *) mm->pmd_huge_pte); 1131 mm->pmd_huge_pte = pgtable; 1132 } 1133 1134 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) 1135 { 1136 struct list_head *lh; 1137 pgtable_t pgtable; 1138 pte_t *ptep; 1139 1140 assert_spin_locked(&mm->page_table_lock); 1141 1142 /* FIFO */ 1143 pgtable = mm->pmd_huge_pte; 1144 lh = (struct list_head *) pgtable; 1145 if (list_empty(lh)) 1146 mm->pmd_huge_pte = NULL; 1147 else { 1148 mm->pmd_huge_pte = (pgtable_t) lh->next; 1149 list_del(lh); 1150 } 1151 ptep = (pte_t *) pgtable; 1152 pte_val(*ptep) = _PAGE_TYPE_EMPTY; 1153 ptep++; 1154 pte_val(*ptep) = _PAGE_TYPE_EMPTY; 1155 return pgtable; 1156 } 1157 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1158