1 /* 2 * Copyright IBM Corp. 2007, 2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 27 #ifndef CONFIG_64BIT 28 #define ALLOC_ORDER 1 29 #define FRAG_MASK 0x0f 30 #else 31 #define ALLOC_ORDER 2 32 #define FRAG_MASK 0x03 33 #endif 34 35 36 unsigned long *crst_table_alloc(struct mm_struct *mm) 37 { 38 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 39 40 if (!page) 41 return NULL; 42 return (unsigned long *) page_to_phys(page); 43 } 44 45 void crst_table_free(struct mm_struct *mm, unsigned long *table) 46 { 47 free_pages((unsigned long) table, ALLOC_ORDER); 48 } 49 50 #ifdef CONFIG_64BIT 51 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 52 { 53 unsigned long *table, *pgd; 54 unsigned long entry; 55 56 BUG_ON(limit > (1UL << 53)); 57 repeat: 58 table = crst_table_alloc(mm); 59 if (!table) 60 return -ENOMEM; 61 spin_lock_bh(&mm->page_table_lock); 62 if (mm->context.asce_limit < limit) { 63 pgd = (unsigned long *) mm->pgd; 64 if (mm->context.asce_limit <= (1UL << 31)) { 65 entry = _REGION3_ENTRY_EMPTY; 66 mm->context.asce_limit = 1UL << 42; 67 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 68 _ASCE_USER_BITS | 69 _ASCE_TYPE_REGION3; 70 } else { 71 entry = _REGION2_ENTRY_EMPTY; 72 mm->context.asce_limit = 1UL << 53; 73 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 74 _ASCE_USER_BITS | 75 _ASCE_TYPE_REGION2; 76 } 77 crst_table_init(table, entry); 78 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 79 mm->pgd = (pgd_t *) table; 80 mm->task_size = mm->context.asce_limit; 81 table = NULL; 82 } 83 spin_unlock_bh(&mm->page_table_lock); 84 if (table) 85 crst_table_free(mm, table); 86 if (mm->context.asce_limit < limit) 87 goto repeat; 88 return 0; 89 } 90 91 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 92 { 93 pgd_t *pgd; 94 95 while (mm->context.asce_limit > limit) { 96 pgd = mm->pgd; 97 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 98 case _REGION_ENTRY_TYPE_R2: 99 mm->context.asce_limit = 1UL << 42; 100 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 101 _ASCE_USER_BITS | 102 _ASCE_TYPE_REGION3; 103 break; 104 case _REGION_ENTRY_TYPE_R3: 105 mm->context.asce_limit = 1UL << 31; 106 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 107 _ASCE_USER_BITS | 108 _ASCE_TYPE_SEGMENT; 109 break; 110 default: 111 BUG(); 112 } 113 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 114 mm->task_size = mm->context.asce_limit; 115 crst_table_free(mm, (unsigned long *) pgd); 116 } 117 } 118 #endif 119 120 #ifdef CONFIG_PGSTE 121 122 /** 123 * gmap_alloc - allocate a guest address space 124 * @mm: pointer to the parent mm_struct 125 * 126 * Returns a guest address space structure. 127 */ 128 struct gmap *gmap_alloc(struct mm_struct *mm) 129 { 130 struct gmap *gmap; 131 struct page *page; 132 unsigned long *table; 133 134 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 135 if (!gmap) 136 goto out; 137 INIT_LIST_HEAD(&gmap->crst_list); 138 gmap->mm = mm; 139 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 140 if (!page) 141 goto out_free; 142 list_add(&page->lru, &gmap->crst_list); 143 table = (unsigned long *) page_to_phys(page); 144 crst_table_init(table, _REGION1_ENTRY_EMPTY); 145 gmap->table = table; 146 gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH | 147 _ASCE_USER_BITS | __pa(table); 148 list_add(&gmap->list, &mm->context.gmap_list); 149 return gmap; 150 151 out_free: 152 kfree(gmap); 153 out: 154 return NULL; 155 } 156 EXPORT_SYMBOL_GPL(gmap_alloc); 157 158 static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) 159 { 160 struct gmap_pgtable *mp; 161 struct gmap_rmap *rmap; 162 struct page *page; 163 164 if (*table & _SEGMENT_ENTRY_INV) 165 return 0; 166 page = pfn_to_page(*table >> PAGE_SHIFT); 167 mp = (struct gmap_pgtable *) page->index; 168 list_for_each_entry(rmap, &mp->mapper, list) { 169 if (rmap->entry != table) 170 continue; 171 list_del(&rmap->list); 172 kfree(rmap); 173 break; 174 } 175 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; 176 return 1; 177 } 178 179 static void gmap_flush_tlb(struct gmap *gmap) 180 { 181 if (MACHINE_HAS_IDTE) 182 __tlb_flush_idte((unsigned long) gmap->table | 183 _ASCE_TYPE_REGION1); 184 else 185 __tlb_flush_global(); 186 } 187 188 /** 189 * gmap_free - free a guest address space 190 * @gmap: pointer to the guest address space structure 191 */ 192 void gmap_free(struct gmap *gmap) 193 { 194 struct page *page, *next; 195 unsigned long *table; 196 int i; 197 198 199 /* Flush tlb. */ 200 if (MACHINE_HAS_IDTE) 201 __tlb_flush_idte((unsigned long) gmap->table | 202 _ASCE_TYPE_REGION1); 203 else 204 __tlb_flush_global(); 205 206 /* Free all segment & region tables. */ 207 down_read(&gmap->mm->mmap_sem); 208 spin_lock(&gmap->mm->page_table_lock); 209 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { 210 table = (unsigned long *) page_to_phys(page); 211 if ((*table & _REGION_ENTRY_TYPE_MASK) == 0) 212 /* Remove gmap rmap structures for segment table. */ 213 for (i = 0; i < PTRS_PER_PMD; i++, table++) 214 gmap_unlink_segment(gmap, table); 215 __free_pages(page, ALLOC_ORDER); 216 } 217 spin_unlock(&gmap->mm->page_table_lock); 218 up_read(&gmap->mm->mmap_sem); 219 list_del(&gmap->list); 220 kfree(gmap); 221 } 222 EXPORT_SYMBOL_GPL(gmap_free); 223 224 /** 225 * gmap_enable - switch primary space to the guest address space 226 * @gmap: pointer to the guest address space structure 227 */ 228 void gmap_enable(struct gmap *gmap) 229 { 230 S390_lowcore.gmap = (unsigned long) gmap; 231 } 232 EXPORT_SYMBOL_GPL(gmap_enable); 233 234 /** 235 * gmap_disable - switch back to the standard primary address space 236 * @gmap: pointer to the guest address space structure 237 */ 238 void gmap_disable(struct gmap *gmap) 239 { 240 S390_lowcore.gmap = 0UL; 241 } 242 EXPORT_SYMBOL_GPL(gmap_disable); 243 244 /* 245 * gmap_alloc_table is assumed to be called with mmap_sem held 246 */ 247 static int gmap_alloc_table(struct gmap *gmap, 248 unsigned long *table, unsigned long init) 249 { 250 struct page *page; 251 unsigned long *new; 252 253 /* since we dont free the gmap table until gmap_free we can unlock */ 254 spin_unlock(&gmap->mm->page_table_lock); 255 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 256 spin_lock(&gmap->mm->page_table_lock); 257 if (!page) 258 return -ENOMEM; 259 new = (unsigned long *) page_to_phys(page); 260 crst_table_init(new, init); 261 if (*table & _REGION_ENTRY_INV) { 262 list_add(&page->lru, &gmap->crst_list); 263 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 264 (*table & _REGION_ENTRY_TYPE_MASK); 265 } else 266 __free_pages(page, ALLOC_ORDER); 267 return 0; 268 } 269 270 /** 271 * gmap_unmap_segment - unmap segment from the guest address space 272 * @gmap: pointer to the guest address space structure 273 * @addr: address in the guest address space 274 * @len: length of the memory area to unmap 275 * 276 * Returns 0 if the unmap succeded, -EINVAL if not. 277 */ 278 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 279 { 280 unsigned long *table; 281 unsigned long off; 282 int flush; 283 284 if ((to | len) & (PMD_SIZE - 1)) 285 return -EINVAL; 286 if (len == 0 || to + len < to) 287 return -EINVAL; 288 289 flush = 0; 290 down_read(&gmap->mm->mmap_sem); 291 spin_lock(&gmap->mm->page_table_lock); 292 for (off = 0; off < len; off += PMD_SIZE) { 293 /* Walk the guest addr space page table */ 294 table = gmap->table + (((to + off) >> 53) & 0x7ff); 295 if (*table & _REGION_ENTRY_INV) 296 goto out; 297 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 298 table = table + (((to + off) >> 42) & 0x7ff); 299 if (*table & _REGION_ENTRY_INV) 300 goto out; 301 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 302 table = table + (((to + off) >> 31) & 0x7ff); 303 if (*table & _REGION_ENTRY_INV) 304 goto out; 305 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 306 table = table + (((to + off) >> 20) & 0x7ff); 307 308 /* Clear segment table entry in guest address space. */ 309 flush |= gmap_unlink_segment(gmap, table); 310 *table = _SEGMENT_ENTRY_INV; 311 } 312 out: 313 spin_unlock(&gmap->mm->page_table_lock); 314 up_read(&gmap->mm->mmap_sem); 315 if (flush) 316 gmap_flush_tlb(gmap); 317 return 0; 318 } 319 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 320 321 /** 322 * gmap_mmap_segment - map a segment to the guest address space 323 * @gmap: pointer to the guest address space structure 324 * @from: source address in the parent address space 325 * @to: target address in the guest address space 326 * 327 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not. 328 */ 329 int gmap_map_segment(struct gmap *gmap, unsigned long from, 330 unsigned long to, unsigned long len) 331 { 332 unsigned long *table; 333 unsigned long off; 334 int flush; 335 336 if ((from | to | len) & (PMD_SIZE - 1)) 337 return -EINVAL; 338 if (len == 0 || from + len > PGDIR_SIZE || 339 from + len < from || to + len < to) 340 return -EINVAL; 341 342 flush = 0; 343 down_read(&gmap->mm->mmap_sem); 344 spin_lock(&gmap->mm->page_table_lock); 345 for (off = 0; off < len; off += PMD_SIZE) { 346 /* Walk the gmap address space page table */ 347 table = gmap->table + (((to + off) >> 53) & 0x7ff); 348 if ((*table & _REGION_ENTRY_INV) && 349 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY)) 350 goto out_unmap; 351 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 352 table = table + (((to + off) >> 42) & 0x7ff); 353 if ((*table & _REGION_ENTRY_INV) && 354 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY)) 355 goto out_unmap; 356 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 357 table = table + (((to + off) >> 31) & 0x7ff); 358 if ((*table & _REGION_ENTRY_INV) && 359 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY)) 360 goto out_unmap; 361 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN); 362 table = table + (((to + off) >> 20) & 0x7ff); 363 364 /* Store 'from' address in an invalid segment table entry. */ 365 flush |= gmap_unlink_segment(gmap, table); 366 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off); 367 } 368 spin_unlock(&gmap->mm->page_table_lock); 369 up_read(&gmap->mm->mmap_sem); 370 if (flush) 371 gmap_flush_tlb(gmap); 372 return 0; 373 374 out_unmap: 375 spin_unlock(&gmap->mm->page_table_lock); 376 up_read(&gmap->mm->mmap_sem); 377 gmap_unmap_segment(gmap, to, len); 378 return -ENOMEM; 379 } 380 EXPORT_SYMBOL_GPL(gmap_map_segment); 381 382 /* 383 * this function is assumed to be called with mmap_sem held 384 */ 385 unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) 386 { 387 unsigned long *table, vmaddr, segment; 388 struct mm_struct *mm; 389 struct gmap_pgtable *mp; 390 struct gmap_rmap *rmap; 391 struct vm_area_struct *vma; 392 struct page *page; 393 pgd_t *pgd; 394 pud_t *pud; 395 pmd_t *pmd; 396 397 current->thread.gmap_addr = address; 398 mm = gmap->mm; 399 /* Walk the gmap address space page table */ 400 table = gmap->table + ((address >> 53) & 0x7ff); 401 if (unlikely(*table & _REGION_ENTRY_INV)) 402 return -EFAULT; 403 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 404 table = table + ((address >> 42) & 0x7ff); 405 if (unlikely(*table & _REGION_ENTRY_INV)) 406 return -EFAULT; 407 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 408 table = table + ((address >> 31) & 0x7ff); 409 if (unlikely(*table & _REGION_ENTRY_INV)) 410 return -EFAULT; 411 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 412 table = table + ((address >> 20) & 0x7ff); 413 414 /* Convert the gmap address to an mm address. */ 415 segment = *table; 416 if (likely(!(segment & _SEGMENT_ENTRY_INV))) { 417 page = pfn_to_page(segment >> PAGE_SHIFT); 418 mp = (struct gmap_pgtable *) page->index; 419 return mp->vmaddr | (address & ~PMD_MASK); 420 } else if (segment & _SEGMENT_ENTRY_RO) { 421 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 422 vma = find_vma(mm, vmaddr); 423 if (!vma || vma->vm_start > vmaddr) 424 return -EFAULT; 425 426 /* Walk the parent mm page table */ 427 pgd = pgd_offset(mm, vmaddr); 428 pud = pud_alloc(mm, pgd, vmaddr); 429 if (!pud) 430 return -ENOMEM; 431 pmd = pmd_alloc(mm, pud, vmaddr); 432 if (!pmd) 433 return -ENOMEM; 434 if (!pmd_present(*pmd) && 435 __pte_alloc(mm, vma, pmd, vmaddr)) 436 return -ENOMEM; 437 /* pmd now points to a valid segment table entry. */ 438 rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); 439 if (!rmap) 440 return -ENOMEM; 441 /* Link gmap segment table entry location to page table. */ 442 page = pmd_page(*pmd); 443 mp = (struct gmap_pgtable *) page->index; 444 rmap->entry = table; 445 spin_lock(&mm->page_table_lock); 446 list_add(&rmap->list, &mp->mapper); 447 spin_unlock(&mm->page_table_lock); 448 /* Set gmap segment table entry to page table. */ 449 *table = pmd_val(*pmd) & PAGE_MASK; 450 return vmaddr | (address & ~PMD_MASK); 451 } 452 return -EFAULT; 453 } 454 455 unsigned long gmap_fault(unsigned long address, struct gmap *gmap) 456 { 457 unsigned long rc; 458 459 down_read(&gmap->mm->mmap_sem); 460 rc = __gmap_fault(address, gmap); 461 up_read(&gmap->mm->mmap_sem); 462 463 return rc; 464 } 465 EXPORT_SYMBOL_GPL(gmap_fault); 466 467 void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) 468 { 469 470 unsigned long *table, address, size; 471 struct vm_area_struct *vma; 472 struct gmap_pgtable *mp; 473 struct page *page; 474 475 down_read(&gmap->mm->mmap_sem); 476 address = from; 477 while (address < to) { 478 /* Walk the gmap address space page table */ 479 table = gmap->table + ((address >> 53) & 0x7ff); 480 if (unlikely(*table & _REGION_ENTRY_INV)) { 481 address = (address + PMD_SIZE) & PMD_MASK; 482 continue; 483 } 484 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 485 table = table + ((address >> 42) & 0x7ff); 486 if (unlikely(*table & _REGION_ENTRY_INV)) { 487 address = (address + PMD_SIZE) & PMD_MASK; 488 continue; 489 } 490 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 491 table = table + ((address >> 31) & 0x7ff); 492 if (unlikely(*table & _REGION_ENTRY_INV)) { 493 address = (address + PMD_SIZE) & PMD_MASK; 494 continue; 495 } 496 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 497 table = table + ((address >> 20) & 0x7ff); 498 if (unlikely(*table & _SEGMENT_ENTRY_INV)) { 499 address = (address + PMD_SIZE) & PMD_MASK; 500 continue; 501 } 502 page = pfn_to_page(*table >> PAGE_SHIFT); 503 mp = (struct gmap_pgtable *) page->index; 504 vma = find_vma(gmap->mm, mp->vmaddr); 505 size = min(to - address, PMD_SIZE - (address & ~PMD_MASK)); 506 zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK), 507 size, NULL); 508 address = (address + PMD_SIZE) & PMD_MASK; 509 } 510 up_read(&gmap->mm->mmap_sem); 511 } 512 EXPORT_SYMBOL_GPL(gmap_discard); 513 514 void gmap_unmap_notifier(struct mm_struct *mm, unsigned long *table) 515 { 516 struct gmap_rmap *rmap, *next; 517 struct gmap_pgtable *mp; 518 struct page *page; 519 int flush; 520 521 flush = 0; 522 spin_lock(&mm->page_table_lock); 523 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 524 mp = (struct gmap_pgtable *) page->index; 525 list_for_each_entry_safe(rmap, next, &mp->mapper, list) { 526 *rmap->entry = 527 _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; 528 list_del(&rmap->list); 529 kfree(rmap); 530 flush = 1; 531 } 532 spin_unlock(&mm->page_table_lock); 533 if (flush) 534 __tlb_flush_global(); 535 } 536 537 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 538 unsigned long vmaddr) 539 { 540 struct page *page; 541 unsigned long *table; 542 struct gmap_pgtable *mp; 543 544 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 545 if (!page) 546 return NULL; 547 mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT); 548 if (!mp) { 549 __free_page(page); 550 return NULL; 551 } 552 pgtable_page_ctor(page); 553 mp->vmaddr = vmaddr & PMD_MASK; 554 INIT_LIST_HEAD(&mp->mapper); 555 page->index = (unsigned long) mp; 556 atomic_set(&page->_mapcount, 3); 557 table = (unsigned long *) page_to_phys(page); 558 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); 559 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 560 return table; 561 } 562 563 static inline void page_table_free_pgste(unsigned long *table) 564 { 565 struct page *page; 566 struct gmap_pgtable *mp; 567 568 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 569 mp = (struct gmap_pgtable *) page->index; 570 BUG_ON(!list_empty(&mp->mapper)); 571 pgtable_page_dtor(page); 572 atomic_set(&page->_mapcount, -1); 573 kfree(mp); 574 __free_page(page); 575 } 576 577 #else /* CONFIG_PGSTE */ 578 579 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 580 unsigned long vmaddr) 581 { 582 return NULL; 583 } 584 585 static inline void page_table_free_pgste(unsigned long *table) 586 { 587 } 588 589 static inline void gmap_unmap_notifier(struct mm_struct *mm, 590 unsigned long *table) 591 { 592 } 593 594 #endif /* CONFIG_PGSTE */ 595 596 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 597 { 598 unsigned int old, new; 599 600 do { 601 old = atomic_read(v); 602 new = old ^ bits; 603 } while (atomic_cmpxchg(v, old, new) != old); 604 return new; 605 } 606 607 /* 608 * page table entry allocation/free routines. 609 */ 610 unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) 611 { 612 unsigned long *uninitialized_var(table); 613 struct page *uninitialized_var(page); 614 unsigned int mask, bit; 615 616 if (mm_has_pgste(mm)) 617 return page_table_alloc_pgste(mm, vmaddr); 618 /* Allocate fragments of a 4K page as 1K/2K page table */ 619 spin_lock_bh(&mm->context.list_lock); 620 mask = FRAG_MASK; 621 if (!list_empty(&mm->context.pgtable_list)) { 622 page = list_first_entry(&mm->context.pgtable_list, 623 struct page, lru); 624 table = (unsigned long *) page_to_phys(page); 625 mask = atomic_read(&page->_mapcount); 626 mask = mask | (mask >> 4); 627 } 628 if ((mask & FRAG_MASK) == FRAG_MASK) { 629 spin_unlock_bh(&mm->context.list_lock); 630 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 631 if (!page) 632 return NULL; 633 pgtable_page_ctor(page); 634 atomic_set(&page->_mapcount, 1); 635 table = (unsigned long *) page_to_phys(page); 636 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); 637 spin_lock_bh(&mm->context.list_lock); 638 list_add(&page->lru, &mm->context.pgtable_list); 639 } else { 640 for (bit = 1; mask & bit; bit <<= 1) 641 table += PTRS_PER_PTE; 642 mask = atomic_xor_bits(&page->_mapcount, bit); 643 if ((mask & FRAG_MASK) == FRAG_MASK) 644 list_del(&page->lru); 645 } 646 spin_unlock_bh(&mm->context.list_lock); 647 return table; 648 } 649 650 void page_table_free(struct mm_struct *mm, unsigned long *table) 651 { 652 struct page *page; 653 unsigned int bit, mask; 654 655 if (mm_has_pgste(mm)) { 656 gmap_unmap_notifier(mm, table); 657 return page_table_free_pgste(table); 658 } 659 /* Free 1K/2K page table fragment of a 4K page */ 660 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 661 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 662 spin_lock_bh(&mm->context.list_lock); 663 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 664 list_del(&page->lru); 665 mask = atomic_xor_bits(&page->_mapcount, bit); 666 if (mask & FRAG_MASK) 667 list_add(&page->lru, &mm->context.pgtable_list); 668 spin_unlock_bh(&mm->context.list_lock); 669 if (mask == 0) { 670 pgtable_page_dtor(page); 671 atomic_set(&page->_mapcount, -1); 672 __free_page(page); 673 } 674 } 675 676 static void __page_table_free_rcu(void *table, unsigned bit) 677 { 678 struct page *page; 679 680 if (bit == FRAG_MASK) 681 return page_table_free_pgste(table); 682 /* Free 1K/2K page table fragment of a 4K page */ 683 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 684 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 685 pgtable_page_dtor(page); 686 atomic_set(&page->_mapcount, -1); 687 __free_page(page); 688 } 689 } 690 691 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) 692 { 693 struct mm_struct *mm; 694 struct page *page; 695 unsigned int bit, mask; 696 697 mm = tlb->mm; 698 if (mm_has_pgste(mm)) { 699 gmap_unmap_notifier(mm, table); 700 table = (unsigned long *) (__pa(table) | FRAG_MASK); 701 tlb_remove_table(tlb, table); 702 return; 703 } 704 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 705 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 706 spin_lock_bh(&mm->context.list_lock); 707 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 708 list_del(&page->lru); 709 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 710 if (mask & FRAG_MASK) 711 list_add_tail(&page->lru, &mm->context.pgtable_list); 712 spin_unlock_bh(&mm->context.list_lock); 713 table = (unsigned long *) (__pa(table) | (bit << 4)); 714 tlb_remove_table(tlb, table); 715 } 716 717 void __tlb_remove_table(void *_table) 718 { 719 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 720 void *table = (void *)((unsigned long) _table & ~mask); 721 unsigned type = (unsigned long) _table & mask; 722 723 if (type) 724 __page_table_free_rcu(table, type); 725 else 726 free_pages((unsigned long) table, ALLOC_ORDER); 727 } 728 729 static void tlb_remove_table_smp_sync(void *arg) 730 { 731 /* Simply deliver the interrupt */ 732 } 733 734 static void tlb_remove_table_one(void *table) 735 { 736 /* 737 * This isn't an RCU grace period and hence the page-tables cannot be 738 * assumed to be actually RCU-freed. 739 * 740 * It is however sufficient for software page-table walkers that rely 741 * on IRQ disabling. See the comment near struct mmu_table_batch. 742 */ 743 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 744 __tlb_remove_table(table); 745 } 746 747 static void tlb_remove_table_rcu(struct rcu_head *head) 748 { 749 struct mmu_table_batch *batch; 750 int i; 751 752 batch = container_of(head, struct mmu_table_batch, rcu); 753 754 for (i = 0; i < batch->nr; i++) 755 __tlb_remove_table(batch->tables[i]); 756 757 free_page((unsigned long)batch); 758 } 759 760 void tlb_table_flush(struct mmu_gather *tlb) 761 { 762 struct mmu_table_batch **batch = &tlb->batch; 763 764 if (*batch) { 765 __tlb_flush_mm(tlb->mm); 766 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 767 *batch = NULL; 768 } 769 } 770 771 void tlb_remove_table(struct mmu_gather *tlb, void *table) 772 { 773 struct mmu_table_batch **batch = &tlb->batch; 774 775 if (*batch == NULL) { 776 *batch = (struct mmu_table_batch *) 777 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 778 if (*batch == NULL) { 779 __tlb_flush_mm(tlb->mm); 780 tlb_remove_table_one(table); 781 return; 782 } 783 (*batch)->nr = 0; 784 } 785 (*batch)->tables[(*batch)->nr++] = table; 786 if ((*batch)->nr == MAX_TABLE_BATCH) 787 tlb_table_flush(tlb); 788 } 789 790 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 791 void thp_split_vma(struct vm_area_struct *vma) 792 { 793 unsigned long addr; 794 struct page *page; 795 796 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { 797 page = follow_page(vma, addr, FOLL_SPLIT); 798 } 799 } 800 801 void thp_split_mm(struct mm_struct *mm) 802 { 803 struct vm_area_struct *vma = mm->mmap; 804 805 while (vma != NULL) { 806 thp_split_vma(vma); 807 vma->vm_flags &= ~VM_HUGEPAGE; 808 vma->vm_flags |= VM_NOHUGEPAGE; 809 vma = vma->vm_next; 810 } 811 } 812 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 813 814 /* 815 * switch on pgstes for its userspace process (for kvm) 816 */ 817 int s390_enable_sie(void) 818 { 819 struct task_struct *tsk = current; 820 struct mm_struct *mm, *old_mm; 821 822 /* Do we have switched amode? If no, we cannot do sie */ 823 if (s390_user_mode == HOME_SPACE_MODE) 824 return -EINVAL; 825 826 /* Do we have pgstes? if yes, we are done */ 827 if (mm_has_pgste(tsk->mm)) 828 return 0; 829 830 /* lets check if we are allowed to replace the mm */ 831 task_lock(tsk); 832 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 833 #ifdef CONFIG_AIO 834 !hlist_empty(&tsk->mm->ioctx_list) || 835 #endif 836 tsk->mm != tsk->active_mm) { 837 task_unlock(tsk); 838 return -EINVAL; 839 } 840 task_unlock(tsk); 841 842 /* we copy the mm and let dup_mm create the page tables with_pgstes */ 843 tsk->mm->context.alloc_pgste = 1; 844 /* make sure that both mms have a correct rss state */ 845 sync_mm_rss(tsk->mm); 846 mm = dup_mm(tsk); 847 tsk->mm->context.alloc_pgste = 0; 848 if (!mm) 849 return -ENOMEM; 850 851 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 852 /* split thp mappings and disable thp for future mappings */ 853 thp_split_mm(mm); 854 mm->def_flags |= VM_NOHUGEPAGE; 855 #endif 856 857 /* Now lets check again if something happened */ 858 task_lock(tsk); 859 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 860 #ifdef CONFIG_AIO 861 !hlist_empty(&tsk->mm->ioctx_list) || 862 #endif 863 tsk->mm != tsk->active_mm) { 864 mmput(mm); 865 task_unlock(tsk); 866 return -EINVAL; 867 } 868 869 /* ok, we are alone. No ptrace, no threads, etc. */ 870 old_mm = tsk->mm; 871 tsk->mm = tsk->active_mm = mm; 872 preempt_disable(); 873 update_mm(mm, tsk); 874 atomic_inc(&mm->context.attach_count); 875 atomic_dec(&old_mm->context.attach_count); 876 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); 877 preempt_enable(); 878 task_unlock(tsk); 879 mmput(old_mm); 880 return 0; 881 } 882 EXPORT_SYMBOL_GPL(s390_enable_sie); 883 884 #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION) 885 bool kernel_page_present(struct page *page) 886 { 887 unsigned long addr; 888 int cc; 889 890 addr = page_to_phys(page); 891 asm volatile( 892 " lra %1,0(%1)\n" 893 " ipm %0\n" 894 " srl %0,28" 895 : "=d" (cc), "+a" (addr) : : "cc"); 896 return cc == 0; 897 } 898 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ 899 900 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 901 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, 902 pmd_t *pmdp) 903 { 904 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 905 /* No need to flush TLB 906 * On s390 reference bits are in storage key and never in TLB */ 907 return pmdp_test_and_clear_young(vma, address, pmdp); 908 } 909 910 int pmdp_set_access_flags(struct vm_area_struct *vma, 911 unsigned long address, pmd_t *pmdp, 912 pmd_t entry, int dirty) 913 { 914 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 915 916 if (pmd_same(*pmdp, entry)) 917 return 0; 918 pmdp_invalidate(vma, address, pmdp); 919 set_pmd_at(vma->vm_mm, address, pmdp, entry); 920 return 1; 921 } 922 923 static void pmdp_splitting_flush_sync(void *arg) 924 { 925 /* Simply deliver the interrupt */ 926 } 927 928 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, 929 pmd_t *pmdp) 930 { 931 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 932 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, 933 (unsigned long *) pmdp)) { 934 /* need to serialize against gup-fast (IRQ disabled) */ 935 smp_call_function(pmdp_splitting_flush_sync, NULL, 1); 936 } 937 } 938 939 void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) 940 { 941 struct list_head *lh = (struct list_head *) pgtable; 942 943 assert_spin_locked(&mm->page_table_lock); 944 945 /* FIFO */ 946 if (!mm->pmd_huge_pte) 947 INIT_LIST_HEAD(lh); 948 else 949 list_add(lh, (struct list_head *) mm->pmd_huge_pte); 950 mm->pmd_huge_pte = pgtable; 951 } 952 953 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) 954 { 955 struct list_head *lh; 956 pgtable_t pgtable; 957 pte_t *ptep; 958 959 assert_spin_locked(&mm->page_table_lock); 960 961 /* FIFO */ 962 pgtable = mm->pmd_huge_pte; 963 lh = (struct list_head *) pgtable; 964 if (list_empty(lh)) 965 mm->pmd_huge_pte = NULL; 966 else { 967 mm->pmd_huge_pte = (pgtable_t) lh->next; 968 list_del(lh); 969 } 970 ptep = (pte_t *) pgtable; 971 pte_val(*ptep) = _PAGE_TYPE_EMPTY; 972 ptep++; 973 pte_val(*ptep) = _PAGE_TYPE_EMPTY; 974 return pgtable; 975 } 976 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 977