1 /* 2 * Copyright IBM Corp. 2007,2011 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 #include <linux/slab.h> 20 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 27 #ifndef CONFIG_64BIT 28 #define ALLOC_ORDER 1 29 #define FRAG_MASK 0x0f 30 #else 31 #define ALLOC_ORDER 2 32 #define FRAG_MASK 0x03 33 #endif 34 35 36 unsigned long *crst_table_alloc(struct mm_struct *mm) 37 { 38 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 39 40 if (!page) 41 return NULL; 42 return (unsigned long *) page_to_phys(page); 43 } 44 45 void crst_table_free(struct mm_struct *mm, unsigned long *table) 46 { 47 free_pages((unsigned long) table, ALLOC_ORDER); 48 } 49 50 #ifdef CONFIG_64BIT 51 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 52 { 53 unsigned long *table, *pgd; 54 unsigned long entry; 55 56 BUG_ON(limit > (1UL << 53)); 57 repeat: 58 table = crst_table_alloc(mm); 59 if (!table) 60 return -ENOMEM; 61 spin_lock_bh(&mm->page_table_lock); 62 if (mm->context.asce_limit < limit) { 63 pgd = (unsigned long *) mm->pgd; 64 if (mm->context.asce_limit <= (1UL << 31)) { 65 entry = _REGION3_ENTRY_EMPTY; 66 mm->context.asce_limit = 1UL << 42; 67 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 68 _ASCE_USER_BITS | 69 _ASCE_TYPE_REGION3; 70 } else { 71 entry = _REGION2_ENTRY_EMPTY; 72 mm->context.asce_limit = 1UL << 53; 73 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 74 _ASCE_USER_BITS | 75 _ASCE_TYPE_REGION2; 76 } 77 crst_table_init(table, entry); 78 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 79 mm->pgd = (pgd_t *) table; 80 mm->task_size = mm->context.asce_limit; 81 table = NULL; 82 } 83 spin_unlock_bh(&mm->page_table_lock); 84 if (table) 85 crst_table_free(mm, table); 86 if (mm->context.asce_limit < limit) 87 goto repeat; 88 update_mm(mm, current); 89 return 0; 90 } 91 92 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 93 { 94 pgd_t *pgd; 95 96 if (mm->context.asce_limit <= limit) 97 return; 98 __tlb_flush_mm(mm); 99 while (mm->context.asce_limit > limit) { 100 pgd = mm->pgd; 101 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 102 case _REGION_ENTRY_TYPE_R2: 103 mm->context.asce_limit = 1UL << 42; 104 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 105 _ASCE_USER_BITS | 106 _ASCE_TYPE_REGION3; 107 break; 108 case _REGION_ENTRY_TYPE_R3: 109 mm->context.asce_limit = 1UL << 31; 110 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 111 _ASCE_USER_BITS | 112 _ASCE_TYPE_SEGMENT; 113 break; 114 default: 115 BUG(); 116 } 117 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 118 mm->task_size = mm->context.asce_limit; 119 crst_table_free(mm, (unsigned long *) pgd); 120 } 121 update_mm(mm, current); 122 } 123 #endif 124 125 #ifdef CONFIG_PGSTE 126 127 /** 128 * gmap_alloc - allocate a guest address space 129 * @mm: pointer to the parent mm_struct 130 * 131 * Returns a guest address space structure. 132 */ 133 struct gmap *gmap_alloc(struct mm_struct *mm) 134 { 135 struct gmap *gmap; 136 struct page *page; 137 unsigned long *table; 138 139 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); 140 if (!gmap) 141 goto out; 142 INIT_LIST_HEAD(&gmap->crst_list); 143 gmap->mm = mm; 144 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 145 if (!page) 146 goto out_free; 147 list_add(&page->lru, &gmap->crst_list); 148 table = (unsigned long *) page_to_phys(page); 149 crst_table_init(table, _REGION1_ENTRY_EMPTY); 150 gmap->table = table; 151 gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH | 152 _ASCE_USER_BITS | __pa(table); 153 list_add(&gmap->list, &mm->context.gmap_list); 154 return gmap; 155 156 out_free: 157 kfree(gmap); 158 out: 159 return NULL; 160 } 161 EXPORT_SYMBOL_GPL(gmap_alloc); 162 163 static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) 164 { 165 struct gmap_pgtable *mp; 166 struct gmap_rmap *rmap; 167 struct page *page; 168 169 if (*table & _SEGMENT_ENTRY_INV) 170 return 0; 171 page = pfn_to_page(*table >> PAGE_SHIFT); 172 mp = (struct gmap_pgtable *) page->index; 173 list_for_each_entry(rmap, &mp->mapper, list) { 174 if (rmap->entry != table) 175 continue; 176 list_del(&rmap->list); 177 kfree(rmap); 178 break; 179 } 180 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; 181 return 1; 182 } 183 184 static void gmap_flush_tlb(struct gmap *gmap) 185 { 186 if (MACHINE_HAS_IDTE) 187 __tlb_flush_idte((unsigned long) gmap->table | 188 _ASCE_TYPE_REGION1); 189 else 190 __tlb_flush_global(); 191 } 192 193 /** 194 * gmap_free - free a guest address space 195 * @gmap: pointer to the guest address space structure 196 */ 197 void gmap_free(struct gmap *gmap) 198 { 199 struct page *page, *next; 200 unsigned long *table; 201 int i; 202 203 204 /* Flush tlb. */ 205 if (MACHINE_HAS_IDTE) 206 __tlb_flush_idte((unsigned long) gmap->table | 207 _ASCE_TYPE_REGION1); 208 else 209 __tlb_flush_global(); 210 211 /* Free all segment & region tables. */ 212 down_read(&gmap->mm->mmap_sem); 213 spin_lock(&gmap->mm->page_table_lock); 214 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { 215 table = (unsigned long *) page_to_phys(page); 216 if ((*table & _REGION_ENTRY_TYPE_MASK) == 0) 217 /* Remove gmap rmap structures for segment table. */ 218 for (i = 0; i < PTRS_PER_PMD; i++, table++) 219 gmap_unlink_segment(gmap, table); 220 __free_pages(page, ALLOC_ORDER); 221 } 222 spin_unlock(&gmap->mm->page_table_lock); 223 up_read(&gmap->mm->mmap_sem); 224 list_del(&gmap->list); 225 kfree(gmap); 226 } 227 EXPORT_SYMBOL_GPL(gmap_free); 228 229 /** 230 * gmap_enable - switch primary space to the guest address space 231 * @gmap: pointer to the guest address space structure 232 */ 233 void gmap_enable(struct gmap *gmap) 234 { 235 S390_lowcore.gmap = (unsigned long) gmap; 236 } 237 EXPORT_SYMBOL_GPL(gmap_enable); 238 239 /** 240 * gmap_disable - switch back to the standard primary address space 241 * @gmap: pointer to the guest address space structure 242 */ 243 void gmap_disable(struct gmap *gmap) 244 { 245 S390_lowcore.gmap = 0UL; 246 } 247 EXPORT_SYMBOL_GPL(gmap_disable); 248 249 /* 250 * gmap_alloc_table is assumed to be called with mmap_sem held 251 */ 252 static int gmap_alloc_table(struct gmap *gmap, 253 unsigned long *table, unsigned long init) 254 { 255 struct page *page; 256 unsigned long *new; 257 258 /* since we dont free the gmap table until gmap_free we can unlock */ 259 spin_unlock(&gmap->mm->page_table_lock); 260 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 261 spin_lock(&gmap->mm->page_table_lock); 262 if (!page) 263 return -ENOMEM; 264 new = (unsigned long *) page_to_phys(page); 265 crst_table_init(new, init); 266 if (*table & _REGION_ENTRY_INV) { 267 list_add(&page->lru, &gmap->crst_list); 268 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 269 (*table & _REGION_ENTRY_TYPE_MASK); 270 } else 271 __free_pages(page, ALLOC_ORDER); 272 return 0; 273 } 274 275 /** 276 * gmap_unmap_segment - unmap segment from the guest address space 277 * @gmap: pointer to the guest address space structure 278 * @addr: address in the guest address space 279 * @len: length of the memory area to unmap 280 * 281 * Returns 0 if the unmap succeded, -EINVAL if not. 282 */ 283 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) 284 { 285 unsigned long *table; 286 unsigned long off; 287 int flush; 288 289 if ((to | len) & (PMD_SIZE - 1)) 290 return -EINVAL; 291 if (len == 0 || to + len < to) 292 return -EINVAL; 293 294 flush = 0; 295 down_read(&gmap->mm->mmap_sem); 296 spin_lock(&gmap->mm->page_table_lock); 297 for (off = 0; off < len; off += PMD_SIZE) { 298 /* Walk the guest addr space page table */ 299 table = gmap->table + (((to + off) >> 53) & 0x7ff); 300 if (*table & _REGION_ENTRY_INV) 301 goto out; 302 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 303 table = table + (((to + off) >> 42) & 0x7ff); 304 if (*table & _REGION_ENTRY_INV) 305 goto out; 306 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 307 table = table + (((to + off) >> 31) & 0x7ff); 308 if (*table & _REGION_ENTRY_INV) 309 goto out; 310 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 311 table = table + (((to + off) >> 20) & 0x7ff); 312 313 /* Clear segment table entry in guest address space. */ 314 flush |= gmap_unlink_segment(gmap, table); 315 *table = _SEGMENT_ENTRY_INV; 316 } 317 out: 318 spin_unlock(&gmap->mm->page_table_lock); 319 up_read(&gmap->mm->mmap_sem); 320 if (flush) 321 gmap_flush_tlb(gmap); 322 return 0; 323 } 324 EXPORT_SYMBOL_GPL(gmap_unmap_segment); 325 326 /** 327 * gmap_mmap_segment - map a segment to the guest address space 328 * @gmap: pointer to the guest address space structure 329 * @from: source address in the parent address space 330 * @to: target address in the guest address space 331 * 332 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not. 333 */ 334 int gmap_map_segment(struct gmap *gmap, unsigned long from, 335 unsigned long to, unsigned long len) 336 { 337 unsigned long *table; 338 unsigned long off; 339 int flush; 340 341 if ((from | to | len) & (PMD_SIZE - 1)) 342 return -EINVAL; 343 if (len == 0 || from + len > PGDIR_SIZE || 344 from + len < from || to + len < to) 345 return -EINVAL; 346 347 flush = 0; 348 down_read(&gmap->mm->mmap_sem); 349 spin_lock(&gmap->mm->page_table_lock); 350 for (off = 0; off < len; off += PMD_SIZE) { 351 /* Walk the gmap address space page table */ 352 table = gmap->table + (((to + off) >> 53) & 0x7ff); 353 if ((*table & _REGION_ENTRY_INV) && 354 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY)) 355 goto out_unmap; 356 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 357 table = table + (((to + off) >> 42) & 0x7ff); 358 if ((*table & _REGION_ENTRY_INV) && 359 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY)) 360 goto out_unmap; 361 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 362 table = table + (((to + off) >> 31) & 0x7ff); 363 if ((*table & _REGION_ENTRY_INV) && 364 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY)) 365 goto out_unmap; 366 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN); 367 table = table + (((to + off) >> 20) & 0x7ff); 368 369 /* Store 'from' address in an invalid segment table entry. */ 370 flush |= gmap_unlink_segment(gmap, table); 371 *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off); 372 } 373 spin_unlock(&gmap->mm->page_table_lock); 374 up_read(&gmap->mm->mmap_sem); 375 if (flush) 376 gmap_flush_tlb(gmap); 377 return 0; 378 379 out_unmap: 380 spin_unlock(&gmap->mm->page_table_lock); 381 up_read(&gmap->mm->mmap_sem); 382 gmap_unmap_segment(gmap, to, len); 383 return -ENOMEM; 384 } 385 EXPORT_SYMBOL_GPL(gmap_map_segment); 386 387 /* 388 * this function is assumed to be called with mmap_sem held 389 */ 390 unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) 391 { 392 unsigned long *table, vmaddr, segment; 393 struct mm_struct *mm; 394 struct gmap_pgtable *mp; 395 struct gmap_rmap *rmap; 396 struct vm_area_struct *vma; 397 struct page *page; 398 pgd_t *pgd; 399 pud_t *pud; 400 pmd_t *pmd; 401 402 current->thread.gmap_addr = address; 403 mm = gmap->mm; 404 /* Walk the gmap address space page table */ 405 table = gmap->table + ((address >> 53) & 0x7ff); 406 if (unlikely(*table & _REGION_ENTRY_INV)) 407 return -EFAULT; 408 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 409 table = table + ((address >> 42) & 0x7ff); 410 if (unlikely(*table & _REGION_ENTRY_INV)) 411 return -EFAULT; 412 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 413 table = table + ((address >> 31) & 0x7ff); 414 if (unlikely(*table & _REGION_ENTRY_INV)) 415 return -EFAULT; 416 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 417 table = table + ((address >> 20) & 0x7ff); 418 419 /* Convert the gmap address to an mm address. */ 420 segment = *table; 421 if (likely(!(segment & _SEGMENT_ENTRY_INV))) { 422 page = pfn_to_page(segment >> PAGE_SHIFT); 423 mp = (struct gmap_pgtable *) page->index; 424 return mp->vmaddr | (address & ~PMD_MASK); 425 } else if (segment & _SEGMENT_ENTRY_RO) { 426 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; 427 vma = find_vma(mm, vmaddr); 428 if (!vma || vma->vm_start > vmaddr) 429 return -EFAULT; 430 431 /* Walk the parent mm page table */ 432 pgd = pgd_offset(mm, vmaddr); 433 pud = pud_alloc(mm, pgd, vmaddr); 434 if (!pud) 435 return -ENOMEM; 436 pmd = pmd_alloc(mm, pud, vmaddr); 437 if (!pmd) 438 return -ENOMEM; 439 if (!pmd_present(*pmd) && 440 __pte_alloc(mm, vma, pmd, vmaddr)) 441 return -ENOMEM; 442 /* pmd now points to a valid segment table entry. */ 443 rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); 444 if (!rmap) 445 return -ENOMEM; 446 /* Link gmap segment table entry location to page table. */ 447 page = pmd_page(*pmd); 448 mp = (struct gmap_pgtable *) page->index; 449 rmap->entry = table; 450 spin_lock(&mm->page_table_lock); 451 list_add(&rmap->list, &mp->mapper); 452 spin_unlock(&mm->page_table_lock); 453 /* Set gmap segment table entry to page table. */ 454 *table = pmd_val(*pmd) & PAGE_MASK; 455 return vmaddr | (address & ~PMD_MASK); 456 } 457 return -EFAULT; 458 } 459 460 unsigned long gmap_fault(unsigned long address, struct gmap *gmap) 461 { 462 unsigned long rc; 463 464 down_read(&gmap->mm->mmap_sem); 465 rc = __gmap_fault(address, gmap); 466 up_read(&gmap->mm->mmap_sem); 467 468 return rc; 469 } 470 EXPORT_SYMBOL_GPL(gmap_fault); 471 472 void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) 473 { 474 475 unsigned long *table, address, size; 476 struct vm_area_struct *vma; 477 struct gmap_pgtable *mp; 478 struct page *page; 479 480 down_read(&gmap->mm->mmap_sem); 481 address = from; 482 while (address < to) { 483 /* Walk the gmap address space page table */ 484 table = gmap->table + ((address >> 53) & 0x7ff); 485 if (unlikely(*table & _REGION_ENTRY_INV)) { 486 address = (address + PMD_SIZE) & PMD_MASK; 487 continue; 488 } 489 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 490 table = table + ((address >> 42) & 0x7ff); 491 if (unlikely(*table & _REGION_ENTRY_INV)) { 492 address = (address + PMD_SIZE) & PMD_MASK; 493 continue; 494 } 495 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 496 table = table + ((address >> 31) & 0x7ff); 497 if (unlikely(*table & _REGION_ENTRY_INV)) { 498 address = (address + PMD_SIZE) & PMD_MASK; 499 continue; 500 } 501 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 502 table = table + ((address >> 20) & 0x7ff); 503 if (unlikely(*table & _SEGMENT_ENTRY_INV)) { 504 address = (address + PMD_SIZE) & PMD_MASK; 505 continue; 506 } 507 page = pfn_to_page(*table >> PAGE_SHIFT); 508 mp = (struct gmap_pgtable *) page->index; 509 vma = find_vma(gmap->mm, mp->vmaddr); 510 size = min(to - address, PMD_SIZE - (address & ~PMD_MASK)); 511 zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK), 512 size, NULL); 513 address = (address + PMD_SIZE) & PMD_MASK; 514 } 515 up_read(&gmap->mm->mmap_sem); 516 } 517 EXPORT_SYMBOL_GPL(gmap_discard); 518 519 void gmap_unmap_notifier(struct mm_struct *mm, unsigned long *table) 520 { 521 struct gmap_rmap *rmap, *next; 522 struct gmap_pgtable *mp; 523 struct page *page; 524 int flush; 525 526 flush = 0; 527 spin_lock(&mm->page_table_lock); 528 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 529 mp = (struct gmap_pgtable *) page->index; 530 list_for_each_entry_safe(rmap, next, &mp->mapper, list) { 531 *rmap->entry = 532 _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr; 533 list_del(&rmap->list); 534 kfree(rmap); 535 flush = 1; 536 } 537 spin_unlock(&mm->page_table_lock); 538 if (flush) 539 __tlb_flush_global(); 540 } 541 542 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 543 unsigned long vmaddr) 544 { 545 struct page *page; 546 unsigned long *table; 547 struct gmap_pgtable *mp; 548 549 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 550 if (!page) 551 return NULL; 552 mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT); 553 if (!mp) { 554 __free_page(page); 555 return NULL; 556 } 557 pgtable_page_ctor(page); 558 mp->vmaddr = vmaddr & PMD_MASK; 559 INIT_LIST_HEAD(&mp->mapper); 560 page->index = (unsigned long) mp; 561 atomic_set(&page->_mapcount, 3); 562 table = (unsigned long *) page_to_phys(page); 563 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); 564 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 565 return table; 566 } 567 568 static inline void page_table_free_pgste(unsigned long *table) 569 { 570 struct page *page; 571 struct gmap_pgtable *mp; 572 573 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 574 mp = (struct gmap_pgtable *) page->index; 575 BUG_ON(!list_empty(&mp->mapper)); 576 pgtable_page_dtor(page); 577 atomic_set(&page->_mapcount, -1); 578 kfree(mp); 579 __free_page(page); 580 } 581 582 #else /* CONFIG_PGSTE */ 583 584 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 585 unsigned long vmaddr) 586 { 587 return NULL; 588 } 589 590 static inline void page_table_free_pgste(unsigned long *table) 591 { 592 } 593 594 static inline void gmap_unmap_notifier(struct mm_struct *mm, 595 unsigned long *table) 596 { 597 } 598 599 #endif /* CONFIG_PGSTE */ 600 601 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 602 { 603 unsigned int old, new; 604 605 do { 606 old = atomic_read(v); 607 new = old ^ bits; 608 } while (atomic_cmpxchg(v, old, new) != old); 609 return new; 610 } 611 612 /* 613 * page table entry allocation/free routines. 614 */ 615 unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) 616 { 617 struct page *page; 618 unsigned long *table; 619 unsigned int mask, bit; 620 621 if (mm_has_pgste(mm)) 622 return page_table_alloc_pgste(mm, vmaddr); 623 /* Allocate fragments of a 4K page as 1K/2K page table */ 624 spin_lock_bh(&mm->context.list_lock); 625 mask = FRAG_MASK; 626 if (!list_empty(&mm->context.pgtable_list)) { 627 page = list_first_entry(&mm->context.pgtable_list, 628 struct page, lru); 629 table = (unsigned long *) page_to_phys(page); 630 mask = atomic_read(&page->_mapcount); 631 mask = mask | (mask >> 4); 632 } 633 if ((mask & FRAG_MASK) == FRAG_MASK) { 634 spin_unlock_bh(&mm->context.list_lock); 635 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 636 if (!page) 637 return NULL; 638 pgtable_page_ctor(page); 639 atomic_set(&page->_mapcount, 1); 640 table = (unsigned long *) page_to_phys(page); 641 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); 642 spin_lock_bh(&mm->context.list_lock); 643 list_add(&page->lru, &mm->context.pgtable_list); 644 } else { 645 for (bit = 1; mask & bit; bit <<= 1) 646 table += PTRS_PER_PTE; 647 mask = atomic_xor_bits(&page->_mapcount, bit); 648 if ((mask & FRAG_MASK) == FRAG_MASK) 649 list_del(&page->lru); 650 } 651 spin_unlock_bh(&mm->context.list_lock); 652 return table; 653 } 654 655 void page_table_free(struct mm_struct *mm, unsigned long *table) 656 { 657 struct page *page; 658 unsigned int bit, mask; 659 660 if (mm_has_pgste(mm)) { 661 gmap_unmap_notifier(mm, table); 662 return page_table_free_pgste(table); 663 } 664 /* Free 1K/2K page table fragment of a 4K page */ 665 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 666 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 667 spin_lock_bh(&mm->context.list_lock); 668 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 669 list_del(&page->lru); 670 mask = atomic_xor_bits(&page->_mapcount, bit); 671 if (mask & FRAG_MASK) 672 list_add(&page->lru, &mm->context.pgtable_list); 673 spin_unlock_bh(&mm->context.list_lock); 674 if (mask == 0) { 675 pgtable_page_dtor(page); 676 atomic_set(&page->_mapcount, -1); 677 __free_page(page); 678 } 679 } 680 681 static void __page_table_free_rcu(void *table, unsigned bit) 682 { 683 struct page *page; 684 685 if (bit == FRAG_MASK) 686 return page_table_free_pgste(table); 687 /* Free 1K/2K page table fragment of a 4K page */ 688 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 689 if (atomic_xor_bits(&page->_mapcount, bit) == 0) { 690 pgtable_page_dtor(page); 691 atomic_set(&page->_mapcount, -1); 692 __free_page(page); 693 } 694 } 695 696 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) 697 { 698 struct mm_struct *mm; 699 struct page *page; 700 unsigned int bit, mask; 701 702 mm = tlb->mm; 703 if (mm_has_pgste(mm)) { 704 gmap_unmap_notifier(mm, table); 705 table = (unsigned long *) (__pa(table) | FRAG_MASK); 706 tlb_remove_table(tlb, table); 707 return; 708 } 709 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 710 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 711 spin_lock_bh(&mm->context.list_lock); 712 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 713 list_del(&page->lru); 714 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); 715 if (mask & FRAG_MASK) 716 list_add_tail(&page->lru, &mm->context.pgtable_list); 717 spin_unlock_bh(&mm->context.list_lock); 718 table = (unsigned long *) (__pa(table) | (bit << 4)); 719 tlb_remove_table(tlb, table); 720 } 721 722 void __tlb_remove_table(void *_table) 723 { 724 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; 725 void *table = (void *)((unsigned long) _table & ~mask); 726 unsigned type = (unsigned long) _table & mask; 727 728 if (type) 729 __page_table_free_rcu(table, type); 730 else 731 free_pages((unsigned long) table, ALLOC_ORDER); 732 } 733 734 static void tlb_remove_table_smp_sync(void *arg) 735 { 736 /* Simply deliver the interrupt */ 737 } 738 739 static void tlb_remove_table_one(void *table) 740 { 741 /* 742 * This isn't an RCU grace period and hence the page-tables cannot be 743 * assumed to be actually RCU-freed. 744 * 745 * It is however sufficient for software page-table walkers that rely 746 * on IRQ disabling. See the comment near struct mmu_table_batch. 747 */ 748 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 749 __tlb_remove_table(table); 750 } 751 752 static void tlb_remove_table_rcu(struct rcu_head *head) 753 { 754 struct mmu_table_batch *batch; 755 int i; 756 757 batch = container_of(head, struct mmu_table_batch, rcu); 758 759 for (i = 0; i < batch->nr; i++) 760 __tlb_remove_table(batch->tables[i]); 761 762 free_page((unsigned long)batch); 763 } 764 765 void tlb_table_flush(struct mmu_gather *tlb) 766 { 767 struct mmu_table_batch **batch = &tlb->batch; 768 769 if (*batch) { 770 __tlb_flush_mm(tlb->mm); 771 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 772 *batch = NULL; 773 } 774 } 775 776 void tlb_remove_table(struct mmu_gather *tlb, void *table) 777 { 778 struct mmu_table_batch **batch = &tlb->batch; 779 780 if (*batch == NULL) { 781 *batch = (struct mmu_table_batch *) 782 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 783 if (*batch == NULL) { 784 __tlb_flush_mm(tlb->mm); 785 tlb_remove_table_one(table); 786 return; 787 } 788 (*batch)->nr = 0; 789 } 790 (*batch)->tables[(*batch)->nr++] = table; 791 if ((*batch)->nr == MAX_TABLE_BATCH) 792 tlb_table_flush(tlb); 793 } 794 795 /* 796 * switch on pgstes for its userspace process (for kvm) 797 */ 798 int s390_enable_sie(void) 799 { 800 struct task_struct *tsk = current; 801 struct mm_struct *mm, *old_mm; 802 803 /* Do we have switched amode? If no, we cannot do sie */ 804 if (user_mode == HOME_SPACE_MODE) 805 return -EINVAL; 806 807 /* Do we have pgstes? if yes, we are done */ 808 if (mm_has_pgste(tsk->mm)) 809 return 0; 810 811 /* lets check if we are allowed to replace the mm */ 812 task_lock(tsk); 813 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 814 #ifdef CONFIG_AIO 815 !hlist_empty(&tsk->mm->ioctx_list) || 816 #endif 817 tsk->mm != tsk->active_mm) { 818 task_unlock(tsk); 819 return -EINVAL; 820 } 821 task_unlock(tsk); 822 823 /* we copy the mm and let dup_mm create the page tables with_pgstes */ 824 tsk->mm->context.alloc_pgste = 1; 825 /* make sure that both mms have a correct rss state */ 826 sync_mm_rss(tsk->mm); 827 mm = dup_mm(tsk); 828 tsk->mm->context.alloc_pgste = 0; 829 if (!mm) 830 return -ENOMEM; 831 832 /* Now lets check again if something happened */ 833 task_lock(tsk); 834 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 835 #ifdef CONFIG_AIO 836 !hlist_empty(&tsk->mm->ioctx_list) || 837 #endif 838 tsk->mm != tsk->active_mm) { 839 mmput(mm); 840 task_unlock(tsk); 841 return -EINVAL; 842 } 843 844 /* ok, we are alone. No ptrace, no threads, etc. */ 845 old_mm = tsk->mm; 846 tsk->mm = tsk->active_mm = mm; 847 preempt_disable(); 848 update_mm(mm, tsk); 849 atomic_inc(&mm->context.attach_count); 850 atomic_dec(&old_mm->context.attach_count); 851 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); 852 preempt_enable(); 853 task_unlock(tsk); 854 mmput(old_mm); 855 return 0; 856 } 857 EXPORT_SYMBOL_GPL(s390_enable_sie); 858 859 #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION) 860 bool kernel_page_present(struct page *page) 861 { 862 unsigned long addr; 863 int cc; 864 865 addr = page_to_phys(page); 866 asm volatile( 867 " lra %1,0(%1)\n" 868 " ipm %0\n" 869 " srl %0,28" 870 : "=d" (cc), "+a" (addr) : : "cc"); 871 return cc == 0; 872 } 873 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ 874