1 /* 2 * Copyright IBM Corp. 2007,2009 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 20 #include <asm/system.h> 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 27 struct rcu_table_freelist { 28 struct rcu_head rcu; 29 struct mm_struct *mm; 30 unsigned int pgt_index; 31 unsigned int crst_index; 32 unsigned long *table[0]; 33 }; 34 35 #define RCU_FREELIST_SIZE \ 36 ((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \ 37 / sizeof(unsigned long)) 38 39 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 40 static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist); 41 42 static void __page_table_free(struct mm_struct *mm, unsigned long *table); 43 static void __crst_table_free(struct mm_struct *mm, unsigned long *table); 44 45 static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm) 46 { 47 struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist); 48 struct rcu_table_freelist *batch = *batchp; 49 50 if (batch) 51 return batch; 52 batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC); 53 if (batch) { 54 batch->mm = mm; 55 batch->pgt_index = 0; 56 batch->crst_index = RCU_FREELIST_SIZE; 57 *batchp = batch; 58 } 59 return batch; 60 } 61 62 static void rcu_table_freelist_callback(struct rcu_head *head) 63 { 64 struct rcu_table_freelist *batch = 65 container_of(head, struct rcu_table_freelist, rcu); 66 67 while (batch->pgt_index > 0) 68 __page_table_free(batch->mm, batch->table[--batch->pgt_index]); 69 while (batch->crst_index < RCU_FREELIST_SIZE) 70 __crst_table_free(batch->mm, batch->table[batch->crst_index++]); 71 free_page((unsigned long) batch); 72 } 73 74 void rcu_table_freelist_finish(void) 75 { 76 struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist); 77 78 if (!batch) 79 return; 80 call_rcu(&batch->rcu, rcu_table_freelist_callback); 81 __get_cpu_var(rcu_table_freelist) = NULL; 82 } 83 84 static void smp_sync(void *arg) 85 { 86 } 87 88 #ifndef CONFIG_64BIT 89 #define ALLOC_ORDER 1 90 #define TABLES_PER_PAGE 4 91 #define FRAG_MASK 15UL 92 #define SECOND_HALVES 10UL 93 94 void clear_table_pgstes(unsigned long *table) 95 { 96 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); 97 memset(table + 256, 0, PAGE_SIZE/4); 98 clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); 99 memset(table + 768, 0, PAGE_SIZE/4); 100 } 101 102 #else 103 #define ALLOC_ORDER 2 104 #define TABLES_PER_PAGE 2 105 #define FRAG_MASK 3UL 106 #define SECOND_HALVES 2UL 107 108 void clear_table_pgstes(unsigned long *table) 109 { 110 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); 111 memset(table + 256, 0, PAGE_SIZE/2); 112 } 113 114 #endif 115 116 unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE; 117 EXPORT_SYMBOL(VMALLOC_START); 118 119 static int __init parse_vmalloc(char *arg) 120 { 121 if (!arg) 122 return -EINVAL; 123 VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK; 124 return 0; 125 } 126 early_param("vmalloc", parse_vmalloc); 127 128 unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) 129 { 130 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 131 132 if (!page) 133 return NULL; 134 page->index = 0; 135 if (noexec) { 136 struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 137 if (!shadow) { 138 __free_pages(page, ALLOC_ORDER); 139 return NULL; 140 } 141 page->index = page_to_phys(shadow); 142 } 143 spin_lock_bh(&mm->context.list_lock); 144 list_add(&page->lru, &mm->context.crst_list); 145 spin_unlock_bh(&mm->context.list_lock); 146 return (unsigned long *) page_to_phys(page); 147 } 148 149 static void __crst_table_free(struct mm_struct *mm, unsigned long *table) 150 { 151 unsigned long *shadow = get_shadow_table(table); 152 153 if (shadow) 154 free_pages((unsigned long) shadow, ALLOC_ORDER); 155 free_pages((unsigned long) table, ALLOC_ORDER); 156 } 157 158 void crst_table_free(struct mm_struct *mm, unsigned long *table) 159 { 160 struct page *page = virt_to_page(table); 161 162 spin_lock_bh(&mm->context.list_lock); 163 list_del(&page->lru); 164 spin_unlock_bh(&mm->context.list_lock); 165 __crst_table_free(mm, table); 166 } 167 168 void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table) 169 { 170 struct rcu_table_freelist *batch; 171 struct page *page = virt_to_page(table); 172 173 spin_lock_bh(&mm->context.list_lock); 174 list_del(&page->lru); 175 spin_unlock_bh(&mm->context.list_lock); 176 if (atomic_read(&mm->mm_users) < 2 && 177 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { 178 __crst_table_free(mm, table); 179 return; 180 } 181 batch = rcu_table_freelist_get(mm); 182 if (!batch) { 183 smp_call_function(smp_sync, NULL, 1); 184 __crst_table_free(mm, table); 185 return; 186 } 187 batch->table[--batch->crst_index] = table; 188 if (batch->pgt_index >= batch->crst_index) 189 rcu_table_freelist_finish(); 190 } 191 192 #ifdef CONFIG_64BIT 193 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 194 { 195 unsigned long *table, *pgd; 196 unsigned long entry; 197 198 BUG_ON(limit > (1UL << 53)); 199 repeat: 200 table = crst_table_alloc(mm, mm->context.noexec); 201 if (!table) 202 return -ENOMEM; 203 spin_lock_bh(&mm->page_table_lock); 204 if (mm->context.asce_limit < limit) { 205 pgd = (unsigned long *) mm->pgd; 206 if (mm->context.asce_limit <= (1UL << 31)) { 207 entry = _REGION3_ENTRY_EMPTY; 208 mm->context.asce_limit = 1UL << 42; 209 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 210 _ASCE_USER_BITS | 211 _ASCE_TYPE_REGION3; 212 } else { 213 entry = _REGION2_ENTRY_EMPTY; 214 mm->context.asce_limit = 1UL << 53; 215 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 216 _ASCE_USER_BITS | 217 _ASCE_TYPE_REGION2; 218 } 219 crst_table_init(table, entry); 220 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 221 mm->pgd = (pgd_t *) table; 222 mm->task_size = mm->context.asce_limit; 223 table = NULL; 224 } 225 spin_unlock_bh(&mm->page_table_lock); 226 if (table) 227 crst_table_free(mm, table); 228 if (mm->context.asce_limit < limit) 229 goto repeat; 230 update_mm(mm, current); 231 return 0; 232 } 233 234 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 235 { 236 pgd_t *pgd; 237 238 if (mm->context.asce_limit <= limit) 239 return; 240 __tlb_flush_mm(mm); 241 while (mm->context.asce_limit > limit) { 242 pgd = mm->pgd; 243 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 244 case _REGION_ENTRY_TYPE_R2: 245 mm->context.asce_limit = 1UL << 42; 246 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 247 _ASCE_USER_BITS | 248 _ASCE_TYPE_REGION3; 249 break; 250 case _REGION_ENTRY_TYPE_R3: 251 mm->context.asce_limit = 1UL << 31; 252 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 253 _ASCE_USER_BITS | 254 _ASCE_TYPE_SEGMENT; 255 break; 256 default: 257 BUG(); 258 } 259 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 260 mm->task_size = mm->context.asce_limit; 261 crst_table_free(mm, (unsigned long *) pgd); 262 } 263 update_mm(mm, current); 264 } 265 #endif 266 267 /* 268 * page table entry allocation/free routines. 269 */ 270 unsigned long *page_table_alloc(struct mm_struct *mm) 271 { 272 struct page *page; 273 unsigned long *table; 274 unsigned long bits; 275 276 bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; 277 spin_lock_bh(&mm->context.list_lock); 278 page = NULL; 279 if (!list_empty(&mm->context.pgtable_list)) { 280 page = list_first_entry(&mm->context.pgtable_list, 281 struct page, lru); 282 if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) 283 page = NULL; 284 } 285 if (!page) { 286 spin_unlock_bh(&mm->context.list_lock); 287 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 288 if (!page) 289 return NULL; 290 pgtable_page_ctor(page); 291 page->flags &= ~FRAG_MASK; 292 table = (unsigned long *) page_to_phys(page); 293 if (mm->context.has_pgste) 294 clear_table_pgstes(table); 295 else 296 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); 297 spin_lock_bh(&mm->context.list_lock); 298 list_add(&page->lru, &mm->context.pgtable_list); 299 } 300 table = (unsigned long *) page_to_phys(page); 301 while (page->flags & bits) { 302 table += 256; 303 bits <<= 1; 304 } 305 page->flags |= bits; 306 if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) 307 list_move_tail(&page->lru, &mm->context.pgtable_list); 308 spin_unlock_bh(&mm->context.list_lock); 309 return table; 310 } 311 312 static void __page_table_free(struct mm_struct *mm, unsigned long *table) 313 { 314 struct page *page; 315 unsigned long bits; 316 317 bits = ((unsigned long) table) & 15; 318 table = (unsigned long *)(((unsigned long) table) ^ bits); 319 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 320 page->flags ^= bits; 321 if (!(page->flags & FRAG_MASK)) { 322 pgtable_page_dtor(page); 323 __free_page(page); 324 } 325 } 326 327 void page_table_free(struct mm_struct *mm, unsigned long *table) 328 { 329 struct page *page; 330 unsigned long bits; 331 332 bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; 333 bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); 334 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 335 spin_lock_bh(&mm->context.list_lock); 336 page->flags ^= bits; 337 if (page->flags & FRAG_MASK) { 338 /* Page now has some free pgtable fragments. */ 339 list_move(&page->lru, &mm->context.pgtable_list); 340 page = NULL; 341 } else 342 /* All fragments of the 4K page have been freed. */ 343 list_del(&page->lru); 344 spin_unlock_bh(&mm->context.list_lock); 345 if (page) { 346 pgtable_page_dtor(page); 347 __free_page(page); 348 } 349 } 350 351 void page_table_free_rcu(struct mm_struct *mm, unsigned long *table) 352 { 353 struct rcu_table_freelist *batch; 354 struct page *page; 355 unsigned long bits; 356 357 if (atomic_read(&mm->mm_users) < 2 && 358 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { 359 page_table_free(mm, table); 360 return; 361 } 362 batch = rcu_table_freelist_get(mm); 363 if (!batch) { 364 smp_call_function(smp_sync, NULL, 1); 365 page_table_free(mm, table); 366 return; 367 } 368 bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; 369 bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); 370 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 371 spin_lock_bh(&mm->context.list_lock); 372 /* Delayed freeing with rcu prevents reuse of pgtable fragments */ 373 list_del_init(&page->lru); 374 spin_unlock_bh(&mm->context.list_lock); 375 table = (unsigned long *)(((unsigned long) table) | bits); 376 batch->table[batch->pgt_index++] = table; 377 if (batch->pgt_index >= batch->crst_index) 378 rcu_table_freelist_finish(); 379 } 380 381 void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) 382 { 383 struct page *page; 384 385 spin_lock_bh(&mm->context.list_lock); 386 /* Free shadow region and segment tables. */ 387 list_for_each_entry(page, &mm->context.crst_list, lru) 388 if (page->index) { 389 free_pages((unsigned long) page->index, ALLOC_ORDER); 390 page->index = 0; 391 } 392 /* "Free" second halves of page tables. */ 393 list_for_each_entry(page, &mm->context.pgtable_list, lru) 394 page->flags &= ~SECOND_HALVES; 395 spin_unlock_bh(&mm->context.list_lock); 396 mm->context.noexec = 0; 397 update_mm(mm, tsk); 398 } 399 400 /* 401 * switch on pgstes for its userspace process (for kvm) 402 */ 403 int s390_enable_sie(void) 404 { 405 struct task_struct *tsk = current; 406 struct mm_struct *mm, *old_mm; 407 408 /* Do we have switched amode? If no, we cannot do sie */ 409 if (user_mode == HOME_SPACE_MODE) 410 return -EINVAL; 411 412 /* Do we have pgstes? if yes, we are done */ 413 if (tsk->mm->context.has_pgste) 414 return 0; 415 416 /* lets check if we are allowed to replace the mm */ 417 task_lock(tsk); 418 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 419 #ifdef CONFIG_AIO 420 !hlist_empty(&tsk->mm->ioctx_list) || 421 #endif 422 tsk->mm != tsk->active_mm) { 423 task_unlock(tsk); 424 return -EINVAL; 425 } 426 task_unlock(tsk); 427 428 /* we copy the mm and let dup_mm create the page tables with_pgstes */ 429 tsk->mm->context.alloc_pgste = 1; 430 mm = dup_mm(tsk); 431 tsk->mm->context.alloc_pgste = 0; 432 if (!mm) 433 return -ENOMEM; 434 435 /* Now lets check again if something happened */ 436 task_lock(tsk); 437 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 438 #ifdef CONFIG_AIO 439 !hlist_empty(&tsk->mm->ioctx_list) || 440 #endif 441 tsk->mm != tsk->active_mm) { 442 mmput(mm); 443 task_unlock(tsk); 444 return -EINVAL; 445 } 446 447 /* ok, we are alone. No ptrace, no threads, etc. */ 448 old_mm = tsk->mm; 449 tsk->mm = tsk->active_mm = mm; 450 preempt_disable(); 451 update_mm(mm, tsk); 452 atomic_inc(&mm->context.attach_count); 453 atomic_dec(&old_mm->context.attach_count); 454 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); 455 preempt_enable(); 456 task_unlock(tsk); 457 mmput(old_mm); 458 return 0; 459 } 460 EXPORT_SYMBOL_GPL(s390_enable_sie); 461 462 #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION) 463 bool kernel_page_present(struct page *page) 464 { 465 unsigned long addr; 466 int cc; 467 468 addr = page_to_phys(page); 469 asm volatile( 470 " lra %1,0(%1)\n" 471 " ipm %0\n" 472 " srl %0,28" 473 : "=d" (cc), "+a" (addr) : : "cc"); 474 return cc == 0; 475 } 476 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ 477