1 /* 2 * arch/s390/mm/fault.c 3 * 4 * S390 version 5 * Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation 6 * Author(s): Hartmut Penner (hp@de.ibm.com) 7 * Ulrich Weigand (uweigand@de.ibm.com) 8 * 9 * Derived from "arch/i386/mm/fault.c" 10 * Copyright (C) 1995 Linus Torvalds 11 */ 12 13 #include <linux/kernel_stat.h> 14 #include <linux/perf_event.h> 15 #include <linux/signal.h> 16 #include <linux/sched.h> 17 #include <linux/kernel.h> 18 #include <linux/errno.h> 19 #include <linux/string.h> 20 #include <linux/types.h> 21 #include <linux/ptrace.h> 22 #include <linux/mman.h> 23 #include <linux/mm.h> 24 #include <linux/compat.h> 25 #include <linux/smp.h> 26 #include <linux/kdebug.h> 27 #include <linux/init.h> 28 #include <linux/console.h> 29 #include <linux/module.h> 30 #include <linux/hardirq.h> 31 #include <linux/kprobes.h> 32 #include <linux/uaccess.h> 33 #include <linux/hugetlb.h> 34 #include <asm/asm-offsets.h> 35 #include <asm/pgtable.h> 36 #include <asm/irq.h> 37 #include <asm/mmu_context.h> 38 #include <asm/facility.h> 39 #include "../kernel/entry.h" 40 41 #ifndef CONFIG_64BIT 42 #define __FAIL_ADDR_MASK 0x7ffff000 43 #define __SUBCODE_MASK 0x0200 44 #define __PF_RES_FIELD 0ULL 45 #else /* CONFIG_64BIT */ 46 #define __FAIL_ADDR_MASK -4096L 47 #define __SUBCODE_MASK 0x0600 48 #define __PF_RES_FIELD 0x8000000000000000ULL 49 #endif /* CONFIG_64BIT */ 50 51 #define VM_FAULT_BADCONTEXT 0x010000 52 #define VM_FAULT_BADMAP 0x020000 53 #define VM_FAULT_BADACCESS 0x040000 54 55 static unsigned long store_indication; 56 57 void fault_init(void) 58 { 59 if (test_facility(2) && test_facility(75)) 60 store_indication = 0xc00; 61 } 62 63 static inline int notify_page_fault(struct pt_regs *regs) 64 { 65 int ret = 0; 66 67 /* kprobe_running() needs smp_processor_id() */ 68 if (kprobes_built_in() && !user_mode(regs)) { 69 preempt_disable(); 70 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 71 ret = 1; 72 preempt_enable(); 73 } 74 return ret; 75 } 76 77 78 /* 79 * Unlock any spinlocks which will prevent us from getting the 80 * message out. 81 */ 82 void bust_spinlocks(int yes) 83 { 84 if (yes) { 85 oops_in_progress = 1; 86 } else { 87 int loglevel_save = console_loglevel; 88 console_unblank(); 89 oops_in_progress = 0; 90 /* 91 * OK, the message is on the console. Now we call printk() 92 * without oops_in_progress set so that printk will give klogd 93 * a poke. Hold onto your hats... 94 */ 95 console_loglevel = 15; 96 printk(" "); 97 console_loglevel = loglevel_save; 98 } 99 } 100 101 /* 102 * Returns the address space associated with the fault. 103 * Returns 0 for kernel space and 1 for user space. 104 */ 105 static inline int user_space_fault(unsigned long trans_exc_code) 106 { 107 /* 108 * The lowest two bits of the translation exception 109 * identification indicate which paging table was used. 110 */ 111 trans_exc_code &= 3; 112 if (trans_exc_code == 2) 113 /* Access via secondary space, set_fs setting decides */ 114 return current->thread.mm_segment.ar4; 115 if (user_mode == HOME_SPACE_MODE) 116 /* User space if the access has been done via home space. */ 117 return trans_exc_code == 3; 118 /* 119 * If the user space is not the home space the kernel runs in home 120 * space. Access via secondary space has already been covered, 121 * access via primary space or access register is from user space 122 * and access via home space is from the kernel. 123 */ 124 return trans_exc_code != 3; 125 } 126 127 static inline void report_user_fault(struct pt_regs *regs, long signr) 128 { 129 if ((task_pid_nr(current) > 1) && !show_unhandled_signals) 130 return; 131 if (!unhandled_signal(current, signr)) 132 return; 133 if (!printk_ratelimit()) 134 return; 135 printk(KERN_ALERT "User process fault: interruption code 0x%X ", 136 regs->int_code); 137 print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN); 138 printk(KERN_CONT "\n"); 139 printk(KERN_ALERT "failing address: %lX\n", 140 regs->int_parm_long & __FAIL_ADDR_MASK); 141 show_regs(regs); 142 } 143 144 /* 145 * Send SIGSEGV to task. This is an external routine 146 * to keep the stack usage of do_page_fault small. 147 */ 148 static noinline void do_sigsegv(struct pt_regs *regs, int si_code) 149 { 150 struct siginfo si; 151 152 report_user_fault(regs, SIGSEGV); 153 si.si_signo = SIGSEGV; 154 si.si_code = si_code; 155 si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK); 156 force_sig_info(SIGSEGV, &si, current); 157 } 158 159 static noinline void do_no_context(struct pt_regs *regs) 160 { 161 const struct exception_table_entry *fixup; 162 unsigned long address; 163 164 /* Are we prepared to handle this kernel fault? */ 165 fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN); 166 if (fixup) { 167 regs->psw.addr = fixup->fixup | PSW_ADDR_AMODE; 168 return; 169 } 170 171 /* 172 * Oops. The kernel tried to access some bad page. We'll have to 173 * terminate things with extreme prejudice. 174 */ 175 address = regs->int_parm_long & __FAIL_ADDR_MASK; 176 if (!user_space_fault(regs->int_parm_long)) 177 printk(KERN_ALERT "Unable to handle kernel pointer dereference" 178 " at virtual kernel address %p\n", (void *)address); 179 else 180 printk(KERN_ALERT "Unable to handle kernel paging request" 181 " at virtual user address %p\n", (void *)address); 182 183 die(regs, "Oops"); 184 do_exit(SIGKILL); 185 } 186 187 static noinline void do_low_address(struct pt_regs *regs) 188 { 189 /* Low-address protection hit in kernel mode means 190 NULL pointer write access in kernel mode. */ 191 if (regs->psw.mask & PSW_MASK_PSTATE) { 192 /* Low-address protection hit in user mode 'cannot happen'. */ 193 die (regs, "Low-address protection"); 194 do_exit(SIGKILL); 195 } 196 197 do_no_context(regs); 198 } 199 200 static noinline void do_sigbus(struct pt_regs *regs) 201 { 202 struct task_struct *tsk = current; 203 struct siginfo si; 204 205 /* 206 * Send a sigbus, regardless of whether we were in kernel 207 * or user mode. 208 */ 209 si.si_signo = SIGBUS; 210 si.si_errno = 0; 211 si.si_code = BUS_ADRERR; 212 si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK); 213 force_sig_info(SIGBUS, &si, tsk); 214 } 215 216 static noinline void do_fault_error(struct pt_regs *regs, int fault) 217 { 218 int si_code; 219 220 switch (fault) { 221 case VM_FAULT_BADACCESS: 222 case VM_FAULT_BADMAP: 223 /* Bad memory access. Check if it is kernel or user space. */ 224 if (regs->psw.mask & PSW_MASK_PSTATE) { 225 /* User mode accesses just cause a SIGSEGV */ 226 si_code = (fault == VM_FAULT_BADMAP) ? 227 SEGV_MAPERR : SEGV_ACCERR; 228 do_sigsegv(regs, si_code); 229 return; 230 } 231 case VM_FAULT_BADCONTEXT: 232 do_no_context(regs); 233 break; 234 default: /* fault & VM_FAULT_ERROR */ 235 if (fault & VM_FAULT_OOM) { 236 if (!(regs->psw.mask & PSW_MASK_PSTATE)) 237 do_no_context(regs); 238 else 239 pagefault_out_of_memory(); 240 } else if (fault & VM_FAULT_SIGBUS) { 241 /* Kernel mode? Handle exceptions or die */ 242 if (!(regs->psw.mask & PSW_MASK_PSTATE)) 243 do_no_context(regs); 244 else 245 do_sigbus(regs); 246 } else 247 BUG(); 248 break; 249 } 250 } 251 252 /* 253 * This routine handles page faults. It determines the address, 254 * and the problem, and then passes it off to one of the appropriate 255 * routines. 256 * 257 * interruption code (int_code): 258 * 04 Protection -> Write-Protection (suprression) 259 * 10 Segment translation -> Not present (nullification) 260 * 11 Page translation -> Not present (nullification) 261 * 3b Region third trans. -> Not present (nullification) 262 */ 263 static inline int do_exception(struct pt_regs *regs, int access) 264 { 265 struct task_struct *tsk; 266 struct mm_struct *mm; 267 struct vm_area_struct *vma; 268 unsigned long trans_exc_code; 269 unsigned long address; 270 unsigned int flags; 271 int fault; 272 273 if (notify_page_fault(regs)) 274 return 0; 275 276 tsk = current; 277 mm = tsk->mm; 278 trans_exc_code = regs->int_parm_long; 279 280 /* 281 * Verify that the fault happened in user space, that 282 * we are not in an interrupt and that there is a 283 * user context. 284 */ 285 fault = VM_FAULT_BADCONTEXT; 286 if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) 287 goto out; 288 289 address = trans_exc_code & __FAIL_ADDR_MASK; 290 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 291 flags = FAULT_FLAG_ALLOW_RETRY; 292 if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) 293 flags |= FAULT_FLAG_WRITE; 294 down_read(&mm->mmap_sem); 295 296 #ifdef CONFIG_PGSTE 297 if (test_tsk_thread_flag(current, TIF_SIE) && S390_lowcore.gmap) { 298 address = __gmap_fault(address, 299 (struct gmap *) S390_lowcore.gmap); 300 if (address == -EFAULT) { 301 fault = VM_FAULT_BADMAP; 302 goto out_up; 303 } 304 if (address == -ENOMEM) { 305 fault = VM_FAULT_OOM; 306 goto out_up; 307 } 308 } 309 #endif 310 311 retry: 312 fault = VM_FAULT_BADMAP; 313 vma = find_vma(mm, address); 314 if (!vma) 315 goto out_up; 316 317 if (unlikely(vma->vm_start > address)) { 318 if (!(vma->vm_flags & VM_GROWSDOWN)) 319 goto out_up; 320 if (expand_stack(vma, address)) 321 goto out_up; 322 } 323 324 /* 325 * Ok, we have a good vm_area for this memory access, so 326 * we can handle it.. 327 */ 328 fault = VM_FAULT_BADACCESS; 329 if (unlikely(!(vma->vm_flags & access))) 330 goto out_up; 331 332 if (is_vm_hugetlb_page(vma)) 333 address &= HPAGE_MASK; 334 /* 335 * If for any reason at all we couldn't handle the fault, 336 * make sure we exit gracefully rather than endlessly redo 337 * the fault. 338 */ 339 fault = handle_mm_fault(mm, vma, address, flags); 340 if (unlikely(fault & VM_FAULT_ERROR)) 341 goto out_up; 342 343 /* 344 * Major/minor page fault accounting is only done on the 345 * initial attempt. If we go through a retry, it is extremely 346 * likely that the page will be found in page cache at that point. 347 */ 348 if (flags & FAULT_FLAG_ALLOW_RETRY) { 349 if (fault & VM_FAULT_MAJOR) { 350 tsk->maj_flt++; 351 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 352 regs, address); 353 } else { 354 tsk->min_flt++; 355 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 356 regs, address); 357 } 358 if (fault & VM_FAULT_RETRY) { 359 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk 360 * of starvation. */ 361 flags &= ~FAULT_FLAG_ALLOW_RETRY; 362 down_read(&mm->mmap_sem); 363 goto retry; 364 } 365 } 366 /* 367 * The instruction that caused the program check will 368 * be repeated. Don't signal single step via SIGTRAP. 369 */ 370 clear_tsk_thread_flag(tsk, TIF_PER_TRAP); 371 fault = 0; 372 out_up: 373 up_read(&mm->mmap_sem); 374 out: 375 return fault; 376 } 377 378 void __kprobes do_protection_exception(struct pt_regs *regs) 379 { 380 unsigned long trans_exc_code; 381 int fault; 382 383 trans_exc_code = regs->int_parm_long; 384 /* Protection exception is suppressing, decrement psw address. */ 385 regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); 386 /* 387 * Check for low-address protection. This needs to be treated 388 * as a special case because the translation exception code 389 * field is not guaranteed to contain valid data in this case. 390 */ 391 if (unlikely(!(trans_exc_code & 4))) { 392 do_low_address(regs); 393 return; 394 } 395 fault = do_exception(regs, VM_WRITE); 396 if (unlikely(fault)) 397 do_fault_error(regs, fault); 398 } 399 400 void __kprobes do_dat_exception(struct pt_regs *regs) 401 { 402 int access, fault; 403 404 access = VM_READ | VM_EXEC | VM_WRITE; 405 fault = do_exception(regs, access); 406 if (unlikely(fault)) 407 do_fault_error(regs, fault); 408 } 409 410 #ifdef CONFIG_64BIT 411 void __kprobes do_asce_exception(struct pt_regs *regs) 412 { 413 struct mm_struct *mm = current->mm; 414 struct vm_area_struct *vma; 415 unsigned long trans_exc_code; 416 417 trans_exc_code = regs->int_parm_long; 418 if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) 419 goto no_context; 420 421 down_read(&mm->mmap_sem); 422 vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); 423 up_read(&mm->mmap_sem); 424 425 if (vma) { 426 update_mm(mm, current); 427 return; 428 } 429 430 /* User mode accesses just cause a SIGSEGV */ 431 if (regs->psw.mask & PSW_MASK_PSTATE) { 432 do_sigsegv(regs, SEGV_MAPERR); 433 return; 434 } 435 436 no_context: 437 do_no_context(regs); 438 } 439 #endif 440 441 int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) 442 { 443 struct pt_regs regs; 444 int access, fault; 445 446 regs.psw.mask = psw_kernel_bits | PSW_MASK_DAT | PSW_MASK_MCHECK; 447 if (!irqs_disabled()) 448 regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; 449 regs.psw.addr = (unsigned long) __builtin_return_address(0); 450 regs.psw.addr |= PSW_ADDR_AMODE; 451 regs.int_code = pgm_int_code; 452 regs.int_parm_long = (uaddr & PAGE_MASK) | 2; 453 access = write ? VM_WRITE : VM_READ; 454 fault = do_exception(®s, access); 455 if (unlikely(fault)) { 456 if (fault & VM_FAULT_OOM) 457 return -EFAULT; 458 else if (fault & VM_FAULT_SIGBUS) 459 do_sigbus(®s); 460 } 461 return fault ? -EFAULT : 0; 462 } 463 464 #ifdef CONFIG_PFAULT 465 /* 466 * 'pfault' pseudo page faults routines. 467 */ 468 static int pfault_disable; 469 470 static int __init nopfault(char *str) 471 { 472 pfault_disable = 1; 473 return 1; 474 } 475 476 __setup("nopfault", nopfault); 477 478 struct pfault_refbk { 479 u16 refdiagc; 480 u16 reffcode; 481 u16 refdwlen; 482 u16 refversn; 483 u64 refgaddr; 484 u64 refselmk; 485 u64 refcmpmk; 486 u64 reserved; 487 } __attribute__ ((packed, aligned(8))); 488 489 int pfault_init(void) 490 { 491 struct pfault_refbk refbk = { 492 .refdiagc = 0x258, 493 .reffcode = 0, 494 .refdwlen = 5, 495 .refversn = 2, 496 .refgaddr = __LC_CURRENT_PID, 497 .refselmk = 1ULL << 48, 498 .refcmpmk = 1ULL << 48, 499 .reserved = __PF_RES_FIELD }; 500 int rc; 501 502 if (pfault_disable) 503 return -1; 504 asm volatile( 505 " diag %1,%0,0x258\n" 506 "0: j 2f\n" 507 "1: la %0,8\n" 508 "2:\n" 509 EX_TABLE(0b,1b) 510 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); 511 return rc; 512 } 513 514 void pfault_fini(void) 515 { 516 struct pfault_refbk refbk = { 517 .refdiagc = 0x258, 518 .reffcode = 1, 519 .refdwlen = 5, 520 .refversn = 2, 521 }; 522 523 if (pfault_disable) 524 return; 525 asm volatile( 526 " diag %0,0,0x258\n" 527 "0:\n" 528 EX_TABLE(0b,0b) 529 : : "a" (&refbk), "m" (refbk) : "cc"); 530 } 531 532 static DEFINE_SPINLOCK(pfault_lock); 533 static LIST_HEAD(pfault_list); 534 535 static void pfault_interrupt(struct ext_code ext_code, 536 unsigned int param32, unsigned long param64) 537 { 538 struct task_struct *tsk; 539 __u16 subcode; 540 pid_t pid; 541 542 /* 543 * Get the external interruption subcode & pfault 544 * initial/completion signal bit. VM stores this 545 * in the 'cpu address' field associated with the 546 * external interrupt. 547 */ 548 subcode = ext_code.subcode; 549 if ((subcode & 0xff00) != __SUBCODE_MASK) 550 return; 551 kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++; 552 if (subcode & 0x0080) { 553 /* Get the token (= pid of the affected task). */ 554 pid = sizeof(void *) == 4 ? param32 : param64; 555 rcu_read_lock(); 556 tsk = find_task_by_pid_ns(pid, &init_pid_ns); 557 if (tsk) 558 get_task_struct(tsk); 559 rcu_read_unlock(); 560 if (!tsk) 561 return; 562 } else { 563 tsk = current; 564 } 565 spin_lock(&pfault_lock); 566 if (subcode & 0x0080) { 567 /* signal bit is set -> a page has been swapped in by VM */ 568 if (tsk->thread.pfault_wait == 1) { 569 /* Initial interrupt was faster than the completion 570 * interrupt. pfault_wait is valid. Set pfault_wait 571 * back to zero and wake up the process. This can 572 * safely be done because the task is still sleeping 573 * and can't produce new pfaults. */ 574 tsk->thread.pfault_wait = 0; 575 list_del(&tsk->thread.list); 576 wake_up_process(tsk); 577 } else { 578 /* Completion interrupt was faster than initial 579 * interrupt. Set pfault_wait to -1 so the initial 580 * interrupt doesn't put the task to sleep. 581 * If the task is not running, ignore the completion 582 * interrupt since it must be a leftover of a PFAULT 583 * CANCEL operation which didn't remove all pending 584 * completion interrupts. */ 585 if (tsk->state == TASK_RUNNING) 586 tsk->thread.pfault_wait = -1; 587 } 588 put_task_struct(tsk); 589 } else { 590 /* signal bit not set -> a real page is missing. */ 591 if (tsk->thread.pfault_wait == -1) { 592 /* Completion interrupt was faster than the initial 593 * interrupt (pfault_wait == -1). Set pfault_wait 594 * back to zero and exit. */ 595 tsk->thread.pfault_wait = 0; 596 } else { 597 /* Initial interrupt arrived before completion 598 * interrupt. Let the task sleep. */ 599 tsk->thread.pfault_wait = 1; 600 list_add(&tsk->thread.list, &pfault_list); 601 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 602 set_tsk_need_resched(tsk); 603 } 604 } 605 spin_unlock(&pfault_lock); 606 } 607 608 static int __cpuinit pfault_cpu_notify(struct notifier_block *self, 609 unsigned long action, void *hcpu) 610 { 611 struct thread_struct *thread, *next; 612 struct task_struct *tsk; 613 614 switch (action) { 615 case CPU_DEAD: 616 case CPU_DEAD_FROZEN: 617 spin_lock_irq(&pfault_lock); 618 list_for_each_entry_safe(thread, next, &pfault_list, list) { 619 thread->pfault_wait = 0; 620 list_del(&thread->list); 621 tsk = container_of(thread, struct task_struct, thread); 622 wake_up_process(tsk); 623 } 624 spin_unlock_irq(&pfault_lock); 625 break; 626 default: 627 break; 628 } 629 return NOTIFY_OK; 630 } 631 632 static int __init pfault_irq_init(void) 633 { 634 int rc; 635 636 rc = register_external_interrupt(0x2603, pfault_interrupt); 637 if (rc) 638 goto out_extint; 639 rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; 640 if (rc) 641 goto out_pfault; 642 service_subclass_irq_register(); 643 hotcpu_notifier(pfault_cpu_notify, 0); 644 return 0; 645 646 out_pfault: 647 unregister_external_interrupt(0x2603, pfault_interrupt); 648 out_extint: 649 pfault_disable = 1; 650 return rc; 651 } 652 early_initcall(pfault_irq_init); 653 654 #endif /* CONFIG_PFAULT */ 655