1 /* 2 * arch/s390/mm/fault.c 3 * 4 * S390 version 5 * Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation 6 * Author(s): Hartmut Penner (hp@de.ibm.com) 7 * Ulrich Weigand (uweigand@de.ibm.com) 8 * 9 * Derived from "arch/i386/mm/fault.c" 10 * Copyright (C) 1995 Linus Torvalds 11 */ 12 13 #include <linux/perf_event.h> 14 #include <linux/signal.h> 15 #include <linux/sched.h> 16 #include <linux/kernel.h> 17 #include <linux/errno.h> 18 #include <linux/string.h> 19 #include <linux/types.h> 20 #include <linux/ptrace.h> 21 #include <linux/mman.h> 22 #include <linux/mm.h> 23 #include <linux/compat.h> 24 #include <linux/smp.h> 25 #include <linux/kdebug.h> 26 #include <linux/init.h> 27 #include <linux/console.h> 28 #include <linux/module.h> 29 #include <linux/hardirq.h> 30 #include <linux/kprobes.h> 31 #include <linux/uaccess.h> 32 #include <linux/hugetlb.h> 33 #include <asm/asm-offsets.h> 34 #include <asm/system.h> 35 #include <asm/pgtable.h> 36 #include <asm/s390_ext.h> 37 #include <asm/mmu_context.h> 38 #include <asm/compat.h> 39 #include "../kernel/entry.h" 40 41 #ifndef CONFIG_64BIT 42 #define __FAIL_ADDR_MASK 0x7ffff000 43 #define __SUBCODE_MASK 0x0200 44 #define __PF_RES_FIELD 0ULL 45 #else /* CONFIG_64BIT */ 46 #define __FAIL_ADDR_MASK -4096L 47 #define __SUBCODE_MASK 0x0600 48 #define __PF_RES_FIELD 0x8000000000000000ULL 49 #endif /* CONFIG_64BIT */ 50 51 #define VM_FAULT_BADCONTEXT 0x010000 52 #define VM_FAULT_BADMAP 0x020000 53 #define VM_FAULT_BADACCESS 0x040000 54 55 static inline int notify_page_fault(struct pt_regs *regs) 56 { 57 int ret = 0; 58 59 /* kprobe_running() needs smp_processor_id() */ 60 if (kprobes_built_in() && !user_mode(regs)) { 61 preempt_disable(); 62 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 63 ret = 1; 64 preempt_enable(); 65 } 66 return ret; 67 } 68 69 70 /* 71 * Unlock any spinlocks which will prevent us from getting the 72 * message out. 73 */ 74 void bust_spinlocks(int yes) 75 { 76 if (yes) { 77 oops_in_progress = 1; 78 } else { 79 int loglevel_save = console_loglevel; 80 console_unblank(); 81 oops_in_progress = 0; 82 /* 83 * OK, the message is on the console. Now we call printk() 84 * without oops_in_progress set so that printk will give klogd 85 * a poke. Hold onto your hats... 86 */ 87 console_loglevel = 15; 88 printk(" "); 89 console_loglevel = loglevel_save; 90 } 91 } 92 93 /* 94 * Returns the address space associated with the fault. 95 * Returns 0 for kernel space and 1 for user space. 96 */ 97 static inline int user_space_fault(unsigned long trans_exc_code) 98 { 99 /* 100 * The lowest two bits of the translation exception 101 * identification indicate which paging table was used. 102 */ 103 trans_exc_code &= 3; 104 if (trans_exc_code == 2) 105 /* Access via secondary space, set_fs setting decides */ 106 return current->thread.mm_segment.ar4; 107 if (user_mode == HOME_SPACE_MODE) 108 /* User space if the access has been done via home space. */ 109 return trans_exc_code == 3; 110 /* 111 * If the user space is not the home space the kernel runs in home 112 * space. Access via secondary space has already been covered, 113 * access via primary space or access register is from user space 114 * and access via home space is from the kernel. 115 */ 116 return trans_exc_code != 3; 117 } 118 119 static inline void report_user_fault(struct pt_regs *regs, long int_code, 120 int signr, unsigned long address) 121 { 122 if ((task_pid_nr(current) > 1) && !show_unhandled_signals) 123 return; 124 if (!unhandled_signal(current, signr)) 125 return; 126 if (!printk_ratelimit()) 127 return; 128 printk("User process fault: interruption code 0x%lX ", int_code); 129 print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN); 130 printk("\n"); 131 printk("failing address: %lX\n", address); 132 show_regs(regs); 133 } 134 135 /* 136 * Send SIGSEGV to task. This is an external routine 137 * to keep the stack usage of do_page_fault small. 138 */ 139 static noinline void do_sigsegv(struct pt_regs *regs, long int_code, 140 int si_code, unsigned long trans_exc_code) 141 { 142 struct siginfo si; 143 unsigned long address; 144 145 address = trans_exc_code & __FAIL_ADDR_MASK; 146 current->thread.prot_addr = address; 147 current->thread.trap_no = int_code; 148 report_user_fault(regs, int_code, SIGSEGV, address); 149 si.si_signo = SIGSEGV; 150 si.si_code = si_code; 151 si.si_addr = (void __user *) address; 152 force_sig_info(SIGSEGV, &si, current); 153 } 154 155 static noinline void do_no_context(struct pt_regs *regs, long int_code, 156 unsigned long trans_exc_code) 157 { 158 const struct exception_table_entry *fixup; 159 unsigned long address; 160 161 /* Are we prepared to handle this kernel fault? */ 162 fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN); 163 if (fixup) { 164 regs->psw.addr = fixup->fixup | PSW_ADDR_AMODE; 165 return; 166 } 167 168 /* 169 * Oops. The kernel tried to access some bad page. We'll have to 170 * terminate things with extreme prejudice. 171 */ 172 address = trans_exc_code & __FAIL_ADDR_MASK; 173 if (!user_space_fault(trans_exc_code)) 174 printk(KERN_ALERT "Unable to handle kernel pointer dereference" 175 " at virtual kernel address %p\n", (void *)address); 176 else 177 printk(KERN_ALERT "Unable to handle kernel paging request" 178 " at virtual user address %p\n", (void *)address); 179 180 die("Oops", regs, int_code); 181 do_exit(SIGKILL); 182 } 183 184 static noinline void do_low_address(struct pt_regs *regs, long int_code, 185 unsigned long trans_exc_code) 186 { 187 /* Low-address protection hit in kernel mode means 188 NULL pointer write access in kernel mode. */ 189 if (regs->psw.mask & PSW_MASK_PSTATE) { 190 /* Low-address protection hit in user mode 'cannot happen'. */ 191 die ("Low-address protection", regs, int_code); 192 do_exit(SIGKILL); 193 } 194 195 do_no_context(regs, int_code, trans_exc_code); 196 } 197 198 static noinline void do_sigbus(struct pt_regs *regs, long int_code, 199 unsigned long trans_exc_code) 200 { 201 struct task_struct *tsk = current; 202 203 /* 204 * Send a sigbus, regardless of whether we were in kernel 205 * or user mode. 206 */ 207 tsk->thread.prot_addr = trans_exc_code & __FAIL_ADDR_MASK; 208 tsk->thread.trap_no = int_code; 209 force_sig(SIGBUS, tsk); 210 } 211 212 #ifdef CONFIG_S390_EXEC_PROTECT 213 static noinline int signal_return(struct pt_regs *regs, long int_code, 214 unsigned long trans_exc_code) 215 { 216 u16 instruction; 217 int rc; 218 219 rc = __get_user(instruction, (u16 __user *) regs->psw.addr); 220 221 if (!rc && instruction == 0x0a77) { 222 clear_tsk_thread_flag(current, TIF_SINGLE_STEP); 223 if (is_compat_task()) 224 sys32_sigreturn(); 225 else 226 sys_sigreturn(); 227 } else if (!rc && instruction == 0x0aad) { 228 clear_tsk_thread_flag(current, TIF_SINGLE_STEP); 229 if (is_compat_task()) 230 sys32_rt_sigreturn(); 231 else 232 sys_rt_sigreturn(); 233 } else 234 do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); 235 return 0; 236 } 237 #endif /* CONFIG_S390_EXEC_PROTECT */ 238 239 static noinline void do_fault_error(struct pt_regs *regs, long int_code, 240 unsigned long trans_exc_code, int fault) 241 { 242 int si_code; 243 244 switch (fault) { 245 case VM_FAULT_BADACCESS: 246 #ifdef CONFIG_S390_EXEC_PROTECT 247 if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && 248 (trans_exc_code & 3) == 0) { 249 signal_return(regs, int_code, trans_exc_code); 250 break; 251 } 252 #endif /* CONFIG_S390_EXEC_PROTECT */ 253 case VM_FAULT_BADMAP: 254 /* Bad memory access. Check if it is kernel or user space. */ 255 if (regs->psw.mask & PSW_MASK_PSTATE) { 256 /* User mode accesses just cause a SIGSEGV */ 257 si_code = (fault == VM_FAULT_BADMAP) ? 258 SEGV_MAPERR : SEGV_ACCERR; 259 do_sigsegv(regs, int_code, si_code, trans_exc_code); 260 return; 261 } 262 case VM_FAULT_BADCONTEXT: 263 do_no_context(regs, int_code, trans_exc_code); 264 break; 265 default: /* fault & VM_FAULT_ERROR */ 266 if (fault & VM_FAULT_OOM) 267 pagefault_out_of_memory(); 268 else if (fault & VM_FAULT_SIGBUS) { 269 do_sigbus(regs, int_code, trans_exc_code); 270 /* Kernel mode? Handle exceptions or die */ 271 if (!(regs->psw.mask & PSW_MASK_PSTATE)) 272 do_no_context(regs, int_code, trans_exc_code); 273 } else 274 BUG(); 275 break; 276 } 277 } 278 279 /* 280 * This routine handles page faults. It determines the address, 281 * and the problem, and then passes it off to one of the appropriate 282 * routines. 283 * 284 * interruption code (int_code): 285 * 04 Protection -> Write-Protection (suprression) 286 * 10 Segment translation -> Not present (nullification) 287 * 11 Page translation -> Not present (nullification) 288 * 3b Region third trans. -> Not present (nullification) 289 */ 290 static inline int do_exception(struct pt_regs *regs, int access, 291 unsigned long trans_exc_code) 292 { 293 struct task_struct *tsk; 294 struct mm_struct *mm; 295 struct vm_area_struct *vma; 296 unsigned long address; 297 int fault; 298 299 if (notify_page_fault(regs)) 300 return 0; 301 302 tsk = current; 303 mm = tsk->mm; 304 305 /* 306 * Verify that the fault happened in user space, that 307 * we are not in an interrupt and that there is a 308 * user context. 309 */ 310 fault = VM_FAULT_BADCONTEXT; 311 if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) 312 goto out; 313 314 address = trans_exc_code & __FAIL_ADDR_MASK; 315 /* 316 * When we get here, the fault happened in the current 317 * task's user address space, so we can switch on the 318 * interrupts again and then search the VMAs 319 */ 320 local_irq_enable(); 321 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); 322 down_read(&mm->mmap_sem); 323 324 fault = VM_FAULT_BADMAP; 325 vma = find_vma(mm, address); 326 if (!vma) 327 goto out_up; 328 329 if (unlikely(vma->vm_start > address)) { 330 if (!(vma->vm_flags & VM_GROWSDOWN)) 331 goto out_up; 332 if (expand_stack(vma, address)) 333 goto out_up; 334 } 335 336 /* 337 * Ok, we have a good vm_area for this memory access, so 338 * we can handle it.. 339 */ 340 fault = VM_FAULT_BADACCESS; 341 if (unlikely(!(vma->vm_flags & access))) 342 goto out_up; 343 344 if (is_vm_hugetlb_page(vma)) 345 address &= HPAGE_MASK; 346 /* 347 * If for any reason at all we couldn't handle the fault, 348 * make sure we exit gracefully rather than endlessly redo 349 * the fault. 350 */ 351 fault = handle_mm_fault(mm, vma, address, 352 (access == VM_WRITE) ? FAULT_FLAG_WRITE : 0); 353 if (unlikely(fault & VM_FAULT_ERROR)) 354 goto out_up; 355 356 if (fault & VM_FAULT_MAJOR) { 357 tsk->maj_flt++; 358 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 359 regs, address); 360 } else { 361 tsk->min_flt++; 362 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 363 regs, address); 364 } 365 /* 366 * The instruction that caused the program check will 367 * be repeated. Don't signal single step via SIGTRAP. 368 */ 369 clear_tsk_thread_flag(tsk, TIF_SINGLE_STEP); 370 fault = 0; 371 out_up: 372 up_read(&mm->mmap_sem); 373 out: 374 return fault; 375 } 376 377 void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) 378 { 379 unsigned long trans_exc_code = S390_lowcore.trans_exc_code; 380 int fault; 381 382 /* Protection exception is supressing, decrement psw address. */ 383 regs->psw.addr -= (int_code >> 16); 384 /* 385 * Check for low-address protection. This needs to be treated 386 * as a special case because the translation exception code 387 * field is not guaranteed to contain valid data in this case. 388 */ 389 if (unlikely(!(trans_exc_code & 4))) { 390 do_low_address(regs, int_code, trans_exc_code); 391 return; 392 } 393 fault = do_exception(regs, VM_WRITE, trans_exc_code); 394 if (unlikely(fault)) 395 do_fault_error(regs, 4, trans_exc_code, fault); 396 } 397 398 void __kprobes do_dat_exception(struct pt_regs *regs, long int_code) 399 { 400 unsigned long trans_exc_code = S390_lowcore.trans_exc_code; 401 int access, fault; 402 403 access = VM_READ | VM_EXEC | VM_WRITE; 404 #ifdef CONFIG_S390_EXEC_PROTECT 405 if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && 406 (trans_exc_code & 3) == 0) 407 access = VM_EXEC; 408 #endif 409 fault = do_exception(regs, access, trans_exc_code); 410 if (unlikely(fault)) 411 do_fault_error(regs, int_code & 255, trans_exc_code, fault); 412 } 413 414 #ifdef CONFIG_64BIT 415 void __kprobes do_asce_exception(struct pt_regs *regs, long int_code) 416 { 417 unsigned long trans_exc_code = S390_lowcore.trans_exc_code; 418 struct mm_struct *mm = current->mm; 419 struct vm_area_struct *vma; 420 421 if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) 422 goto no_context; 423 424 local_irq_enable(); 425 426 down_read(&mm->mmap_sem); 427 vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); 428 up_read(&mm->mmap_sem); 429 430 if (vma) { 431 update_mm(mm, current); 432 return; 433 } 434 435 /* User mode accesses just cause a SIGSEGV */ 436 if (regs->psw.mask & PSW_MASK_PSTATE) { 437 do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); 438 return; 439 } 440 441 no_context: 442 do_no_context(regs, int_code, trans_exc_code); 443 } 444 #endif 445 446 int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user) 447 { 448 struct pt_regs regs; 449 int access, fault; 450 451 regs.psw.mask = psw_kernel_bits; 452 if (!irqs_disabled()) 453 regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; 454 regs.psw.addr = (unsigned long) __builtin_return_address(0); 455 regs.psw.addr |= PSW_ADDR_AMODE; 456 uaddr &= PAGE_MASK; 457 access = write_user ? VM_WRITE : VM_READ; 458 fault = do_exception(®s, access, uaddr | 2); 459 if (unlikely(fault)) { 460 if (fault & VM_FAULT_OOM) { 461 pagefault_out_of_memory(); 462 fault = 0; 463 } else if (fault & VM_FAULT_SIGBUS) 464 do_sigbus(®s, int_code, uaddr); 465 } 466 return fault ? -EFAULT : 0; 467 } 468 469 #ifdef CONFIG_PFAULT 470 /* 471 * 'pfault' pseudo page faults routines. 472 */ 473 static ext_int_info_t ext_int_pfault; 474 static int pfault_disable = 0; 475 476 static int __init nopfault(char *str) 477 { 478 pfault_disable = 1; 479 return 1; 480 } 481 482 __setup("nopfault", nopfault); 483 484 typedef struct { 485 __u16 refdiagc; 486 __u16 reffcode; 487 __u16 refdwlen; 488 __u16 refversn; 489 __u64 refgaddr; 490 __u64 refselmk; 491 __u64 refcmpmk; 492 __u64 reserved; 493 } __attribute__ ((packed, aligned(8))) pfault_refbk_t; 494 495 int pfault_init(void) 496 { 497 pfault_refbk_t refbk = 498 { 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48, 499 __PF_RES_FIELD }; 500 int rc; 501 502 if (!MACHINE_IS_VM || pfault_disable) 503 return -1; 504 asm volatile( 505 " diag %1,%0,0x258\n" 506 "0: j 2f\n" 507 "1: la %0,8\n" 508 "2:\n" 509 EX_TABLE(0b,1b) 510 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); 511 __ctl_set_bit(0, 9); 512 return rc; 513 } 514 515 void pfault_fini(void) 516 { 517 pfault_refbk_t refbk = 518 { 0x258, 1, 5, 2, 0ULL, 0ULL, 0ULL, 0ULL }; 519 520 if (!MACHINE_IS_VM || pfault_disable) 521 return; 522 __ctl_clear_bit(0,9); 523 asm volatile( 524 " diag %0,0,0x258\n" 525 "0:\n" 526 EX_TABLE(0b,0b) 527 : : "a" (&refbk), "m" (refbk) : "cc"); 528 } 529 530 static void pfault_interrupt(__u16 int_code) 531 { 532 struct task_struct *tsk; 533 __u16 subcode; 534 535 /* 536 * Get the external interruption subcode & pfault 537 * initial/completion signal bit. VM stores this 538 * in the 'cpu address' field associated with the 539 * external interrupt. 540 */ 541 subcode = S390_lowcore.cpu_addr; 542 if ((subcode & 0xff00) != __SUBCODE_MASK) 543 return; 544 545 /* 546 * Get the token (= address of the task structure of the affected task). 547 */ 548 tsk = *(struct task_struct **) __LC_PFAULT_INTPARM; 549 550 if (subcode & 0x0080) { 551 /* signal bit is set -> a page has been swapped in by VM */ 552 if (xchg(&tsk->thread.pfault_wait, -1) != 0) { 553 /* Initial interrupt was faster than the completion 554 * interrupt. pfault_wait is valid. Set pfault_wait 555 * back to zero and wake up the process. This can 556 * safely be done because the task is still sleeping 557 * and can't produce new pfaults. */ 558 tsk->thread.pfault_wait = 0; 559 wake_up_process(tsk); 560 put_task_struct(tsk); 561 } 562 } else { 563 /* signal bit not set -> a real page is missing. */ 564 get_task_struct(tsk); 565 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 566 if (xchg(&tsk->thread.pfault_wait, 1) != 0) { 567 /* Completion interrupt was faster than the initial 568 * interrupt (swapped in a -1 for pfault_wait). Set 569 * pfault_wait back to zero and exit. This can be 570 * done safely because tsk is running in kernel 571 * mode and can't produce new pfaults. */ 572 tsk->thread.pfault_wait = 0; 573 set_task_state(tsk, TASK_RUNNING); 574 put_task_struct(tsk); 575 } else 576 set_tsk_need_resched(tsk); 577 } 578 } 579 580 void __init pfault_irq_init(void) 581 { 582 if (!MACHINE_IS_VM) 583 return; 584 585 /* 586 * Try to get pfault pseudo page faults going. 587 */ 588 if (register_early_external_interrupt(0x2603, pfault_interrupt, 589 &ext_int_pfault) != 0) 590 panic("Couldn't request external interrupt 0x2603"); 591 592 if (pfault_init() == 0) 593 return; 594 595 /* Tough luck, no pfault. */ 596 pfault_disable = 1; 597 unregister_early_external_interrupt(0x2603, pfault_interrupt, 598 &ext_int_pfault); 599 } 600 #endif 601