xref: /openbmc/linux/arch/s390/mm/fault.c (revision 4ed91d48259d9ddd378424d008f2e6559f7e78f8)
1 /*
2  *  S390 version
3  *    Copyright IBM Corp. 1999
4  *    Author(s): Hartmut Penner (hp@de.ibm.com)
5  *               Ulrich Weigand (uweigand@de.ibm.com)
6  *
7  *  Derived from "arch/i386/mm/fault.c"
8  *    Copyright (C) 1995  Linus Torvalds
9  */
10 
11 #include <linux/kernel_stat.h>
12 #include <linux/perf_event.h>
13 #include <linux/signal.h>
14 #include <linux/sched.h>
15 #include <linux/sched/debug.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/string.h>
19 #include <linux/types.h>
20 #include <linux/ptrace.h>
21 #include <linux/mman.h>
22 #include <linux/mm.h>
23 #include <linux/compat.h>
24 #include <linux/smp.h>
25 #include <linux/kdebug.h>
26 #include <linux/init.h>
27 #include <linux/console.h>
28 #include <linux/extable.h>
29 #include <linux/hardirq.h>
30 #include <linux/kprobes.h>
31 #include <linux/uaccess.h>
32 #include <linux/hugetlb.h>
33 #include <asm/asm-offsets.h>
34 #include <asm/diag.h>
35 #include <asm/pgtable.h>
36 #include <asm/gmap.h>
37 #include <asm/irq.h>
38 #include <asm/mmu_context.h>
39 #include <asm/facility.h>
40 #include "../kernel/entry.h"
41 
42 #define __FAIL_ADDR_MASK -4096L
43 #define __SUBCODE_MASK 0x0600
44 #define __PF_RES_FIELD 0x8000000000000000ULL
45 
46 #define VM_FAULT_BADCONTEXT	0x010000
47 #define VM_FAULT_BADMAP		0x020000
48 #define VM_FAULT_BADACCESS	0x040000
49 #define VM_FAULT_SIGNAL		0x080000
50 #define VM_FAULT_PFAULT		0x100000
51 
52 static unsigned long store_indication __read_mostly;
53 
54 static int __init fault_init(void)
55 {
56 	if (test_facility(75))
57 		store_indication = 0xc00;
58 	return 0;
59 }
60 early_initcall(fault_init);
61 
62 static inline int notify_page_fault(struct pt_regs *regs)
63 {
64 	int ret = 0;
65 
66 	/* kprobe_running() needs smp_processor_id() */
67 	if (kprobes_built_in() && !user_mode(regs)) {
68 		preempt_disable();
69 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
70 			ret = 1;
71 		preempt_enable();
72 	}
73 	return ret;
74 }
75 
76 
77 /*
78  * Unlock any spinlocks which will prevent us from getting the
79  * message out.
80  */
81 void bust_spinlocks(int yes)
82 {
83 	if (yes) {
84 		oops_in_progress = 1;
85 	} else {
86 		int loglevel_save = console_loglevel;
87 		console_unblank();
88 		oops_in_progress = 0;
89 		/*
90 		 * OK, the message is on the console.  Now we call printk()
91 		 * without oops_in_progress set so that printk will give klogd
92 		 * a poke.  Hold onto your hats...
93 		 */
94 		console_loglevel = 15;
95 		printk(" ");
96 		console_loglevel = loglevel_save;
97 	}
98 }
99 
100 /*
101  * Returns the address space associated with the fault.
102  * Returns 0 for kernel space and 1 for user space.
103  */
104 static inline int user_space_fault(struct pt_regs *regs)
105 {
106 	unsigned long trans_exc_code;
107 
108 	/*
109 	 * The lowest two bits of the translation exception
110 	 * identification indicate which paging table was used.
111 	 */
112 	trans_exc_code = regs->int_parm_long & 3;
113 	if (trans_exc_code == 3) /* home space -> kernel */
114 		return 0;
115 	if (user_mode(regs))
116 		return 1;
117 	if (trans_exc_code == 2) /* secondary space -> set_fs */
118 		return current->thread.mm_segment.ar4;
119 	if (current->flags & PF_VCPU)
120 		return 1;
121 	return 0;
122 }
123 
124 static int bad_address(void *p)
125 {
126 	unsigned long dummy;
127 
128 	return probe_kernel_address((unsigned long *)p, dummy);
129 }
130 
131 static void dump_pagetable(unsigned long asce, unsigned long address)
132 {
133 	unsigned long *table = __va(asce & PAGE_MASK);
134 
135 	pr_alert("AS:%016lx ", asce);
136 	switch (asce & _ASCE_TYPE_MASK) {
137 	case _ASCE_TYPE_REGION1:
138 		table = table + ((address >> 53) & 0x7ff);
139 		if (bad_address(table))
140 			goto bad;
141 		pr_cont("R1:%016lx ", *table);
142 		if (*table & _REGION_ENTRY_INVALID)
143 			goto out;
144 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
145 		/* fallthrough */
146 	case _ASCE_TYPE_REGION2:
147 		table = table + ((address >> 42) & 0x7ff);
148 		if (bad_address(table))
149 			goto bad;
150 		pr_cont("R2:%016lx ", *table);
151 		if (*table & _REGION_ENTRY_INVALID)
152 			goto out;
153 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
154 		/* fallthrough */
155 	case _ASCE_TYPE_REGION3:
156 		table = table + ((address >> 31) & 0x7ff);
157 		if (bad_address(table))
158 			goto bad;
159 		pr_cont("R3:%016lx ", *table);
160 		if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
161 			goto out;
162 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
163 		/* fallthrough */
164 	case _ASCE_TYPE_SEGMENT:
165 		table = table + ((address >> 20) & 0x7ff);
166 		if (bad_address(table))
167 			goto bad;
168 		pr_cont("S:%016lx ", *table);
169 		if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
170 			goto out;
171 		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
172 	}
173 	table = table + ((address >> 12) & 0xff);
174 	if (bad_address(table))
175 		goto bad;
176 	pr_cont("P:%016lx ", *table);
177 out:
178 	pr_cont("\n");
179 	return;
180 bad:
181 	pr_cont("BAD\n");
182 }
183 
184 static void dump_fault_info(struct pt_regs *regs)
185 {
186 	unsigned long asce;
187 
188 	pr_alert("Failing address: %016lx TEID: %016lx\n",
189 		 regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
190 	pr_alert("Fault in ");
191 	switch (regs->int_parm_long & 3) {
192 	case 3:
193 		pr_cont("home space ");
194 		break;
195 	case 2:
196 		pr_cont("secondary space ");
197 		break;
198 	case 1:
199 		pr_cont("access register ");
200 		break;
201 	case 0:
202 		pr_cont("primary space ");
203 		break;
204 	}
205 	pr_cont("mode while using ");
206 	if (!user_space_fault(regs)) {
207 		asce = S390_lowcore.kernel_asce;
208 		pr_cont("kernel ");
209 	}
210 #ifdef CONFIG_PGSTE
211 	else if ((current->flags & PF_VCPU) && S390_lowcore.gmap) {
212 		struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
213 		asce = gmap->asce;
214 		pr_cont("gmap ");
215 	}
216 #endif
217 	else {
218 		asce = S390_lowcore.user_asce;
219 		pr_cont("user ");
220 	}
221 	pr_cont("ASCE.\n");
222 	dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
223 }
224 
225 int show_unhandled_signals = 1;
226 
227 void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
228 {
229 	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
230 		return;
231 	if (!unhandled_signal(current, signr))
232 		return;
233 	if (!printk_ratelimit())
234 		return;
235 	printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
236 	       regs->int_code & 0xffff, regs->int_code >> 17);
237 	print_vma_addr(KERN_CONT "in ", regs->psw.addr);
238 	printk(KERN_CONT "\n");
239 	if (is_mm_fault)
240 		dump_fault_info(regs);
241 	show_regs(regs);
242 }
243 
244 /*
245  * Send SIGSEGV to task.  This is an external routine
246  * to keep the stack usage of do_page_fault small.
247  */
248 static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
249 {
250 	struct siginfo si;
251 
252 	report_user_fault(regs, SIGSEGV, 1);
253 	si.si_signo = SIGSEGV;
254 	si.si_errno = 0;
255 	si.si_code = si_code;
256 	si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
257 	force_sig_info(SIGSEGV, &si, current);
258 }
259 
260 static noinline void do_no_context(struct pt_regs *regs)
261 {
262 	const struct exception_table_entry *fixup;
263 
264 	/* Are we prepared to handle this kernel fault?  */
265 	fixup = search_exception_tables(regs->psw.addr);
266 	if (fixup) {
267 		regs->psw.addr = extable_fixup(fixup);
268 		return;
269 	}
270 
271 	/*
272 	 * Oops. The kernel tried to access some bad page. We'll have to
273 	 * terminate things with extreme prejudice.
274 	 */
275 	if (!user_space_fault(regs))
276 		printk(KERN_ALERT "Unable to handle kernel pointer dereference"
277 		       " in virtual kernel address space\n");
278 	else
279 		printk(KERN_ALERT "Unable to handle kernel paging request"
280 		       " in virtual user address space\n");
281 	dump_fault_info(regs);
282 	die(regs, "Oops");
283 	do_exit(SIGKILL);
284 }
285 
286 static noinline void do_low_address(struct pt_regs *regs)
287 {
288 	/* Low-address protection hit in kernel mode means
289 	   NULL pointer write access in kernel mode.  */
290 	if (regs->psw.mask & PSW_MASK_PSTATE) {
291 		/* Low-address protection hit in user mode 'cannot happen'. */
292 		die (regs, "Low-address protection");
293 		do_exit(SIGKILL);
294 	}
295 
296 	do_no_context(regs);
297 }
298 
299 static noinline void do_sigbus(struct pt_regs *regs)
300 {
301 	struct task_struct *tsk = current;
302 	struct siginfo si;
303 
304 	/*
305 	 * Send a sigbus, regardless of whether we were in kernel
306 	 * or user mode.
307 	 */
308 	si.si_signo = SIGBUS;
309 	si.si_errno = 0;
310 	si.si_code = BUS_ADRERR;
311 	si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
312 	force_sig_info(SIGBUS, &si, tsk);
313 }
314 
315 static noinline int signal_return(struct pt_regs *regs)
316 {
317 	u16 instruction;
318 	int rc;
319 
320 	rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
321 	if (rc)
322 		return rc;
323 	if (instruction == 0x0a77) {
324 		set_pt_regs_flag(regs, PIF_SYSCALL);
325 		regs->int_code = 0x00040077;
326 		return 0;
327 	} else if (instruction == 0x0aad) {
328 		set_pt_regs_flag(regs, PIF_SYSCALL);
329 		regs->int_code = 0x000400ad;
330 		return 0;
331 	}
332 	return -EACCES;
333 }
334 
335 static noinline void do_fault_error(struct pt_regs *regs, int access, int fault)
336 {
337 	int si_code;
338 
339 	switch (fault) {
340 	case VM_FAULT_BADACCESS:
341 		if (access == VM_EXEC && signal_return(regs) == 0)
342 			break;
343 	case VM_FAULT_BADMAP:
344 		/* Bad memory access. Check if it is kernel or user space. */
345 		if (user_mode(regs)) {
346 			/* User mode accesses just cause a SIGSEGV */
347 			si_code = (fault == VM_FAULT_BADMAP) ?
348 				SEGV_MAPERR : SEGV_ACCERR;
349 			do_sigsegv(regs, si_code);
350 			break;
351 		}
352 	case VM_FAULT_BADCONTEXT:
353 	case VM_FAULT_PFAULT:
354 		do_no_context(regs);
355 		break;
356 	case VM_FAULT_SIGNAL:
357 		if (!user_mode(regs))
358 			do_no_context(regs);
359 		break;
360 	default: /* fault & VM_FAULT_ERROR */
361 		if (fault & VM_FAULT_OOM) {
362 			if (!user_mode(regs))
363 				do_no_context(regs);
364 			else
365 				pagefault_out_of_memory();
366 		} else if (fault & VM_FAULT_SIGSEGV) {
367 			/* Kernel mode? Handle exceptions or die */
368 			if (!user_mode(regs))
369 				do_no_context(regs);
370 			else
371 				do_sigsegv(regs, SEGV_MAPERR);
372 		} else if (fault & VM_FAULT_SIGBUS) {
373 			/* Kernel mode? Handle exceptions or die */
374 			if (!user_mode(regs))
375 				do_no_context(regs);
376 			else
377 				do_sigbus(regs);
378 		} else
379 			BUG();
380 		break;
381 	}
382 }
383 
384 /*
385  * This routine handles page faults.  It determines the address,
386  * and the problem, and then passes it off to one of the appropriate
387  * routines.
388  *
389  * interruption code (int_code):
390  *   04       Protection           ->  Write-Protection  (suprression)
391  *   10       Segment translation  ->  Not present       (nullification)
392  *   11       Page translation     ->  Not present       (nullification)
393  *   3b       Region third trans.  ->  Not present       (nullification)
394  */
395 static inline int do_exception(struct pt_regs *regs, int access)
396 {
397 #ifdef CONFIG_PGSTE
398 	struct gmap *gmap;
399 #endif
400 	struct task_struct *tsk;
401 	struct mm_struct *mm;
402 	struct vm_area_struct *vma;
403 	unsigned long trans_exc_code;
404 	unsigned long address;
405 	unsigned int flags;
406 	int fault;
407 
408 	tsk = current;
409 	/*
410 	 * The instruction that caused the program check has
411 	 * been nullified. Don't signal single step via SIGTRAP.
412 	 */
413 	clear_pt_regs_flag(regs, PIF_PER_TRAP);
414 
415 	if (notify_page_fault(regs))
416 		return 0;
417 
418 	mm = tsk->mm;
419 	trans_exc_code = regs->int_parm_long;
420 
421 	/*
422 	 * Verify that the fault happened in user space, that
423 	 * we are not in an interrupt and that there is a
424 	 * user context.
425 	 */
426 	fault = VM_FAULT_BADCONTEXT;
427 	if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm))
428 		goto out;
429 
430 	address = trans_exc_code & __FAIL_ADDR_MASK;
431 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
432 	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
433 	if (user_mode(regs))
434 		flags |= FAULT_FLAG_USER;
435 	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
436 		flags |= FAULT_FLAG_WRITE;
437 	down_read(&mm->mmap_sem);
438 
439 #ifdef CONFIG_PGSTE
440 	gmap = (current->flags & PF_VCPU) ?
441 		(struct gmap *) S390_lowcore.gmap : NULL;
442 	if (gmap) {
443 		current->thread.gmap_addr = address;
444 		current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
445 		current->thread.gmap_int_code = regs->int_code & 0xffff;
446 		address = __gmap_translate(gmap, address);
447 		if (address == -EFAULT) {
448 			fault = VM_FAULT_BADMAP;
449 			goto out_up;
450 		}
451 		if (gmap->pfault_enabled)
452 			flags |= FAULT_FLAG_RETRY_NOWAIT;
453 	}
454 #endif
455 
456 retry:
457 	fault = VM_FAULT_BADMAP;
458 	vma = find_vma(mm, address);
459 	if (!vma)
460 		goto out_up;
461 
462 	if (unlikely(vma->vm_start > address)) {
463 		if (!(vma->vm_flags & VM_GROWSDOWN))
464 			goto out_up;
465 		if (expand_stack(vma, address))
466 			goto out_up;
467 	}
468 
469 	/*
470 	 * Ok, we have a good vm_area for this memory access, so
471 	 * we can handle it..
472 	 */
473 	fault = VM_FAULT_BADACCESS;
474 	if (unlikely(!(vma->vm_flags & access)))
475 		goto out_up;
476 
477 	if (is_vm_hugetlb_page(vma))
478 		address &= HPAGE_MASK;
479 	/*
480 	 * If for any reason at all we couldn't handle the fault,
481 	 * make sure we exit gracefully rather than endlessly redo
482 	 * the fault.
483 	 */
484 	fault = handle_mm_fault(vma, address, flags);
485 	/* No reason to continue if interrupted by SIGKILL. */
486 	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
487 		fault = VM_FAULT_SIGNAL;
488 		goto out;
489 	}
490 	if (unlikely(fault & VM_FAULT_ERROR))
491 		goto out_up;
492 
493 	/*
494 	 * Major/minor page fault accounting is only done on the
495 	 * initial attempt. If we go through a retry, it is extremely
496 	 * likely that the page will be found in page cache at that point.
497 	 */
498 	if (flags & FAULT_FLAG_ALLOW_RETRY) {
499 		if (fault & VM_FAULT_MAJOR) {
500 			tsk->maj_flt++;
501 			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
502 				      regs, address);
503 		} else {
504 			tsk->min_flt++;
505 			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
506 				      regs, address);
507 		}
508 		if (fault & VM_FAULT_RETRY) {
509 #ifdef CONFIG_PGSTE
510 			if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
511 				/* FAULT_FLAG_RETRY_NOWAIT has been set,
512 				 * mmap_sem has not been released */
513 				current->thread.gmap_pfault = 1;
514 				fault = VM_FAULT_PFAULT;
515 				goto out_up;
516 			}
517 #endif
518 			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
519 			 * of starvation. */
520 			flags &= ~(FAULT_FLAG_ALLOW_RETRY |
521 				   FAULT_FLAG_RETRY_NOWAIT);
522 			flags |= FAULT_FLAG_TRIED;
523 			down_read(&mm->mmap_sem);
524 			goto retry;
525 		}
526 	}
527 #ifdef CONFIG_PGSTE
528 	if (gmap) {
529 		address =  __gmap_link(gmap, current->thread.gmap_addr,
530 				       address);
531 		if (address == -EFAULT) {
532 			fault = VM_FAULT_BADMAP;
533 			goto out_up;
534 		}
535 		if (address == -ENOMEM) {
536 			fault = VM_FAULT_OOM;
537 			goto out_up;
538 		}
539 	}
540 #endif
541 	fault = 0;
542 out_up:
543 	up_read(&mm->mmap_sem);
544 out:
545 	return fault;
546 }
547 
548 void do_protection_exception(struct pt_regs *regs)
549 {
550 	unsigned long trans_exc_code;
551 	int access, fault;
552 
553 	trans_exc_code = regs->int_parm_long;
554 	/*
555 	 * Protection exceptions are suppressing, decrement psw address.
556 	 * The exception to this rule are aborted transactions, for these
557 	 * the PSW already points to the correct location.
558 	 */
559 	if (!(regs->int_code & 0x200))
560 		regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
561 	/*
562 	 * Check for low-address protection.  This needs to be treated
563 	 * as a special case because the translation exception code
564 	 * field is not guaranteed to contain valid data in this case.
565 	 */
566 	if (unlikely(!(trans_exc_code & 4))) {
567 		do_low_address(regs);
568 		return;
569 	}
570 	if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
571 		regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
572 					(regs->psw.addr & PAGE_MASK);
573 		access = VM_EXEC;
574 		fault = VM_FAULT_BADACCESS;
575 	} else {
576 		access = VM_WRITE;
577 		fault = do_exception(regs, access);
578 	}
579 	if (unlikely(fault))
580 		do_fault_error(regs, access, fault);
581 }
582 NOKPROBE_SYMBOL(do_protection_exception);
583 
584 void do_dat_exception(struct pt_regs *regs)
585 {
586 	int access, fault;
587 
588 	access = VM_READ | VM_EXEC | VM_WRITE;
589 	fault = do_exception(regs, access);
590 	if (unlikely(fault))
591 		do_fault_error(regs, access, fault);
592 }
593 NOKPROBE_SYMBOL(do_dat_exception);
594 
595 #ifdef CONFIG_PFAULT
596 /*
597  * 'pfault' pseudo page faults routines.
598  */
599 static int pfault_disable;
600 
601 static int __init nopfault(char *str)
602 {
603 	pfault_disable = 1;
604 	return 1;
605 }
606 
607 __setup("nopfault", nopfault);
608 
609 struct pfault_refbk {
610 	u16 refdiagc;
611 	u16 reffcode;
612 	u16 refdwlen;
613 	u16 refversn;
614 	u64 refgaddr;
615 	u64 refselmk;
616 	u64 refcmpmk;
617 	u64 reserved;
618 } __attribute__ ((packed, aligned(8)));
619 
620 int pfault_init(void)
621 {
622 	struct pfault_refbk refbk = {
623 		.refdiagc = 0x258,
624 		.reffcode = 0,
625 		.refdwlen = 5,
626 		.refversn = 2,
627 		.refgaddr = __LC_LPP,
628 		.refselmk = 1ULL << 48,
629 		.refcmpmk = 1ULL << 48,
630 		.reserved = __PF_RES_FIELD };
631         int rc;
632 
633 	if (pfault_disable)
634 		return -1;
635 	diag_stat_inc(DIAG_STAT_X258);
636 	asm volatile(
637 		"	diag	%1,%0,0x258\n"
638 		"0:	j	2f\n"
639 		"1:	la	%0,8\n"
640 		"2:\n"
641 		EX_TABLE(0b,1b)
642 		: "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
643         return rc;
644 }
645 
646 void pfault_fini(void)
647 {
648 	struct pfault_refbk refbk = {
649 		.refdiagc = 0x258,
650 		.reffcode = 1,
651 		.refdwlen = 5,
652 		.refversn = 2,
653 	};
654 
655 	if (pfault_disable)
656 		return;
657 	diag_stat_inc(DIAG_STAT_X258);
658 	asm volatile(
659 		"	diag	%0,0,0x258\n"
660 		"0:	nopr	%%r7\n"
661 		EX_TABLE(0b,0b)
662 		: : "a" (&refbk), "m" (refbk) : "cc");
663 }
664 
665 static DEFINE_SPINLOCK(pfault_lock);
666 static LIST_HEAD(pfault_list);
667 
668 #define PF_COMPLETE	0x0080
669 
670 /*
671  * The mechanism of our pfault code: if Linux is running as guest, runs a user
672  * space process and the user space process accesses a page that the host has
673  * paged out we get a pfault interrupt.
674  *
675  * This allows us, within the guest, to schedule a different process. Without
676  * this mechanism the host would have to suspend the whole virtual cpu until
677  * the page has been paged in.
678  *
679  * So when we get such an interrupt then we set the state of the current task
680  * to uninterruptible and also set the need_resched flag. Both happens within
681  * interrupt context(!). If we later on want to return to user space we
682  * recognize the need_resched flag and then call schedule().  It's not very
683  * obvious how this works...
684  *
685  * Of course we have a lot of additional fun with the completion interrupt (->
686  * host signals that a page of a process has been paged in and the process can
687  * continue to run). This interrupt can arrive on any cpu and, since we have
688  * virtual cpus, actually appear before the interrupt that signals that a page
689  * is missing.
690  */
691 static void pfault_interrupt(struct ext_code ext_code,
692 			     unsigned int param32, unsigned long param64)
693 {
694 	struct task_struct *tsk;
695 	__u16 subcode;
696 	pid_t pid;
697 
698 	/*
699 	 * Get the external interruption subcode & pfault initial/completion
700 	 * signal bit. VM stores this in the 'cpu address' field associated
701 	 * with the external interrupt.
702 	 */
703 	subcode = ext_code.subcode;
704 	if ((subcode & 0xff00) != __SUBCODE_MASK)
705 		return;
706 	inc_irq_stat(IRQEXT_PFL);
707 	/* Get the token (= pid of the affected task). */
708 	pid = param64 & LPP_PFAULT_PID_MASK;
709 	rcu_read_lock();
710 	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
711 	if (tsk)
712 		get_task_struct(tsk);
713 	rcu_read_unlock();
714 	if (!tsk)
715 		return;
716 	spin_lock(&pfault_lock);
717 	if (subcode & PF_COMPLETE) {
718 		/* signal bit is set -> a page has been swapped in by VM */
719 		if (tsk->thread.pfault_wait == 1) {
720 			/* Initial interrupt was faster than the completion
721 			 * interrupt. pfault_wait is valid. Set pfault_wait
722 			 * back to zero and wake up the process. This can
723 			 * safely be done because the task is still sleeping
724 			 * and can't produce new pfaults. */
725 			tsk->thread.pfault_wait = 0;
726 			list_del(&tsk->thread.list);
727 			wake_up_process(tsk);
728 			put_task_struct(tsk);
729 		} else {
730 			/* Completion interrupt was faster than initial
731 			 * interrupt. Set pfault_wait to -1 so the initial
732 			 * interrupt doesn't put the task to sleep.
733 			 * If the task is not running, ignore the completion
734 			 * interrupt since it must be a leftover of a PFAULT
735 			 * CANCEL operation which didn't remove all pending
736 			 * completion interrupts. */
737 			if (tsk->state == TASK_RUNNING)
738 				tsk->thread.pfault_wait = -1;
739 		}
740 	} else {
741 		/* signal bit not set -> a real page is missing. */
742 		if (WARN_ON_ONCE(tsk != current))
743 			goto out;
744 		if (tsk->thread.pfault_wait == 1) {
745 			/* Already on the list with a reference: put to sleep */
746 			goto block;
747 		} else if (tsk->thread.pfault_wait == -1) {
748 			/* Completion interrupt was faster than the initial
749 			 * interrupt (pfault_wait == -1). Set pfault_wait
750 			 * back to zero and exit. */
751 			tsk->thread.pfault_wait = 0;
752 		} else {
753 			/* Initial interrupt arrived before completion
754 			 * interrupt. Let the task sleep.
755 			 * An extra task reference is needed since a different
756 			 * cpu may set the task state to TASK_RUNNING again
757 			 * before the scheduler is reached. */
758 			get_task_struct(tsk);
759 			tsk->thread.pfault_wait = 1;
760 			list_add(&tsk->thread.list, &pfault_list);
761 block:
762 			/* Since this must be a userspace fault, there
763 			 * is no kernel task state to trample. Rely on the
764 			 * return to userspace schedule() to block. */
765 			__set_current_state(TASK_UNINTERRUPTIBLE);
766 			set_tsk_need_resched(tsk);
767 			set_preempt_need_resched();
768 		}
769 	}
770 out:
771 	spin_unlock(&pfault_lock);
772 	put_task_struct(tsk);
773 }
774 
775 static int pfault_cpu_dead(unsigned int cpu)
776 {
777 	struct thread_struct *thread, *next;
778 	struct task_struct *tsk;
779 
780 	spin_lock_irq(&pfault_lock);
781 	list_for_each_entry_safe(thread, next, &pfault_list, list) {
782 		thread->pfault_wait = 0;
783 		list_del(&thread->list);
784 		tsk = container_of(thread, struct task_struct, thread);
785 		wake_up_process(tsk);
786 		put_task_struct(tsk);
787 	}
788 	spin_unlock_irq(&pfault_lock);
789 	return 0;
790 }
791 
792 static int __init pfault_irq_init(void)
793 {
794 	int rc;
795 
796 	rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
797 	if (rc)
798 		goto out_extint;
799 	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
800 	if (rc)
801 		goto out_pfault;
802 	irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
803 	cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
804 				  NULL, pfault_cpu_dead);
805 	return 0;
806 
807 out_pfault:
808 	unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
809 out_extint:
810 	pfault_disable = 1;
811 	return rc;
812 }
813 early_initcall(pfault_irq_init);
814 
815 #endif /* CONFIG_PFAULT */
816