10d1fb0a4SAlex Dewar // SPDX-License-Identifier: GPL-2.0
21d3468a6SJeff Dike /*
34c9e1385SJeff Dike * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
41d3468a6SJeff Dike */
51d3468a6SJeff Dike
64c9e1385SJeff Dike #include <linux/mm.h>
73f07c014SIngo Molnar #include <linux/sched/signal.h>
84c9e1385SJeff Dike #include <linux/hardirq.h>
973395a00SAl Viro #include <linux/module.h>
10fbc9f16aSPeter Zijlstra #include <linux/uaccess.h>
11b17b0153SIngo Molnar #include <linux/sched/debug.h>
124c9e1385SJeff Dike #include <asm/current.h>
134c9e1385SJeff Dike #include <asm/tlbflush.h>
1437185b33SAl Viro #include <arch.h>
1537185b33SAl Viro #include <as-layout.h>
1637185b33SAl Viro #include <kern_util.h>
1737185b33SAl Viro #include <os.h>
1837185b33SAl Viro #include <skas.h>
191d3468a6SJeff Dike
204c9e1385SJeff Dike /*
2159fdf91dSColin Ian King * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by
224c9e1385SJeff Dike * segv().
234c9e1385SJeff Dike */
handle_page_fault(unsigned long address,unsigned long ip,int is_write,int is_user,int * code_out)241d3468a6SJeff Dike int handle_page_fault(unsigned long address, unsigned long ip,
251d3468a6SJeff Dike int is_write, int is_user, int *code_out)
261d3468a6SJeff Dike {
271d3468a6SJeff Dike struct mm_struct *mm = current->mm;
281d3468a6SJeff Dike struct vm_area_struct *vma;
291d3468a6SJeff Dike pmd_t *pmd;
301d3468a6SJeff Dike pte_t *pte;
311d3468a6SJeff Dike int err = -EFAULT;
32dde16072SPeter Xu unsigned int flags = FAULT_FLAG_DEFAULT;
331d3468a6SJeff Dike
341d3468a6SJeff Dike *code_out = SEGV_MAPERR;
351d3468a6SJeff Dike
364c9e1385SJeff Dike /*
3770ffdb93SDavid Hildenbrand * If the fault was with pagefaults disabled, don't take the fault, just
384c9e1385SJeff Dike * fail.
394c9e1385SJeff Dike */
4070ffdb93SDavid Hildenbrand if (faulthandler_disabled())
411d3468a6SJeff Dike goto out_nosemaphore;
421d3468a6SJeff Dike
43759496baSJohannes Weiner if (is_user)
44759496baSJohannes Weiner flags |= FAULT_FLAG_USER;
451cefe28fSKautuk Consul retry:
46d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
471d3468a6SJeff Dike vma = find_vma(mm, address);
481d3468a6SJeff Dike if (!vma)
491d3468a6SJeff Dike goto out;
50*8d7071afSLinus Torvalds if (vma->vm_start <= address)
511d3468a6SJeff Dike goto good_area;
52*8d7071afSLinus Torvalds if (!(vma->vm_flags & VM_GROWSDOWN))
531d3468a6SJeff Dike goto out;
54*8d7071afSLinus Torvalds if (is_user && !ARCH_IS_STACKGROW(address))
551d3468a6SJeff Dike goto out;
56*8d7071afSLinus Torvalds vma = expand_stack(mm, address);
57*8d7071afSLinus Torvalds if (!vma)
58*8d7071afSLinus Torvalds goto out_nosemaphore;
591d3468a6SJeff Dike
601d3468a6SJeff Dike good_area:
611d3468a6SJeff Dike *code_out = SEGV_ACCERR;
62759496baSJohannes Weiner if (is_write) {
63759496baSJohannes Weiner if (!(vma->vm_flags & VM_WRITE))
641d3468a6SJeff Dike goto out;
65759496baSJohannes Weiner flags |= FAULT_FLAG_WRITE;
66759496baSJohannes Weiner } else {
671d3468a6SJeff Dike /* Don't require VM_READ|VM_EXEC for write faults! */
68759496baSJohannes Weiner if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
691d3468a6SJeff Dike goto out;
70759496baSJohannes Weiner }
711d3468a6SJeff Dike
721d3468a6SJeff Dike do {
7350a7ca3cSSouptick Joarder vm_fault_t fault;
741c0fe6e3SNick Piggin
75bce617edSPeter Xu fault = handle_mm_fault(vma, address, flags, NULL);
761cefe28fSKautuk Consul
771cefe28fSKautuk Consul if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
781cefe28fSKautuk Consul goto out_nosemaphore;
791cefe28fSKautuk Consul
80d9272525SPeter Xu /* The fault is fully completed (including releasing mmap lock) */
81d9272525SPeter Xu if (fault & VM_FAULT_COMPLETED)
82d9272525SPeter Xu return 0;
83d9272525SPeter Xu
8483c54070SNick Piggin if (unlikely(fault & VM_FAULT_ERROR)) {
8583c54070SNick Piggin if (fault & VM_FAULT_OOM) {
861d3468a6SJeff Dike goto out_of_memory;
8733692f27SLinus Torvalds } else if (fault & VM_FAULT_SIGSEGV) {
8833692f27SLinus Torvalds goto out;
8983c54070SNick Piggin } else if (fault & VM_FAULT_SIGBUS) {
9083c54070SNick Piggin err = -EACCES;
9183c54070SNick Piggin goto out;
9283c54070SNick Piggin }
931d3468a6SJeff Dike BUG();
941d3468a6SJeff Dike }
951cefe28fSKautuk Consul if (fault & VM_FAULT_RETRY) {
9645cac65bSShaohua Li flags |= FAULT_FLAG_TRIED;
971cefe28fSKautuk Consul
981cefe28fSKautuk Consul goto retry;
991cefe28fSKautuk Consul }
10083c54070SNick Piggin
101e05c7b1fSMike Rapoport pmd = pmd_off(mm, address);
1021d3468a6SJeff Dike pte = pte_offset_kernel(pmd, address);
1031d3468a6SJeff Dike } while (!pte_present(*pte));
1041d3468a6SJeff Dike err = 0;
1054c9e1385SJeff Dike /*
1064c9e1385SJeff Dike * The below warning was added in place of
1071d3468a6SJeff Dike * pte_mkyoung(); if (is_write) pte_mkdirty();
1081d3468a6SJeff Dike * If it's triggered, we'd see normally a hang here (a clean pte is
1091d3468a6SJeff Dike * marked read-only to emulate the dirty bit).
1101d3468a6SJeff Dike * However, the generic code can mark a PTE writable but clean on a
1111d3468a6SJeff Dike * concurrent read fault, triggering this harmlessly. So comment it out.
1121d3468a6SJeff Dike */
1131d3468a6SJeff Dike #if 0
1141d3468a6SJeff Dike WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte)));
1151d3468a6SJeff Dike #endif
1161d3468a6SJeff Dike flush_tlb_page(vma, address);
1171d3468a6SJeff Dike out:
118d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
1191d3468a6SJeff Dike out_nosemaphore:
1204c9e1385SJeff Dike return err;
1211d3468a6SJeff Dike
1221d3468a6SJeff Dike out_of_memory:
1231c0fe6e3SNick Piggin /*
1241c0fe6e3SNick Piggin * We ran out of memory, call the OOM killer, and return the userspace
1251c0fe6e3SNick Piggin * (which will retry the fault, or kill us if we got oom-killed).
1261c0fe6e3SNick Piggin */
127d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
12887134102SJohannes Weiner if (!is_user)
12987134102SJohannes Weiner goto out_nosemaphore;
1301c0fe6e3SNick Piggin pagefault_out_of_memory();
1311c0fe6e3SNick Piggin return 0;
1321d3468a6SJeff Dike }
1331d3468a6SJeff Dike
show_segv_info(struct uml_pt_regs * regs)1343ef6130aSRichard Weinberger static void show_segv_info(struct uml_pt_regs *regs)
1353ef6130aSRichard Weinberger {
1363ef6130aSRichard Weinberger struct task_struct *tsk = current;
1373ef6130aSRichard Weinberger struct faultinfo *fi = UPT_FAULTINFO(regs);
1383ef6130aSRichard Weinberger
1393ef6130aSRichard Weinberger if (!unhandled_signal(tsk, SIGSEGV))
1403ef6130aSRichard Weinberger return;
1413ef6130aSRichard Weinberger
1423ef6130aSRichard Weinberger if (!printk_ratelimit())
1433ef6130aSRichard Weinberger return;
1443ef6130aSRichard Weinberger
14510a7e9d8SKees Cook printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
1463ef6130aSRichard Weinberger task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
1473ef6130aSRichard Weinberger tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
1483ef6130aSRichard Weinberger (void *)UPT_IP(regs), (void *)UPT_SP(regs),
1493ef6130aSRichard Weinberger fi->error_code);
1503ef6130aSRichard Weinberger
1513ef6130aSRichard Weinberger print_vma_addr(KERN_CONT " in ", UPT_IP(regs));
1523ef6130aSRichard Weinberger printk(KERN_CONT "\n");
1533ef6130aSRichard Weinberger }
1543ef6130aSRichard Weinberger
bad_segv(struct faultinfo fi,unsigned long ip)15527aa6ef3SJeff Dike static void bad_segv(struct faultinfo fi, unsigned long ip)
15627aa6ef3SJeff Dike {
15727aa6ef3SJeff Dike current->thread.arch.faultinfo = fi;
1582e1661d2SEric W. Biederman force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi));
15927aa6ef3SJeff Dike }
16027aa6ef3SJeff Dike
fatal_sigsegv(void)1613e6f2ac4SJeff Dike void fatal_sigsegv(void)
1623e6f2ac4SJeff Dike {
163e21294a7SEric W. Biederman force_fatal_sig(SIGSEGV);
164ccaee5f8SIngo Molnar do_signal(¤t->thread.regs);
1653e6f2ac4SJeff Dike /*
1663e6f2ac4SJeff Dike * This is to tell gcc that we're not returning - do_signal
1673e6f2ac4SJeff Dike * can, in general, return, but in this case, it's not, since
1683e6f2ac4SJeff Dike * we just got a fatal SIGSEGV queued.
1693e6f2ac4SJeff Dike */
1703e6f2ac4SJeff Dike os_dump_core();
1713e6f2ac4SJeff Dike }
1723e6f2ac4SJeff Dike
17388af2338SThomas Meyer /**
17488af2338SThomas Meyer * segv_handler() - the SIGSEGV handler
17588af2338SThomas Meyer * @sig: the signal number
17688af2338SThomas Meyer * @unused_si: the signal info struct; unused in this handler
17788af2338SThomas Meyer * @regs: the ptrace register information
17888af2338SThomas Meyer *
17988af2338SThomas Meyer * The handler first extracts the faultinfo from the UML ptrace regs struct.
18088af2338SThomas Meyer * If the userfault did not happen in an UML userspace process, bad_segv is called.
18188af2338SThomas Meyer * Otherwise the signal did happen in a cloned userspace process, handle it.
18288af2338SThomas Meyer */
segv_handler(int sig,struct siginfo * unused_si,struct uml_pt_regs * regs)183d3c1cfcdSMartin Pärtel void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
1841d3468a6SJeff Dike {
1851d3468a6SJeff Dike struct faultinfo * fi = UPT_FAULTINFO(regs);
1861d3468a6SJeff Dike
1871d3468a6SJeff Dike if (UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)) {
1883ef6130aSRichard Weinberger show_segv_info(regs);
1891d3468a6SJeff Dike bad_segv(*fi, UPT_IP(regs));
1901d3468a6SJeff Dike return;
1911d3468a6SJeff Dike }
1921d3468a6SJeff Dike segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs);
1931d3468a6SJeff Dike }
1941d3468a6SJeff Dike
1951d3468a6SJeff Dike /*
1961d3468a6SJeff Dike * We give a *copy* of the faultinfo in the regs to segv.
1971d3468a6SJeff Dike * This must be done, since nesting SEGVs could overwrite
1981d3468a6SJeff Dike * the info in the regs. A pointer to the info then would
1991d3468a6SJeff Dike * give us bad data!
2001d3468a6SJeff Dike */
segv(struct faultinfo fi,unsigned long ip,int is_user,struct uml_pt_regs * regs)2015d86456dSJeff Dike unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
20277bf4400SJeff Dike struct uml_pt_regs *regs)
2031d3468a6SJeff Dike {
204fab95c55SJeff Dike jmp_buf *catcher;
205bc08c078SEric W. Biederman int si_code;
2061d3468a6SJeff Dike int err;
2071d3468a6SJeff Dike int is_write = FAULT_WRITE(fi);
2081d3468a6SJeff Dike unsigned long address = FAULT_ADDRESS(fi);
2091d3468a6SJeff Dike
210bb6a1b2eSRichard Weinberger if (!is_user && regs)
211f72c22e4SRichard Weinberger current->thread.segv_regs = container_of(regs, struct pt_regs, regs);
212f72c22e4SRichard Weinberger
2131d3468a6SJeff Dike if (!is_user && (address >= start_vm) && (address < end_vm)) {
2141d3468a6SJeff Dike flush_tlb_kernel_vm();
215f72c22e4SRichard Weinberger goto out;
2161d3468a6SJeff Dike }
217377fad3aSJeff Dike else if (current->mm == NULL) {
218377fad3aSJeff Dike show_regs(container_of(regs, struct pt_regs, regs));
2191d3468a6SJeff Dike panic("Segfault with no mm");
220377fad3aSJeff Dike }
22156b88a3bSRichard Weinberger else if (!is_user && address > PAGE_SIZE && address < TASK_SIZE) {
222d2313084SRichard Weinberger show_regs(container_of(regs, struct pt_regs, regs));
223d2313084SRichard Weinberger panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx",
224d2313084SRichard Weinberger address, ip);
225d2313084SRichard Weinberger }
2261d3468a6SJeff Dike
227d0b5e15fSRichard Weinberger if (SEGV_IS_FIXABLE(&fi))
2284c9e1385SJeff Dike err = handle_page_fault(address, ip, is_write, is_user,
229bc08c078SEric W. Biederman &si_code);
2301d3468a6SJeff Dike else {
2311d3468a6SJeff Dike err = -EFAULT;
2324c9e1385SJeff Dike /*
2334c9e1385SJeff Dike * A thread accessed NULL, we get a fault, but CR2 is invalid.
2344c9e1385SJeff Dike * This code is used in __do_copy_from_user() of TT mode.
2354c9e1385SJeff Dike * XXX tt mode is gone, so maybe this isn't needed any more
2364c9e1385SJeff Dike */
2371d3468a6SJeff Dike address = 0;
2381d3468a6SJeff Dike }
2391d3468a6SJeff Dike
2401d3468a6SJeff Dike catcher = current->thread.fault_catcher;
2411d3468a6SJeff Dike if (!err)
242f72c22e4SRichard Weinberger goto out;
2431d3468a6SJeff Dike else if (catcher != NULL) {
2441d3468a6SJeff Dike current->thread.fault_addr = (void *) address;
245fab95c55SJeff Dike UML_LONGJMP(catcher, 1);
2461d3468a6SJeff Dike }
2471d3468a6SJeff Dike else if (current->thread.fault_addr != NULL)
2481d3468a6SJeff Dike panic("fault_addr set but no fault catcher");
2495d86456dSJeff Dike else if (!is_user && arch_fixup(ip, regs))
250f72c22e4SRichard Weinberger goto out;
2511d3468a6SJeff Dike
252377fad3aSJeff Dike if (!is_user) {
253377fad3aSJeff Dike show_regs(container_of(regs, struct pt_regs, regs));
2541d3468a6SJeff Dike panic("Kernel mode fault at addr 0x%lx, ip 0x%lx",
2551d3468a6SJeff Dike address, ip);
256377fad3aSJeff Dike }
2571d3468a6SJeff Dike
2583ef6130aSRichard Weinberger show_segv_info(regs);
2593ef6130aSRichard Weinberger
2601d3468a6SJeff Dike if (err == -EACCES) {
2611d3468a6SJeff Dike current->thread.arch.faultinfo = fi;
2622e1661d2SEric W. Biederman force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
2631d3468a6SJeff Dike } else {
2641d3468a6SJeff Dike BUG_ON(err != -EFAULT);
2651d3468a6SJeff Dike current->thread.arch.faultinfo = fi;
2662e1661d2SEric W. Biederman force_sig_fault(SIGSEGV, si_code, (void __user *) address);
2671d3468a6SJeff Dike }
268f72c22e4SRichard Weinberger
269f72c22e4SRichard Weinberger out:
270f72c22e4SRichard Weinberger if (regs)
271f72c22e4SRichard Weinberger current->thread.segv_regs = NULL;
272f72c22e4SRichard Weinberger
2735d86456dSJeff Dike return 0;
2741d3468a6SJeff Dike }
2751d3468a6SJeff Dike
relay_signal(int sig,struct siginfo * si,struct uml_pt_regs * regs)276d3c1cfcdSMartin Pärtel void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
2771d3468a6SJeff Dike {
278530621b7SEric W. Biederman int code, err;
2796edf428eSJeff Dike if (!UPT_IS_USER(regs)) {
2806edf428eSJeff Dike if (sig == SIGBUS)
2814c9e1385SJeff Dike printk(KERN_ERR "Bus error - the host /dev/shm or /tmp "
2824c9e1385SJeff Dike "mount likely just ran out of space\n");
2831d3468a6SJeff Dike panic("Kernel mode signal %d", sig);
2846edf428eSJeff Dike }
2856edf428eSJeff Dike
2869226b838SJeff Dike arch_examine_signal(sig, regs);
2879226b838SJeff Dike
288530621b7SEric W. Biederman /* Is the signal layout for the signal known?
289530621b7SEric W. Biederman * Signal data must be scrubbed to prevent information leaks.
290530621b7SEric W. Biederman */
291530621b7SEric W. Biederman code = si->si_code;
292530621b7SEric W. Biederman err = si->si_errno;
293530621b7SEric W. Biederman if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) {
294530621b7SEric W. Biederman struct faultinfo *fi = UPT_FAULTINFO(regs);
295d3c1cfcdSMartin Pärtel current->thread.arch.faultinfo = *fi;
2962e1661d2SEric W. Biederman force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi));
297530621b7SEric W. Biederman } else {
298530621b7SEric W. Biederman printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n",
299530621b7SEric W. Biederman sig, code, err);
3003cf5d076SEric W. Biederman force_sig(sig);
3011d3468a6SJeff Dike }
302d3c1cfcdSMartin Pärtel }
303d3c1cfcdSMartin Pärtel
bus_handler(int sig,struct siginfo * si,struct uml_pt_regs * regs)304d3c1cfcdSMartin Pärtel void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs)
3051d3468a6SJeff Dike {
3061d3468a6SJeff Dike if (current->thread.fault_catcher != NULL)
307fab95c55SJeff Dike UML_LONGJMP(current->thread.fault_catcher, 1);
308d3c1cfcdSMartin Pärtel else
309d3c1cfcdSMartin Pärtel relay_signal(sig, si, regs);
3101d3468a6SJeff Dike }
3111d3468a6SJeff Dike
winch(int sig,struct siginfo * unused_si,struct uml_pt_regs * regs)312d3c1cfcdSMartin Pärtel void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
3131d3468a6SJeff Dike {
3141d3468a6SJeff Dike do_IRQ(WINCH_IRQ, regs);
3151d3468a6SJeff Dike }
316