140b0b3f8SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2dc009d92SEric W. Biederman /*
32965faa5SDave Young * kexec.c - kexec_load system call
4dc009d92SEric W. Biederman * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
5dc009d92SEric W. Biederman */
6dc009d92SEric W. Biederman
7de90a6bcSMinfei Huang #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8de90a6bcSMinfei Huang
9c59ede7bSRandy.Dunlap #include <linux/capability.h>
10dc009d92SEric W. Biederman #include <linux/mm.h>
11dc009d92SEric W. Biederman #include <linux/file.h>
12a210fd32SMimi Zohar #include <linux/security.h>
13dc009d92SEric W. Biederman #include <linux/kexec.h>
148c5a1cf0SAndrew Morton #include <linux/mutex.h>
15dc009d92SEric W. Biederman #include <linux/list.h>
16dc009d92SEric W. Biederman #include <linux/syscalls.h>
17a43cac0dSDave Young #include <linux/vmalloc.h>
182965faa5SDave Young #include <linux/slab.h>
196e274d14SAlexander Nyberg
20a43cac0dSDave Young #include "kexec_internal.h"
21a43cac0dSDave Young
kimage_alloc_init(struct kimage ** rimage,unsigned long entry,unsigned long nr_segments,struct kexec_segment * segments,unsigned long flags)22255aedd9SVivek Goyal static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
2372414d3fSManeesh Soni unsigned long nr_segments,
245d700a0fSArnd Bergmann struct kexec_segment *segments,
25255aedd9SVivek Goyal unsigned long flags)
26dc009d92SEric W. Biederman {
27255aedd9SVivek Goyal int ret;
28dc009d92SEric W. Biederman struct kimage *image;
29255aedd9SVivek Goyal bool kexec_on_panic = flags & KEXEC_ON_CRASH;
30255aedd9SVivek Goyal
31255aedd9SVivek Goyal if (kexec_on_panic) {
32255aedd9SVivek Goyal /* Verify we have a valid entry point */
3343546d86SRussell King if ((entry < phys_to_boot_phys(crashk_res.start)) ||
3443546d86SRussell King (entry > phys_to_boot_phys(crashk_res.end)))
35255aedd9SVivek Goyal return -EADDRNOTAVAIL;
36255aedd9SVivek Goyal }
37dc009d92SEric W. Biederman
38dc009d92SEric W. Biederman /* Allocate and initialize a controlling structure */
39dabe7862SVivek Goyal image = do_kimage_alloc_init();
40dabe7862SVivek Goyal if (!image)
41dabe7862SVivek Goyal return -ENOMEM;
42dabe7862SVivek Goyal
43dabe7862SVivek Goyal image->start = entry;
445d700a0fSArnd Bergmann image->nr_segments = nr_segments;
455d700a0fSArnd Bergmann memcpy(image->segment, segments, nr_segments * sizeof(*segments));
46dabe7862SVivek Goyal
47255aedd9SVivek Goyal if (kexec_on_panic) {
48cdf4b3faSXunlei Pang /* Enable special crash kernel control page alloc policy. */
49255aedd9SVivek Goyal image->control_page = crashk_res.start;
50255aedd9SVivek Goyal image->type = KEXEC_TYPE_CRASH;
51255aedd9SVivek Goyal }
52255aedd9SVivek Goyal
53cdf4b3faSXunlei Pang ret = sanity_check_segment_list(image);
54cdf4b3faSXunlei Pang if (ret)
55cdf4b3faSXunlei Pang goto out_free_image;
56cdf4b3faSXunlei Pang
57dc009d92SEric W. Biederman /*
58dc009d92SEric W. Biederman * Find a location for the control code buffer, and add it
59dc009d92SEric W. Biederman * the vector of segments so that it's pages will also be
60dc009d92SEric W. Biederman * counted as destination pages.
61dc009d92SEric W. Biederman */
62255aedd9SVivek Goyal ret = -ENOMEM;
63dc009d92SEric W. Biederman image->control_code_page = kimage_alloc_control_pages(image,
64163f6876SHuang Ying get_order(KEXEC_CONTROL_PAGE_SIZE));
65dc009d92SEric W. Biederman if (!image->control_code_page) {
66e1bebcf4SFabian Frederick pr_err("Could not allocate control_code_buffer\n");
67dabe7862SVivek Goyal goto out_free_image;
68dc009d92SEric W. Biederman }
69dc009d92SEric W. Biederman
70255aedd9SVivek Goyal if (!kexec_on_panic) {
713ab83521SHuang Ying image->swap_page = kimage_alloc_control_pages(image, 0);
723ab83521SHuang Ying if (!image->swap_page) {
73e1bebcf4SFabian Frederick pr_err("Could not allocate swap buffer\n");
74dabe7862SVivek Goyal goto out_free_control_pages;
753ab83521SHuang Ying }
76255aedd9SVivek Goyal }
773ab83521SHuang Ying
78dc009d92SEric W. Biederman *rimage = image;
79b92e7e0dSZhang Yanfei return 0;
80dabe7862SVivek Goyal out_free_control_pages:
81b92e7e0dSZhang Yanfei kimage_free_page_list(&image->control_pages);
82dabe7862SVivek Goyal out_free_image:
83b92e7e0dSZhang Yanfei kfree(image);
84255aedd9SVivek Goyal return ret;
85dc009d92SEric W. Biederman }
86dc009d92SEric W. Biederman
do_kexec_load(unsigned long entry,unsigned long nr_segments,struct kexec_segment * segments,unsigned long flags)870eea0867SMinfei Huang static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
885d700a0fSArnd Bergmann struct kexec_segment *segments, unsigned long flags)
890eea0867SMinfei Huang {
900eea0867SMinfei Huang struct kimage **dest_image, *image;
910eea0867SMinfei Huang unsigned long i;
920eea0867SMinfei Huang int ret;
930eea0867SMinfei Huang
944b692e86SArnd Bergmann /*
954b692e86SArnd Bergmann * Because we write directly to the reserved memory region when loading
9605c62574SValentin Schneider * crash kernels we need a serialization here to prevent multiple crash
9705c62574SValentin Schneider * kernels from attempting to load simultaneously.
984b692e86SArnd Bergmann */
9905c62574SValentin Schneider if (!kexec_trylock())
1004b692e86SArnd Bergmann return -EBUSY;
1014b692e86SArnd Bergmann
1020eea0867SMinfei Huang if (flags & KEXEC_ON_CRASH) {
1030eea0867SMinfei Huang dest_image = &kexec_crash_image;
1040eea0867SMinfei Huang if (kexec_crash_image)
1050eea0867SMinfei Huang arch_kexec_unprotect_crashkres();
1060eea0867SMinfei Huang } else {
1070eea0867SMinfei Huang dest_image = &kexec_image;
1080eea0867SMinfei Huang }
1090eea0867SMinfei Huang
1100eea0867SMinfei Huang if (nr_segments == 0) {
1110eea0867SMinfei Huang /* Uninstall image */
1120eea0867SMinfei Huang kimage_free(xchg(dest_image, NULL));
1134b692e86SArnd Bergmann ret = 0;
1144b692e86SArnd Bergmann goto out_unlock;
1150eea0867SMinfei Huang }
1160eea0867SMinfei Huang if (flags & KEXEC_ON_CRASH) {
1170eea0867SMinfei Huang /*
1180eea0867SMinfei Huang * Loading another kernel to switch to if this one
1190eea0867SMinfei Huang * crashes. Free any current crash dump kernel before
1200eea0867SMinfei Huang * we corrupt it.
1210eea0867SMinfei Huang */
1220eea0867SMinfei Huang kimage_free(xchg(&kexec_crash_image, NULL));
1230eea0867SMinfei Huang }
1240eea0867SMinfei Huang
1250eea0867SMinfei Huang ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
1260eea0867SMinfei Huang if (ret)
1274b692e86SArnd Bergmann goto out_unlock;
1280eea0867SMinfei Huang
1290eea0867SMinfei Huang if (flags & KEXEC_PRESERVE_CONTEXT)
1300eea0867SMinfei Huang image->preserve_context = 1;
1310eea0867SMinfei Huang
132a72bbec7SEric DeVolder #ifdef CONFIG_CRASH_HOTPLUG
133a72bbec7SEric DeVolder if (flags & KEXEC_UPDATE_ELFCOREHDR)
134a72bbec7SEric DeVolder image->update_elfcorehdr = 1;
135a72bbec7SEric DeVolder #endif
136a72bbec7SEric DeVolder
1370eea0867SMinfei Huang ret = machine_kexec_prepare(image);
1380eea0867SMinfei Huang if (ret)
1390eea0867SMinfei Huang goto out;
1400eea0867SMinfei Huang
1411229384fSXunlei Pang /*
1421229384fSXunlei Pang * Some architecture(like S390) may touch the crash memory before
1431229384fSXunlei Pang * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
1441229384fSXunlei Pang */
1451229384fSXunlei Pang ret = kimage_crash_copy_vmcoreinfo(image);
1461229384fSXunlei Pang if (ret)
1471229384fSXunlei Pang goto out;
1481229384fSXunlei Pang
1490eea0867SMinfei Huang for (i = 0; i < nr_segments; i++) {
1500eea0867SMinfei Huang ret = kimage_load_segment(image, &image->segment[i]);
1510eea0867SMinfei Huang if (ret)
1520eea0867SMinfei Huang goto out;
1530eea0867SMinfei Huang }
1540eea0867SMinfei Huang
1550eea0867SMinfei Huang kimage_terminate(image);
1560eea0867SMinfei Huang
157de68e4daSPavel Tatashin ret = machine_kexec_post_load(image);
158de68e4daSPavel Tatashin if (ret)
159de68e4daSPavel Tatashin goto out;
160de68e4daSPavel Tatashin
1610eea0867SMinfei Huang /* Install the new kernel and uninstall the old */
1620eea0867SMinfei Huang image = xchg(dest_image, image);
1630eea0867SMinfei Huang
1640eea0867SMinfei Huang out:
1650eea0867SMinfei Huang if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
1660eea0867SMinfei Huang arch_kexec_protect_crashkres();
1670eea0867SMinfei Huang
1680eea0867SMinfei Huang kimage_free(image);
1694b692e86SArnd Bergmann out_unlock:
17005c62574SValentin Schneider kexec_unlock();
1710eea0867SMinfei Huang return ret;
1720eea0867SMinfei Huang }
1730eea0867SMinfei Huang
174dc009d92SEric W. Biederman /*
175dc009d92SEric W. Biederman * Exec Kernel system call: for obvious reasons only root may call it.
176dc009d92SEric W. Biederman *
177dc009d92SEric W. Biederman * This call breaks up into three pieces.
178dc009d92SEric W. Biederman * - A generic part which loads the new kernel from the current
179dc009d92SEric W. Biederman * address space, and very carefully places the data in the
180dc009d92SEric W. Biederman * allocated pages.
181dc009d92SEric W. Biederman *
182dc009d92SEric W. Biederman * - A generic part that interacts with the kernel and tells all of
183dc009d92SEric W. Biederman * the devices to shut down. Preventing on-going dmas, and placing
184dc009d92SEric W. Biederman * the devices in a consistent state so a later kernel can
185dc009d92SEric W. Biederman * reinitialize them.
186dc009d92SEric W. Biederman *
187dc009d92SEric W. Biederman * - A machine specific part that includes the syscall number
188002ace78SGeert Uytterhoeven * and then copies the image to it's final destination. And
189dc009d92SEric W. Biederman * jumps into the image at entry.
190dc009d92SEric W. Biederman *
191dc009d92SEric W. Biederman * kexec does not sync, or unmount filesystems so if you need
192dc009d92SEric W. Biederman * that to happen you need to do that yourself.
193dc009d92SEric W. Biederman */
1948c5a1cf0SAndrew Morton
kexec_load_check(unsigned long nr_segments,unsigned long flags)1956b27aef0SDominik Brodowski static inline int kexec_load_check(unsigned long nr_segments,
1966b27aef0SDominik Brodowski unsigned long flags)
197dc009d92SEric W. Biederman {
198a42aaad2SRicardo Ribalda int image_type = (flags & KEXEC_ON_CRASH) ?
199a42aaad2SRicardo Ribalda KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
200a210fd32SMimi Zohar int result;
201a210fd32SMimi Zohar
202dc009d92SEric W. Biederman /* We only trust the superuser with rebooting the system. */
203a42aaad2SRicardo Ribalda if (!kexec_load_permitted(image_type))
204dc009d92SEric W. Biederman return -EPERM;
205dc009d92SEric W. Biederman
206a210fd32SMimi Zohar /* Permit LSMs and IMA to fail the kexec */
207b64fcae7SKees Cook result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false);
208a210fd32SMimi Zohar if (result < 0)
209a210fd32SMimi Zohar return result;
210a210fd32SMimi Zohar
211dc009d92SEric W. Biederman /*
2127d31f460SMatthew Garrett * kexec can be used to circumvent module loading restrictions, so
2137d31f460SMatthew Garrett * prevent loading in that case
2147d31f460SMatthew Garrett */
2157d31f460SMatthew Garrett result = security_locked_down(LOCKDOWN_KEXEC);
2167d31f460SMatthew Garrett if (result)
2177d31f460SMatthew Garrett return result;
2187d31f460SMatthew Garrett
2197d31f460SMatthew Garrett /*
220dc009d92SEric W. Biederman * Verify we have a legal set of flags
221dc009d92SEric W. Biederman * This leaves us room for future extensions.
222dc009d92SEric W. Biederman */
223dc009d92SEric W. Biederman if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
224dc009d92SEric W. Biederman return -EINVAL;
225dc009d92SEric W. Biederman
226dc009d92SEric W. Biederman /* Put an artificial cap on the number
227dc009d92SEric W. Biederman * of segments passed to kexec_load.
228dc009d92SEric W. Biederman */
229dc009d92SEric W. Biederman if (nr_segments > KEXEC_SEGMENT_MAX)
230dc009d92SEric W. Biederman return -EINVAL;
231dc009d92SEric W. Biederman
2326b27aef0SDominik Brodowski return 0;
2336b27aef0SDominik Brodowski }
2346b27aef0SDominik Brodowski
SYSCALL_DEFINE4(kexec_load,unsigned long,entry,unsigned long,nr_segments,struct kexec_segment __user *,segments,unsigned long,flags)2356b27aef0SDominik Brodowski SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
2366b27aef0SDominik Brodowski struct kexec_segment __user *, segments, unsigned long, flags)
2376b27aef0SDominik Brodowski {
2385d700a0fSArnd Bergmann struct kexec_segment *ksegments;
2395d700a0fSArnd Bergmann unsigned long result;
2406b27aef0SDominik Brodowski
2416b27aef0SDominik Brodowski result = kexec_load_check(nr_segments, flags);
2426b27aef0SDominik Brodowski if (result)
2436b27aef0SDominik Brodowski return result;
2446b27aef0SDominik Brodowski
2456b27aef0SDominik Brodowski /* Verify we are on the appropriate architecture */
2466b27aef0SDominik Brodowski if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
2476b27aef0SDominik Brodowski ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
2486b27aef0SDominik Brodowski return -EINVAL;
2496b27aef0SDominik Brodowski
250*4fc857ccSPhilipp Stanner ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0]));
2515d700a0fSArnd Bergmann if (IS_ERR(ksegments))
2525d700a0fSArnd Bergmann return PTR_ERR(ksegments);
2535d700a0fSArnd Bergmann
2545d700a0fSArnd Bergmann result = do_kexec_load(entry, nr_segments, ksegments, flags);
2555d700a0fSArnd Bergmann kfree(ksegments);
2569b492cf5SXunlei Pang
257dc009d92SEric W. Biederman return result;
258dc009d92SEric W. Biederman }
259dc009d92SEric W. Biederman
260dc009d92SEric W. Biederman #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(kexec_load,compat_ulong_t,entry,compat_ulong_t,nr_segments,struct compat_kexec_segment __user *,segments,compat_ulong_t,flags)261ca2c405aSHeiko Carstens COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
262ca2c405aSHeiko Carstens compat_ulong_t, nr_segments,
263ca2c405aSHeiko Carstens struct compat_kexec_segment __user *, segments,
264ca2c405aSHeiko Carstens compat_ulong_t, flags)
265dc009d92SEric W. Biederman {
266dc009d92SEric W. Biederman struct compat_kexec_segment in;
2675d700a0fSArnd Bergmann struct kexec_segment *ksegments;
268dc009d92SEric W. Biederman unsigned long i, result;
269dc009d92SEric W. Biederman
2706b27aef0SDominik Brodowski result = kexec_load_check(nr_segments, flags);
2716b27aef0SDominik Brodowski if (result)
2726b27aef0SDominik Brodowski return result;
2736b27aef0SDominik Brodowski
274dc009d92SEric W. Biederman /* Don't allow clients that don't understand the native
275dc009d92SEric W. Biederman * architecture to do anything.
276dc009d92SEric W. Biederman */
27772414d3fSManeesh Soni if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
278dc009d92SEric W. Biederman return -EINVAL;
279dc009d92SEric W. Biederman
2805d700a0fSArnd Bergmann ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]),
2815d700a0fSArnd Bergmann GFP_KERNEL);
2825d700a0fSArnd Bergmann if (!ksegments)
2835d700a0fSArnd Bergmann return -ENOMEM;
2845d700a0fSArnd Bergmann
285dc009d92SEric W. Biederman for (i = 0; i < nr_segments; i++) {
286dc009d92SEric W. Biederman result = copy_from_user(&in, &segments[i], sizeof(in));
28772414d3fSManeesh Soni if (result)
2885d700a0fSArnd Bergmann goto fail;
289dc009d92SEric W. Biederman
2905d700a0fSArnd Bergmann ksegments[i].buf = compat_ptr(in.buf);
2915d700a0fSArnd Bergmann ksegments[i].bufsz = in.bufsz;
2925d700a0fSArnd Bergmann ksegments[i].mem = in.mem;
2935d700a0fSArnd Bergmann ksegments[i].memsz = in.memsz;
294dc009d92SEric W. Biederman }
295dc009d92SEric W. Biederman
2966b27aef0SDominik Brodowski result = do_kexec_load(entry, nr_segments, ksegments, flags);
2976b27aef0SDominik Brodowski
2985d700a0fSArnd Bergmann fail:
2995d700a0fSArnd Bergmann kfree(ksegments);
3006b27aef0SDominik Brodowski return result;
301dc009d92SEric W. Biederman }
302dc009d92SEric W. Biederman #endif
303