xref: /openbmc/linux/kernel/kexec.c (revision b97d6790d03b763eca08847a9a5869a4291b9f9a)
140b0b3f8SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2dc009d92SEric W. Biederman /*
32965faa5SDave Young  * kexec.c - kexec_load system call
4dc009d92SEric W. Biederman  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
5dc009d92SEric W. Biederman  */
6dc009d92SEric W. Biederman 
7de90a6bcSMinfei Huang #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8de90a6bcSMinfei Huang 
9c59ede7bSRandy.Dunlap #include <linux/capability.h>
10dc009d92SEric W. Biederman #include <linux/mm.h>
11dc009d92SEric W. Biederman #include <linux/file.h>
12a210fd32SMimi Zohar #include <linux/security.h>
13dc009d92SEric W. Biederman #include <linux/kexec.h>
148c5a1cf0SAndrew Morton #include <linux/mutex.h>
15dc009d92SEric W. Biederman #include <linux/list.h>
16dc009d92SEric W. Biederman #include <linux/syscalls.h>
17a43cac0dSDave Young #include <linux/vmalloc.h>
182965faa5SDave Young #include <linux/slab.h>
196e274d14SAlexander Nyberg 
20a43cac0dSDave Young #include "kexec_internal.h"
21a43cac0dSDave Young 
kimage_alloc_init(struct kimage ** rimage,unsigned long entry,unsigned long nr_segments,struct kexec_segment * segments,unsigned long flags)22255aedd9SVivek Goyal static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
2372414d3fSManeesh Soni 			     unsigned long nr_segments,
245d700a0fSArnd Bergmann 			     struct kexec_segment *segments,
25255aedd9SVivek Goyal 			     unsigned long flags)
26dc009d92SEric W. Biederman {
27255aedd9SVivek Goyal 	int ret;
28dc009d92SEric W. Biederman 	struct kimage *image;
29255aedd9SVivek Goyal 	bool kexec_on_panic = flags & KEXEC_ON_CRASH;
30255aedd9SVivek Goyal 
31255aedd9SVivek Goyal 	if (kexec_on_panic) {
32255aedd9SVivek Goyal 		/* Verify we have a valid entry point */
3343546d86SRussell King 		if ((entry < phys_to_boot_phys(crashk_res.start)) ||
3443546d86SRussell King 		    (entry > phys_to_boot_phys(crashk_res.end)))
35255aedd9SVivek Goyal 			return -EADDRNOTAVAIL;
36255aedd9SVivek Goyal 	}
37dc009d92SEric W. Biederman 
38dc009d92SEric W. Biederman 	/* Allocate and initialize a controlling structure */
39dabe7862SVivek Goyal 	image = do_kimage_alloc_init();
40dabe7862SVivek Goyal 	if (!image)
41dabe7862SVivek Goyal 		return -ENOMEM;
42dabe7862SVivek Goyal 
43dabe7862SVivek Goyal 	image->start = entry;
445d700a0fSArnd Bergmann 	image->nr_segments = nr_segments;
455d700a0fSArnd Bergmann 	memcpy(image->segment, segments, nr_segments * sizeof(*segments));
46dabe7862SVivek Goyal 
47255aedd9SVivek Goyal 	if (kexec_on_panic) {
48cdf4b3faSXunlei Pang 		/* Enable special crash kernel control page alloc policy. */
49255aedd9SVivek Goyal 		image->control_page = crashk_res.start;
50255aedd9SVivek Goyal 		image->type = KEXEC_TYPE_CRASH;
51255aedd9SVivek Goyal 	}
52255aedd9SVivek Goyal 
53cdf4b3faSXunlei Pang 	ret = sanity_check_segment_list(image);
54cdf4b3faSXunlei Pang 	if (ret)
55cdf4b3faSXunlei Pang 		goto out_free_image;
56cdf4b3faSXunlei Pang 
57dc009d92SEric W. Biederman 	/*
58dc009d92SEric W. Biederman 	 * Find a location for the control code buffer, and add it
59dc009d92SEric W. Biederman 	 * the vector of segments so that it's pages will also be
60dc009d92SEric W. Biederman 	 * counted as destination pages.
61dc009d92SEric W. Biederman 	 */
62255aedd9SVivek Goyal 	ret = -ENOMEM;
63dc009d92SEric W. Biederman 	image->control_code_page = kimage_alloc_control_pages(image,
64163f6876SHuang Ying 					   get_order(KEXEC_CONTROL_PAGE_SIZE));
65dc009d92SEric W. Biederman 	if (!image->control_code_page) {
66e1bebcf4SFabian Frederick 		pr_err("Could not allocate control_code_buffer\n");
67dabe7862SVivek Goyal 		goto out_free_image;
68dc009d92SEric W. Biederman 	}
69dc009d92SEric W. Biederman 
70255aedd9SVivek Goyal 	if (!kexec_on_panic) {
713ab83521SHuang Ying 		image->swap_page = kimage_alloc_control_pages(image, 0);
723ab83521SHuang Ying 		if (!image->swap_page) {
73e1bebcf4SFabian Frederick 			pr_err("Could not allocate swap buffer\n");
74dabe7862SVivek Goyal 			goto out_free_control_pages;
753ab83521SHuang Ying 		}
76255aedd9SVivek Goyal 	}
773ab83521SHuang Ying 
78dc009d92SEric W. Biederman 	*rimage = image;
79b92e7e0dSZhang Yanfei 	return 0;
80dabe7862SVivek Goyal out_free_control_pages:
81b92e7e0dSZhang Yanfei 	kimage_free_page_list(&image->control_pages);
82dabe7862SVivek Goyal out_free_image:
83b92e7e0dSZhang Yanfei 	kfree(image);
84255aedd9SVivek Goyal 	return ret;
85dc009d92SEric W. Biederman }
86dc009d92SEric W. Biederman 
do_kexec_load(unsigned long entry,unsigned long nr_segments,struct kexec_segment * segments,unsigned long flags)870eea0867SMinfei Huang static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
885d700a0fSArnd Bergmann 		struct kexec_segment *segments, unsigned long flags)
890eea0867SMinfei Huang {
900eea0867SMinfei Huang 	struct kimage **dest_image, *image;
910eea0867SMinfei Huang 	unsigned long i;
920eea0867SMinfei Huang 	int ret;
930eea0867SMinfei Huang 
944b692e86SArnd Bergmann 	/*
954b692e86SArnd Bergmann 	 * Because we write directly to the reserved memory region when loading
9605c62574SValentin Schneider 	 * crash kernels we need a serialization here to prevent multiple crash
9705c62574SValentin Schneider 	 * kernels from attempting to load simultaneously.
984b692e86SArnd Bergmann 	 */
9905c62574SValentin Schneider 	if (!kexec_trylock())
1004b692e86SArnd Bergmann 		return -EBUSY;
1014b692e86SArnd Bergmann 
1020eea0867SMinfei Huang 	if (flags & KEXEC_ON_CRASH) {
1030eea0867SMinfei Huang 		dest_image = &kexec_crash_image;
1040eea0867SMinfei Huang 		if (kexec_crash_image)
1050eea0867SMinfei Huang 			arch_kexec_unprotect_crashkres();
1060eea0867SMinfei Huang 	} else {
1070eea0867SMinfei Huang 		dest_image = &kexec_image;
1080eea0867SMinfei Huang 	}
1090eea0867SMinfei Huang 
1100eea0867SMinfei Huang 	if (nr_segments == 0) {
1110eea0867SMinfei Huang 		/* Uninstall image */
1120eea0867SMinfei Huang 		kimage_free(xchg(dest_image, NULL));
1134b692e86SArnd Bergmann 		ret = 0;
1144b692e86SArnd Bergmann 		goto out_unlock;
1150eea0867SMinfei Huang 	}
1160eea0867SMinfei Huang 	if (flags & KEXEC_ON_CRASH) {
1170eea0867SMinfei Huang 		/*
1180eea0867SMinfei Huang 		 * Loading another kernel to switch to if this one
1190eea0867SMinfei Huang 		 * crashes.  Free any current crash dump kernel before
1200eea0867SMinfei Huang 		 * we corrupt it.
1210eea0867SMinfei Huang 		 */
1220eea0867SMinfei Huang 		kimage_free(xchg(&kexec_crash_image, NULL));
1230eea0867SMinfei Huang 	}
1240eea0867SMinfei Huang 
1250eea0867SMinfei Huang 	ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
1260eea0867SMinfei Huang 	if (ret)
1274b692e86SArnd Bergmann 		goto out_unlock;
1280eea0867SMinfei Huang 
1290eea0867SMinfei Huang 	if (flags & KEXEC_PRESERVE_CONTEXT)
1300eea0867SMinfei Huang 		image->preserve_context = 1;
1310eea0867SMinfei Huang 
132a72bbec7SEric DeVolder #ifdef CONFIG_CRASH_HOTPLUG
133a72bbec7SEric DeVolder 	if (flags & KEXEC_UPDATE_ELFCOREHDR)
134a72bbec7SEric DeVolder 		image->update_elfcorehdr = 1;
135a72bbec7SEric DeVolder #endif
136a72bbec7SEric DeVolder 
1370eea0867SMinfei Huang 	ret = machine_kexec_prepare(image);
1380eea0867SMinfei Huang 	if (ret)
1390eea0867SMinfei Huang 		goto out;
1400eea0867SMinfei Huang 
1411229384fSXunlei Pang 	/*
1421229384fSXunlei Pang 	 * Some architecture(like S390) may touch the crash memory before
1431229384fSXunlei Pang 	 * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
1441229384fSXunlei Pang 	 */
1451229384fSXunlei Pang 	ret = kimage_crash_copy_vmcoreinfo(image);
1461229384fSXunlei Pang 	if (ret)
1471229384fSXunlei Pang 		goto out;
1481229384fSXunlei Pang 
1490eea0867SMinfei Huang 	for (i = 0; i < nr_segments; i++) {
1500eea0867SMinfei Huang 		ret = kimage_load_segment(image, &image->segment[i]);
1510eea0867SMinfei Huang 		if (ret)
1520eea0867SMinfei Huang 			goto out;
1530eea0867SMinfei Huang 	}
1540eea0867SMinfei Huang 
1550eea0867SMinfei Huang 	kimage_terminate(image);
1560eea0867SMinfei Huang 
157de68e4daSPavel Tatashin 	ret = machine_kexec_post_load(image);
158de68e4daSPavel Tatashin 	if (ret)
159de68e4daSPavel Tatashin 		goto out;
160de68e4daSPavel Tatashin 
1610eea0867SMinfei Huang 	/* Install the new kernel and uninstall the old */
1620eea0867SMinfei Huang 	image = xchg(dest_image, image);
1630eea0867SMinfei Huang 
1640eea0867SMinfei Huang out:
1650eea0867SMinfei Huang 	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
1660eea0867SMinfei Huang 		arch_kexec_protect_crashkres();
1670eea0867SMinfei Huang 
1680eea0867SMinfei Huang 	kimage_free(image);
1694b692e86SArnd Bergmann out_unlock:
17005c62574SValentin Schneider 	kexec_unlock();
1710eea0867SMinfei Huang 	return ret;
1720eea0867SMinfei Huang }
1730eea0867SMinfei Huang 
174dc009d92SEric W. Biederman /*
175dc009d92SEric W. Biederman  * Exec Kernel system call: for obvious reasons only root may call it.
176dc009d92SEric W. Biederman  *
177dc009d92SEric W. Biederman  * This call breaks up into three pieces.
178dc009d92SEric W. Biederman  * - A generic part which loads the new kernel from the current
179dc009d92SEric W. Biederman  *   address space, and very carefully places the data in the
180dc009d92SEric W. Biederman  *   allocated pages.
181dc009d92SEric W. Biederman  *
182dc009d92SEric W. Biederman  * - A generic part that interacts with the kernel and tells all of
183dc009d92SEric W. Biederman  *   the devices to shut down.  Preventing on-going dmas, and placing
184dc009d92SEric W. Biederman  *   the devices in a consistent state so a later kernel can
185dc009d92SEric W. Biederman  *   reinitialize them.
186dc009d92SEric W. Biederman  *
187dc009d92SEric W. Biederman  * - A machine specific part that includes the syscall number
188002ace78SGeert Uytterhoeven  *   and then copies the image to it's final destination.  And
189dc009d92SEric W. Biederman  *   jumps into the image at entry.
190dc009d92SEric W. Biederman  *
191dc009d92SEric W. Biederman  * kexec does not sync, or unmount filesystems so if you need
192dc009d92SEric W. Biederman  * that to happen you need to do that yourself.
193dc009d92SEric W. Biederman  */
1948c5a1cf0SAndrew Morton 
kexec_load_check(unsigned long nr_segments,unsigned long flags)1956b27aef0SDominik Brodowski static inline int kexec_load_check(unsigned long nr_segments,
1966b27aef0SDominik Brodowski 				   unsigned long flags)
197dc009d92SEric W. Biederman {
198a42aaad2SRicardo Ribalda 	int image_type = (flags & KEXEC_ON_CRASH) ?
199a42aaad2SRicardo Ribalda 			 KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
200a210fd32SMimi Zohar 	int result;
201a210fd32SMimi Zohar 
202dc009d92SEric W. Biederman 	/* We only trust the superuser with rebooting the system. */
203a42aaad2SRicardo Ribalda 	if (!kexec_load_permitted(image_type))
204dc009d92SEric W. Biederman 		return -EPERM;
205dc009d92SEric W. Biederman 
206a210fd32SMimi Zohar 	/* Permit LSMs and IMA to fail the kexec */
207b64fcae7SKees Cook 	result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false);
208a210fd32SMimi Zohar 	if (result < 0)
209a210fd32SMimi Zohar 		return result;
210a210fd32SMimi Zohar 
211dc009d92SEric W. Biederman 	/*
2127d31f460SMatthew Garrett 	 * kexec can be used to circumvent module loading restrictions, so
2137d31f460SMatthew Garrett 	 * prevent loading in that case
2147d31f460SMatthew Garrett 	 */
2157d31f460SMatthew Garrett 	result = security_locked_down(LOCKDOWN_KEXEC);
2167d31f460SMatthew Garrett 	if (result)
2177d31f460SMatthew Garrett 		return result;
2187d31f460SMatthew Garrett 
2197d31f460SMatthew Garrett 	/*
220dc009d92SEric W. Biederman 	 * Verify we have a legal set of flags
221dc009d92SEric W. Biederman 	 * This leaves us room for future extensions.
222dc009d92SEric W. Biederman 	 */
223dc009d92SEric W. Biederman 	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
224dc009d92SEric W. Biederman 		return -EINVAL;
225dc009d92SEric W. Biederman 
226dc009d92SEric W. Biederman 	/* Put an artificial cap on the number
227dc009d92SEric W. Biederman 	 * of segments passed to kexec_load.
228dc009d92SEric W. Biederman 	 */
229dc009d92SEric W. Biederman 	if (nr_segments > KEXEC_SEGMENT_MAX)
230dc009d92SEric W. Biederman 		return -EINVAL;
231dc009d92SEric W. Biederman 
2326b27aef0SDominik Brodowski 	return 0;
2336b27aef0SDominik Brodowski }
2346b27aef0SDominik Brodowski 
SYSCALL_DEFINE4(kexec_load,unsigned long,entry,unsigned long,nr_segments,struct kexec_segment __user *,segments,unsigned long,flags)2356b27aef0SDominik Brodowski SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
2366b27aef0SDominik Brodowski 		struct kexec_segment __user *, segments, unsigned long, flags)
2376b27aef0SDominik Brodowski {
2385d700a0fSArnd Bergmann 	struct kexec_segment *ksegments;
2395d700a0fSArnd Bergmann 	unsigned long result;
2406b27aef0SDominik Brodowski 
2416b27aef0SDominik Brodowski 	result = kexec_load_check(nr_segments, flags);
2426b27aef0SDominik Brodowski 	if (result)
2436b27aef0SDominik Brodowski 		return result;
2446b27aef0SDominik Brodowski 
2456b27aef0SDominik Brodowski 	/* Verify we are on the appropriate architecture */
2466b27aef0SDominik Brodowski 	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
2476b27aef0SDominik Brodowski 		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
2486b27aef0SDominik Brodowski 		return -EINVAL;
2496b27aef0SDominik Brodowski 
250*4fc857ccSPhilipp Stanner 	ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0]));
2515d700a0fSArnd Bergmann 	if (IS_ERR(ksegments))
2525d700a0fSArnd Bergmann 		return PTR_ERR(ksegments);
2535d700a0fSArnd Bergmann 
2545d700a0fSArnd Bergmann 	result = do_kexec_load(entry, nr_segments, ksegments, flags);
2555d700a0fSArnd Bergmann 	kfree(ksegments);
2569b492cf5SXunlei Pang 
257dc009d92SEric W. Biederman 	return result;
258dc009d92SEric W. Biederman }
259dc009d92SEric W. Biederman 
260dc009d92SEric W. Biederman #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(kexec_load,compat_ulong_t,entry,compat_ulong_t,nr_segments,struct compat_kexec_segment __user *,segments,compat_ulong_t,flags)261ca2c405aSHeiko Carstens COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
262ca2c405aSHeiko Carstens 		       compat_ulong_t, nr_segments,
263ca2c405aSHeiko Carstens 		       struct compat_kexec_segment __user *, segments,
264ca2c405aSHeiko Carstens 		       compat_ulong_t, flags)
265dc009d92SEric W. Biederman {
266dc009d92SEric W. Biederman 	struct compat_kexec_segment in;
2675d700a0fSArnd Bergmann 	struct kexec_segment *ksegments;
268dc009d92SEric W. Biederman 	unsigned long i, result;
269dc009d92SEric W. Biederman 
2706b27aef0SDominik Brodowski 	result = kexec_load_check(nr_segments, flags);
2716b27aef0SDominik Brodowski 	if (result)
2726b27aef0SDominik Brodowski 		return result;
2736b27aef0SDominik Brodowski 
274dc009d92SEric W. Biederman 	/* Don't allow clients that don't understand the native
275dc009d92SEric W. Biederman 	 * architecture to do anything.
276dc009d92SEric W. Biederman 	 */
27772414d3fSManeesh Soni 	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
278dc009d92SEric W. Biederman 		return -EINVAL;
279dc009d92SEric W. Biederman 
2805d700a0fSArnd Bergmann 	ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]),
2815d700a0fSArnd Bergmann 			GFP_KERNEL);
2825d700a0fSArnd Bergmann 	if (!ksegments)
2835d700a0fSArnd Bergmann 		return -ENOMEM;
2845d700a0fSArnd Bergmann 
285dc009d92SEric W. Biederman 	for (i = 0; i < nr_segments; i++) {
286dc009d92SEric W. Biederman 		result = copy_from_user(&in, &segments[i], sizeof(in));
28772414d3fSManeesh Soni 		if (result)
2885d700a0fSArnd Bergmann 			goto fail;
289dc009d92SEric W. Biederman 
2905d700a0fSArnd Bergmann 		ksegments[i].buf   = compat_ptr(in.buf);
2915d700a0fSArnd Bergmann 		ksegments[i].bufsz = in.bufsz;
2925d700a0fSArnd Bergmann 		ksegments[i].mem   = in.mem;
2935d700a0fSArnd Bergmann 		ksegments[i].memsz = in.memsz;
294dc009d92SEric W. Biederman 	}
295dc009d92SEric W. Biederman 
2966b27aef0SDominik Brodowski 	result = do_kexec_load(entry, nr_segments, ksegments, flags);
2976b27aef0SDominik Brodowski 
2985d700a0fSArnd Bergmann fail:
2995d700a0fSArnd Bergmann 	kfree(ksegments);
3006b27aef0SDominik Brodowski 	return result;
301dc009d92SEric W. Biederman }
302dc009d92SEric W. Biederman #endif
303