xref: /openbmc/linux/kernel/kexec_core.c (revision 4f205687)
1 /*
2  * kexec.c - kexec system call core code.
3  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
4  *
5  * This source code is licensed under the GNU General Public License,
6  * Version 2.  See the file COPYING for more details.
7  */
8 
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 
11 #include <linux/capability.h>
12 #include <linux/mm.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/fs.h>
16 #include <linux/kexec.h>
17 #include <linux/mutex.h>
18 #include <linux/list.h>
19 #include <linux/highmem.h>
20 #include <linux/syscalls.h>
21 #include <linux/reboot.h>
22 #include <linux/ioport.h>
23 #include <linux/hardirq.h>
24 #include <linux/elf.h>
25 #include <linux/elfcore.h>
26 #include <linux/utsname.h>
27 #include <linux/numa.h>
28 #include <linux/suspend.h>
29 #include <linux/device.h>
30 #include <linux/freezer.h>
31 #include <linux/pm.h>
32 #include <linux/cpu.h>
33 #include <linux/uaccess.h>
34 #include <linux/io.h>
35 #include <linux/console.h>
36 #include <linux/vmalloc.h>
37 #include <linux/swap.h>
38 #include <linux/syscore_ops.h>
39 #include <linux/compiler.h>
40 #include <linux/hugetlb.h>
41 
42 #include <asm/page.h>
43 #include <asm/sections.h>
44 
45 #include <crypto/hash.h>
46 #include <crypto/sha.h>
47 #include "kexec_internal.h"
48 
49 DEFINE_MUTEX(kexec_mutex);
50 
51 /* Per cpu memory for storing cpu states in case of system crash. */
52 note_buf_t __percpu *crash_notes;
53 
54 /* vmcoreinfo stuff */
55 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
56 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
57 size_t vmcoreinfo_size;
58 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
59 
60 /* Flag to indicate we are going to kexec a new kernel */
61 bool kexec_in_progress = false;
62 
63 
64 /* Location of the reserved area for the crash kernel */
65 struct resource crashk_res = {
66 	.name  = "Crash kernel",
67 	.start = 0,
68 	.end   = 0,
69 	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
70 	.desc  = IORES_DESC_CRASH_KERNEL
71 };
72 struct resource crashk_low_res = {
73 	.name  = "Crash kernel",
74 	.start = 0,
75 	.end   = 0,
76 	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
77 	.desc  = IORES_DESC_CRASH_KERNEL
78 };
79 
80 int kexec_should_crash(struct task_struct *p)
81 {
82 	/*
83 	 * If crash_kexec_post_notifiers is enabled, don't run
84 	 * crash_kexec() here yet, which must be run after panic
85 	 * notifiers in panic().
86 	 */
87 	if (crash_kexec_post_notifiers)
88 		return 0;
89 	/*
90 	 * There are 4 panic() calls in do_exit() path, each of which
91 	 * corresponds to each of these 4 conditions.
92 	 */
93 	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
94 		return 1;
95 	return 0;
96 }
97 
98 /*
99  * When kexec transitions to the new kernel there is a one-to-one
100  * mapping between physical and virtual addresses.  On processors
101  * where you can disable the MMU this is trivial, and easy.  For
102  * others it is still a simple predictable page table to setup.
103  *
104  * In that environment kexec copies the new kernel to its final
105  * resting place.  This means I can only support memory whose
106  * physical address can fit in an unsigned long.  In particular
107  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
108  * If the assembly stub has more restrictive requirements
109  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
110  * defined more restrictively in <asm/kexec.h>.
111  *
112  * The code for the transition from the current kernel to the
113  * the new kernel is placed in the control_code_buffer, whose size
114  * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
115  * page of memory is necessary, but some architectures require more.
116  * Because this memory must be identity mapped in the transition from
117  * virtual to physical addresses it must live in the range
118  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
119  * modifiable.
120  *
121  * The assembly stub in the control code buffer is passed a linked list
122  * of descriptor pages detailing the source pages of the new kernel,
123  * and the destination addresses of those source pages.  As this data
124  * structure is not used in the context of the current OS, it must
125  * be self-contained.
126  *
127  * The code has been made to work with highmem pages and will use a
128  * destination page in its final resting place (if it happens
129  * to allocate it).  The end product of this is that most of the
130  * physical address space, and most of RAM can be used.
131  *
132  * Future directions include:
133  *  - allocating a page table with the control code buffer identity
134  *    mapped, to simplify machine_kexec and make kexec_on_panic more
135  *    reliable.
136  */
137 
138 /*
139  * KIMAGE_NO_DEST is an impossible destination address..., for
140  * allocating pages whose destination address we do not care about.
141  */
142 #define KIMAGE_NO_DEST (-1UL)
143 
144 static struct page *kimage_alloc_page(struct kimage *image,
145 				       gfp_t gfp_mask,
146 				       unsigned long dest);
147 
148 int sanity_check_segment_list(struct kimage *image)
149 {
150 	int result, i;
151 	unsigned long nr_segments = image->nr_segments;
152 
153 	/*
154 	 * Verify we have good destination addresses.  The caller is
155 	 * responsible for making certain we don't attempt to load
156 	 * the new image into invalid or reserved areas of RAM.  This
157 	 * just verifies it is an address we can use.
158 	 *
159 	 * Since the kernel does everything in page size chunks ensure
160 	 * the destination addresses are page aligned.  Too many
161 	 * special cases crop of when we don't do this.  The most
162 	 * insidious is getting overlapping destination addresses
163 	 * simply because addresses are changed to page size
164 	 * granularity.
165 	 */
166 	result = -EADDRNOTAVAIL;
167 	for (i = 0; i < nr_segments; i++) {
168 		unsigned long mstart, mend;
169 
170 		mstart = image->segment[i].mem;
171 		mend   = mstart + image->segment[i].memsz;
172 		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
173 			return result;
174 		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
175 			return result;
176 	}
177 
178 	/* Verify our destination addresses do not overlap.
179 	 * If we alloed overlapping destination addresses
180 	 * through very weird things can happen with no
181 	 * easy explanation as one segment stops on another.
182 	 */
183 	result = -EINVAL;
184 	for (i = 0; i < nr_segments; i++) {
185 		unsigned long mstart, mend;
186 		unsigned long j;
187 
188 		mstart = image->segment[i].mem;
189 		mend   = mstart + image->segment[i].memsz;
190 		for (j = 0; j < i; j++) {
191 			unsigned long pstart, pend;
192 
193 			pstart = image->segment[j].mem;
194 			pend   = pstart + image->segment[j].memsz;
195 			/* Do the segments overlap ? */
196 			if ((mend > pstart) && (mstart < pend))
197 				return result;
198 		}
199 	}
200 
201 	/* Ensure our buffer sizes are strictly less than
202 	 * our memory sizes.  This should always be the case,
203 	 * and it is easier to check up front than to be surprised
204 	 * later on.
205 	 */
206 	result = -EINVAL;
207 	for (i = 0; i < nr_segments; i++) {
208 		if (image->segment[i].bufsz > image->segment[i].memsz)
209 			return result;
210 	}
211 
212 	/*
213 	 * Verify we have good destination addresses.  Normally
214 	 * the caller is responsible for making certain we don't
215 	 * attempt to load the new image into invalid or reserved
216 	 * areas of RAM.  But crash kernels are preloaded into a
217 	 * reserved area of ram.  We must ensure the addresses
218 	 * are in the reserved area otherwise preloading the
219 	 * kernel could corrupt things.
220 	 */
221 
222 	if (image->type == KEXEC_TYPE_CRASH) {
223 		result = -EADDRNOTAVAIL;
224 		for (i = 0; i < nr_segments; i++) {
225 			unsigned long mstart, mend;
226 
227 			mstart = image->segment[i].mem;
228 			mend = mstart + image->segment[i].memsz - 1;
229 			/* Ensure we are within the crash kernel limits */
230 			if ((mstart < crashk_res.start) ||
231 			    (mend > crashk_res.end))
232 				return result;
233 		}
234 	}
235 
236 	return 0;
237 }
238 
239 struct kimage *do_kimage_alloc_init(void)
240 {
241 	struct kimage *image;
242 
243 	/* Allocate a controlling structure */
244 	image = kzalloc(sizeof(*image), GFP_KERNEL);
245 	if (!image)
246 		return NULL;
247 
248 	image->head = 0;
249 	image->entry = &image->head;
250 	image->last_entry = &image->head;
251 	image->control_page = ~0; /* By default this does not apply */
252 	image->type = KEXEC_TYPE_DEFAULT;
253 
254 	/* Initialize the list of control pages */
255 	INIT_LIST_HEAD(&image->control_pages);
256 
257 	/* Initialize the list of destination pages */
258 	INIT_LIST_HEAD(&image->dest_pages);
259 
260 	/* Initialize the list of unusable pages */
261 	INIT_LIST_HEAD(&image->unusable_pages);
262 
263 	return image;
264 }
265 
266 int kimage_is_destination_range(struct kimage *image,
267 					unsigned long start,
268 					unsigned long end)
269 {
270 	unsigned long i;
271 
272 	for (i = 0; i < image->nr_segments; i++) {
273 		unsigned long mstart, mend;
274 
275 		mstart = image->segment[i].mem;
276 		mend = mstart + image->segment[i].memsz;
277 		if ((end > mstart) && (start < mend))
278 			return 1;
279 	}
280 
281 	return 0;
282 }
283 
284 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
285 {
286 	struct page *pages;
287 
288 	pages = alloc_pages(gfp_mask, order);
289 	if (pages) {
290 		unsigned int count, i;
291 
292 		pages->mapping = NULL;
293 		set_page_private(pages, order);
294 		count = 1 << order;
295 		for (i = 0; i < count; i++)
296 			SetPageReserved(pages + i);
297 	}
298 
299 	return pages;
300 }
301 
302 static void kimage_free_pages(struct page *page)
303 {
304 	unsigned int order, count, i;
305 
306 	order = page_private(page);
307 	count = 1 << order;
308 	for (i = 0; i < count; i++)
309 		ClearPageReserved(page + i);
310 	__free_pages(page, order);
311 }
312 
313 void kimage_free_page_list(struct list_head *list)
314 {
315 	struct page *page, *next;
316 
317 	list_for_each_entry_safe(page, next, list, lru) {
318 		list_del(&page->lru);
319 		kimage_free_pages(page);
320 	}
321 }
322 
323 static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
324 							unsigned int order)
325 {
326 	/* Control pages are special, they are the intermediaries
327 	 * that are needed while we copy the rest of the pages
328 	 * to their final resting place.  As such they must
329 	 * not conflict with either the destination addresses
330 	 * or memory the kernel is already using.
331 	 *
332 	 * The only case where we really need more than one of
333 	 * these are for architectures where we cannot disable
334 	 * the MMU and must instead generate an identity mapped
335 	 * page table for all of the memory.
336 	 *
337 	 * At worst this runs in O(N) of the image size.
338 	 */
339 	struct list_head extra_pages;
340 	struct page *pages;
341 	unsigned int count;
342 
343 	count = 1 << order;
344 	INIT_LIST_HEAD(&extra_pages);
345 
346 	/* Loop while I can allocate a page and the page allocated
347 	 * is a destination page.
348 	 */
349 	do {
350 		unsigned long pfn, epfn, addr, eaddr;
351 
352 		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
353 		if (!pages)
354 			break;
355 		pfn   = page_to_pfn(pages);
356 		epfn  = pfn + count;
357 		addr  = pfn << PAGE_SHIFT;
358 		eaddr = epfn << PAGE_SHIFT;
359 		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
360 			      kimage_is_destination_range(image, addr, eaddr)) {
361 			list_add(&pages->lru, &extra_pages);
362 			pages = NULL;
363 		}
364 	} while (!pages);
365 
366 	if (pages) {
367 		/* Remember the allocated page... */
368 		list_add(&pages->lru, &image->control_pages);
369 
370 		/* Because the page is already in it's destination
371 		 * location we will never allocate another page at
372 		 * that address.  Therefore kimage_alloc_pages
373 		 * will not return it (again) and we don't need
374 		 * to give it an entry in image->segment[].
375 		 */
376 	}
377 	/* Deal with the destination pages I have inadvertently allocated.
378 	 *
379 	 * Ideally I would convert multi-page allocations into single
380 	 * page allocations, and add everything to image->dest_pages.
381 	 *
382 	 * For now it is simpler to just free the pages.
383 	 */
384 	kimage_free_page_list(&extra_pages);
385 
386 	return pages;
387 }
388 
389 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
390 						      unsigned int order)
391 {
392 	/* Control pages are special, they are the intermediaries
393 	 * that are needed while we copy the rest of the pages
394 	 * to their final resting place.  As such they must
395 	 * not conflict with either the destination addresses
396 	 * or memory the kernel is already using.
397 	 *
398 	 * Control pages are also the only pags we must allocate
399 	 * when loading a crash kernel.  All of the other pages
400 	 * are specified by the segments and we just memcpy
401 	 * into them directly.
402 	 *
403 	 * The only case where we really need more than one of
404 	 * these are for architectures where we cannot disable
405 	 * the MMU and must instead generate an identity mapped
406 	 * page table for all of the memory.
407 	 *
408 	 * Given the low demand this implements a very simple
409 	 * allocator that finds the first hole of the appropriate
410 	 * size in the reserved memory region, and allocates all
411 	 * of the memory up to and including the hole.
412 	 */
413 	unsigned long hole_start, hole_end, size;
414 	struct page *pages;
415 
416 	pages = NULL;
417 	size = (1 << order) << PAGE_SHIFT;
418 	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
419 	hole_end   = hole_start + size - 1;
420 	while (hole_end <= crashk_res.end) {
421 		unsigned long i;
422 
423 		if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
424 			break;
425 		/* See if I overlap any of the segments */
426 		for (i = 0; i < image->nr_segments; i++) {
427 			unsigned long mstart, mend;
428 
429 			mstart = image->segment[i].mem;
430 			mend   = mstart + image->segment[i].memsz - 1;
431 			if ((hole_end >= mstart) && (hole_start <= mend)) {
432 				/* Advance the hole to the end of the segment */
433 				hole_start = (mend + (size - 1)) & ~(size - 1);
434 				hole_end   = hole_start + size - 1;
435 				break;
436 			}
437 		}
438 		/* If I don't overlap any segments I have found my hole! */
439 		if (i == image->nr_segments) {
440 			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
441 			image->control_page = hole_end;
442 			break;
443 		}
444 	}
445 
446 	return pages;
447 }
448 
449 
450 struct page *kimage_alloc_control_pages(struct kimage *image,
451 					 unsigned int order)
452 {
453 	struct page *pages = NULL;
454 
455 	switch (image->type) {
456 	case KEXEC_TYPE_DEFAULT:
457 		pages = kimage_alloc_normal_control_pages(image, order);
458 		break;
459 	case KEXEC_TYPE_CRASH:
460 		pages = kimage_alloc_crash_control_pages(image, order);
461 		break;
462 	}
463 
464 	return pages;
465 }
466 
467 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
468 {
469 	if (*image->entry != 0)
470 		image->entry++;
471 
472 	if (image->entry == image->last_entry) {
473 		kimage_entry_t *ind_page;
474 		struct page *page;
475 
476 		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
477 		if (!page)
478 			return -ENOMEM;
479 
480 		ind_page = page_address(page);
481 		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
482 		image->entry = ind_page;
483 		image->last_entry = ind_page +
484 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
485 	}
486 	*image->entry = entry;
487 	image->entry++;
488 	*image->entry = 0;
489 
490 	return 0;
491 }
492 
493 static int kimage_set_destination(struct kimage *image,
494 				   unsigned long destination)
495 {
496 	int result;
497 
498 	destination &= PAGE_MASK;
499 	result = kimage_add_entry(image, destination | IND_DESTINATION);
500 
501 	return result;
502 }
503 
504 
505 static int kimage_add_page(struct kimage *image, unsigned long page)
506 {
507 	int result;
508 
509 	page &= PAGE_MASK;
510 	result = kimage_add_entry(image, page | IND_SOURCE);
511 
512 	return result;
513 }
514 
515 
516 static void kimage_free_extra_pages(struct kimage *image)
517 {
518 	/* Walk through and free any extra destination pages I may have */
519 	kimage_free_page_list(&image->dest_pages);
520 
521 	/* Walk through and free any unusable pages I have cached */
522 	kimage_free_page_list(&image->unusable_pages);
523 
524 }
525 void kimage_terminate(struct kimage *image)
526 {
527 	if (*image->entry != 0)
528 		image->entry++;
529 
530 	*image->entry = IND_DONE;
531 }
532 
533 #define for_each_kimage_entry(image, ptr, entry) \
534 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
535 		ptr = (entry & IND_INDIRECTION) ? \
536 			phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
537 
538 static void kimage_free_entry(kimage_entry_t entry)
539 {
540 	struct page *page;
541 
542 	page = pfn_to_page(entry >> PAGE_SHIFT);
543 	kimage_free_pages(page);
544 }
545 
546 void kimage_free(struct kimage *image)
547 {
548 	kimage_entry_t *ptr, entry;
549 	kimage_entry_t ind = 0;
550 
551 	if (!image)
552 		return;
553 
554 	kimage_free_extra_pages(image);
555 	for_each_kimage_entry(image, ptr, entry) {
556 		if (entry & IND_INDIRECTION) {
557 			/* Free the previous indirection page */
558 			if (ind & IND_INDIRECTION)
559 				kimage_free_entry(ind);
560 			/* Save this indirection page until we are
561 			 * done with it.
562 			 */
563 			ind = entry;
564 		} else if (entry & IND_SOURCE)
565 			kimage_free_entry(entry);
566 	}
567 	/* Free the final indirection page */
568 	if (ind & IND_INDIRECTION)
569 		kimage_free_entry(ind);
570 
571 	/* Handle any machine specific cleanup */
572 	machine_kexec_cleanup(image);
573 
574 	/* Free the kexec control pages... */
575 	kimage_free_page_list(&image->control_pages);
576 
577 	/*
578 	 * Free up any temporary buffers allocated. This might hit if
579 	 * error occurred much later after buffer allocation.
580 	 */
581 	if (image->file_mode)
582 		kimage_file_post_load_cleanup(image);
583 
584 	kfree(image);
585 }
586 
587 static kimage_entry_t *kimage_dst_used(struct kimage *image,
588 					unsigned long page)
589 {
590 	kimage_entry_t *ptr, entry;
591 	unsigned long destination = 0;
592 
593 	for_each_kimage_entry(image, ptr, entry) {
594 		if (entry & IND_DESTINATION)
595 			destination = entry & PAGE_MASK;
596 		else if (entry & IND_SOURCE) {
597 			if (page == destination)
598 				return ptr;
599 			destination += PAGE_SIZE;
600 		}
601 	}
602 
603 	return NULL;
604 }
605 
606 static struct page *kimage_alloc_page(struct kimage *image,
607 					gfp_t gfp_mask,
608 					unsigned long destination)
609 {
610 	/*
611 	 * Here we implement safeguards to ensure that a source page
612 	 * is not copied to its destination page before the data on
613 	 * the destination page is no longer useful.
614 	 *
615 	 * To do this we maintain the invariant that a source page is
616 	 * either its own destination page, or it is not a
617 	 * destination page at all.
618 	 *
619 	 * That is slightly stronger than required, but the proof
620 	 * that no problems will not occur is trivial, and the
621 	 * implementation is simply to verify.
622 	 *
623 	 * When allocating all pages normally this algorithm will run
624 	 * in O(N) time, but in the worst case it will run in O(N^2)
625 	 * time.   If the runtime is a problem the data structures can
626 	 * be fixed.
627 	 */
628 	struct page *page;
629 	unsigned long addr;
630 
631 	/*
632 	 * Walk through the list of destination pages, and see if I
633 	 * have a match.
634 	 */
635 	list_for_each_entry(page, &image->dest_pages, lru) {
636 		addr = page_to_pfn(page) << PAGE_SHIFT;
637 		if (addr == destination) {
638 			list_del(&page->lru);
639 			return page;
640 		}
641 	}
642 	page = NULL;
643 	while (1) {
644 		kimage_entry_t *old;
645 
646 		/* Allocate a page, if we run out of memory give up */
647 		page = kimage_alloc_pages(gfp_mask, 0);
648 		if (!page)
649 			return NULL;
650 		/* If the page cannot be used file it away */
651 		if (page_to_pfn(page) >
652 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
653 			list_add(&page->lru, &image->unusable_pages);
654 			continue;
655 		}
656 		addr = page_to_pfn(page) << PAGE_SHIFT;
657 
658 		/* If it is the destination page we want use it */
659 		if (addr == destination)
660 			break;
661 
662 		/* If the page is not a destination page use it */
663 		if (!kimage_is_destination_range(image, addr,
664 						  addr + PAGE_SIZE))
665 			break;
666 
667 		/*
668 		 * I know that the page is someones destination page.
669 		 * See if there is already a source page for this
670 		 * destination page.  And if so swap the source pages.
671 		 */
672 		old = kimage_dst_used(image, addr);
673 		if (old) {
674 			/* If so move it */
675 			unsigned long old_addr;
676 			struct page *old_page;
677 
678 			old_addr = *old & PAGE_MASK;
679 			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
680 			copy_highpage(page, old_page);
681 			*old = addr | (*old & ~PAGE_MASK);
682 
683 			/* The old page I have found cannot be a
684 			 * destination page, so return it if it's
685 			 * gfp_flags honor the ones passed in.
686 			 */
687 			if (!(gfp_mask & __GFP_HIGHMEM) &&
688 			    PageHighMem(old_page)) {
689 				kimage_free_pages(old_page);
690 				continue;
691 			}
692 			addr = old_addr;
693 			page = old_page;
694 			break;
695 		}
696 		/* Place the page on the destination list, to be used later */
697 		list_add(&page->lru, &image->dest_pages);
698 	}
699 
700 	return page;
701 }
702 
703 static int kimage_load_normal_segment(struct kimage *image,
704 					 struct kexec_segment *segment)
705 {
706 	unsigned long maddr;
707 	size_t ubytes, mbytes;
708 	int result;
709 	unsigned char __user *buf = NULL;
710 	unsigned char *kbuf = NULL;
711 
712 	result = 0;
713 	if (image->file_mode)
714 		kbuf = segment->kbuf;
715 	else
716 		buf = segment->buf;
717 	ubytes = segment->bufsz;
718 	mbytes = segment->memsz;
719 	maddr = segment->mem;
720 
721 	result = kimage_set_destination(image, maddr);
722 	if (result < 0)
723 		goto out;
724 
725 	while (mbytes) {
726 		struct page *page;
727 		char *ptr;
728 		size_t uchunk, mchunk;
729 
730 		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
731 		if (!page) {
732 			result  = -ENOMEM;
733 			goto out;
734 		}
735 		result = kimage_add_page(image, page_to_pfn(page)
736 								<< PAGE_SHIFT);
737 		if (result < 0)
738 			goto out;
739 
740 		ptr = kmap(page);
741 		/* Start with a clear page */
742 		clear_page(ptr);
743 		ptr += maddr & ~PAGE_MASK;
744 		mchunk = min_t(size_t, mbytes,
745 				PAGE_SIZE - (maddr & ~PAGE_MASK));
746 		uchunk = min(ubytes, mchunk);
747 
748 		/* For file based kexec, source pages are in kernel memory */
749 		if (image->file_mode)
750 			memcpy(ptr, kbuf, uchunk);
751 		else
752 			result = copy_from_user(ptr, buf, uchunk);
753 		kunmap(page);
754 		if (result) {
755 			result = -EFAULT;
756 			goto out;
757 		}
758 		ubytes -= uchunk;
759 		maddr  += mchunk;
760 		if (image->file_mode)
761 			kbuf += mchunk;
762 		else
763 			buf += mchunk;
764 		mbytes -= mchunk;
765 	}
766 out:
767 	return result;
768 }
769 
770 static int kimage_load_crash_segment(struct kimage *image,
771 					struct kexec_segment *segment)
772 {
773 	/* For crash dumps kernels we simply copy the data from
774 	 * user space to it's destination.
775 	 * We do things a page at a time for the sake of kmap.
776 	 */
777 	unsigned long maddr;
778 	size_t ubytes, mbytes;
779 	int result;
780 	unsigned char __user *buf = NULL;
781 	unsigned char *kbuf = NULL;
782 
783 	result = 0;
784 	if (image->file_mode)
785 		kbuf = segment->kbuf;
786 	else
787 		buf = segment->buf;
788 	ubytes = segment->bufsz;
789 	mbytes = segment->memsz;
790 	maddr = segment->mem;
791 	while (mbytes) {
792 		struct page *page;
793 		char *ptr;
794 		size_t uchunk, mchunk;
795 
796 		page = pfn_to_page(maddr >> PAGE_SHIFT);
797 		if (!page) {
798 			result  = -ENOMEM;
799 			goto out;
800 		}
801 		ptr = kmap(page);
802 		ptr += maddr & ~PAGE_MASK;
803 		mchunk = min_t(size_t, mbytes,
804 				PAGE_SIZE - (maddr & ~PAGE_MASK));
805 		uchunk = min(ubytes, mchunk);
806 		if (mchunk > uchunk) {
807 			/* Zero the trailing part of the page */
808 			memset(ptr + uchunk, 0, mchunk - uchunk);
809 		}
810 
811 		/* For file based kexec, source pages are in kernel memory */
812 		if (image->file_mode)
813 			memcpy(ptr, kbuf, uchunk);
814 		else
815 			result = copy_from_user(ptr, buf, uchunk);
816 		kexec_flush_icache_page(page);
817 		kunmap(page);
818 		if (result) {
819 			result = -EFAULT;
820 			goto out;
821 		}
822 		ubytes -= uchunk;
823 		maddr  += mchunk;
824 		if (image->file_mode)
825 			kbuf += mchunk;
826 		else
827 			buf += mchunk;
828 		mbytes -= mchunk;
829 	}
830 out:
831 	return result;
832 }
833 
834 int kimage_load_segment(struct kimage *image,
835 				struct kexec_segment *segment)
836 {
837 	int result = -ENOMEM;
838 
839 	switch (image->type) {
840 	case KEXEC_TYPE_DEFAULT:
841 		result = kimage_load_normal_segment(image, segment);
842 		break;
843 	case KEXEC_TYPE_CRASH:
844 		result = kimage_load_crash_segment(image, segment);
845 		break;
846 	}
847 
848 	return result;
849 }
850 
851 struct kimage *kexec_image;
852 struct kimage *kexec_crash_image;
853 int kexec_load_disabled;
854 
855 /*
856  * No panic_cpu check version of crash_kexec().  This function is called
857  * only when panic_cpu holds the current CPU number; this is the only CPU
858  * which processes crash_kexec routines.
859  */
860 void __crash_kexec(struct pt_regs *regs)
861 {
862 	/* Take the kexec_mutex here to prevent sys_kexec_load
863 	 * running on one cpu from replacing the crash kernel
864 	 * we are using after a panic on a different cpu.
865 	 *
866 	 * If the crash kernel was not located in a fixed area
867 	 * of memory the xchg(&kexec_crash_image) would be
868 	 * sufficient.  But since I reuse the memory...
869 	 */
870 	if (mutex_trylock(&kexec_mutex)) {
871 		if (kexec_crash_image) {
872 			struct pt_regs fixed_regs;
873 
874 			crash_setup_regs(&fixed_regs, regs);
875 			crash_save_vmcoreinfo();
876 			machine_crash_shutdown(&fixed_regs);
877 			machine_kexec(kexec_crash_image);
878 		}
879 		mutex_unlock(&kexec_mutex);
880 	}
881 }
882 
883 void crash_kexec(struct pt_regs *regs)
884 {
885 	int old_cpu, this_cpu;
886 
887 	/*
888 	 * Only one CPU is allowed to execute the crash_kexec() code as with
889 	 * panic().  Otherwise parallel calls of panic() and crash_kexec()
890 	 * may stop each other.  To exclude them, we use panic_cpu here too.
891 	 */
892 	this_cpu = raw_smp_processor_id();
893 	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
894 	if (old_cpu == PANIC_CPU_INVALID) {
895 		/* This is the 1st CPU which comes here, so go ahead. */
896 		printk_nmi_flush_on_panic();
897 		__crash_kexec(regs);
898 
899 		/*
900 		 * Reset panic_cpu to allow another panic()/crash_kexec()
901 		 * call.
902 		 */
903 		atomic_set(&panic_cpu, PANIC_CPU_INVALID);
904 	}
905 }
906 
907 size_t crash_get_memory_size(void)
908 {
909 	size_t size = 0;
910 
911 	mutex_lock(&kexec_mutex);
912 	if (crashk_res.end != crashk_res.start)
913 		size = resource_size(&crashk_res);
914 	mutex_unlock(&kexec_mutex);
915 	return size;
916 }
917 
918 void __weak crash_free_reserved_phys_range(unsigned long begin,
919 					   unsigned long end)
920 {
921 	unsigned long addr;
922 
923 	for (addr = begin; addr < end; addr += PAGE_SIZE)
924 		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
925 }
926 
927 int crash_shrink_memory(unsigned long new_size)
928 {
929 	int ret = 0;
930 	unsigned long start, end;
931 	unsigned long old_size;
932 	struct resource *ram_res;
933 
934 	mutex_lock(&kexec_mutex);
935 
936 	if (kexec_crash_image) {
937 		ret = -ENOENT;
938 		goto unlock;
939 	}
940 	start = crashk_res.start;
941 	end = crashk_res.end;
942 	old_size = (end == 0) ? 0 : end - start + 1;
943 	if (new_size >= old_size) {
944 		ret = (new_size == old_size) ? 0 : -EINVAL;
945 		goto unlock;
946 	}
947 
948 	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
949 	if (!ram_res) {
950 		ret = -ENOMEM;
951 		goto unlock;
952 	}
953 
954 	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
955 	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
956 
957 	crash_free_reserved_phys_range(end, crashk_res.end);
958 
959 	if ((start == end) && (crashk_res.parent != NULL))
960 		release_resource(&crashk_res);
961 
962 	ram_res->start = end;
963 	ram_res->end = crashk_res.end;
964 	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
965 	ram_res->name = "System RAM";
966 
967 	crashk_res.end = end - 1;
968 
969 	insert_resource(&iomem_resource, ram_res);
970 
971 unlock:
972 	mutex_unlock(&kexec_mutex);
973 	return ret;
974 }
975 
976 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
977 			    size_t data_len)
978 {
979 	struct elf_note note;
980 
981 	note.n_namesz = strlen(name) + 1;
982 	note.n_descsz = data_len;
983 	note.n_type   = type;
984 	memcpy(buf, &note, sizeof(note));
985 	buf += (sizeof(note) + 3)/4;
986 	memcpy(buf, name, note.n_namesz);
987 	buf += (note.n_namesz + 3)/4;
988 	memcpy(buf, data, note.n_descsz);
989 	buf += (note.n_descsz + 3)/4;
990 
991 	return buf;
992 }
993 
994 static void final_note(u32 *buf)
995 {
996 	struct elf_note note;
997 
998 	note.n_namesz = 0;
999 	note.n_descsz = 0;
1000 	note.n_type   = 0;
1001 	memcpy(buf, &note, sizeof(note));
1002 }
1003 
1004 void crash_save_cpu(struct pt_regs *regs, int cpu)
1005 {
1006 	struct elf_prstatus prstatus;
1007 	u32 *buf;
1008 
1009 	if ((cpu < 0) || (cpu >= nr_cpu_ids))
1010 		return;
1011 
1012 	/* Using ELF notes here is opportunistic.
1013 	 * I need a well defined structure format
1014 	 * for the data I pass, and I need tags
1015 	 * on the data to indicate what information I have
1016 	 * squirrelled away.  ELF notes happen to provide
1017 	 * all of that, so there is no need to invent something new.
1018 	 */
1019 	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
1020 	if (!buf)
1021 		return;
1022 	memset(&prstatus, 0, sizeof(prstatus));
1023 	prstatus.pr_pid = current->pid;
1024 	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1025 	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1026 			      &prstatus, sizeof(prstatus));
1027 	final_note(buf);
1028 }
1029 
1030 static int __init crash_notes_memory_init(void)
1031 {
1032 	/* Allocate memory for saving cpu registers. */
1033 	size_t size, align;
1034 
1035 	/*
1036 	 * crash_notes could be allocated across 2 vmalloc pages when percpu
1037 	 * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
1038 	 * pages are also on 2 continuous physical pages. In this case the
1039 	 * 2nd part of crash_notes in 2nd page could be lost since only the
1040 	 * starting address and size of crash_notes are exported through sysfs.
1041 	 * Here round up the size of crash_notes to the nearest power of two
1042 	 * and pass it to __alloc_percpu as align value. This can make sure
1043 	 * crash_notes is allocated inside one physical page.
1044 	 */
1045 	size = sizeof(note_buf_t);
1046 	align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
1047 
1048 	/*
1049 	 * Break compile if size is bigger than PAGE_SIZE since crash_notes
1050 	 * definitely will be in 2 pages with that.
1051 	 */
1052 	BUILD_BUG_ON(size > PAGE_SIZE);
1053 
1054 	crash_notes = __alloc_percpu(size, align);
1055 	if (!crash_notes) {
1056 		pr_warn("Memory allocation for saving cpu register states failed\n");
1057 		return -ENOMEM;
1058 	}
1059 	return 0;
1060 }
1061 subsys_initcall(crash_notes_memory_init);
1062 
1063 
1064 /*
1065  * parsing the "crashkernel" commandline
1066  *
1067  * this code is intended to be called from architecture specific code
1068  */
1069 
1070 
1071 /*
1072  * This function parses command lines in the format
1073  *
1074  *   crashkernel=ramsize-range:size[,...][@offset]
1075  *
1076  * The function returns 0 on success and -EINVAL on failure.
1077  */
1078 static int __init parse_crashkernel_mem(char *cmdline,
1079 					unsigned long long system_ram,
1080 					unsigned long long *crash_size,
1081 					unsigned long long *crash_base)
1082 {
1083 	char *cur = cmdline, *tmp;
1084 
1085 	/* for each entry of the comma-separated list */
1086 	do {
1087 		unsigned long long start, end = ULLONG_MAX, size;
1088 
1089 		/* get the start of the range */
1090 		start = memparse(cur, &tmp);
1091 		if (cur == tmp) {
1092 			pr_warn("crashkernel: Memory value expected\n");
1093 			return -EINVAL;
1094 		}
1095 		cur = tmp;
1096 		if (*cur != '-') {
1097 			pr_warn("crashkernel: '-' expected\n");
1098 			return -EINVAL;
1099 		}
1100 		cur++;
1101 
1102 		/* if no ':' is here, than we read the end */
1103 		if (*cur != ':') {
1104 			end = memparse(cur, &tmp);
1105 			if (cur == tmp) {
1106 				pr_warn("crashkernel: Memory value expected\n");
1107 				return -EINVAL;
1108 			}
1109 			cur = tmp;
1110 			if (end <= start) {
1111 				pr_warn("crashkernel: end <= start\n");
1112 				return -EINVAL;
1113 			}
1114 		}
1115 
1116 		if (*cur != ':') {
1117 			pr_warn("crashkernel: ':' expected\n");
1118 			return -EINVAL;
1119 		}
1120 		cur++;
1121 
1122 		size = memparse(cur, &tmp);
1123 		if (cur == tmp) {
1124 			pr_warn("Memory value expected\n");
1125 			return -EINVAL;
1126 		}
1127 		cur = tmp;
1128 		if (size >= system_ram) {
1129 			pr_warn("crashkernel: invalid size\n");
1130 			return -EINVAL;
1131 		}
1132 
1133 		/* match ? */
1134 		if (system_ram >= start && system_ram < end) {
1135 			*crash_size = size;
1136 			break;
1137 		}
1138 	} while (*cur++ == ',');
1139 
1140 	if (*crash_size > 0) {
1141 		while (*cur && *cur != ' ' && *cur != '@')
1142 			cur++;
1143 		if (*cur == '@') {
1144 			cur++;
1145 			*crash_base = memparse(cur, &tmp);
1146 			if (cur == tmp) {
1147 				pr_warn("Memory value expected after '@'\n");
1148 				return -EINVAL;
1149 			}
1150 		}
1151 	}
1152 
1153 	return 0;
1154 }
1155 
1156 /*
1157  * That function parses "simple" (old) crashkernel command lines like
1158  *
1159  *	crashkernel=size[@offset]
1160  *
1161  * It returns 0 on success and -EINVAL on failure.
1162  */
1163 static int __init parse_crashkernel_simple(char *cmdline,
1164 					   unsigned long long *crash_size,
1165 					   unsigned long long *crash_base)
1166 {
1167 	char *cur = cmdline;
1168 
1169 	*crash_size = memparse(cmdline, &cur);
1170 	if (cmdline == cur) {
1171 		pr_warn("crashkernel: memory value expected\n");
1172 		return -EINVAL;
1173 	}
1174 
1175 	if (*cur == '@')
1176 		*crash_base = memparse(cur+1, &cur);
1177 	else if (*cur != ' ' && *cur != '\0') {
1178 		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
1179 		return -EINVAL;
1180 	}
1181 
1182 	return 0;
1183 }
1184 
1185 #define SUFFIX_HIGH 0
1186 #define SUFFIX_LOW  1
1187 #define SUFFIX_NULL 2
1188 static __initdata char *suffix_tbl[] = {
1189 	[SUFFIX_HIGH] = ",high",
1190 	[SUFFIX_LOW]  = ",low",
1191 	[SUFFIX_NULL] = NULL,
1192 };
1193 
1194 /*
1195  * That function parses "suffix"  crashkernel command lines like
1196  *
1197  *	crashkernel=size,[high|low]
1198  *
1199  * It returns 0 on success and -EINVAL on failure.
1200  */
1201 static int __init parse_crashkernel_suffix(char *cmdline,
1202 					   unsigned long long	*crash_size,
1203 					   const char *suffix)
1204 {
1205 	char *cur = cmdline;
1206 
1207 	*crash_size = memparse(cmdline, &cur);
1208 	if (cmdline == cur) {
1209 		pr_warn("crashkernel: memory value expected\n");
1210 		return -EINVAL;
1211 	}
1212 
1213 	/* check with suffix */
1214 	if (strncmp(cur, suffix, strlen(suffix))) {
1215 		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
1216 		return -EINVAL;
1217 	}
1218 	cur += strlen(suffix);
1219 	if (*cur != ' ' && *cur != '\0') {
1220 		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
1221 		return -EINVAL;
1222 	}
1223 
1224 	return 0;
1225 }
1226 
1227 static __init char *get_last_crashkernel(char *cmdline,
1228 			     const char *name,
1229 			     const char *suffix)
1230 {
1231 	char *p = cmdline, *ck_cmdline = NULL;
1232 
1233 	/* find crashkernel and use the last one if there are more */
1234 	p = strstr(p, name);
1235 	while (p) {
1236 		char *end_p = strchr(p, ' ');
1237 		char *q;
1238 
1239 		if (!end_p)
1240 			end_p = p + strlen(p);
1241 
1242 		if (!suffix) {
1243 			int i;
1244 
1245 			/* skip the one with any known suffix */
1246 			for (i = 0; suffix_tbl[i]; i++) {
1247 				q = end_p - strlen(suffix_tbl[i]);
1248 				if (!strncmp(q, suffix_tbl[i],
1249 					     strlen(suffix_tbl[i])))
1250 					goto next;
1251 			}
1252 			ck_cmdline = p;
1253 		} else {
1254 			q = end_p - strlen(suffix);
1255 			if (!strncmp(q, suffix, strlen(suffix)))
1256 				ck_cmdline = p;
1257 		}
1258 next:
1259 		p = strstr(p+1, name);
1260 	}
1261 
1262 	if (!ck_cmdline)
1263 		return NULL;
1264 
1265 	return ck_cmdline;
1266 }
1267 
1268 static int __init __parse_crashkernel(char *cmdline,
1269 			     unsigned long long system_ram,
1270 			     unsigned long long *crash_size,
1271 			     unsigned long long *crash_base,
1272 			     const char *name,
1273 			     const char *suffix)
1274 {
1275 	char	*first_colon, *first_space;
1276 	char	*ck_cmdline;
1277 
1278 	BUG_ON(!crash_size || !crash_base);
1279 	*crash_size = 0;
1280 	*crash_base = 0;
1281 
1282 	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1283 
1284 	if (!ck_cmdline)
1285 		return -EINVAL;
1286 
1287 	ck_cmdline += strlen(name);
1288 
1289 	if (suffix)
1290 		return parse_crashkernel_suffix(ck_cmdline, crash_size,
1291 				suffix);
1292 	/*
1293 	 * if the commandline contains a ':', then that's the extended
1294 	 * syntax -- if not, it must be the classic syntax
1295 	 */
1296 	first_colon = strchr(ck_cmdline, ':');
1297 	first_space = strchr(ck_cmdline, ' ');
1298 	if (first_colon && (!first_space || first_colon < first_space))
1299 		return parse_crashkernel_mem(ck_cmdline, system_ram,
1300 				crash_size, crash_base);
1301 
1302 	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1303 }
1304 
1305 /*
1306  * That function is the entry point for command line parsing and should be
1307  * called from the arch-specific code.
1308  */
1309 int __init parse_crashkernel(char *cmdline,
1310 			     unsigned long long system_ram,
1311 			     unsigned long long *crash_size,
1312 			     unsigned long long *crash_base)
1313 {
1314 	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1315 					"crashkernel=", NULL);
1316 }
1317 
1318 int __init parse_crashkernel_high(char *cmdline,
1319 			     unsigned long long system_ram,
1320 			     unsigned long long *crash_size,
1321 			     unsigned long long *crash_base)
1322 {
1323 	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1324 				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1325 }
1326 
1327 int __init parse_crashkernel_low(char *cmdline,
1328 			     unsigned long long system_ram,
1329 			     unsigned long long *crash_size,
1330 			     unsigned long long *crash_base)
1331 {
1332 	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1333 				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
1334 }
1335 
1336 static void update_vmcoreinfo_note(void)
1337 {
1338 	u32 *buf = vmcoreinfo_note;
1339 
1340 	if (!vmcoreinfo_size)
1341 		return;
1342 	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1343 			      vmcoreinfo_size);
1344 	final_note(buf);
1345 }
1346 
1347 void crash_save_vmcoreinfo(void)
1348 {
1349 	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1350 	update_vmcoreinfo_note();
1351 }
1352 
1353 void vmcoreinfo_append_str(const char *fmt, ...)
1354 {
1355 	va_list args;
1356 	char buf[0x50];
1357 	size_t r;
1358 
1359 	va_start(args, fmt);
1360 	r = vscnprintf(buf, sizeof(buf), fmt, args);
1361 	va_end(args);
1362 
1363 	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1364 
1365 	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1366 
1367 	vmcoreinfo_size += r;
1368 }
1369 
1370 /*
1371  * provide an empty default implementation here -- architecture
1372  * code may override this
1373  */
1374 void __weak arch_crash_save_vmcoreinfo(void)
1375 {}
1376 
1377 unsigned long __weak paddr_vmcoreinfo_note(void)
1378 {
1379 	return __pa((unsigned long)(char *)&vmcoreinfo_note);
1380 }
1381 
1382 static int __init crash_save_vmcoreinfo_init(void)
1383 {
1384 	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1385 	VMCOREINFO_PAGESIZE(PAGE_SIZE);
1386 
1387 	VMCOREINFO_SYMBOL(init_uts_ns);
1388 	VMCOREINFO_SYMBOL(node_online_map);
1389 #ifdef CONFIG_MMU
1390 	VMCOREINFO_SYMBOL(swapper_pg_dir);
1391 #endif
1392 	VMCOREINFO_SYMBOL(_stext);
1393 	VMCOREINFO_SYMBOL(vmap_area_list);
1394 
1395 #ifndef CONFIG_NEED_MULTIPLE_NODES
1396 	VMCOREINFO_SYMBOL(mem_map);
1397 	VMCOREINFO_SYMBOL(contig_page_data);
1398 #endif
1399 #ifdef CONFIG_SPARSEMEM
1400 	VMCOREINFO_SYMBOL(mem_section);
1401 	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1402 	VMCOREINFO_STRUCT_SIZE(mem_section);
1403 	VMCOREINFO_OFFSET(mem_section, section_mem_map);
1404 #endif
1405 	VMCOREINFO_STRUCT_SIZE(page);
1406 	VMCOREINFO_STRUCT_SIZE(pglist_data);
1407 	VMCOREINFO_STRUCT_SIZE(zone);
1408 	VMCOREINFO_STRUCT_SIZE(free_area);
1409 	VMCOREINFO_STRUCT_SIZE(list_head);
1410 	VMCOREINFO_SIZE(nodemask_t);
1411 	VMCOREINFO_OFFSET(page, flags);
1412 	VMCOREINFO_OFFSET(page, _refcount);
1413 	VMCOREINFO_OFFSET(page, mapping);
1414 	VMCOREINFO_OFFSET(page, lru);
1415 	VMCOREINFO_OFFSET(page, _mapcount);
1416 	VMCOREINFO_OFFSET(page, private);
1417 	VMCOREINFO_OFFSET(page, compound_dtor);
1418 	VMCOREINFO_OFFSET(page, compound_order);
1419 	VMCOREINFO_OFFSET(page, compound_head);
1420 	VMCOREINFO_OFFSET(pglist_data, node_zones);
1421 	VMCOREINFO_OFFSET(pglist_data, nr_zones);
1422 #ifdef CONFIG_FLAT_NODE_MEM_MAP
1423 	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1424 #endif
1425 	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1426 	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1427 	VMCOREINFO_OFFSET(pglist_data, node_id);
1428 	VMCOREINFO_OFFSET(zone, free_area);
1429 	VMCOREINFO_OFFSET(zone, vm_stat);
1430 	VMCOREINFO_OFFSET(zone, spanned_pages);
1431 	VMCOREINFO_OFFSET(free_area, free_list);
1432 	VMCOREINFO_OFFSET(list_head, next);
1433 	VMCOREINFO_OFFSET(list_head, prev);
1434 	VMCOREINFO_OFFSET(vmap_area, va_start);
1435 	VMCOREINFO_OFFSET(vmap_area, list);
1436 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1437 	log_buf_kexec_setup();
1438 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1439 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
1440 	VMCOREINFO_NUMBER(PG_lru);
1441 	VMCOREINFO_NUMBER(PG_private);
1442 	VMCOREINFO_NUMBER(PG_swapcache);
1443 	VMCOREINFO_NUMBER(PG_slab);
1444 #ifdef CONFIG_MEMORY_FAILURE
1445 	VMCOREINFO_NUMBER(PG_hwpoison);
1446 #endif
1447 	VMCOREINFO_NUMBER(PG_head_mask);
1448 	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1449 #ifdef CONFIG_X86
1450 	VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
1451 #endif
1452 #ifdef CONFIG_HUGETLB_PAGE
1453 	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
1454 #endif
1455 
1456 	arch_crash_save_vmcoreinfo();
1457 	update_vmcoreinfo_note();
1458 
1459 	return 0;
1460 }
1461 
1462 subsys_initcall(crash_save_vmcoreinfo_init);
1463 
1464 /*
1465  * Move into place and start executing a preloaded standalone
1466  * executable.  If nothing was preloaded return an error.
1467  */
1468 int kernel_kexec(void)
1469 {
1470 	int error = 0;
1471 
1472 	if (!mutex_trylock(&kexec_mutex))
1473 		return -EBUSY;
1474 	if (!kexec_image) {
1475 		error = -EINVAL;
1476 		goto Unlock;
1477 	}
1478 
1479 #ifdef CONFIG_KEXEC_JUMP
1480 	if (kexec_image->preserve_context) {
1481 		lock_system_sleep();
1482 		pm_prepare_console();
1483 		error = freeze_processes();
1484 		if (error) {
1485 			error = -EBUSY;
1486 			goto Restore_console;
1487 		}
1488 		suspend_console();
1489 		error = dpm_suspend_start(PMSG_FREEZE);
1490 		if (error)
1491 			goto Resume_console;
1492 		/* At this point, dpm_suspend_start() has been called,
1493 		 * but *not* dpm_suspend_end(). We *must* call
1494 		 * dpm_suspend_end() now.  Otherwise, drivers for
1495 		 * some devices (e.g. interrupt controllers) become
1496 		 * desynchronized with the actual state of the
1497 		 * hardware at resume time, and evil weirdness ensues.
1498 		 */
1499 		error = dpm_suspend_end(PMSG_FREEZE);
1500 		if (error)
1501 			goto Resume_devices;
1502 		error = disable_nonboot_cpus();
1503 		if (error)
1504 			goto Enable_cpus;
1505 		local_irq_disable();
1506 		error = syscore_suspend();
1507 		if (error)
1508 			goto Enable_irqs;
1509 	} else
1510 #endif
1511 	{
1512 		kexec_in_progress = true;
1513 		kernel_restart_prepare(NULL);
1514 		migrate_to_reboot_cpu();
1515 
1516 		/*
1517 		 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
1518 		 * no further code needs to use CPU hotplug (which is true in
1519 		 * the reboot case). However, the kexec path depends on using
1520 		 * CPU hotplug again; so re-enable it here.
1521 		 */
1522 		cpu_hotplug_enable();
1523 		pr_emerg("Starting new kernel\n");
1524 		machine_shutdown();
1525 	}
1526 
1527 	machine_kexec(kexec_image);
1528 
1529 #ifdef CONFIG_KEXEC_JUMP
1530 	if (kexec_image->preserve_context) {
1531 		syscore_resume();
1532  Enable_irqs:
1533 		local_irq_enable();
1534  Enable_cpus:
1535 		enable_nonboot_cpus();
1536 		dpm_resume_start(PMSG_RESTORE);
1537  Resume_devices:
1538 		dpm_resume_end(PMSG_RESTORE);
1539  Resume_console:
1540 		resume_console();
1541 		thaw_processes();
1542  Restore_console:
1543 		pm_restore_console();
1544 		unlock_system_sleep();
1545 	}
1546 #endif
1547 
1548  Unlock:
1549 	mutex_unlock(&kexec_mutex);
1550 	return error;
1551 }
1552 
1553 /*
1554  * Protection mechanism for crashkernel reserved memory after
1555  * the kdump kernel is loaded.
1556  *
1557  * Provide an empty default implementation here -- architecture
1558  * code may override this
1559  */
1560 void __weak arch_kexec_protect_crashkres(void)
1561 {}
1562 
1563 void __weak arch_kexec_unprotect_crashkres(void)
1564 {}
1565