xref: /openbmc/linux/kernel/kexec.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  * kexec.c - kexec system call
3  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
4  *
5  * This source code is licensed under the GNU General Public License,
6  * Version 2.  See the file COPYING for more details.
7  */
8 
9 #include <linux/mm.h>
10 #include <linux/file.h>
11 #include <linux/slab.h>
12 #include <linux/fs.h>
13 #include <linux/kexec.h>
14 #include <linux/spinlock.h>
15 #include <linux/list.h>
16 #include <linux/highmem.h>
17 #include <linux/syscalls.h>
18 #include <linux/reboot.h>
19 #include <linux/syscalls.h>
20 #include <linux/ioport.h>
21 #include <linux/hardirq.h>
22 
23 #include <asm/page.h>
24 #include <asm/uaccess.h>
25 #include <asm/io.h>
26 #include <asm/system.h>
27 #include <asm/semaphore.h>
28 
29 /* Location of the reserved area for the crash kernel */
30 struct resource crashk_res = {
31 	.name  = "Crash kernel",
32 	.start = 0,
33 	.end   = 0,
34 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
35 };
36 
37 int kexec_should_crash(struct task_struct *p)
38 {
39 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
40 		return 1;
41 	return 0;
42 }
43 
44 /*
45  * When kexec transitions to the new kernel there is a one-to-one
46  * mapping between physical and virtual addresses.  On processors
47  * where you can disable the MMU this is trivial, and easy.  For
48  * others it is still a simple predictable page table to setup.
49  *
50  * In that environment kexec copies the new kernel to its final
51  * resting place.  This means I can only support memory whose
52  * physical address can fit in an unsigned long.  In particular
53  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
54  * If the assembly stub has more restrictive requirements
55  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
56  * defined more restrictively in <asm/kexec.h>.
57  *
58  * The code for the transition from the current kernel to the
59  * the new kernel is placed in the control_code_buffer, whose size
60  * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
61  * page of memory is necessary, but some architectures require more.
62  * Because this memory must be identity mapped in the transition from
63  * virtual to physical addresses it must live in the range
64  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
65  * modifiable.
66  *
67  * The assembly stub in the control code buffer is passed a linked list
68  * of descriptor pages detailing the source pages of the new kernel,
69  * and the destination addresses of those source pages.  As this data
70  * structure is not used in the context of the current OS, it must
71  * be self-contained.
72  *
73  * The code has been made to work with highmem pages and will use a
74  * destination page in its final resting place (if it happens
75  * to allocate it).  The end product of this is that most of the
76  * physical address space, and most of RAM can be used.
77  *
78  * Future directions include:
79  *  - allocating a page table with the control code buffer identity
80  *    mapped, to simplify machine_kexec and make kexec_on_panic more
81  *    reliable.
82  */
83 
84 /*
85  * KIMAGE_NO_DEST is an impossible destination address..., for
86  * allocating pages whose destination address we do not care about.
87  */
88 #define KIMAGE_NO_DEST (-1UL)
89 
90 static int kimage_is_destination_range(struct kimage *image,
91 				       unsigned long start, unsigned long end);
92 static struct page *kimage_alloc_page(struct kimage *image,
93 				       gfp_t gfp_mask,
94 				       unsigned long dest);
95 
96 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
97 	                    unsigned long nr_segments,
98                             struct kexec_segment __user *segments)
99 {
100 	size_t segment_bytes;
101 	struct kimage *image;
102 	unsigned long i;
103 	int result;
104 
105 	/* Allocate a controlling structure */
106 	result = -ENOMEM;
107 	image = kmalloc(sizeof(*image), GFP_KERNEL);
108 	if (!image)
109 		goto out;
110 
111 	memset(image, 0, sizeof(*image));
112 	image->head = 0;
113 	image->entry = &image->head;
114 	image->last_entry = &image->head;
115 	image->control_page = ~0; /* By default this does not apply */
116 	image->start = entry;
117 	image->type = KEXEC_TYPE_DEFAULT;
118 
119 	/* Initialize the list of control pages */
120 	INIT_LIST_HEAD(&image->control_pages);
121 
122 	/* Initialize the list of destination pages */
123 	INIT_LIST_HEAD(&image->dest_pages);
124 
125 	/* Initialize the list of unuseable pages */
126 	INIT_LIST_HEAD(&image->unuseable_pages);
127 
128 	/* Read in the segments */
129 	image->nr_segments = nr_segments;
130 	segment_bytes = nr_segments * sizeof(*segments);
131 	result = copy_from_user(image->segment, segments, segment_bytes);
132 	if (result)
133 		goto out;
134 
135 	/*
136 	 * Verify we have good destination addresses.  The caller is
137 	 * responsible for making certain we don't attempt to load
138 	 * the new image into invalid or reserved areas of RAM.  This
139 	 * just verifies it is an address we can use.
140 	 *
141 	 * Since the kernel does everything in page size chunks ensure
142 	 * the destination addreses are page aligned.  Too many
143 	 * special cases crop of when we don't do this.  The most
144 	 * insidious is getting overlapping destination addresses
145 	 * simply because addresses are changed to page size
146 	 * granularity.
147 	 */
148 	result = -EADDRNOTAVAIL;
149 	for (i = 0; i < nr_segments; i++) {
150 		unsigned long mstart, mend;
151 
152 		mstart = image->segment[i].mem;
153 		mend   = mstart + image->segment[i].memsz;
154 		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
155 			goto out;
156 		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
157 			goto out;
158 	}
159 
160 	/* Verify our destination addresses do not overlap.
161 	 * If we alloed overlapping destination addresses
162 	 * through very weird things can happen with no
163 	 * easy explanation as one segment stops on another.
164 	 */
165 	result = -EINVAL;
166 	for (i = 0; i < nr_segments; i++) {
167 		unsigned long mstart, mend;
168 		unsigned long j;
169 
170 		mstart = image->segment[i].mem;
171 		mend   = mstart + image->segment[i].memsz;
172 		for (j = 0; j < i; j++) {
173 			unsigned long pstart, pend;
174 			pstart = image->segment[j].mem;
175 			pend   = pstart + image->segment[j].memsz;
176 			/* Do the segments overlap ? */
177 			if ((mend > pstart) && (mstart < pend))
178 				goto out;
179 		}
180 	}
181 
182 	/* Ensure our buffer sizes are strictly less than
183 	 * our memory sizes.  This should always be the case,
184 	 * and it is easier to check up front than to be surprised
185 	 * later on.
186 	 */
187 	result = -EINVAL;
188 	for (i = 0; i < nr_segments; i++) {
189 		if (image->segment[i].bufsz > image->segment[i].memsz)
190 			goto out;
191 	}
192 
193 	result = 0;
194 out:
195 	if (result == 0)
196 		*rimage = image;
197 	else
198 		kfree(image);
199 
200 	return result;
201 
202 }
203 
204 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
205 				unsigned long nr_segments,
206 				struct kexec_segment __user *segments)
207 {
208 	int result;
209 	struct kimage *image;
210 
211 	/* Allocate and initialize a controlling structure */
212 	image = NULL;
213 	result = do_kimage_alloc(&image, entry, nr_segments, segments);
214 	if (result)
215 		goto out;
216 
217 	*rimage = image;
218 
219 	/*
220 	 * Find a location for the control code buffer, and add it
221 	 * the vector of segments so that it's pages will also be
222 	 * counted as destination pages.
223 	 */
224 	result = -ENOMEM;
225 	image->control_code_page = kimage_alloc_control_pages(image,
226 					   get_order(KEXEC_CONTROL_CODE_SIZE));
227 	if (!image->control_code_page) {
228 		printk(KERN_ERR "Could not allocate control_code_buffer\n");
229 		goto out;
230 	}
231 
232 	result = 0;
233  out:
234 	if (result == 0)
235 		*rimage = image;
236 	else
237 		kfree(image);
238 
239 	return result;
240 }
241 
242 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
243 				unsigned long nr_segments,
244 				struct kexec_segment __user *segments)
245 {
246 	int result;
247 	struct kimage *image;
248 	unsigned long i;
249 
250 	image = NULL;
251 	/* Verify we have a valid entry point */
252 	if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
253 		result = -EADDRNOTAVAIL;
254 		goto out;
255 	}
256 
257 	/* Allocate and initialize a controlling structure */
258 	result = do_kimage_alloc(&image, entry, nr_segments, segments);
259 	if (result)
260 		goto out;
261 
262 	/* Enable the special crash kernel control page
263 	 * allocation policy.
264 	 */
265 	image->control_page = crashk_res.start;
266 	image->type = KEXEC_TYPE_CRASH;
267 
268 	/*
269 	 * Verify we have good destination addresses.  Normally
270 	 * the caller is responsible for making certain we don't
271 	 * attempt to load the new image into invalid or reserved
272 	 * areas of RAM.  But crash kernels are preloaded into a
273 	 * reserved area of ram.  We must ensure the addresses
274 	 * are in the reserved area otherwise preloading the
275 	 * kernel could corrupt things.
276 	 */
277 	result = -EADDRNOTAVAIL;
278 	for (i = 0; i < nr_segments; i++) {
279 		unsigned long mstart, mend;
280 
281 		mstart = image->segment[i].mem;
282 		mend = mstart + image->segment[i].memsz - 1;
283 		/* Ensure we are within the crash kernel limits */
284 		if ((mstart < crashk_res.start) || (mend > crashk_res.end))
285 			goto out;
286 	}
287 
288 	/*
289 	 * Find a location for the control code buffer, and add
290 	 * the vector of segments so that it's pages will also be
291 	 * counted as destination pages.
292 	 */
293 	result = -ENOMEM;
294 	image->control_code_page = kimage_alloc_control_pages(image,
295 					   get_order(KEXEC_CONTROL_CODE_SIZE));
296 	if (!image->control_code_page) {
297 		printk(KERN_ERR "Could not allocate control_code_buffer\n");
298 		goto out;
299 	}
300 
301 	result = 0;
302 out:
303 	if (result == 0)
304 		*rimage = image;
305 	else
306 		kfree(image);
307 
308 	return result;
309 }
310 
311 static int kimage_is_destination_range(struct kimage *image,
312 					unsigned long start,
313 					unsigned long end)
314 {
315 	unsigned long i;
316 
317 	for (i = 0; i < image->nr_segments; i++) {
318 		unsigned long mstart, mend;
319 
320 		mstart = image->segment[i].mem;
321 		mend = mstart + image->segment[i].memsz;
322 		if ((end > mstart) && (start < mend))
323 			return 1;
324 	}
325 
326 	return 0;
327 }
328 
329 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
330 {
331 	struct page *pages;
332 
333 	pages = alloc_pages(gfp_mask, order);
334 	if (pages) {
335 		unsigned int count, i;
336 		pages->mapping = NULL;
337 		set_page_private(pages, order);
338 		count = 1 << order;
339 		for (i = 0; i < count; i++)
340 			SetPageReserved(pages + i);
341 	}
342 
343 	return pages;
344 }
345 
346 static void kimage_free_pages(struct page *page)
347 {
348 	unsigned int order, count, i;
349 
350 	order = page_private(page);
351 	count = 1 << order;
352 	for (i = 0; i < count; i++)
353 		ClearPageReserved(page + i);
354 	__free_pages(page, order);
355 }
356 
357 static void kimage_free_page_list(struct list_head *list)
358 {
359 	struct list_head *pos, *next;
360 
361 	list_for_each_safe(pos, next, list) {
362 		struct page *page;
363 
364 		page = list_entry(pos, struct page, lru);
365 		list_del(&page->lru);
366 		kimage_free_pages(page);
367 	}
368 }
369 
370 static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
371 							unsigned int order)
372 {
373 	/* Control pages are special, they are the intermediaries
374 	 * that are needed while we copy the rest of the pages
375 	 * to their final resting place.  As such they must
376 	 * not conflict with either the destination addresses
377 	 * or memory the kernel is already using.
378 	 *
379 	 * The only case where we really need more than one of
380 	 * these are for architectures where we cannot disable
381 	 * the MMU and must instead generate an identity mapped
382 	 * page table for all of the memory.
383 	 *
384 	 * At worst this runs in O(N) of the image size.
385 	 */
386 	struct list_head extra_pages;
387 	struct page *pages;
388 	unsigned int count;
389 
390 	count = 1 << order;
391 	INIT_LIST_HEAD(&extra_pages);
392 
393 	/* Loop while I can allocate a page and the page allocated
394 	 * is a destination page.
395 	 */
396 	do {
397 		unsigned long pfn, epfn, addr, eaddr;
398 
399 		pages = kimage_alloc_pages(GFP_KERNEL, order);
400 		if (!pages)
401 			break;
402 		pfn   = page_to_pfn(pages);
403 		epfn  = pfn + count;
404 		addr  = pfn << PAGE_SHIFT;
405 		eaddr = epfn << PAGE_SHIFT;
406 		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
407 			      kimage_is_destination_range(image, addr, eaddr)) {
408 			list_add(&pages->lru, &extra_pages);
409 			pages = NULL;
410 		}
411 	} while (!pages);
412 
413 	if (pages) {
414 		/* Remember the allocated page... */
415 		list_add(&pages->lru, &image->control_pages);
416 
417 		/* Because the page is already in it's destination
418 		 * location we will never allocate another page at
419 		 * that address.  Therefore kimage_alloc_pages
420 		 * will not return it (again) and we don't need
421 		 * to give it an entry in image->segment[].
422 		 */
423 	}
424 	/* Deal with the destination pages I have inadvertently allocated.
425 	 *
426 	 * Ideally I would convert multi-page allocations into single
427 	 * page allocations, and add everyting to image->dest_pages.
428 	 *
429 	 * For now it is simpler to just free the pages.
430 	 */
431 	kimage_free_page_list(&extra_pages);
432 
433 	return pages;
434 }
435 
436 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
437 						      unsigned int order)
438 {
439 	/* Control pages are special, they are the intermediaries
440 	 * that are needed while we copy the rest of the pages
441 	 * to their final resting place.  As such they must
442 	 * not conflict with either the destination addresses
443 	 * or memory the kernel is already using.
444 	 *
445 	 * Control pages are also the only pags we must allocate
446 	 * when loading a crash kernel.  All of the other pages
447 	 * are specified by the segments and we just memcpy
448 	 * into them directly.
449 	 *
450 	 * The only case where we really need more than one of
451 	 * these are for architectures where we cannot disable
452 	 * the MMU and must instead generate an identity mapped
453 	 * page table for all of the memory.
454 	 *
455 	 * Given the low demand this implements a very simple
456 	 * allocator that finds the first hole of the appropriate
457 	 * size in the reserved memory region, and allocates all
458 	 * of the memory up to and including the hole.
459 	 */
460 	unsigned long hole_start, hole_end, size;
461 	struct page *pages;
462 
463 	pages = NULL;
464 	size = (1 << order) << PAGE_SHIFT;
465 	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
466 	hole_end   = hole_start + size - 1;
467 	while (hole_end <= crashk_res.end) {
468 		unsigned long i;
469 
470 		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
471 			break;
472 		if (hole_end > crashk_res.end)
473 			break;
474 		/* See if I overlap any of the segments */
475 		for (i = 0; i < image->nr_segments; i++) {
476 			unsigned long mstart, mend;
477 
478 			mstart = image->segment[i].mem;
479 			mend   = mstart + image->segment[i].memsz - 1;
480 			if ((hole_end >= mstart) && (hole_start <= mend)) {
481 				/* Advance the hole to the end of the segment */
482 				hole_start = (mend + (size - 1)) & ~(size - 1);
483 				hole_end   = hole_start + size - 1;
484 				break;
485 			}
486 		}
487 		/* If I don't overlap any segments I have found my hole! */
488 		if (i == image->nr_segments) {
489 			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
490 			break;
491 		}
492 	}
493 	if (pages)
494 		image->control_page = hole_end;
495 
496 	return pages;
497 }
498 
499 
500 struct page *kimage_alloc_control_pages(struct kimage *image,
501 					 unsigned int order)
502 {
503 	struct page *pages = NULL;
504 
505 	switch (image->type) {
506 	case KEXEC_TYPE_DEFAULT:
507 		pages = kimage_alloc_normal_control_pages(image, order);
508 		break;
509 	case KEXEC_TYPE_CRASH:
510 		pages = kimage_alloc_crash_control_pages(image, order);
511 		break;
512 	}
513 
514 	return pages;
515 }
516 
517 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
518 {
519 	if (*image->entry != 0)
520 		image->entry++;
521 
522 	if (image->entry == image->last_entry) {
523 		kimage_entry_t *ind_page;
524 		struct page *page;
525 
526 		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
527 		if (!page)
528 			return -ENOMEM;
529 
530 		ind_page = page_address(page);
531 		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
532 		image->entry = ind_page;
533 		image->last_entry = ind_page +
534 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
535 	}
536 	*image->entry = entry;
537 	image->entry++;
538 	*image->entry = 0;
539 
540 	return 0;
541 }
542 
543 static int kimage_set_destination(struct kimage *image,
544 				   unsigned long destination)
545 {
546 	int result;
547 
548 	destination &= PAGE_MASK;
549 	result = kimage_add_entry(image, destination | IND_DESTINATION);
550 	if (result == 0)
551 		image->destination = destination;
552 
553 	return result;
554 }
555 
556 
557 static int kimage_add_page(struct kimage *image, unsigned long page)
558 {
559 	int result;
560 
561 	page &= PAGE_MASK;
562 	result = kimage_add_entry(image, page | IND_SOURCE);
563 	if (result == 0)
564 		image->destination += PAGE_SIZE;
565 
566 	return result;
567 }
568 
569 
570 static void kimage_free_extra_pages(struct kimage *image)
571 {
572 	/* Walk through and free any extra destination pages I may have */
573 	kimage_free_page_list(&image->dest_pages);
574 
575 	/* Walk through and free any unuseable pages I have cached */
576 	kimage_free_page_list(&image->unuseable_pages);
577 
578 }
579 static int kimage_terminate(struct kimage *image)
580 {
581 	if (*image->entry != 0)
582 		image->entry++;
583 
584 	*image->entry = IND_DONE;
585 
586 	return 0;
587 }
588 
589 #define for_each_kimage_entry(image, ptr, entry) \
590 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
591 		ptr = (entry & IND_INDIRECTION)? \
592 			phys_to_virt((entry & PAGE_MASK)): ptr +1)
593 
594 static void kimage_free_entry(kimage_entry_t entry)
595 {
596 	struct page *page;
597 
598 	page = pfn_to_page(entry >> PAGE_SHIFT);
599 	kimage_free_pages(page);
600 }
601 
602 static void kimage_free(struct kimage *image)
603 {
604 	kimage_entry_t *ptr, entry;
605 	kimage_entry_t ind = 0;
606 
607 	if (!image)
608 		return;
609 
610 	kimage_free_extra_pages(image);
611 	for_each_kimage_entry(image, ptr, entry) {
612 		if (entry & IND_INDIRECTION) {
613 			/* Free the previous indirection page */
614 			if (ind & IND_INDIRECTION)
615 				kimage_free_entry(ind);
616 			/* Save this indirection page until we are
617 			 * done with it.
618 			 */
619 			ind = entry;
620 		}
621 		else if (entry & IND_SOURCE)
622 			kimage_free_entry(entry);
623 	}
624 	/* Free the final indirection page */
625 	if (ind & IND_INDIRECTION)
626 		kimage_free_entry(ind);
627 
628 	/* Handle any machine specific cleanup */
629 	machine_kexec_cleanup(image);
630 
631 	/* Free the kexec control pages... */
632 	kimage_free_page_list(&image->control_pages);
633 	kfree(image);
634 }
635 
636 static kimage_entry_t *kimage_dst_used(struct kimage *image,
637 					unsigned long page)
638 {
639 	kimage_entry_t *ptr, entry;
640 	unsigned long destination = 0;
641 
642 	for_each_kimage_entry(image, ptr, entry) {
643 		if (entry & IND_DESTINATION)
644 			destination = entry & PAGE_MASK;
645 		else if (entry & IND_SOURCE) {
646 			if (page == destination)
647 				return ptr;
648 			destination += PAGE_SIZE;
649 		}
650 	}
651 
652 	return NULL;
653 }
654 
655 static struct page *kimage_alloc_page(struct kimage *image,
656 					gfp_t gfp_mask,
657 					unsigned long destination)
658 {
659 	/*
660 	 * Here we implement safeguards to ensure that a source page
661 	 * is not copied to its destination page before the data on
662 	 * the destination page is no longer useful.
663 	 *
664 	 * To do this we maintain the invariant that a source page is
665 	 * either its own destination page, or it is not a
666 	 * destination page at all.
667 	 *
668 	 * That is slightly stronger than required, but the proof
669 	 * that no problems will not occur is trivial, and the
670 	 * implementation is simply to verify.
671 	 *
672 	 * When allocating all pages normally this algorithm will run
673 	 * in O(N) time, but in the worst case it will run in O(N^2)
674 	 * time.   If the runtime is a problem the data structures can
675 	 * be fixed.
676 	 */
677 	struct page *page;
678 	unsigned long addr;
679 
680 	/*
681 	 * Walk through the list of destination pages, and see if I
682 	 * have a match.
683 	 */
684 	list_for_each_entry(page, &image->dest_pages, lru) {
685 		addr = page_to_pfn(page) << PAGE_SHIFT;
686 		if (addr == destination) {
687 			list_del(&page->lru);
688 			return page;
689 		}
690 	}
691 	page = NULL;
692 	while (1) {
693 		kimage_entry_t *old;
694 
695 		/* Allocate a page, if we run out of memory give up */
696 		page = kimage_alloc_pages(gfp_mask, 0);
697 		if (!page)
698 			return NULL;
699 		/* If the page cannot be used file it away */
700 		if (page_to_pfn(page) >
701 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
702 			list_add(&page->lru, &image->unuseable_pages);
703 			continue;
704 		}
705 		addr = page_to_pfn(page) << PAGE_SHIFT;
706 
707 		/* If it is the destination page we want use it */
708 		if (addr == destination)
709 			break;
710 
711 		/* If the page is not a destination page use it */
712 		if (!kimage_is_destination_range(image, addr,
713 						  addr + PAGE_SIZE))
714 			break;
715 
716 		/*
717 		 * I know that the page is someones destination page.
718 		 * See if there is already a source page for this
719 		 * destination page.  And if so swap the source pages.
720 		 */
721 		old = kimage_dst_used(image, addr);
722 		if (old) {
723 			/* If so move it */
724 			unsigned long old_addr;
725 			struct page *old_page;
726 
727 			old_addr = *old & PAGE_MASK;
728 			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
729 			copy_highpage(page, old_page);
730 			*old = addr | (*old & ~PAGE_MASK);
731 
732 			/* The old page I have found cannot be a
733 			 * destination page, so return it.
734 			 */
735 			addr = old_addr;
736 			page = old_page;
737 			break;
738 		}
739 		else {
740 			/* Place the page on the destination list I
741 			 * will use it later.
742 			 */
743 			list_add(&page->lru, &image->dest_pages);
744 		}
745 	}
746 
747 	return page;
748 }
749 
750 static int kimage_load_normal_segment(struct kimage *image,
751 					 struct kexec_segment *segment)
752 {
753 	unsigned long maddr;
754 	unsigned long ubytes, mbytes;
755 	int result;
756 	unsigned char __user *buf;
757 
758 	result = 0;
759 	buf = segment->buf;
760 	ubytes = segment->bufsz;
761 	mbytes = segment->memsz;
762 	maddr = segment->mem;
763 
764 	result = kimage_set_destination(image, maddr);
765 	if (result < 0)
766 		goto out;
767 
768 	while (mbytes) {
769 		struct page *page;
770 		char *ptr;
771 		size_t uchunk, mchunk;
772 
773 		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
774 		if (page == 0) {
775 			result  = -ENOMEM;
776 			goto out;
777 		}
778 		result = kimage_add_page(image, page_to_pfn(page)
779 								<< PAGE_SHIFT);
780 		if (result < 0)
781 			goto out;
782 
783 		ptr = kmap(page);
784 		/* Start with a clear page */
785 		memset(ptr, 0, PAGE_SIZE);
786 		ptr += maddr & ~PAGE_MASK;
787 		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
788 		if (mchunk > mbytes)
789 			mchunk = mbytes;
790 
791 		uchunk = mchunk;
792 		if (uchunk > ubytes)
793 			uchunk = ubytes;
794 
795 		result = copy_from_user(ptr, buf, uchunk);
796 		kunmap(page);
797 		if (result) {
798 			result = (result < 0) ? result : -EIO;
799 			goto out;
800 		}
801 		ubytes -= uchunk;
802 		maddr  += mchunk;
803 		buf    += mchunk;
804 		mbytes -= mchunk;
805 	}
806 out:
807 	return result;
808 }
809 
810 static int kimage_load_crash_segment(struct kimage *image,
811 					struct kexec_segment *segment)
812 {
813 	/* For crash dumps kernels we simply copy the data from
814 	 * user space to it's destination.
815 	 * We do things a page at a time for the sake of kmap.
816 	 */
817 	unsigned long maddr;
818 	unsigned long ubytes, mbytes;
819 	int result;
820 	unsigned char __user *buf;
821 
822 	result = 0;
823 	buf = segment->buf;
824 	ubytes = segment->bufsz;
825 	mbytes = segment->memsz;
826 	maddr = segment->mem;
827 	while (mbytes) {
828 		struct page *page;
829 		char *ptr;
830 		size_t uchunk, mchunk;
831 
832 		page = pfn_to_page(maddr >> PAGE_SHIFT);
833 		if (page == 0) {
834 			result  = -ENOMEM;
835 			goto out;
836 		}
837 		ptr = kmap(page);
838 		ptr += maddr & ~PAGE_MASK;
839 		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
840 		if (mchunk > mbytes)
841 			mchunk = mbytes;
842 
843 		uchunk = mchunk;
844 		if (uchunk > ubytes) {
845 			uchunk = ubytes;
846 			/* Zero the trailing part of the page */
847 			memset(ptr + uchunk, 0, mchunk - uchunk);
848 		}
849 		result = copy_from_user(ptr, buf, uchunk);
850 		kunmap(page);
851 		if (result) {
852 			result = (result < 0) ? result : -EIO;
853 			goto out;
854 		}
855 		ubytes -= uchunk;
856 		maddr  += mchunk;
857 		buf    += mchunk;
858 		mbytes -= mchunk;
859 	}
860 out:
861 	return result;
862 }
863 
864 static int kimage_load_segment(struct kimage *image,
865 				struct kexec_segment *segment)
866 {
867 	int result = -ENOMEM;
868 
869 	switch (image->type) {
870 	case KEXEC_TYPE_DEFAULT:
871 		result = kimage_load_normal_segment(image, segment);
872 		break;
873 	case KEXEC_TYPE_CRASH:
874 		result = kimage_load_crash_segment(image, segment);
875 		break;
876 	}
877 
878 	return result;
879 }
880 
881 /*
882  * Exec Kernel system call: for obvious reasons only root may call it.
883  *
884  * This call breaks up into three pieces.
885  * - A generic part which loads the new kernel from the current
886  *   address space, and very carefully places the data in the
887  *   allocated pages.
888  *
889  * - A generic part that interacts with the kernel and tells all of
890  *   the devices to shut down.  Preventing on-going dmas, and placing
891  *   the devices in a consistent state so a later kernel can
892  *   reinitialize them.
893  *
894  * - A machine specific part that includes the syscall number
895  *   and the copies the image to it's final destination.  And
896  *   jumps into the image at entry.
897  *
898  * kexec does not sync, or unmount filesystems so if you need
899  * that to happen you need to do that yourself.
900  */
901 struct kimage *kexec_image = NULL;
902 static struct kimage *kexec_crash_image = NULL;
903 /*
904  * A home grown binary mutex.
905  * Nothing can wait so this mutex is safe to use
906  * in interrupt context :)
907  */
908 static int kexec_lock = 0;
909 
910 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
911 				struct kexec_segment __user *segments,
912 				unsigned long flags)
913 {
914 	struct kimage **dest_image, *image;
915 	int locked;
916 	int result;
917 
918 	/* We only trust the superuser with rebooting the system. */
919 	if (!capable(CAP_SYS_BOOT))
920 		return -EPERM;
921 
922 	/*
923 	 * Verify we have a legal set of flags
924 	 * This leaves us room for future extensions.
925 	 */
926 	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
927 		return -EINVAL;
928 
929 	/* Verify we are on the appropriate architecture */
930 	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
931 		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
932 		return -EINVAL;
933 
934 	/* Put an artificial cap on the number
935 	 * of segments passed to kexec_load.
936 	 */
937 	if (nr_segments > KEXEC_SEGMENT_MAX)
938 		return -EINVAL;
939 
940 	image = NULL;
941 	result = 0;
942 
943 	/* Because we write directly to the reserved memory
944 	 * region when loading crash kernels we need a mutex here to
945 	 * prevent multiple crash  kernels from attempting to load
946 	 * simultaneously, and to prevent a crash kernel from loading
947 	 * over the top of a in use crash kernel.
948 	 *
949 	 * KISS: always take the mutex.
950 	 */
951 	locked = xchg(&kexec_lock, 1);
952 	if (locked)
953 		return -EBUSY;
954 
955 	dest_image = &kexec_image;
956 	if (flags & KEXEC_ON_CRASH)
957 		dest_image = &kexec_crash_image;
958 	if (nr_segments > 0) {
959 		unsigned long i;
960 
961 		/* Loading another kernel to reboot into */
962 		if ((flags & KEXEC_ON_CRASH) == 0)
963 			result = kimage_normal_alloc(&image, entry,
964 							nr_segments, segments);
965 		/* Loading another kernel to switch to if this one crashes */
966 		else if (flags & KEXEC_ON_CRASH) {
967 			/* Free any current crash dump kernel before
968 			 * we corrupt it.
969 			 */
970 			kimage_free(xchg(&kexec_crash_image, NULL));
971 			result = kimage_crash_alloc(&image, entry,
972 						     nr_segments, segments);
973 		}
974 		if (result)
975 			goto out;
976 
977 		result = machine_kexec_prepare(image);
978 		if (result)
979 			goto out;
980 
981 		for (i = 0; i < nr_segments; i++) {
982 			result = kimage_load_segment(image, &image->segment[i]);
983 			if (result)
984 				goto out;
985 		}
986 		result = kimage_terminate(image);
987 		if (result)
988 			goto out;
989 	}
990 	/* Install the new kernel, and  Uninstall the old */
991 	image = xchg(dest_image, image);
992 
993 out:
994 	xchg(&kexec_lock, 0); /* Release the mutex */
995 	kimage_free(image);
996 
997 	return result;
998 }
999 
1000 #ifdef CONFIG_COMPAT
1001 asmlinkage long compat_sys_kexec_load(unsigned long entry,
1002 				unsigned long nr_segments,
1003 				struct compat_kexec_segment __user *segments,
1004 				unsigned long flags)
1005 {
1006 	struct compat_kexec_segment in;
1007 	struct kexec_segment out, __user *ksegments;
1008 	unsigned long i, result;
1009 
1010 	/* Don't allow clients that don't understand the native
1011 	 * architecture to do anything.
1012 	 */
1013 	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1014 		return -EINVAL;
1015 
1016 	if (nr_segments > KEXEC_SEGMENT_MAX)
1017 		return -EINVAL;
1018 
1019 	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1020 	for (i=0; i < nr_segments; i++) {
1021 		result = copy_from_user(&in, &segments[i], sizeof(in));
1022 		if (result)
1023 			return -EFAULT;
1024 
1025 		out.buf   = compat_ptr(in.buf);
1026 		out.bufsz = in.bufsz;
1027 		out.mem   = in.mem;
1028 		out.memsz = in.memsz;
1029 
1030 		result = copy_to_user(&ksegments[i], &out, sizeof(out));
1031 		if (result)
1032 			return -EFAULT;
1033 	}
1034 
1035 	return sys_kexec_load(entry, nr_segments, ksegments, flags);
1036 }
1037 #endif
1038 
1039 void crash_kexec(struct pt_regs *regs)
1040 {
1041 	struct kimage *image;
1042 	int locked;
1043 
1044 
1045 	/* Take the kexec_lock here to prevent sys_kexec_load
1046 	 * running on one cpu from replacing the crash kernel
1047 	 * we are using after a panic on a different cpu.
1048 	 *
1049 	 * If the crash kernel was not located in a fixed area
1050 	 * of memory the xchg(&kexec_crash_image) would be
1051 	 * sufficient.  But since I reuse the memory...
1052 	 */
1053 	locked = xchg(&kexec_lock, 1);
1054 	if (!locked) {
1055 		image = xchg(&kexec_crash_image, NULL);
1056 		if (image) {
1057 			machine_crash_shutdown(regs);
1058 			machine_kexec(image);
1059 		}
1060 		xchg(&kexec_lock, 0);
1061 	}
1062 }
1063