1 /* 2 * kexec.c - kexec system call core code. 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/capability.h> 12 #include <linux/mm.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/fs.h> 16 #include <linux/kexec.h> 17 #include <linux/mutex.h> 18 #include <linux/list.h> 19 #include <linux/highmem.h> 20 #include <linux/syscalls.h> 21 #include <linux/reboot.h> 22 #include <linux/ioport.h> 23 #include <linux/hardirq.h> 24 #include <linux/elf.h> 25 #include <linux/elfcore.h> 26 #include <linux/utsname.h> 27 #include <linux/numa.h> 28 #include <linux/suspend.h> 29 #include <linux/device.h> 30 #include <linux/freezer.h> 31 #include <linux/pm.h> 32 #include <linux/cpu.h> 33 #include <linux/uaccess.h> 34 #include <linux/io.h> 35 #include <linux/console.h> 36 #include <linux/vmalloc.h> 37 #include <linux/swap.h> 38 #include <linux/syscore_ops.h> 39 #include <linux/compiler.h> 40 #include <linux/hugetlb.h> 41 42 #include <asm/page.h> 43 #include <asm/sections.h> 44 45 #include <crypto/hash.h> 46 #include <crypto/sha.h> 47 #include "kexec_internal.h" 48 49 DEFINE_MUTEX(kexec_mutex); 50 51 /* Per cpu memory for storing cpu states in case of system crash. */ 52 note_buf_t __percpu *crash_notes; 53 54 /* Flag to indicate we are going to kexec a new kernel */ 55 bool kexec_in_progress = false; 56 57 58 /* Location of the reserved area for the crash kernel */ 59 struct resource crashk_res = { 60 .name = "Crash kernel", 61 .start = 0, 62 .end = 0, 63 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, 64 .desc = IORES_DESC_CRASH_KERNEL 65 }; 66 struct resource crashk_low_res = { 67 .name = "Crash kernel", 68 .start = 0, 69 .end = 0, 70 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, 71 .desc = IORES_DESC_CRASH_KERNEL 72 }; 73 74 int kexec_should_crash(struct task_struct *p) 75 { 76 /* 77 * If crash_kexec_post_notifiers is enabled, don't run 78 * crash_kexec() here yet, which must be run after panic 79 * notifiers in panic(). 80 */ 81 if (crash_kexec_post_notifiers) 82 return 0; 83 /* 84 * There are 4 panic() calls in do_exit() path, each of which 85 * corresponds to each of these 4 conditions. 86 */ 87 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) 88 return 1; 89 return 0; 90 } 91 92 int kexec_crash_loaded(void) 93 { 94 return !!kexec_crash_image; 95 } 96 EXPORT_SYMBOL_GPL(kexec_crash_loaded); 97 98 /* 99 * When kexec transitions to the new kernel there is a one-to-one 100 * mapping between physical and virtual addresses. On processors 101 * where you can disable the MMU this is trivial, and easy. For 102 * others it is still a simple predictable page table to setup. 103 * 104 * In that environment kexec copies the new kernel to its final 105 * resting place. This means I can only support memory whose 106 * physical address can fit in an unsigned long. In particular 107 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 108 * If the assembly stub has more restrictive requirements 109 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 110 * defined more restrictively in <asm/kexec.h>. 111 * 112 * The code for the transition from the current kernel to the 113 * the new kernel is placed in the control_code_buffer, whose size 114 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single 115 * page of memory is necessary, but some architectures require more. 116 * Because this memory must be identity mapped in the transition from 117 * virtual to physical addresses it must live in the range 118 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 119 * modifiable. 120 * 121 * The assembly stub in the control code buffer is passed a linked list 122 * of descriptor pages detailing the source pages of the new kernel, 123 * and the destination addresses of those source pages. As this data 124 * structure is not used in the context of the current OS, it must 125 * be self-contained. 126 * 127 * The code has been made to work with highmem pages and will use a 128 * destination page in its final resting place (if it happens 129 * to allocate it). The end product of this is that most of the 130 * physical address space, and most of RAM can be used. 131 * 132 * Future directions include: 133 * - allocating a page table with the control code buffer identity 134 * mapped, to simplify machine_kexec and make kexec_on_panic more 135 * reliable. 136 */ 137 138 /* 139 * KIMAGE_NO_DEST is an impossible destination address..., for 140 * allocating pages whose destination address we do not care about. 141 */ 142 #define KIMAGE_NO_DEST (-1UL) 143 #define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) 144 145 static struct page *kimage_alloc_page(struct kimage *image, 146 gfp_t gfp_mask, 147 unsigned long dest); 148 149 int sanity_check_segment_list(struct kimage *image) 150 { 151 int i; 152 unsigned long nr_segments = image->nr_segments; 153 unsigned long total_pages = 0; 154 155 /* 156 * Verify we have good destination addresses. The caller is 157 * responsible for making certain we don't attempt to load 158 * the new image into invalid or reserved areas of RAM. This 159 * just verifies it is an address we can use. 160 * 161 * Since the kernel does everything in page size chunks ensure 162 * the destination addresses are page aligned. Too many 163 * special cases crop of when we don't do this. The most 164 * insidious is getting overlapping destination addresses 165 * simply because addresses are changed to page size 166 * granularity. 167 */ 168 for (i = 0; i < nr_segments; i++) { 169 unsigned long mstart, mend; 170 171 mstart = image->segment[i].mem; 172 mend = mstart + image->segment[i].memsz; 173 if (mstart > mend) 174 return -EADDRNOTAVAIL; 175 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 176 return -EADDRNOTAVAIL; 177 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 178 return -EADDRNOTAVAIL; 179 } 180 181 /* Verify our destination addresses do not overlap. 182 * If we alloed overlapping destination addresses 183 * through very weird things can happen with no 184 * easy explanation as one segment stops on another. 185 */ 186 for (i = 0; i < nr_segments; i++) { 187 unsigned long mstart, mend; 188 unsigned long j; 189 190 mstart = image->segment[i].mem; 191 mend = mstart + image->segment[i].memsz; 192 for (j = 0; j < i; j++) { 193 unsigned long pstart, pend; 194 195 pstart = image->segment[j].mem; 196 pend = pstart + image->segment[j].memsz; 197 /* Do the segments overlap ? */ 198 if ((mend > pstart) && (mstart < pend)) 199 return -EINVAL; 200 } 201 } 202 203 /* Ensure our buffer sizes are strictly less than 204 * our memory sizes. This should always be the case, 205 * and it is easier to check up front than to be surprised 206 * later on. 207 */ 208 for (i = 0; i < nr_segments; i++) { 209 if (image->segment[i].bufsz > image->segment[i].memsz) 210 return -EINVAL; 211 } 212 213 /* 214 * Verify that no more than half of memory will be consumed. If the 215 * request from userspace is too large, a large amount of time will be 216 * wasted allocating pages, which can cause a soft lockup. 217 */ 218 for (i = 0; i < nr_segments; i++) { 219 if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2) 220 return -EINVAL; 221 222 total_pages += PAGE_COUNT(image->segment[i].memsz); 223 } 224 225 if (total_pages > totalram_pages / 2) 226 return -EINVAL; 227 228 /* 229 * Verify we have good destination addresses. Normally 230 * the caller is responsible for making certain we don't 231 * attempt to load the new image into invalid or reserved 232 * areas of RAM. But crash kernels are preloaded into a 233 * reserved area of ram. We must ensure the addresses 234 * are in the reserved area otherwise preloading the 235 * kernel could corrupt things. 236 */ 237 238 if (image->type == KEXEC_TYPE_CRASH) { 239 for (i = 0; i < nr_segments; i++) { 240 unsigned long mstart, mend; 241 242 mstart = image->segment[i].mem; 243 mend = mstart + image->segment[i].memsz - 1; 244 /* Ensure we are within the crash kernel limits */ 245 if ((mstart < phys_to_boot_phys(crashk_res.start)) || 246 (mend > phys_to_boot_phys(crashk_res.end))) 247 return -EADDRNOTAVAIL; 248 } 249 } 250 251 return 0; 252 } 253 254 struct kimage *do_kimage_alloc_init(void) 255 { 256 struct kimage *image; 257 258 /* Allocate a controlling structure */ 259 image = kzalloc(sizeof(*image), GFP_KERNEL); 260 if (!image) 261 return NULL; 262 263 image->head = 0; 264 image->entry = &image->head; 265 image->last_entry = &image->head; 266 image->control_page = ~0; /* By default this does not apply */ 267 image->type = KEXEC_TYPE_DEFAULT; 268 269 /* Initialize the list of control pages */ 270 INIT_LIST_HEAD(&image->control_pages); 271 272 /* Initialize the list of destination pages */ 273 INIT_LIST_HEAD(&image->dest_pages); 274 275 /* Initialize the list of unusable pages */ 276 INIT_LIST_HEAD(&image->unusable_pages); 277 278 return image; 279 } 280 281 int kimage_is_destination_range(struct kimage *image, 282 unsigned long start, 283 unsigned long end) 284 { 285 unsigned long i; 286 287 for (i = 0; i < image->nr_segments; i++) { 288 unsigned long mstart, mend; 289 290 mstart = image->segment[i].mem; 291 mend = mstart + image->segment[i].memsz; 292 if ((end > mstart) && (start < mend)) 293 return 1; 294 } 295 296 return 0; 297 } 298 299 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 300 { 301 struct page *pages; 302 303 pages = alloc_pages(gfp_mask, order); 304 if (pages) { 305 unsigned int count, i; 306 307 pages->mapping = NULL; 308 set_page_private(pages, order); 309 count = 1 << order; 310 for (i = 0; i < count; i++) 311 SetPageReserved(pages + i); 312 } 313 314 return pages; 315 } 316 317 static void kimage_free_pages(struct page *page) 318 { 319 unsigned int order, count, i; 320 321 order = page_private(page); 322 count = 1 << order; 323 for (i = 0; i < count; i++) 324 ClearPageReserved(page + i); 325 __free_pages(page, order); 326 } 327 328 void kimage_free_page_list(struct list_head *list) 329 { 330 struct page *page, *next; 331 332 list_for_each_entry_safe(page, next, list, lru) { 333 list_del(&page->lru); 334 kimage_free_pages(page); 335 } 336 } 337 338 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 339 unsigned int order) 340 { 341 /* Control pages are special, they are the intermediaries 342 * that are needed while we copy the rest of the pages 343 * to their final resting place. As such they must 344 * not conflict with either the destination addresses 345 * or memory the kernel is already using. 346 * 347 * The only case where we really need more than one of 348 * these are for architectures where we cannot disable 349 * the MMU and must instead generate an identity mapped 350 * page table for all of the memory. 351 * 352 * At worst this runs in O(N) of the image size. 353 */ 354 struct list_head extra_pages; 355 struct page *pages; 356 unsigned int count; 357 358 count = 1 << order; 359 INIT_LIST_HEAD(&extra_pages); 360 361 /* Loop while I can allocate a page and the page allocated 362 * is a destination page. 363 */ 364 do { 365 unsigned long pfn, epfn, addr, eaddr; 366 367 pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); 368 if (!pages) 369 break; 370 pfn = page_to_boot_pfn(pages); 371 epfn = pfn + count; 372 addr = pfn << PAGE_SHIFT; 373 eaddr = epfn << PAGE_SHIFT; 374 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 375 kimage_is_destination_range(image, addr, eaddr)) { 376 list_add(&pages->lru, &extra_pages); 377 pages = NULL; 378 } 379 } while (!pages); 380 381 if (pages) { 382 /* Remember the allocated page... */ 383 list_add(&pages->lru, &image->control_pages); 384 385 /* Because the page is already in it's destination 386 * location we will never allocate another page at 387 * that address. Therefore kimage_alloc_pages 388 * will not return it (again) and we don't need 389 * to give it an entry in image->segment[]. 390 */ 391 } 392 /* Deal with the destination pages I have inadvertently allocated. 393 * 394 * Ideally I would convert multi-page allocations into single 395 * page allocations, and add everything to image->dest_pages. 396 * 397 * For now it is simpler to just free the pages. 398 */ 399 kimage_free_page_list(&extra_pages); 400 401 return pages; 402 } 403 404 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 405 unsigned int order) 406 { 407 /* Control pages are special, they are the intermediaries 408 * that are needed while we copy the rest of the pages 409 * to their final resting place. As such they must 410 * not conflict with either the destination addresses 411 * or memory the kernel is already using. 412 * 413 * Control pages are also the only pags we must allocate 414 * when loading a crash kernel. All of the other pages 415 * are specified by the segments and we just memcpy 416 * into them directly. 417 * 418 * The only case where we really need more than one of 419 * these are for architectures where we cannot disable 420 * the MMU and must instead generate an identity mapped 421 * page table for all of the memory. 422 * 423 * Given the low demand this implements a very simple 424 * allocator that finds the first hole of the appropriate 425 * size in the reserved memory region, and allocates all 426 * of the memory up to and including the hole. 427 */ 428 unsigned long hole_start, hole_end, size; 429 struct page *pages; 430 431 pages = NULL; 432 size = (1 << order) << PAGE_SHIFT; 433 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 434 hole_end = hole_start + size - 1; 435 while (hole_end <= crashk_res.end) { 436 unsigned long i; 437 438 cond_resched(); 439 440 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) 441 break; 442 /* See if I overlap any of the segments */ 443 for (i = 0; i < image->nr_segments; i++) { 444 unsigned long mstart, mend; 445 446 mstart = image->segment[i].mem; 447 mend = mstart + image->segment[i].memsz - 1; 448 if ((hole_end >= mstart) && (hole_start <= mend)) { 449 /* Advance the hole to the end of the segment */ 450 hole_start = (mend + (size - 1)) & ~(size - 1); 451 hole_end = hole_start + size - 1; 452 break; 453 } 454 } 455 /* If I don't overlap any segments I have found my hole! */ 456 if (i == image->nr_segments) { 457 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 458 image->control_page = hole_end; 459 break; 460 } 461 } 462 463 return pages; 464 } 465 466 467 struct page *kimage_alloc_control_pages(struct kimage *image, 468 unsigned int order) 469 { 470 struct page *pages = NULL; 471 472 switch (image->type) { 473 case KEXEC_TYPE_DEFAULT: 474 pages = kimage_alloc_normal_control_pages(image, order); 475 break; 476 case KEXEC_TYPE_CRASH: 477 pages = kimage_alloc_crash_control_pages(image, order); 478 break; 479 } 480 481 return pages; 482 } 483 484 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 485 { 486 if (*image->entry != 0) 487 image->entry++; 488 489 if (image->entry == image->last_entry) { 490 kimage_entry_t *ind_page; 491 struct page *page; 492 493 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 494 if (!page) 495 return -ENOMEM; 496 497 ind_page = page_address(page); 498 *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; 499 image->entry = ind_page; 500 image->last_entry = ind_page + 501 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 502 } 503 *image->entry = entry; 504 image->entry++; 505 *image->entry = 0; 506 507 return 0; 508 } 509 510 static int kimage_set_destination(struct kimage *image, 511 unsigned long destination) 512 { 513 int result; 514 515 destination &= PAGE_MASK; 516 result = kimage_add_entry(image, destination | IND_DESTINATION); 517 518 return result; 519 } 520 521 522 static int kimage_add_page(struct kimage *image, unsigned long page) 523 { 524 int result; 525 526 page &= PAGE_MASK; 527 result = kimage_add_entry(image, page | IND_SOURCE); 528 529 return result; 530 } 531 532 533 static void kimage_free_extra_pages(struct kimage *image) 534 { 535 /* Walk through and free any extra destination pages I may have */ 536 kimage_free_page_list(&image->dest_pages); 537 538 /* Walk through and free any unusable pages I have cached */ 539 kimage_free_page_list(&image->unusable_pages); 540 541 } 542 void kimage_terminate(struct kimage *image) 543 { 544 if (*image->entry != 0) 545 image->entry++; 546 547 *image->entry = IND_DONE; 548 } 549 550 #define for_each_kimage_entry(image, ptr, entry) \ 551 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 552 ptr = (entry & IND_INDIRECTION) ? \ 553 boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) 554 555 static void kimage_free_entry(kimage_entry_t entry) 556 { 557 struct page *page; 558 559 page = boot_pfn_to_page(entry >> PAGE_SHIFT); 560 kimage_free_pages(page); 561 } 562 563 void kimage_free(struct kimage *image) 564 { 565 kimage_entry_t *ptr, entry; 566 kimage_entry_t ind = 0; 567 568 if (!image) 569 return; 570 571 kimage_free_extra_pages(image); 572 for_each_kimage_entry(image, ptr, entry) { 573 if (entry & IND_INDIRECTION) { 574 /* Free the previous indirection page */ 575 if (ind & IND_INDIRECTION) 576 kimage_free_entry(ind); 577 /* Save this indirection page until we are 578 * done with it. 579 */ 580 ind = entry; 581 } else if (entry & IND_SOURCE) 582 kimage_free_entry(entry); 583 } 584 /* Free the final indirection page */ 585 if (ind & IND_INDIRECTION) 586 kimage_free_entry(ind); 587 588 /* Handle any machine specific cleanup */ 589 machine_kexec_cleanup(image); 590 591 /* Free the kexec control pages... */ 592 kimage_free_page_list(&image->control_pages); 593 594 /* 595 * Free up any temporary buffers allocated. This might hit if 596 * error occurred much later after buffer allocation. 597 */ 598 if (image->file_mode) 599 kimage_file_post_load_cleanup(image); 600 601 kfree(image); 602 } 603 604 static kimage_entry_t *kimage_dst_used(struct kimage *image, 605 unsigned long page) 606 { 607 kimage_entry_t *ptr, entry; 608 unsigned long destination = 0; 609 610 for_each_kimage_entry(image, ptr, entry) { 611 if (entry & IND_DESTINATION) 612 destination = entry & PAGE_MASK; 613 else if (entry & IND_SOURCE) { 614 if (page == destination) 615 return ptr; 616 destination += PAGE_SIZE; 617 } 618 } 619 620 return NULL; 621 } 622 623 static struct page *kimage_alloc_page(struct kimage *image, 624 gfp_t gfp_mask, 625 unsigned long destination) 626 { 627 /* 628 * Here we implement safeguards to ensure that a source page 629 * is not copied to its destination page before the data on 630 * the destination page is no longer useful. 631 * 632 * To do this we maintain the invariant that a source page is 633 * either its own destination page, or it is not a 634 * destination page at all. 635 * 636 * That is slightly stronger than required, but the proof 637 * that no problems will not occur is trivial, and the 638 * implementation is simply to verify. 639 * 640 * When allocating all pages normally this algorithm will run 641 * in O(N) time, but in the worst case it will run in O(N^2) 642 * time. If the runtime is a problem the data structures can 643 * be fixed. 644 */ 645 struct page *page; 646 unsigned long addr; 647 648 /* 649 * Walk through the list of destination pages, and see if I 650 * have a match. 651 */ 652 list_for_each_entry(page, &image->dest_pages, lru) { 653 addr = page_to_boot_pfn(page) << PAGE_SHIFT; 654 if (addr == destination) { 655 list_del(&page->lru); 656 return page; 657 } 658 } 659 page = NULL; 660 while (1) { 661 kimage_entry_t *old; 662 663 /* Allocate a page, if we run out of memory give up */ 664 page = kimage_alloc_pages(gfp_mask, 0); 665 if (!page) 666 return NULL; 667 /* If the page cannot be used file it away */ 668 if (page_to_boot_pfn(page) > 669 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 670 list_add(&page->lru, &image->unusable_pages); 671 continue; 672 } 673 addr = page_to_boot_pfn(page) << PAGE_SHIFT; 674 675 /* If it is the destination page we want use it */ 676 if (addr == destination) 677 break; 678 679 /* If the page is not a destination page use it */ 680 if (!kimage_is_destination_range(image, addr, 681 addr + PAGE_SIZE)) 682 break; 683 684 /* 685 * I know that the page is someones destination page. 686 * See if there is already a source page for this 687 * destination page. And if so swap the source pages. 688 */ 689 old = kimage_dst_used(image, addr); 690 if (old) { 691 /* If so move it */ 692 unsigned long old_addr; 693 struct page *old_page; 694 695 old_addr = *old & PAGE_MASK; 696 old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); 697 copy_highpage(page, old_page); 698 *old = addr | (*old & ~PAGE_MASK); 699 700 /* The old page I have found cannot be a 701 * destination page, so return it if it's 702 * gfp_flags honor the ones passed in. 703 */ 704 if (!(gfp_mask & __GFP_HIGHMEM) && 705 PageHighMem(old_page)) { 706 kimage_free_pages(old_page); 707 continue; 708 } 709 addr = old_addr; 710 page = old_page; 711 break; 712 } 713 /* Place the page on the destination list, to be used later */ 714 list_add(&page->lru, &image->dest_pages); 715 } 716 717 return page; 718 } 719 720 static int kimage_load_normal_segment(struct kimage *image, 721 struct kexec_segment *segment) 722 { 723 unsigned long maddr; 724 size_t ubytes, mbytes; 725 int result; 726 unsigned char __user *buf = NULL; 727 unsigned char *kbuf = NULL; 728 729 result = 0; 730 if (image->file_mode) 731 kbuf = segment->kbuf; 732 else 733 buf = segment->buf; 734 ubytes = segment->bufsz; 735 mbytes = segment->memsz; 736 maddr = segment->mem; 737 738 result = kimage_set_destination(image, maddr); 739 if (result < 0) 740 goto out; 741 742 while (mbytes) { 743 struct page *page; 744 char *ptr; 745 size_t uchunk, mchunk; 746 747 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 748 if (!page) { 749 result = -ENOMEM; 750 goto out; 751 } 752 result = kimage_add_page(image, page_to_boot_pfn(page) 753 << PAGE_SHIFT); 754 if (result < 0) 755 goto out; 756 757 ptr = kmap(page); 758 /* Start with a clear page */ 759 clear_page(ptr); 760 ptr += maddr & ~PAGE_MASK; 761 mchunk = min_t(size_t, mbytes, 762 PAGE_SIZE - (maddr & ~PAGE_MASK)); 763 uchunk = min(ubytes, mchunk); 764 765 /* For file based kexec, source pages are in kernel memory */ 766 if (image->file_mode) 767 memcpy(ptr, kbuf, uchunk); 768 else 769 result = copy_from_user(ptr, buf, uchunk); 770 kunmap(page); 771 if (result) { 772 result = -EFAULT; 773 goto out; 774 } 775 ubytes -= uchunk; 776 maddr += mchunk; 777 if (image->file_mode) 778 kbuf += mchunk; 779 else 780 buf += mchunk; 781 mbytes -= mchunk; 782 } 783 out: 784 return result; 785 } 786 787 static int kimage_load_crash_segment(struct kimage *image, 788 struct kexec_segment *segment) 789 { 790 /* For crash dumps kernels we simply copy the data from 791 * user space to it's destination. 792 * We do things a page at a time for the sake of kmap. 793 */ 794 unsigned long maddr; 795 size_t ubytes, mbytes; 796 int result; 797 unsigned char __user *buf = NULL; 798 unsigned char *kbuf = NULL; 799 800 result = 0; 801 if (image->file_mode) 802 kbuf = segment->kbuf; 803 else 804 buf = segment->buf; 805 ubytes = segment->bufsz; 806 mbytes = segment->memsz; 807 maddr = segment->mem; 808 while (mbytes) { 809 struct page *page; 810 char *ptr; 811 size_t uchunk, mchunk; 812 813 page = boot_pfn_to_page(maddr >> PAGE_SHIFT); 814 if (!page) { 815 result = -ENOMEM; 816 goto out; 817 } 818 ptr = kmap(page); 819 ptr += maddr & ~PAGE_MASK; 820 mchunk = min_t(size_t, mbytes, 821 PAGE_SIZE - (maddr & ~PAGE_MASK)); 822 uchunk = min(ubytes, mchunk); 823 if (mchunk > uchunk) { 824 /* Zero the trailing part of the page */ 825 memset(ptr + uchunk, 0, mchunk - uchunk); 826 } 827 828 /* For file based kexec, source pages are in kernel memory */ 829 if (image->file_mode) 830 memcpy(ptr, kbuf, uchunk); 831 else 832 result = copy_from_user(ptr, buf, uchunk); 833 kexec_flush_icache_page(page); 834 kunmap(page); 835 if (result) { 836 result = -EFAULT; 837 goto out; 838 } 839 ubytes -= uchunk; 840 maddr += mchunk; 841 if (image->file_mode) 842 kbuf += mchunk; 843 else 844 buf += mchunk; 845 mbytes -= mchunk; 846 } 847 out: 848 return result; 849 } 850 851 int kimage_load_segment(struct kimage *image, 852 struct kexec_segment *segment) 853 { 854 int result = -ENOMEM; 855 856 switch (image->type) { 857 case KEXEC_TYPE_DEFAULT: 858 result = kimage_load_normal_segment(image, segment); 859 break; 860 case KEXEC_TYPE_CRASH: 861 result = kimage_load_crash_segment(image, segment); 862 break; 863 } 864 865 return result; 866 } 867 868 struct kimage *kexec_image; 869 struct kimage *kexec_crash_image; 870 int kexec_load_disabled; 871 872 /* 873 * No panic_cpu check version of crash_kexec(). This function is called 874 * only when panic_cpu holds the current CPU number; this is the only CPU 875 * which processes crash_kexec routines. 876 */ 877 void __crash_kexec(struct pt_regs *regs) 878 { 879 /* Take the kexec_mutex here to prevent sys_kexec_load 880 * running on one cpu from replacing the crash kernel 881 * we are using after a panic on a different cpu. 882 * 883 * If the crash kernel was not located in a fixed area 884 * of memory the xchg(&kexec_crash_image) would be 885 * sufficient. But since I reuse the memory... 886 */ 887 if (mutex_trylock(&kexec_mutex)) { 888 if (kexec_crash_image) { 889 struct pt_regs fixed_regs; 890 891 crash_setup_regs(&fixed_regs, regs); 892 crash_save_vmcoreinfo(); 893 machine_crash_shutdown(&fixed_regs); 894 machine_kexec(kexec_crash_image); 895 } 896 mutex_unlock(&kexec_mutex); 897 } 898 } 899 900 void crash_kexec(struct pt_regs *regs) 901 { 902 int old_cpu, this_cpu; 903 904 /* 905 * Only one CPU is allowed to execute the crash_kexec() code as with 906 * panic(). Otherwise parallel calls of panic() and crash_kexec() 907 * may stop each other. To exclude them, we use panic_cpu here too. 908 */ 909 this_cpu = raw_smp_processor_id(); 910 old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); 911 if (old_cpu == PANIC_CPU_INVALID) { 912 /* This is the 1st CPU which comes here, so go ahead. */ 913 printk_safe_flush_on_panic(); 914 __crash_kexec(regs); 915 916 /* 917 * Reset panic_cpu to allow another panic()/crash_kexec() 918 * call. 919 */ 920 atomic_set(&panic_cpu, PANIC_CPU_INVALID); 921 } 922 } 923 924 size_t crash_get_memory_size(void) 925 { 926 size_t size = 0; 927 928 mutex_lock(&kexec_mutex); 929 if (crashk_res.end != crashk_res.start) 930 size = resource_size(&crashk_res); 931 mutex_unlock(&kexec_mutex); 932 return size; 933 } 934 935 void __weak crash_free_reserved_phys_range(unsigned long begin, 936 unsigned long end) 937 { 938 unsigned long addr; 939 940 for (addr = begin; addr < end; addr += PAGE_SIZE) 941 free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); 942 } 943 944 int crash_shrink_memory(unsigned long new_size) 945 { 946 int ret = 0; 947 unsigned long start, end; 948 unsigned long old_size; 949 struct resource *ram_res; 950 951 mutex_lock(&kexec_mutex); 952 953 if (kexec_crash_image) { 954 ret = -ENOENT; 955 goto unlock; 956 } 957 start = crashk_res.start; 958 end = crashk_res.end; 959 old_size = (end == 0) ? 0 : end - start + 1; 960 if (new_size >= old_size) { 961 ret = (new_size == old_size) ? 0 : -EINVAL; 962 goto unlock; 963 } 964 965 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); 966 if (!ram_res) { 967 ret = -ENOMEM; 968 goto unlock; 969 } 970 971 start = roundup(start, KEXEC_CRASH_MEM_ALIGN); 972 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); 973 974 crash_free_reserved_phys_range(end, crashk_res.end); 975 976 if ((start == end) && (crashk_res.parent != NULL)) 977 release_resource(&crashk_res); 978 979 ram_res->start = end; 980 ram_res->end = crashk_res.end; 981 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; 982 ram_res->name = "System RAM"; 983 984 crashk_res.end = end - 1; 985 986 insert_resource(&iomem_resource, ram_res); 987 988 unlock: 989 mutex_unlock(&kexec_mutex); 990 return ret; 991 } 992 993 void crash_save_cpu(struct pt_regs *regs, int cpu) 994 { 995 struct elf_prstatus prstatus; 996 u32 *buf; 997 998 if ((cpu < 0) || (cpu >= nr_cpu_ids)) 999 return; 1000 1001 /* Using ELF notes here is opportunistic. 1002 * I need a well defined structure format 1003 * for the data I pass, and I need tags 1004 * on the data to indicate what information I have 1005 * squirrelled away. ELF notes happen to provide 1006 * all of that, so there is no need to invent something new. 1007 */ 1008 buf = (u32 *)per_cpu_ptr(crash_notes, cpu); 1009 if (!buf) 1010 return; 1011 memset(&prstatus, 0, sizeof(prstatus)); 1012 prstatus.pr_pid = current->pid; 1013 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); 1014 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1015 &prstatus, sizeof(prstatus)); 1016 final_note(buf); 1017 } 1018 1019 static int __init crash_notes_memory_init(void) 1020 { 1021 /* Allocate memory for saving cpu registers. */ 1022 size_t size, align; 1023 1024 /* 1025 * crash_notes could be allocated across 2 vmalloc pages when percpu 1026 * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc 1027 * pages are also on 2 continuous physical pages. In this case the 1028 * 2nd part of crash_notes in 2nd page could be lost since only the 1029 * starting address and size of crash_notes are exported through sysfs. 1030 * Here round up the size of crash_notes to the nearest power of two 1031 * and pass it to __alloc_percpu as align value. This can make sure 1032 * crash_notes is allocated inside one physical page. 1033 */ 1034 size = sizeof(note_buf_t); 1035 align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); 1036 1037 /* 1038 * Break compile if size is bigger than PAGE_SIZE since crash_notes 1039 * definitely will be in 2 pages with that. 1040 */ 1041 BUILD_BUG_ON(size > PAGE_SIZE); 1042 1043 crash_notes = __alloc_percpu(size, align); 1044 if (!crash_notes) { 1045 pr_warn("Memory allocation for saving cpu register states failed\n"); 1046 return -ENOMEM; 1047 } 1048 return 0; 1049 } 1050 subsys_initcall(crash_notes_memory_init); 1051 1052 1053 /* 1054 * Move into place and start executing a preloaded standalone 1055 * executable. If nothing was preloaded return an error. 1056 */ 1057 int kernel_kexec(void) 1058 { 1059 int error = 0; 1060 1061 if (!mutex_trylock(&kexec_mutex)) 1062 return -EBUSY; 1063 if (!kexec_image) { 1064 error = -EINVAL; 1065 goto Unlock; 1066 } 1067 1068 #ifdef CONFIG_KEXEC_JUMP 1069 if (kexec_image->preserve_context) { 1070 lock_system_sleep(); 1071 pm_prepare_console(); 1072 error = freeze_processes(); 1073 if (error) { 1074 error = -EBUSY; 1075 goto Restore_console; 1076 } 1077 suspend_console(); 1078 error = dpm_suspend_start(PMSG_FREEZE); 1079 if (error) 1080 goto Resume_console; 1081 /* At this point, dpm_suspend_start() has been called, 1082 * but *not* dpm_suspend_end(). We *must* call 1083 * dpm_suspend_end() now. Otherwise, drivers for 1084 * some devices (e.g. interrupt controllers) become 1085 * desynchronized with the actual state of the 1086 * hardware at resume time, and evil weirdness ensues. 1087 */ 1088 error = dpm_suspend_end(PMSG_FREEZE); 1089 if (error) 1090 goto Resume_devices; 1091 error = disable_nonboot_cpus(); 1092 if (error) 1093 goto Enable_cpus; 1094 local_irq_disable(); 1095 error = syscore_suspend(); 1096 if (error) 1097 goto Enable_irqs; 1098 } else 1099 #endif 1100 { 1101 kexec_in_progress = true; 1102 kernel_restart_prepare(NULL); 1103 migrate_to_reboot_cpu(); 1104 1105 /* 1106 * migrate_to_reboot_cpu() disables CPU hotplug assuming that 1107 * no further code needs to use CPU hotplug (which is true in 1108 * the reboot case). However, the kexec path depends on using 1109 * CPU hotplug again; so re-enable it here. 1110 */ 1111 cpu_hotplug_enable(); 1112 pr_emerg("Starting new kernel\n"); 1113 machine_shutdown(); 1114 } 1115 1116 machine_kexec(kexec_image); 1117 1118 #ifdef CONFIG_KEXEC_JUMP 1119 if (kexec_image->preserve_context) { 1120 syscore_resume(); 1121 Enable_irqs: 1122 local_irq_enable(); 1123 Enable_cpus: 1124 enable_nonboot_cpus(); 1125 dpm_resume_start(PMSG_RESTORE); 1126 Resume_devices: 1127 dpm_resume_end(PMSG_RESTORE); 1128 Resume_console: 1129 resume_console(); 1130 thaw_processes(); 1131 Restore_console: 1132 pm_restore_console(); 1133 unlock_system_sleep(); 1134 } 1135 #endif 1136 1137 Unlock: 1138 mutex_unlock(&kexec_mutex); 1139 return error; 1140 } 1141 1142 /* 1143 * Protection mechanism for crashkernel reserved memory after 1144 * the kdump kernel is loaded. 1145 * 1146 * Provide an empty default implementation here -- architecture 1147 * code may override this 1148 */ 1149 void __weak arch_kexec_protect_crashkres(void) 1150 {} 1151 1152 void __weak arch_kexec_unprotect_crashkres(void) 1153 {} 1154