1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kexec.c - kexec system call core code. 4 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 5 */ 6 7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 8 9 #include <linux/capability.h> 10 #include <linux/mm.h> 11 #include <linux/file.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/kexec.h> 15 #include <linux/mutex.h> 16 #include <linux/list.h> 17 #include <linux/highmem.h> 18 #include <linux/syscalls.h> 19 #include <linux/reboot.h> 20 #include <linux/ioport.h> 21 #include <linux/hardirq.h> 22 #include <linux/elf.h> 23 #include <linux/elfcore.h> 24 #include <linux/utsname.h> 25 #include <linux/numa.h> 26 #include <linux/suspend.h> 27 #include <linux/device.h> 28 #include <linux/freezer.h> 29 #include <linux/pm.h> 30 #include <linux/cpu.h> 31 #include <linux/uaccess.h> 32 #include <linux/io.h> 33 #include <linux/console.h> 34 #include <linux/vmalloc.h> 35 #include <linux/swap.h> 36 #include <linux/syscore_ops.h> 37 #include <linux/compiler.h> 38 #include <linux/hugetlb.h> 39 #include <linux/objtool.h> 40 41 #include <asm/page.h> 42 #include <asm/sections.h> 43 44 #include <crypto/hash.h> 45 #include "kexec_internal.h" 46 47 DEFINE_MUTEX(kexec_mutex); 48 49 /* Per cpu memory for storing cpu states in case of system crash. */ 50 note_buf_t __percpu *crash_notes; 51 52 /* Flag to indicate we are going to kexec a new kernel */ 53 bool kexec_in_progress = false; 54 55 56 /* Location of the reserved area for the crash kernel */ 57 struct resource crashk_res = { 58 .name = "Crash kernel", 59 .start = 0, 60 .end = 0, 61 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, 62 .desc = IORES_DESC_CRASH_KERNEL 63 }; 64 struct resource crashk_low_res = { 65 .name = "Crash kernel", 66 .start = 0, 67 .end = 0, 68 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, 69 .desc = IORES_DESC_CRASH_KERNEL 70 }; 71 72 int kexec_should_crash(struct task_struct *p) 73 { 74 /* 75 * If crash_kexec_post_notifiers is enabled, don't run 76 * crash_kexec() here yet, which must be run after panic 77 * notifiers in panic(). 78 */ 79 if (crash_kexec_post_notifiers) 80 return 0; 81 /* 82 * There are 4 panic() calls in do_exit() path, each of which 83 * corresponds to each of these 4 conditions. 84 */ 85 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) 86 return 1; 87 return 0; 88 } 89 90 int kexec_crash_loaded(void) 91 { 92 return !!kexec_crash_image; 93 } 94 EXPORT_SYMBOL_GPL(kexec_crash_loaded); 95 96 /* 97 * When kexec transitions to the new kernel there is a one-to-one 98 * mapping between physical and virtual addresses. On processors 99 * where you can disable the MMU this is trivial, and easy. For 100 * others it is still a simple predictable page table to setup. 101 * 102 * In that environment kexec copies the new kernel to its final 103 * resting place. This means I can only support memory whose 104 * physical address can fit in an unsigned long. In particular 105 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 106 * If the assembly stub has more restrictive requirements 107 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 108 * defined more restrictively in <asm/kexec.h>. 109 * 110 * The code for the transition from the current kernel to the 111 * new kernel is placed in the control_code_buffer, whose size 112 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single 113 * page of memory is necessary, but some architectures require more. 114 * Because this memory must be identity mapped in the transition from 115 * virtual to physical addresses it must live in the range 116 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 117 * modifiable. 118 * 119 * The assembly stub in the control code buffer is passed a linked list 120 * of descriptor pages detailing the source pages of the new kernel, 121 * and the destination addresses of those source pages. As this data 122 * structure is not used in the context of the current OS, it must 123 * be self-contained. 124 * 125 * The code has been made to work with highmem pages and will use a 126 * destination page in its final resting place (if it happens 127 * to allocate it). The end product of this is that most of the 128 * physical address space, and most of RAM can be used. 129 * 130 * Future directions include: 131 * - allocating a page table with the control code buffer identity 132 * mapped, to simplify machine_kexec and make kexec_on_panic more 133 * reliable. 134 */ 135 136 /* 137 * KIMAGE_NO_DEST is an impossible destination address..., for 138 * allocating pages whose destination address we do not care about. 139 */ 140 #define KIMAGE_NO_DEST (-1UL) 141 #define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) 142 143 static struct page *kimage_alloc_page(struct kimage *image, 144 gfp_t gfp_mask, 145 unsigned long dest); 146 147 int sanity_check_segment_list(struct kimage *image) 148 { 149 int i; 150 unsigned long nr_segments = image->nr_segments; 151 unsigned long total_pages = 0; 152 unsigned long nr_pages = totalram_pages(); 153 154 /* 155 * Verify we have good destination addresses. The caller is 156 * responsible for making certain we don't attempt to load 157 * the new image into invalid or reserved areas of RAM. This 158 * just verifies it is an address we can use. 159 * 160 * Since the kernel does everything in page size chunks ensure 161 * the destination addresses are page aligned. Too many 162 * special cases crop of when we don't do this. The most 163 * insidious is getting overlapping destination addresses 164 * simply because addresses are changed to page size 165 * granularity. 166 */ 167 for (i = 0; i < nr_segments; i++) { 168 unsigned long mstart, mend; 169 170 mstart = image->segment[i].mem; 171 mend = mstart + image->segment[i].memsz; 172 if (mstart > mend) 173 return -EADDRNOTAVAIL; 174 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 175 return -EADDRNOTAVAIL; 176 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 177 return -EADDRNOTAVAIL; 178 } 179 180 /* Verify our destination addresses do not overlap. 181 * If we alloed overlapping destination addresses 182 * through very weird things can happen with no 183 * easy explanation as one segment stops on another. 184 */ 185 for (i = 0; i < nr_segments; i++) { 186 unsigned long mstart, mend; 187 unsigned long j; 188 189 mstart = image->segment[i].mem; 190 mend = mstart + image->segment[i].memsz; 191 for (j = 0; j < i; j++) { 192 unsigned long pstart, pend; 193 194 pstart = image->segment[j].mem; 195 pend = pstart + image->segment[j].memsz; 196 /* Do the segments overlap ? */ 197 if ((mend > pstart) && (mstart < pend)) 198 return -EINVAL; 199 } 200 } 201 202 /* Ensure our buffer sizes are strictly less than 203 * our memory sizes. This should always be the case, 204 * and it is easier to check up front than to be surprised 205 * later on. 206 */ 207 for (i = 0; i < nr_segments; i++) { 208 if (image->segment[i].bufsz > image->segment[i].memsz) 209 return -EINVAL; 210 } 211 212 /* 213 * Verify that no more than half of memory will be consumed. If the 214 * request from userspace is too large, a large amount of time will be 215 * wasted allocating pages, which can cause a soft lockup. 216 */ 217 for (i = 0; i < nr_segments; i++) { 218 if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2) 219 return -EINVAL; 220 221 total_pages += PAGE_COUNT(image->segment[i].memsz); 222 } 223 224 if (total_pages > nr_pages / 2) 225 return -EINVAL; 226 227 /* 228 * Verify we have good destination addresses. Normally 229 * the caller is responsible for making certain we don't 230 * attempt to load the new image into invalid or reserved 231 * areas of RAM. But crash kernels are preloaded into a 232 * reserved area of ram. We must ensure the addresses 233 * are in the reserved area otherwise preloading the 234 * kernel could corrupt things. 235 */ 236 237 if (image->type == KEXEC_TYPE_CRASH) { 238 for (i = 0; i < nr_segments; i++) { 239 unsigned long mstart, mend; 240 241 mstart = image->segment[i].mem; 242 mend = mstart + image->segment[i].memsz - 1; 243 /* Ensure we are within the crash kernel limits */ 244 if ((mstart < phys_to_boot_phys(crashk_res.start)) || 245 (mend > phys_to_boot_phys(crashk_res.end))) 246 return -EADDRNOTAVAIL; 247 } 248 } 249 250 return 0; 251 } 252 253 struct kimage *do_kimage_alloc_init(void) 254 { 255 struct kimage *image; 256 257 /* Allocate a controlling structure */ 258 image = kzalloc(sizeof(*image), GFP_KERNEL); 259 if (!image) 260 return NULL; 261 262 image->head = 0; 263 image->entry = &image->head; 264 image->last_entry = &image->head; 265 image->control_page = ~0; /* By default this does not apply */ 266 image->type = KEXEC_TYPE_DEFAULT; 267 268 /* Initialize the list of control pages */ 269 INIT_LIST_HEAD(&image->control_pages); 270 271 /* Initialize the list of destination pages */ 272 INIT_LIST_HEAD(&image->dest_pages); 273 274 /* Initialize the list of unusable pages */ 275 INIT_LIST_HEAD(&image->unusable_pages); 276 277 return image; 278 } 279 280 int kimage_is_destination_range(struct kimage *image, 281 unsigned long start, 282 unsigned long end) 283 { 284 unsigned long i; 285 286 for (i = 0; i < image->nr_segments; i++) { 287 unsigned long mstart, mend; 288 289 mstart = image->segment[i].mem; 290 mend = mstart + image->segment[i].memsz; 291 if ((end > mstart) && (start < mend)) 292 return 1; 293 } 294 295 return 0; 296 } 297 298 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 299 { 300 struct page *pages; 301 302 if (fatal_signal_pending(current)) 303 return NULL; 304 pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); 305 if (pages) { 306 unsigned int count, i; 307 308 pages->mapping = NULL; 309 set_page_private(pages, order); 310 count = 1 << order; 311 for (i = 0; i < count; i++) 312 SetPageReserved(pages + i); 313 314 arch_kexec_post_alloc_pages(page_address(pages), count, 315 gfp_mask); 316 317 if (gfp_mask & __GFP_ZERO) 318 for (i = 0; i < count; i++) 319 clear_highpage(pages + i); 320 } 321 322 return pages; 323 } 324 325 static void kimage_free_pages(struct page *page) 326 { 327 unsigned int order, count, i; 328 329 order = page_private(page); 330 count = 1 << order; 331 332 arch_kexec_pre_free_pages(page_address(page), count); 333 334 for (i = 0; i < count; i++) 335 ClearPageReserved(page + i); 336 __free_pages(page, order); 337 } 338 339 void kimage_free_page_list(struct list_head *list) 340 { 341 struct page *page, *next; 342 343 list_for_each_entry_safe(page, next, list, lru) { 344 list_del(&page->lru); 345 kimage_free_pages(page); 346 } 347 } 348 349 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 350 unsigned int order) 351 { 352 /* Control pages are special, they are the intermediaries 353 * that are needed while we copy the rest of the pages 354 * to their final resting place. As such they must 355 * not conflict with either the destination addresses 356 * or memory the kernel is already using. 357 * 358 * The only case where we really need more than one of 359 * these are for architectures where we cannot disable 360 * the MMU and must instead generate an identity mapped 361 * page table for all of the memory. 362 * 363 * At worst this runs in O(N) of the image size. 364 */ 365 struct list_head extra_pages; 366 struct page *pages; 367 unsigned int count; 368 369 count = 1 << order; 370 INIT_LIST_HEAD(&extra_pages); 371 372 /* Loop while I can allocate a page and the page allocated 373 * is a destination page. 374 */ 375 do { 376 unsigned long pfn, epfn, addr, eaddr; 377 378 pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); 379 if (!pages) 380 break; 381 pfn = page_to_boot_pfn(pages); 382 epfn = pfn + count; 383 addr = pfn << PAGE_SHIFT; 384 eaddr = epfn << PAGE_SHIFT; 385 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 386 kimage_is_destination_range(image, addr, eaddr)) { 387 list_add(&pages->lru, &extra_pages); 388 pages = NULL; 389 } 390 } while (!pages); 391 392 if (pages) { 393 /* Remember the allocated page... */ 394 list_add(&pages->lru, &image->control_pages); 395 396 /* Because the page is already in it's destination 397 * location we will never allocate another page at 398 * that address. Therefore kimage_alloc_pages 399 * will not return it (again) and we don't need 400 * to give it an entry in image->segment[]. 401 */ 402 } 403 /* Deal with the destination pages I have inadvertently allocated. 404 * 405 * Ideally I would convert multi-page allocations into single 406 * page allocations, and add everything to image->dest_pages. 407 * 408 * For now it is simpler to just free the pages. 409 */ 410 kimage_free_page_list(&extra_pages); 411 412 return pages; 413 } 414 415 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 416 unsigned int order) 417 { 418 /* Control pages are special, they are the intermediaries 419 * that are needed while we copy the rest of the pages 420 * to their final resting place. As such they must 421 * not conflict with either the destination addresses 422 * or memory the kernel is already using. 423 * 424 * Control pages are also the only pags we must allocate 425 * when loading a crash kernel. All of the other pages 426 * are specified by the segments and we just memcpy 427 * into them directly. 428 * 429 * The only case where we really need more than one of 430 * these are for architectures where we cannot disable 431 * the MMU and must instead generate an identity mapped 432 * page table for all of the memory. 433 * 434 * Given the low demand this implements a very simple 435 * allocator that finds the first hole of the appropriate 436 * size in the reserved memory region, and allocates all 437 * of the memory up to and including the hole. 438 */ 439 unsigned long hole_start, hole_end, size; 440 struct page *pages; 441 442 pages = NULL; 443 size = (1 << order) << PAGE_SHIFT; 444 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 445 hole_end = hole_start + size - 1; 446 while (hole_end <= crashk_res.end) { 447 unsigned long i; 448 449 cond_resched(); 450 451 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) 452 break; 453 /* See if I overlap any of the segments */ 454 for (i = 0; i < image->nr_segments; i++) { 455 unsigned long mstart, mend; 456 457 mstart = image->segment[i].mem; 458 mend = mstart + image->segment[i].memsz - 1; 459 if ((hole_end >= mstart) && (hole_start <= mend)) { 460 /* Advance the hole to the end of the segment */ 461 hole_start = (mend + (size - 1)) & ~(size - 1); 462 hole_end = hole_start + size - 1; 463 break; 464 } 465 } 466 /* If I don't overlap any segments I have found my hole! */ 467 if (i == image->nr_segments) { 468 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 469 image->control_page = hole_end; 470 break; 471 } 472 } 473 474 /* Ensure that these pages are decrypted if SME is enabled. */ 475 if (pages) 476 arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); 477 478 return pages; 479 } 480 481 482 struct page *kimage_alloc_control_pages(struct kimage *image, 483 unsigned int order) 484 { 485 struct page *pages = NULL; 486 487 switch (image->type) { 488 case KEXEC_TYPE_DEFAULT: 489 pages = kimage_alloc_normal_control_pages(image, order); 490 break; 491 case KEXEC_TYPE_CRASH: 492 pages = kimage_alloc_crash_control_pages(image, order); 493 break; 494 } 495 496 return pages; 497 } 498 499 int kimage_crash_copy_vmcoreinfo(struct kimage *image) 500 { 501 struct page *vmcoreinfo_page; 502 void *safecopy; 503 504 if (image->type != KEXEC_TYPE_CRASH) 505 return 0; 506 507 /* 508 * For kdump, allocate one vmcoreinfo safe copy from the 509 * crash memory. as we have arch_kexec_protect_crashkres() 510 * after kexec syscall, we naturally protect it from write 511 * (even read) access under kernel direct mapping. But on 512 * the other hand, we still need to operate it when crash 513 * happens to generate vmcoreinfo note, hereby we rely on 514 * vmap for this purpose. 515 */ 516 vmcoreinfo_page = kimage_alloc_control_pages(image, 0); 517 if (!vmcoreinfo_page) { 518 pr_warn("Could not allocate vmcoreinfo buffer\n"); 519 return -ENOMEM; 520 } 521 safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); 522 if (!safecopy) { 523 pr_warn("Could not vmap vmcoreinfo buffer\n"); 524 return -ENOMEM; 525 } 526 527 image->vmcoreinfo_data_copy = safecopy; 528 crash_update_vmcoreinfo_safecopy(safecopy); 529 530 return 0; 531 } 532 533 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 534 { 535 if (*image->entry != 0) 536 image->entry++; 537 538 if (image->entry == image->last_entry) { 539 kimage_entry_t *ind_page; 540 struct page *page; 541 542 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 543 if (!page) 544 return -ENOMEM; 545 546 ind_page = page_address(page); 547 *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; 548 image->entry = ind_page; 549 image->last_entry = ind_page + 550 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 551 } 552 *image->entry = entry; 553 image->entry++; 554 *image->entry = 0; 555 556 return 0; 557 } 558 559 static int kimage_set_destination(struct kimage *image, 560 unsigned long destination) 561 { 562 int result; 563 564 destination &= PAGE_MASK; 565 result = kimage_add_entry(image, destination | IND_DESTINATION); 566 567 return result; 568 } 569 570 571 static int kimage_add_page(struct kimage *image, unsigned long page) 572 { 573 int result; 574 575 page &= PAGE_MASK; 576 result = kimage_add_entry(image, page | IND_SOURCE); 577 578 return result; 579 } 580 581 582 static void kimage_free_extra_pages(struct kimage *image) 583 { 584 /* Walk through and free any extra destination pages I may have */ 585 kimage_free_page_list(&image->dest_pages); 586 587 /* Walk through and free any unusable pages I have cached */ 588 kimage_free_page_list(&image->unusable_pages); 589 590 } 591 592 int __weak machine_kexec_post_load(struct kimage *image) 593 { 594 return 0; 595 } 596 597 void kimage_terminate(struct kimage *image) 598 { 599 if (*image->entry != 0) 600 image->entry++; 601 602 *image->entry = IND_DONE; 603 } 604 605 #define for_each_kimage_entry(image, ptr, entry) \ 606 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 607 ptr = (entry & IND_INDIRECTION) ? \ 608 boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) 609 610 static void kimage_free_entry(kimage_entry_t entry) 611 { 612 struct page *page; 613 614 page = boot_pfn_to_page(entry >> PAGE_SHIFT); 615 kimage_free_pages(page); 616 } 617 618 void kimage_free(struct kimage *image) 619 { 620 kimage_entry_t *ptr, entry; 621 kimage_entry_t ind = 0; 622 623 if (!image) 624 return; 625 626 if (image->vmcoreinfo_data_copy) { 627 crash_update_vmcoreinfo_safecopy(NULL); 628 vunmap(image->vmcoreinfo_data_copy); 629 } 630 631 kimage_free_extra_pages(image); 632 for_each_kimage_entry(image, ptr, entry) { 633 if (entry & IND_INDIRECTION) { 634 /* Free the previous indirection page */ 635 if (ind & IND_INDIRECTION) 636 kimage_free_entry(ind); 637 /* Save this indirection page until we are 638 * done with it. 639 */ 640 ind = entry; 641 } else if (entry & IND_SOURCE) 642 kimage_free_entry(entry); 643 } 644 /* Free the final indirection page */ 645 if (ind & IND_INDIRECTION) 646 kimage_free_entry(ind); 647 648 /* Handle any machine specific cleanup */ 649 machine_kexec_cleanup(image); 650 651 /* Free the kexec control pages... */ 652 kimage_free_page_list(&image->control_pages); 653 654 /* 655 * Free up any temporary buffers allocated. This might hit if 656 * error occurred much later after buffer allocation. 657 */ 658 if (image->file_mode) 659 kimage_file_post_load_cleanup(image); 660 661 kfree(image); 662 } 663 664 static kimage_entry_t *kimage_dst_used(struct kimage *image, 665 unsigned long page) 666 { 667 kimage_entry_t *ptr, entry; 668 unsigned long destination = 0; 669 670 for_each_kimage_entry(image, ptr, entry) { 671 if (entry & IND_DESTINATION) 672 destination = entry & PAGE_MASK; 673 else if (entry & IND_SOURCE) { 674 if (page == destination) 675 return ptr; 676 destination += PAGE_SIZE; 677 } 678 } 679 680 return NULL; 681 } 682 683 static struct page *kimage_alloc_page(struct kimage *image, 684 gfp_t gfp_mask, 685 unsigned long destination) 686 { 687 /* 688 * Here we implement safeguards to ensure that a source page 689 * is not copied to its destination page before the data on 690 * the destination page is no longer useful. 691 * 692 * To do this we maintain the invariant that a source page is 693 * either its own destination page, or it is not a 694 * destination page at all. 695 * 696 * That is slightly stronger than required, but the proof 697 * that no problems will not occur is trivial, and the 698 * implementation is simply to verify. 699 * 700 * When allocating all pages normally this algorithm will run 701 * in O(N) time, but in the worst case it will run in O(N^2) 702 * time. If the runtime is a problem the data structures can 703 * be fixed. 704 */ 705 struct page *page; 706 unsigned long addr; 707 708 /* 709 * Walk through the list of destination pages, and see if I 710 * have a match. 711 */ 712 list_for_each_entry(page, &image->dest_pages, lru) { 713 addr = page_to_boot_pfn(page) << PAGE_SHIFT; 714 if (addr == destination) { 715 list_del(&page->lru); 716 return page; 717 } 718 } 719 page = NULL; 720 while (1) { 721 kimage_entry_t *old; 722 723 /* Allocate a page, if we run out of memory give up */ 724 page = kimage_alloc_pages(gfp_mask, 0); 725 if (!page) 726 return NULL; 727 /* If the page cannot be used file it away */ 728 if (page_to_boot_pfn(page) > 729 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 730 list_add(&page->lru, &image->unusable_pages); 731 continue; 732 } 733 addr = page_to_boot_pfn(page) << PAGE_SHIFT; 734 735 /* If it is the destination page we want use it */ 736 if (addr == destination) 737 break; 738 739 /* If the page is not a destination page use it */ 740 if (!kimage_is_destination_range(image, addr, 741 addr + PAGE_SIZE)) 742 break; 743 744 /* 745 * I know that the page is someones destination page. 746 * See if there is already a source page for this 747 * destination page. And if so swap the source pages. 748 */ 749 old = kimage_dst_used(image, addr); 750 if (old) { 751 /* If so move it */ 752 unsigned long old_addr; 753 struct page *old_page; 754 755 old_addr = *old & PAGE_MASK; 756 old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); 757 copy_highpage(page, old_page); 758 *old = addr | (*old & ~PAGE_MASK); 759 760 /* The old page I have found cannot be a 761 * destination page, so return it if it's 762 * gfp_flags honor the ones passed in. 763 */ 764 if (!(gfp_mask & __GFP_HIGHMEM) && 765 PageHighMem(old_page)) { 766 kimage_free_pages(old_page); 767 continue; 768 } 769 addr = old_addr; 770 page = old_page; 771 break; 772 } 773 /* Place the page on the destination list, to be used later */ 774 list_add(&page->lru, &image->dest_pages); 775 } 776 777 return page; 778 } 779 780 static int kimage_load_normal_segment(struct kimage *image, 781 struct kexec_segment *segment) 782 { 783 unsigned long maddr; 784 size_t ubytes, mbytes; 785 int result; 786 unsigned char __user *buf = NULL; 787 unsigned char *kbuf = NULL; 788 789 result = 0; 790 if (image->file_mode) 791 kbuf = segment->kbuf; 792 else 793 buf = segment->buf; 794 ubytes = segment->bufsz; 795 mbytes = segment->memsz; 796 maddr = segment->mem; 797 798 result = kimage_set_destination(image, maddr); 799 if (result < 0) 800 goto out; 801 802 while (mbytes) { 803 struct page *page; 804 char *ptr; 805 size_t uchunk, mchunk; 806 807 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 808 if (!page) { 809 result = -ENOMEM; 810 goto out; 811 } 812 result = kimage_add_page(image, page_to_boot_pfn(page) 813 << PAGE_SHIFT); 814 if (result < 0) 815 goto out; 816 817 ptr = kmap(page); 818 /* Start with a clear page */ 819 clear_page(ptr); 820 ptr += maddr & ~PAGE_MASK; 821 mchunk = min_t(size_t, mbytes, 822 PAGE_SIZE - (maddr & ~PAGE_MASK)); 823 uchunk = min(ubytes, mchunk); 824 825 /* For file based kexec, source pages are in kernel memory */ 826 if (image->file_mode) 827 memcpy(ptr, kbuf, uchunk); 828 else 829 result = copy_from_user(ptr, buf, uchunk); 830 kunmap(page); 831 if (result) { 832 result = -EFAULT; 833 goto out; 834 } 835 ubytes -= uchunk; 836 maddr += mchunk; 837 if (image->file_mode) 838 kbuf += mchunk; 839 else 840 buf += mchunk; 841 mbytes -= mchunk; 842 843 cond_resched(); 844 } 845 out: 846 return result; 847 } 848 849 static int kimage_load_crash_segment(struct kimage *image, 850 struct kexec_segment *segment) 851 { 852 /* For crash dumps kernels we simply copy the data from 853 * user space to it's destination. 854 * We do things a page at a time for the sake of kmap. 855 */ 856 unsigned long maddr; 857 size_t ubytes, mbytes; 858 int result; 859 unsigned char __user *buf = NULL; 860 unsigned char *kbuf = NULL; 861 862 result = 0; 863 if (image->file_mode) 864 kbuf = segment->kbuf; 865 else 866 buf = segment->buf; 867 ubytes = segment->bufsz; 868 mbytes = segment->memsz; 869 maddr = segment->mem; 870 while (mbytes) { 871 struct page *page; 872 char *ptr; 873 size_t uchunk, mchunk; 874 875 page = boot_pfn_to_page(maddr >> PAGE_SHIFT); 876 if (!page) { 877 result = -ENOMEM; 878 goto out; 879 } 880 arch_kexec_post_alloc_pages(page_address(page), 1, 0); 881 ptr = kmap(page); 882 ptr += maddr & ~PAGE_MASK; 883 mchunk = min_t(size_t, mbytes, 884 PAGE_SIZE - (maddr & ~PAGE_MASK)); 885 uchunk = min(ubytes, mchunk); 886 if (mchunk > uchunk) { 887 /* Zero the trailing part of the page */ 888 memset(ptr + uchunk, 0, mchunk - uchunk); 889 } 890 891 /* For file based kexec, source pages are in kernel memory */ 892 if (image->file_mode) 893 memcpy(ptr, kbuf, uchunk); 894 else 895 result = copy_from_user(ptr, buf, uchunk); 896 kexec_flush_icache_page(page); 897 kunmap(page); 898 arch_kexec_pre_free_pages(page_address(page), 1); 899 if (result) { 900 result = -EFAULT; 901 goto out; 902 } 903 ubytes -= uchunk; 904 maddr += mchunk; 905 if (image->file_mode) 906 kbuf += mchunk; 907 else 908 buf += mchunk; 909 mbytes -= mchunk; 910 911 cond_resched(); 912 } 913 out: 914 return result; 915 } 916 917 int kimage_load_segment(struct kimage *image, 918 struct kexec_segment *segment) 919 { 920 int result = -ENOMEM; 921 922 switch (image->type) { 923 case KEXEC_TYPE_DEFAULT: 924 result = kimage_load_normal_segment(image, segment); 925 break; 926 case KEXEC_TYPE_CRASH: 927 result = kimage_load_crash_segment(image, segment); 928 break; 929 } 930 931 return result; 932 } 933 934 struct kimage *kexec_image; 935 struct kimage *kexec_crash_image; 936 int kexec_load_disabled; 937 938 /* 939 * No panic_cpu check version of crash_kexec(). This function is called 940 * only when panic_cpu holds the current CPU number; this is the only CPU 941 * which processes crash_kexec routines. 942 */ 943 void __noclone __crash_kexec(struct pt_regs *regs) 944 { 945 /* Take the kexec_mutex here to prevent sys_kexec_load 946 * running on one cpu from replacing the crash kernel 947 * we are using after a panic on a different cpu. 948 * 949 * If the crash kernel was not located in a fixed area 950 * of memory the xchg(&kexec_crash_image) would be 951 * sufficient. But since I reuse the memory... 952 */ 953 if (mutex_trylock(&kexec_mutex)) { 954 if (kexec_crash_image) { 955 struct pt_regs fixed_regs; 956 957 crash_setup_regs(&fixed_regs, regs); 958 crash_save_vmcoreinfo(); 959 machine_crash_shutdown(&fixed_regs); 960 machine_kexec(kexec_crash_image); 961 } 962 mutex_unlock(&kexec_mutex); 963 } 964 } 965 STACK_FRAME_NON_STANDARD(__crash_kexec); 966 967 void crash_kexec(struct pt_regs *regs) 968 { 969 int old_cpu, this_cpu; 970 971 /* 972 * Only one CPU is allowed to execute the crash_kexec() code as with 973 * panic(). Otherwise parallel calls of panic() and crash_kexec() 974 * may stop each other. To exclude them, we use panic_cpu here too. 975 */ 976 this_cpu = raw_smp_processor_id(); 977 old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); 978 if (old_cpu == PANIC_CPU_INVALID) { 979 /* This is the 1st CPU which comes here, so go ahead. */ 980 printk_safe_flush_on_panic(); 981 __crash_kexec(regs); 982 983 /* 984 * Reset panic_cpu to allow another panic()/crash_kexec() 985 * call. 986 */ 987 atomic_set(&panic_cpu, PANIC_CPU_INVALID); 988 } 989 } 990 991 size_t crash_get_memory_size(void) 992 { 993 size_t size = 0; 994 995 mutex_lock(&kexec_mutex); 996 if (crashk_res.end != crashk_res.start) 997 size = resource_size(&crashk_res); 998 mutex_unlock(&kexec_mutex); 999 return size; 1000 } 1001 1002 void __weak crash_free_reserved_phys_range(unsigned long begin, 1003 unsigned long end) 1004 { 1005 unsigned long addr; 1006 1007 for (addr = begin; addr < end; addr += PAGE_SIZE) 1008 free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); 1009 } 1010 1011 int crash_shrink_memory(unsigned long new_size) 1012 { 1013 int ret = 0; 1014 unsigned long start, end; 1015 unsigned long old_size; 1016 struct resource *ram_res; 1017 1018 mutex_lock(&kexec_mutex); 1019 1020 if (kexec_crash_image) { 1021 ret = -ENOENT; 1022 goto unlock; 1023 } 1024 start = crashk_res.start; 1025 end = crashk_res.end; 1026 old_size = (end == 0) ? 0 : end - start + 1; 1027 if (new_size >= old_size) { 1028 ret = (new_size == old_size) ? 0 : -EINVAL; 1029 goto unlock; 1030 } 1031 1032 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); 1033 if (!ram_res) { 1034 ret = -ENOMEM; 1035 goto unlock; 1036 } 1037 1038 start = roundup(start, KEXEC_CRASH_MEM_ALIGN); 1039 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); 1040 1041 crash_free_reserved_phys_range(end, crashk_res.end); 1042 1043 if ((start == end) && (crashk_res.parent != NULL)) 1044 release_resource(&crashk_res); 1045 1046 ram_res->start = end; 1047 ram_res->end = crashk_res.end; 1048 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; 1049 ram_res->name = "System RAM"; 1050 1051 crashk_res.end = end - 1; 1052 1053 insert_resource(&iomem_resource, ram_res); 1054 1055 unlock: 1056 mutex_unlock(&kexec_mutex); 1057 return ret; 1058 } 1059 1060 void crash_save_cpu(struct pt_regs *regs, int cpu) 1061 { 1062 struct elf_prstatus prstatus; 1063 u32 *buf; 1064 1065 if ((cpu < 0) || (cpu >= nr_cpu_ids)) 1066 return; 1067 1068 /* Using ELF notes here is opportunistic. 1069 * I need a well defined structure format 1070 * for the data I pass, and I need tags 1071 * on the data to indicate what information I have 1072 * squirrelled away. ELF notes happen to provide 1073 * all of that, so there is no need to invent something new. 1074 */ 1075 buf = (u32 *)per_cpu_ptr(crash_notes, cpu); 1076 if (!buf) 1077 return; 1078 memset(&prstatus, 0, sizeof(prstatus)); 1079 prstatus.common.pr_pid = current->pid; 1080 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); 1081 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1082 &prstatus, sizeof(prstatus)); 1083 final_note(buf); 1084 } 1085 1086 static int __init crash_notes_memory_init(void) 1087 { 1088 /* Allocate memory for saving cpu registers. */ 1089 size_t size, align; 1090 1091 /* 1092 * crash_notes could be allocated across 2 vmalloc pages when percpu 1093 * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc 1094 * pages are also on 2 continuous physical pages. In this case the 1095 * 2nd part of crash_notes in 2nd page could be lost since only the 1096 * starting address and size of crash_notes are exported through sysfs. 1097 * Here round up the size of crash_notes to the nearest power of two 1098 * and pass it to __alloc_percpu as align value. This can make sure 1099 * crash_notes is allocated inside one physical page. 1100 */ 1101 size = sizeof(note_buf_t); 1102 align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); 1103 1104 /* 1105 * Break compile if size is bigger than PAGE_SIZE since crash_notes 1106 * definitely will be in 2 pages with that. 1107 */ 1108 BUILD_BUG_ON(size > PAGE_SIZE); 1109 1110 crash_notes = __alloc_percpu(size, align); 1111 if (!crash_notes) { 1112 pr_warn("Memory allocation for saving cpu register states failed\n"); 1113 return -ENOMEM; 1114 } 1115 return 0; 1116 } 1117 subsys_initcall(crash_notes_memory_init); 1118 1119 1120 /* 1121 * Move into place and start executing a preloaded standalone 1122 * executable. If nothing was preloaded return an error. 1123 */ 1124 int kernel_kexec(void) 1125 { 1126 int error = 0; 1127 1128 if (!mutex_trylock(&kexec_mutex)) 1129 return -EBUSY; 1130 if (!kexec_image) { 1131 error = -EINVAL; 1132 goto Unlock; 1133 } 1134 1135 #ifdef CONFIG_KEXEC_JUMP 1136 if (kexec_image->preserve_context) { 1137 pm_prepare_console(); 1138 error = freeze_processes(); 1139 if (error) { 1140 error = -EBUSY; 1141 goto Restore_console; 1142 } 1143 suspend_console(); 1144 error = dpm_suspend_start(PMSG_FREEZE); 1145 if (error) 1146 goto Resume_console; 1147 /* At this point, dpm_suspend_start() has been called, 1148 * but *not* dpm_suspend_end(). We *must* call 1149 * dpm_suspend_end() now. Otherwise, drivers for 1150 * some devices (e.g. interrupt controllers) become 1151 * desynchronized with the actual state of the 1152 * hardware at resume time, and evil weirdness ensues. 1153 */ 1154 error = dpm_suspend_end(PMSG_FREEZE); 1155 if (error) 1156 goto Resume_devices; 1157 error = suspend_disable_secondary_cpus(); 1158 if (error) 1159 goto Enable_cpus; 1160 local_irq_disable(); 1161 error = syscore_suspend(); 1162 if (error) 1163 goto Enable_irqs; 1164 } else 1165 #endif 1166 { 1167 kexec_in_progress = true; 1168 kernel_restart_prepare(NULL); 1169 migrate_to_reboot_cpu(); 1170 1171 /* 1172 * migrate_to_reboot_cpu() disables CPU hotplug assuming that 1173 * no further code needs to use CPU hotplug (which is true in 1174 * the reboot case). However, the kexec path depends on using 1175 * CPU hotplug again; so re-enable it here. 1176 */ 1177 cpu_hotplug_enable(); 1178 pr_notice("Starting new kernel\n"); 1179 machine_shutdown(); 1180 } 1181 1182 machine_kexec(kexec_image); 1183 1184 #ifdef CONFIG_KEXEC_JUMP 1185 if (kexec_image->preserve_context) { 1186 syscore_resume(); 1187 Enable_irqs: 1188 local_irq_enable(); 1189 Enable_cpus: 1190 suspend_enable_secondary_cpus(); 1191 dpm_resume_start(PMSG_RESTORE); 1192 Resume_devices: 1193 dpm_resume_end(PMSG_RESTORE); 1194 Resume_console: 1195 resume_console(); 1196 thaw_processes(); 1197 Restore_console: 1198 pm_restore_console(); 1199 } 1200 #endif 1201 1202 Unlock: 1203 mutex_unlock(&kexec_mutex); 1204 return error; 1205 } 1206 1207 /* 1208 * Protection mechanism for crashkernel reserved memory after 1209 * the kdump kernel is loaded. 1210 * 1211 * Provide an empty default implementation here -- architecture 1212 * code may override this 1213 */ 1214 void __weak arch_kexec_protect_crashkres(void) 1215 {} 1216 1217 void __weak arch_kexec_unprotect_crashkres(void) 1218 {} 1219