1 /* 2 * kexec.c - kexec system call 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #include <linux/capability.h> 10 #include <linux/mm.h> 11 #include <linux/file.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/kexec.h> 15 #include <linux/spinlock.h> 16 #include <linux/list.h> 17 #include <linux/highmem.h> 18 #include <linux/syscalls.h> 19 #include <linux/reboot.h> 20 #include <linux/ioport.h> 21 #include <linux/hardirq.h> 22 #include <linux/elf.h> 23 #include <linux/elfcore.h> 24 #include <linux/utsrelease.h> 25 #include <linux/utsname.h> 26 #include <linux/numa.h> 27 #include <linux/suspend.h> 28 #include <linux/device.h> 29 #include <linux/freezer.h> 30 #include <linux/pm.h> 31 #include <linux/cpu.h> 32 #include <linux/console.h> 33 34 #include <asm/page.h> 35 #include <asm/uaccess.h> 36 #include <asm/io.h> 37 #include <asm/system.h> 38 #include <asm/sections.h> 39 40 /* Per cpu memory for storing cpu states in case of system crash. */ 41 note_buf_t* crash_notes; 42 43 /* vmcoreinfo stuff */ 44 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 45 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 46 size_t vmcoreinfo_size; 47 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 48 49 /* Location of the reserved area for the crash kernel */ 50 struct resource crashk_res = { 51 .name = "Crash kernel", 52 .start = 0, 53 .end = 0, 54 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 55 }; 56 57 int kexec_should_crash(struct task_struct *p) 58 { 59 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) 60 return 1; 61 return 0; 62 } 63 64 /* 65 * When kexec transitions to the new kernel there is a one-to-one 66 * mapping between physical and virtual addresses. On processors 67 * where you can disable the MMU this is trivial, and easy. For 68 * others it is still a simple predictable page table to setup. 69 * 70 * In that environment kexec copies the new kernel to its final 71 * resting place. This means I can only support memory whose 72 * physical address can fit in an unsigned long. In particular 73 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 74 * If the assembly stub has more restrictive requirements 75 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 76 * defined more restrictively in <asm/kexec.h>. 77 * 78 * The code for the transition from the current kernel to the 79 * the new kernel is placed in the control_code_buffer, whose size 80 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 81 * page of memory is necessary, but some architectures require more. 82 * Because this memory must be identity mapped in the transition from 83 * virtual to physical addresses it must live in the range 84 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 85 * modifiable. 86 * 87 * The assembly stub in the control code buffer is passed a linked list 88 * of descriptor pages detailing the source pages of the new kernel, 89 * and the destination addresses of those source pages. As this data 90 * structure is not used in the context of the current OS, it must 91 * be self-contained. 92 * 93 * The code has been made to work with highmem pages and will use a 94 * destination page in its final resting place (if it happens 95 * to allocate it). The end product of this is that most of the 96 * physical address space, and most of RAM can be used. 97 * 98 * Future directions include: 99 * - allocating a page table with the control code buffer identity 100 * mapped, to simplify machine_kexec and make kexec_on_panic more 101 * reliable. 102 */ 103 104 /* 105 * KIMAGE_NO_DEST is an impossible destination address..., for 106 * allocating pages whose destination address we do not care about. 107 */ 108 #define KIMAGE_NO_DEST (-1UL) 109 110 static int kimage_is_destination_range(struct kimage *image, 111 unsigned long start, unsigned long end); 112 static struct page *kimage_alloc_page(struct kimage *image, 113 gfp_t gfp_mask, 114 unsigned long dest); 115 116 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 117 unsigned long nr_segments, 118 struct kexec_segment __user *segments) 119 { 120 size_t segment_bytes; 121 struct kimage *image; 122 unsigned long i; 123 int result; 124 125 /* Allocate a controlling structure */ 126 result = -ENOMEM; 127 image = kzalloc(sizeof(*image), GFP_KERNEL); 128 if (!image) 129 goto out; 130 131 image->head = 0; 132 image->entry = &image->head; 133 image->last_entry = &image->head; 134 image->control_page = ~0; /* By default this does not apply */ 135 image->start = entry; 136 image->type = KEXEC_TYPE_DEFAULT; 137 138 /* Initialize the list of control pages */ 139 INIT_LIST_HEAD(&image->control_pages); 140 141 /* Initialize the list of destination pages */ 142 INIT_LIST_HEAD(&image->dest_pages); 143 144 /* Initialize the list of unuseable pages */ 145 INIT_LIST_HEAD(&image->unuseable_pages); 146 147 /* Read in the segments */ 148 image->nr_segments = nr_segments; 149 segment_bytes = nr_segments * sizeof(*segments); 150 result = copy_from_user(image->segment, segments, segment_bytes); 151 if (result) 152 goto out; 153 154 /* 155 * Verify we have good destination addresses. The caller is 156 * responsible for making certain we don't attempt to load 157 * the new image into invalid or reserved areas of RAM. This 158 * just verifies it is an address we can use. 159 * 160 * Since the kernel does everything in page size chunks ensure 161 * the destination addreses are page aligned. Too many 162 * special cases crop of when we don't do this. The most 163 * insidious is getting overlapping destination addresses 164 * simply because addresses are changed to page size 165 * granularity. 166 */ 167 result = -EADDRNOTAVAIL; 168 for (i = 0; i < nr_segments; i++) { 169 unsigned long mstart, mend; 170 171 mstart = image->segment[i].mem; 172 mend = mstart + image->segment[i].memsz; 173 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 174 goto out; 175 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 176 goto out; 177 } 178 179 /* Verify our destination addresses do not overlap. 180 * If we alloed overlapping destination addresses 181 * through very weird things can happen with no 182 * easy explanation as one segment stops on another. 183 */ 184 result = -EINVAL; 185 for (i = 0; i < nr_segments; i++) { 186 unsigned long mstart, mend; 187 unsigned long j; 188 189 mstart = image->segment[i].mem; 190 mend = mstart + image->segment[i].memsz; 191 for (j = 0; j < i; j++) { 192 unsigned long pstart, pend; 193 pstart = image->segment[j].mem; 194 pend = pstart + image->segment[j].memsz; 195 /* Do the segments overlap ? */ 196 if ((mend > pstart) && (mstart < pend)) 197 goto out; 198 } 199 } 200 201 /* Ensure our buffer sizes are strictly less than 202 * our memory sizes. This should always be the case, 203 * and it is easier to check up front than to be surprised 204 * later on. 205 */ 206 result = -EINVAL; 207 for (i = 0; i < nr_segments; i++) { 208 if (image->segment[i].bufsz > image->segment[i].memsz) 209 goto out; 210 } 211 212 result = 0; 213 out: 214 if (result == 0) 215 *rimage = image; 216 else 217 kfree(image); 218 219 return result; 220 221 } 222 223 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 224 unsigned long nr_segments, 225 struct kexec_segment __user *segments) 226 { 227 int result; 228 struct kimage *image; 229 230 /* Allocate and initialize a controlling structure */ 231 image = NULL; 232 result = do_kimage_alloc(&image, entry, nr_segments, segments); 233 if (result) 234 goto out; 235 236 *rimage = image; 237 238 /* 239 * Find a location for the control code buffer, and add it 240 * the vector of segments so that it's pages will also be 241 * counted as destination pages. 242 */ 243 result = -ENOMEM; 244 image->control_code_page = kimage_alloc_control_pages(image, 245 get_order(KEXEC_CONTROL_CODE_SIZE)); 246 if (!image->control_code_page) { 247 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 248 goto out; 249 } 250 251 image->swap_page = kimage_alloc_control_pages(image, 0); 252 if (!image->swap_page) { 253 printk(KERN_ERR "Could not allocate swap buffer\n"); 254 goto out; 255 } 256 257 result = 0; 258 out: 259 if (result == 0) 260 *rimage = image; 261 else 262 kfree(image); 263 264 return result; 265 } 266 267 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 268 unsigned long nr_segments, 269 struct kexec_segment __user *segments) 270 { 271 int result; 272 struct kimage *image; 273 unsigned long i; 274 275 image = NULL; 276 /* Verify we have a valid entry point */ 277 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 278 result = -EADDRNOTAVAIL; 279 goto out; 280 } 281 282 /* Allocate and initialize a controlling structure */ 283 result = do_kimage_alloc(&image, entry, nr_segments, segments); 284 if (result) 285 goto out; 286 287 /* Enable the special crash kernel control page 288 * allocation policy. 289 */ 290 image->control_page = crashk_res.start; 291 image->type = KEXEC_TYPE_CRASH; 292 293 /* 294 * Verify we have good destination addresses. Normally 295 * the caller is responsible for making certain we don't 296 * attempt to load the new image into invalid or reserved 297 * areas of RAM. But crash kernels are preloaded into a 298 * reserved area of ram. We must ensure the addresses 299 * are in the reserved area otherwise preloading the 300 * kernel could corrupt things. 301 */ 302 result = -EADDRNOTAVAIL; 303 for (i = 0; i < nr_segments; i++) { 304 unsigned long mstart, mend; 305 306 mstart = image->segment[i].mem; 307 mend = mstart + image->segment[i].memsz - 1; 308 /* Ensure we are within the crash kernel limits */ 309 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 310 goto out; 311 } 312 313 /* 314 * Find a location for the control code buffer, and add 315 * the vector of segments so that it's pages will also be 316 * counted as destination pages. 317 */ 318 result = -ENOMEM; 319 image->control_code_page = kimage_alloc_control_pages(image, 320 get_order(KEXEC_CONTROL_CODE_SIZE)); 321 if (!image->control_code_page) { 322 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 323 goto out; 324 } 325 326 result = 0; 327 out: 328 if (result == 0) 329 *rimage = image; 330 else 331 kfree(image); 332 333 return result; 334 } 335 336 static int kimage_is_destination_range(struct kimage *image, 337 unsigned long start, 338 unsigned long end) 339 { 340 unsigned long i; 341 342 for (i = 0; i < image->nr_segments; i++) { 343 unsigned long mstart, mend; 344 345 mstart = image->segment[i].mem; 346 mend = mstart + image->segment[i].memsz; 347 if ((end > mstart) && (start < mend)) 348 return 1; 349 } 350 351 return 0; 352 } 353 354 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 355 { 356 struct page *pages; 357 358 pages = alloc_pages(gfp_mask, order); 359 if (pages) { 360 unsigned int count, i; 361 pages->mapping = NULL; 362 set_page_private(pages, order); 363 count = 1 << order; 364 for (i = 0; i < count; i++) 365 SetPageReserved(pages + i); 366 } 367 368 return pages; 369 } 370 371 static void kimage_free_pages(struct page *page) 372 { 373 unsigned int order, count, i; 374 375 order = page_private(page); 376 count = 1 << order; 377 for (i = 0; i < count; i++) 378 ClearPageReserved(page + i); 379 __free_pages(page, order); 380 } 381 382 static void kimage_free_page_list(struct list_head *list) 383 { 384 struct list_head *pos, *next; 385 386 list_for_each_safe(pos, next, list) { 387 struct page *page; 388 389 page = list_entry(pos, struct page, lru); 390 list_del(&page->lru); 391 kimage_free_pages(page); 392 } 393 } 394 395 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 396 unsigned int order) 397 { 398 /* Control pages are special, they are the intermediaries 399 * that are needed while we copy the rest of the pages 400 * to their final resting place. As such they must 401 * not conflict with either the destination addresses 402 * or memory the kernel is already using. 403 * 404 * The only case where we really need more than one of 405 * these are for architectures where we cannot disable 406 * the MMU and must instead generate an identity mapped 407 * page table for all of the memory. 408 * 409 * At worst this runs in O(N) of the image size. 410 */ 411 struct list_head extra_pages; 412 struct page *pages; 413 unsigned int count; 414 415 count = 1 << order; 416 INIT_LIST_HEAD(&extra_pages); 417 418 /* Loop while I can allocate a page and the page allocated 419 * is a destination page. 420 */ 421 do { 422 unsigned long pfn, epfn, addr, eaddr; 423 424 pages = kimage_alloc_pages(GFP_KERNEL, order); 425 if (!pages) 426 break; 427 pfn = page_to_pfn(pages); 428 epfn = pfn + count; 429 addr = pfn << PAGE_SHIFT; 430 eaddr = epfn << PAGE_SHIFT; 431 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 432 kimage_is_destination_range(image, addr, eaddr)) { 433 list_add(&pages->lru, &extra_pages); 434 pages = NULL; 435 } 436 } while (!pages); 437 438 if (pages) { 439 /* Remember the allocated page... */ 440 list_add(&pages->lru, &image->control_pages); 441 442 /* Because the page is already in it's destination 443 * location we will never allocate another page at 444 * that address. Therefore kimage_alloc_pages 445 * will not return it (again) and we don't need 446 * to give it an entry in image->segment[]. 447 */ 448 } 449 /* Deal with the destination pages I have inadvertently allocated. 450 * 451 * Ideally I would convert multi-page allocations into single 452 * page allocations, and add everyting to image->dest_pages. 453 * 454 * For now it is simpler to just free the pages. 455 */ 456 kimage_free_page_list(&extra_pages); 457 458 return pages; 459 } 460 461 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 462 unsigned int order) 463 { 464 /* Control pages are special, they are the intermediaries 465 * that are needed while we copy the rest of the pages 466 * to their final resting place. As such they must 467 * not conflict with either the destination addresses 468 * or memory the kernel is already using. 469 * 470 * Control pages are also the only pags we must allocate 471 * when loading a crash kernel. All of the other pages 472 * are specified by the segments and we just memcpy 473 * into them directly. 474 * 475 * The only case where we really need more than one of 476 * these are for architectures where we cannot disable 477 * the MMU and must instead generate an identity mapped 478 * page table for all of the memory. 479 * 480 * Given the low demand this implements a very simple 481 * allocator that finds the first hole of the appropriate 482 * size in the reserved memory region, and allocates all 483 * of the memory up to and including the hole. 484 */ 485 unsigned long hole_start, hole_end, size; 486 struct page *pages; 487 488 pages = NULL; 489 size = (1 << order) << PAGE_SHIFT; 490 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 491 hole_end = hole_start + size - 1; 492 while (hole_end <= crashk_res.end) { 493 unsigned long i; 494 495 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 496 break; 497 if (hole_end > crashk_res.end) 498 break; 499 /* See if I overlap any of the segments */ 500 for (i = 0; i < image->nr_segments; i++) { 501 unsigned long mstart, mend; 502 503 mstart = image->segment[i].mem; 504 mend = mstart + image->segment[i].memsz - 1; 505 if ((hole_end >= mstart) && (hole_start <= mend)) { 506 /* Advance the hole to the end of the segment */ 507 hole_start = (mend + (size - 1)) & ~(size - 1); 508 hole_end = hole_start + size - 1; 509 break; 510 } 511 } 512 /* If I don't overlap any segments I have found my hole! */ 513 if (i == image->nr_segments) { 514 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 515 break; 516 } 517 } 518 if (pages) 519 image->control_page = hole_end; 520 521 return pages; 522 } 523 524 525 struct page *kimage_alloc_control_pages(struct kimage *image, 526 unsigned int order) 527 { 528 struct page *pages = NULL; 529 530 switch (image->type) { 531 case KEXEC_TYPE_DEFAULT: 532 pages = kimage_alloc_normal_control_pages(image, order); 533 break; 534 case KEXEC_TYPE_CRASH: 535 pages = kimage_alloc_crash_control_pages(image, order); 536 break; 537 } 538 539 return pages; 540 } 541 542 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 543 { 544 if (*image->entry != 0) 545 image->entry++; 546 547 if (image->entry == image->last_entry) { 548 kimage_entry_t *ind_page; 549 struct page *page; 550 551 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 552 if (!page) 553 return -ENOMEM; 554 555 ind_page = page_address(page); 556 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 557 image->entry = ind_page; 558 image->last_entry = ind_page + 559 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 560 } 561 *image->entry = entry; 562 image->entry++; 563 *image->entry = 0; 564 565 return 0; 566 } 567 568 static int kimage_set_destination(struct kimage *image, 569 unsigned long destination) 570 { 571 int result; 572 573 destination &= PAGE_MASK; 574 result = kimage_add_entry(image, destination | IND_DESTINATION); 575 if (result == 0) 576 image->destination = destination; 577 578 return result; 579 } 580 581 582 static int kimage_add_page(struct kimage *image, unsigned long page) 583 { 584 int result; 585 586 page &= PAGE_MASK; 587 result = kimage_add_entry(image, page | IND_SOURCE); 588 if (result == 0) 589 image->destination += PAGE_SIZE; 590 591 return result; 592 } 593 594 595 static void kimage_free_extra_pages(struct kimage *image) 596 { 597 /* Walk through and free any extra destination pages I may have */ 598 kimage_free_page_list(&image->dest_pages); 599 600 /* Walk through and free any unuseable pages I have cached */ 601 kimage_free_page_list(&image->unuseable_pages); 602 603 } 604 static void kimage_terminate(struct kimage *image) 605 { 606 if (*image->entry != 0) 607 image->entry++; 608 609 *image->entry = IND_DONE; 610 } 611 612 #define for_each_kimage_entry(image, ptr, entry) \ 613 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 614 ptr = (entry & IND_INDIRECTION)? \ 615 phys_to_virt((entry & PAGE_MASK)): ptr +1) 616 617 static void kimage_free_entry(kimage_entry_t entry) 618 { 619 struct page *page; 620 621 page = pfn_to_page(entry >> PAGE_SHIFT); 622 kimage_free_pages(page); 623 } 624 625 static void kimage_free(struct kimage *image) 626 { 627 kimage_entry_t *ptr, entry; 628 kimage_entry_t ind = 0; 629 630 if (!image) 631 return; 632 633 kimage_free_extra_pages(image); 634 for_each_kimage_entry(image, ptr, entry) { 635 if (entry & IND_INDIRECTION) { 636 /* Free the previous indirection page */ 637 if (ind & IND_INDIRECTION) 638 kimage_free_entry(ind); 639 /* Save this indirection page until we are 640 * done with it. 641 */ 642 ind = entry; 643 } 644 else if (entry & IND_SOURCE) 645 kimage_free_entry(entry); 646 } 647 /* Free the final indirection page */ 648 if (ind & IND_INDIRECTION) 649 kimage_free_entry(ind); 650 651 /* Handle any machine specific cleanup */ 652 machine_kexec_cleanup(image); 653 654 /* Free the kexec control pages... */ 655 kimage_free_page_list(&image->control_pages); 656 kfree(image); 657 } 658 659 static kimage_entry_t *kimage_dst_used(struct kimage *image, 660 unsigned long page) 661 { 662 kimage_entry_t *ptr, entry; 663 unsigned long destination = 0; 664 665 for_each_kimage_entry(image, ptr, entry) { 666 if (entry & IND_DESTINATION) 667 destination = entry & PAGE_MASK; 668 else if (entry & IND_SOURCE) { 669 if (page == destination) 670 return ptr; 671 destination += PAGE_SIZE; 672 } 673 } 674 675 return NULL; 676 } 677 678 static struct page *kimage_alloc_page(struct kimage *image, 679 gfp_t gfp_mask, 680 unsigned long destination) 681 { 682 /* 683 * Here we implement safeguards to ensure that a source page 684 * is not copied to its destination page before the data on 685 * the destination page is no longer useful. 686 * 687 * To do this we maintain the invariant that a source page is 688 * either its own destination page, or it is not a 689 * destination page at all. 690 * 691 * That is slightly stronger than required, but the proof 692 * that no problems will not occur is trivial, and the 693 * implementation is simply to verify. 694 * 695 * When allocating all pages normally this algorithm will run 696 * in O(N) time, but in the worst case it will run in O(N^2) 697 * time. If the runtime is a problem the data structures can 698 * be fixed. 699 */ 700 struct page *page; 701 unsigned long addr; 702 703 /* 704 * Walk through the list of destination pages, and see if I 705 * have a match. 706 */ 707 list_for_each_entry(page, &image->dest_pages, lru) { 708 addr = page_to_pfn(page) << PAGE_SHIFT; 709 if (addr == destination) { 710 list_del(&page->lru); 711 return page; 712 } 713 } 714 page = NULL; 715 while (1) { 716 kimage_entry_t *old; 717 718 /* Allocate a page, if we run out of memory give up */ 719 page = kimage_alloc_pages(gfp_mask, 0); 720 if (!page) 721 return NULL; 722 /* If the page cannot be used file it away */ 723 if (page_to_pfn(page) > 724 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 725 list_add(&page->lru, &image->unuseable_pages); 726 continue; 727 } 728 addr = page_to_pfn(page) << PAGE_SHIFT; 729 730 /* If it is the destination page we want use it */ 731 if (addr == destination) 732 break; 733 734 /* If the page is not a destination page use it */ 735 if (!kimage_is_destination_range(image, addr, 736 addr + PAGE_SIZE)) 737 break; 738 739 /* 740 * I know that the page is someones destination page. 741 * See if there is already a source page for this 742 * destination page. And if so swap the source pages. 743 */ 744 old = kimage_dst_used(image, addr); 745 if (old) { 746 /* If so move it */ 747 unsigned long old_addr; 748 struct page *old_page; 749 750 old_addr = *old & PAGE_MASK; 751 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 752 copy_highpage(page, old_page); 753 *old = addr | (*old & ~PAGE_MASK); 754 755 /* The old page I have found cannot be a 756 * destination page, so return it. 757 */ 758 addr = old_addr; 759 page = old_page; 760 break; 761 } 762 else { 763 /* Place the page on the destination list I 764 * will use it later. 765 */ 766 list_add(&page->lru, &image->dest_pages); 767 } 768 } 769 770 return page; 771 } 772 773 static int kimage_load_normal_segment(struct kimage *image, 774 struct kexec_segment *segment) 775 { 776 unsigned long maddr; 777 unsigned long ubytes, mbytes; 778 int result; 779 unsigned char __user *buf; 780 781 result = 0; 782 buf = segment->buf; 783 ubytes = segment->bufsz; 784 mbytes = segment->memsz; 785 maddr = segment->mem; 786 787 result = kimage_set_destination(image, maddr); 788 if (result < 0) 789 goto out; 790 791 while (mbytes) { 792 struct page *page; 793 char *ptr; 794 size_t uchunk, mchunk; 795 796 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 797 if (!page) { 798 result = -ENOMEM; 799 goto out; 800 } 801 result = kimage_add_page(image, page_to_pfn(page) 802 << PAGE_SHIFT); 803 if (result < 0) 804 goto out; 805 806 ptr = kmap(page); 807 /* Start with a clear page */ 808 memset(ptr, 0, PAGE_SIZE); 809 ptr += maddr & ~PAGE_MASK; 810 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 811 if (mchunk > mbytes) 812 mchunk = mbytes; 813 814 uchunk = mchunk; 815 if (uchunk > ubytes) 816 uchunk = ubytes; 817 818 result = copy_from_user(ptr, buf, uchunk); 819 kunmap(page); 820 if (result) { 821 result = (result < 0) ? result : -EIO; 822 goto out; 823 } 824 ubytes -= uchunk; 825 maddr += mchunk; 826 buf += mchunk; 827 mbytes -= mchunk; 828 } 829 out: 830 return result; 831 } 832 833 static int kimage_load_crash_segment(struct kimage *image, 834 struct kexec_segment *segment) 835 { 836 /* For crash dumps kernels we simply copy the data from 837 * user space to it's destination. 838 * We do things a page at a time for the sake of kmap. 839 */ 840 unsigned long maddr; 841 unsigned long ubytes, mbytes; 842 int result; 843 unsigned char __user *buf; 844 845 result = 0; 846 buf = segment->buf; 847 ubytes = segment->bufsz; 848 mbytes = segment->memsz; 849 maddr = segment->mem; 850 while (mbytes) { 851 struct page *page; 852 char *ptr; 853 size_t uchunk, mchunk; 854 855 page = pfn_to_page(maddr >> PAGE_SHIFT); 856 if (!page) { 857 result = -ENOMEM; 858 goto out; 859 } 860 ptr = kmap(page); 861 ptr += maddr & ~PAGE_MASK; 862 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 863 if (mchunk > mbytes) 864 mchunk = mbytes; 865 866 uchunk = mchunk; 867 if (uchunk > ubytes) { 868 uchunk = ubytes; 869 /* Zero the trailing part of the page */ 870 memset(ptr + uchunk, 0, mchunk - uchunk); 871 } 872 result = copy_from_user(ptr, buf, uchunk); 873 kexec_flush_icache_page(page); 874 kunmap(page); 875 if (result) { 876 result = (result < 0) ? result : -EIO; 877 goto out; 878 } 879 ubytes -= uchunk; 880 maddr += mchunk; 881 buf += mchunk; 882 mbytes -= mchunk; 883 } 884 out: 885 return result; 886 } 887 888 static int kimage_load_segment(struct kimage *image, 889 struct kexec_segment *segment) 890 { 891 int result = -ENOMEM; 892 893 switch (image->type) { 894 case KEXEC_TYPE_DEFAULT: 895 result = kimage_load_normal_segment(image, segment); 896 break; 897 case KEXEC_TYPE_CRASH: 898 result = kimage_load_crash_segment(image, segment); 899 break; 900 } 901 902 return result; 903 } 904 905 /* 906 * Exec Kernel system call: for obvious reasons only root may call it. 907 * 908 * This call breaks up into three pieces. 909 * - A generic part which loads the new kernel from the current 910 * address space, and very carefully places the data in the 911 * allocated pages. 912 * 913 * - A generic part that interacts with the kernel and tells all of 914 * the devices to shut down. Preventing on-going dmas, and placing 915 * the devices in a consistent state so a later kernel can 916 * reinitialize them. 917 * 918 * - A machine specific part that includes the syscall number 919 * and the copies the image to it's final destination. And 920 * jumps into the image at entry. 921 * 922 * kexec does not sync, or unmount filesystems so if you need 923 * that to happen you need to do that yourself. 924 */ 925 struct kimage *kexec_image; 926 struct kimage *kexec_crash_image; 927 /* 928 * A home grown binary mutex. 929 * Nothing can wait so this mutex is safe to use 930 * in interrupt context :) 931 */ 932 static int kexec_lock; 933 934 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 935 struct kexec_segment __user *segments, 936 unsigned long flags) 937 { 938 struct kimage **dest_image, *image; 939 int locked; 940 int result; 941 942 /* We only trust the superuser with rebooting the system. */ 943 if (!capable(CAP_SYS_BOOT)) 944 return -EPERM; 945 946 /* 947 * Verify we have a legal set of flags 948 * This leaves us room for future extensions. 949 */ 950 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 951 return -EINVAL; 952 953 /* Verify we are on the appropriate architecture */ 954 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 955 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 956 return -EINVAL; 957 958 /* Put an artificial cap on the number 959 * of segments passed to kexec_load. 960 */ 961 if (nr_segments > KEXEC_SEGMENT_MAX) 962 return -EINVAL; 963 964 image = NULL; 965 result = 0; 966 967 /* Because we write directly to the reserved memory 968 * region when loading crash kernels we need a mutex here to 969 * prevent multiple crash kernels from attempting to load 970 * simultaneously, and to prevent a crash kernel from loading 971 * over the top of a in use crash kernel. 972 * 973 * KISS: always take the mutex. 974 */ 975 locked = xchg(&kexec_lock, 1); 976 if (locked) 977 return -EBUSY; 978 979 dest_image = &kexec_image; 980 if (flags & KEXEC_ON_CRASH) 981 dest_image = &kexec_crash_image; 982 if (nr_segments > 0) { 983 unsigned long i; 984 985 /* Loading another kernel to reboot into */ 986 if ((flags & KEXEC_ON_CRASH) == 0) 987 result = kimage_normal_alloc(&image, entry, 988 nr_segments, segments); 989 /* Loading another kernel to switch to if this one crashes */ 990 else if (flags & KEXEC_ON_CRASH) { 991 /* Free any current crash dump kernel before 992 * we corrupt it. 993 */ 994 kimage_free(xchg(&kexec_crash_image, NULL)); 995 result = kimage_crash_alloc(&image, entry, 996 nr_segments, segments); 997 } 998 if (result) 999 goto out; 1000 1001 if (flags & KEXEC_PRESERVE_CONTEXT) 1002 image->preserve_context = 1; 1003 result = machine_kexec_prepare(image); 1004 if (result) 1005 goto out; 1006 1007 for (i = 0; i < nr_segments; i++) { 1008 result = kimage_load_segment(image, &image->segment[i]); 1009 if (result) 1010 goto out; 1011 } 1012 kimage_terminate(image); 1013 } 1014 /* Install the new kernel, and Uninstall the old */ 1015 image = xchg(dest_image, image); 1016 1017 out: 1018 locked = xchg(&kexec_lock, 0); /* Release the mutex */ 1019 BUG_ON(!locked); 1020 kimage_free(image); 1021 1022 return result; 1023 } 1024 1025 #ifdef CONFIG_COMPAT 1026 asmlinkage long compat_sys_kexec_load(unsigned long entry, 1027 unsigned long nr_segments, 1028 struct compat_kexec_segment __user *segments, 1029 unsigned long flags) 1030 { 1031 struct compat_kexec_segment in; 1032 struct kexec_segment out, __user *ksegments; 1033 unsigned long i, result; 1034 1035 /* Don't allow clients that don't understand the native 1036 * architecture to do anything. 1037 */ 1038 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 1039 return -EINVAL; 1040 1041 if (nr_segments > KEXEC_SEGMENT_MAX) 1042 return -EINVAL; 1043 1044 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1045 for (i=0; i < nr_segments; i++) { 1046 result = copy_from_user(&in, &segments[i], sizeof(in)); 1047 if (result) 1048 return -EFAULT; 1049 1050 out.buf = compat_ptr(in.buf); 1051 out.bufsz = in.bufsz; 1052 out.mem = in.mem; 1053 out.memsz = in.memsz; 1054 1055 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 1056 if (result) 1057 return -EFAULT; 1058 } 1059 1060 return sys_kexec_load(entry, nr_segments, ksegments, flags); 1061 } 1062 #endif 1063 1064 void crash_kexec(struct pt_regs *regs) 1065 { 1066 int locked; 1067 1068 1069 /* Take the kexec_lock here to prevent sys_kexec_load 1070 * running on one cpu from replacing the crash kernel 1071 * we are using after a panic on a different cpu. 1072 * 1073 * If the crash kernel was not located in a fixed area 1074 * of memory the xchg(&kexec_crash_image) would be 1075 * sufficient. But since I reuse the memory... 1076 */ 1077 locked = xchg(&kexec_lock, 1); 1078 if (!locked) { 1079 if (kexec_crash_image) { 1080 struct pt_regs fixed_regs; 1081 crash_setup_regs(&fixed_regs, regs); 1082 crash_save_vmcoreinfo(); 1083 machine_crash_shutdown(&fixed_regs); 1084 machine_kexec(kexec_crash_image); 1085 } 1086 locked = xchg(&kexec_lock, 0); 1087 BUG_ON(!locked); 1088 } 1089 } 1090 1091 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1092 size_t data_len) 1093 { 1094 struct elf_note note; 1095 1096 note.n_namesz = strlen(name) + 1; 1097 note.n_descsz = data_len; 1098 note.n_type = type; 1099 memcpy(buf, ¬e, sizeof(note)); 1100 buf += (sizeof(note) + 3)/4; 1101 memcpy(buf, name, note.n_namesz); 1102 buf += (note.n_namesz + 3)/4; 1103 memcpy(buf, data, note.n_descsz); 1104 buf += (note.n_descsz + 3)/4; 1105 1106 return buf; 1107 } 1108 1109 static void final_note(u32 *buf) 1110 { 1111 struct elf_note note; 1112 1113 note.n_namesz = 0; 1114 note.n_descsz = 0; 1115 note.n_type = 0; 1116 memcpy(buf, ¬e, sizeof(note)); 1117 } 1118 1119 void crash_save_cpu(struct pt_regs *regs, int cpu) 1120 { 1121 struct elf_prstatus prstatus; 1122 u32 *buf; 1123 1124 if ((cpu < 0) || (cpu >= NR_CPUS)) 1125 return; 1126 1127 /* Using ELF notes here is opportunistic. 1128 * I need a well defined structure format 1129 * for the data I pass, and I need tags 1130 * on the data to indicate what information I have 1131 * squirrelled away. ELF notes happen to provide 1132 * all of that, so there is no need to invent something new. 1133 */ 1134 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1135 if (!buf) 1136 return; 1137 memset(&prstatus, 0, sizeof(prstatus)); 1138 prstatus.pr_pid = current->pid; 1139 elf_core_copy_regs(&prstatus.pr_reg, regs); 1140 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1141 &prstatus, sizeof(prstatus)); 1142 final_note(buf); 1143 } 1144 1145 static int __init crash_notes_memory_init(void) 1146 { 1147 /* Allocate memory for saving cpu registers. */ 1148 crash_notes = alloc_percpu(note_buf_t); 1149 if (!crash_notes) { 1150 printk("Kexec: Memory allocation for saving cpu register" 1151 " states failed\n"); 1152 return -ENOMEM; 1153 } 1154 return 0; 1155 } 1156 module_init(crash_notes_memory_init) 1157 1158 1159 /* 1160 * parsing the "crashkernel" commandline 1161 * 1162 * this code is intended to be called from architecture specific code 1163 */ 1164 1165 1166 /* 1167 * This function parses command lines in the format 1168 * 1169 * crashkernel=ramsize-range:size[,...][@offset] 1170 * 1171 * The function returns 0 on success and -EINVAL on failure. 1172 */ 1173 static int __init parse_crashkernel_mem(char *cmdline, 1174 unsigned long long system_ram, 1175 unsigned long long *crash_size, 1176 unsigned long long *crash_base) 1177 { 1178 char *cur = cmdline, *tmp; 1179 1180 /* for each entry of the comma-separated list */ 1181 do { 1182 unsigned long long start, end = ULLONG_MAX, size; 1183 1184 /* get the start of the range */ 1185 start = memparse(cur, &tmp); 1186 if (cur == tmp) { 1187 pr_warning("crashkernel: Memory value expected\n"); 1188 return -EINVAL; 1189 } 1190 cur = tmp; 1191 if (*cur != '-') { 1192 pr_warning("crashkernel: '-' expected\n"); 1193 return -EINVAL; 1194 } 1195 cur++; 1196 1197 /* if no ':' is here, than we read the end */ 1198 if (*cur != ':') { 1199 end = memparse(cur, &tmp); 1200 if (cur == tmp) { 1201 pr_warning("crashkernel: Memory " 1202 "value expected\n"); 1203 return -EINVAL; 1204 } 1205 cur = tmp; 1206 if (end <= start) { 1207 pr_warning("crashkernel: end <= start\n"); 1208 return -EINVAL; 1209 } 1210 } 1211 1212 if (*cur != ':') { 1213 pr_warning("crashkernel: ':' expected\n"); 1214 return -EINVAL; 1215 } 1216 cur++; 1217 1218 size = memparse(cur, &tmp); 1219 if (cur == tmp) { 1220 pr_warning("Memory value expected\n"); 1221 return -EINVAL; 1222 } 1223 cur = tmp; 1224 if (size >= system_ram) { 1225 pr_warning("crashkernel: invalid size\n"); 1226 return -EINVAL; 1227 } 1228 1229 /* match ? */ 1230 if (system_ram >= start && system_ram < end) { 1231 *crash_size = size; 1232 break; 1233 } 1234 } while (*cur++ == ','); 1235 1236 if (*crash_size > 0) { 1237 while (*cur != ' ' && *cur != '@') 1238 cur++; 1239 if (*cur == '@') { 1240 cur++; 1241 *crash_base = memparse(cur, &tmp); 1242 if (cur == tmp) { 1243 pr_warning("Memory value expected " 1244 "after '@'\n"); 1245 return -EINVAL; 1246 } 1247 } 1248 } 1249 1250 return 0; 1251 } 1252 1253 /* 1254 * That function parses "simple" (old) crashkernel command lines like 1255 * 1256 * crashkernel=size[@offset] 1257 * 1258 * It returns 0 on success and -EINVAL on failure. 1259 */ 1260 static int __init parse_crashkernel_simple(char *cmdline, 1261 unsigned long long *crash_size, 1262 unsigned long long *crash_base) 1263 { 1264 char *cur = cmdline; 1265 1266 *crash_size = memparse(cmdline, &cur); 1267 if (cmdline == cur) { 1268 pr_warning("crashkernel: memory value expected\n"); 1269 return -EINVAL; 1270 } 1271 1272 if (*cur == '@') 1273 *crash_base = memparse(cur+1, &cur); 1274 1275 return 0; 1276 } 1277 1278 /* 1279 * That function is the entry point for command line parsing and should be 1280 * called from the arch-specific code. 1281 */ 1282 int __init parse_crashkernel(char *cmdline, 1283 unsigned long long system_ram, 1284 unsigned long long *crash_size, 1285 unsigned long long *crash_base) 1286 { 1287 char *p = cmdline, *ck_cmdline = NULL; 1288 char *first_colon, *first_space; 1289 1290 BUG_ON(!crash_size || !crash_base); 1291 *crash_size = 0; 1292 *crash_base = 0; 1293 1294 /* find crashkernel and use the last one if there are more */ 1295 p = strstr(p, "crashkernel="); 1296 while (p) { 1297 ck_cmdline = p; 1298 p = strstr(p+1, "crashkernel="); 1299 } 1300 1301 if (!ck_cmdline) 1302 return -EINVAL; 1303 1304 ck_cmdline += 12; /* strlen("crashkernel=") */ 1305 1306 /* 1307 * if the commandline contains a ':', then that's the extended 1308 * syntax -- if not, it must be the classic syntax 1309 */ 1310 first_colon = strchr(ck_cmdline, ':'); 1311 first_space = strchr(ck_cmdline, ' '); 1312 if (first_colon && (!first_space || first_colon < first_space)) 1313 return parse_crashkernel_mem(ck_cmdline, system_ram, 1314 crash_size, crash_base); 1315 else 1316 return parse_crashkernel_simple(ck_cmdline, crash_size, 1317 crash_base); 1318 1319 return 0; 1320 } 1321 1322 1323 1324 void crash_save_vmcoreinfo(void) 1325 { 1326 u32 *buf; 1327 1328 if (!vmcoreinfo_size) 1329 return; 1330 1331 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); 1332 1333 buf = (u32 *)vmcoreinfo_note; 1334 1335 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, 1336 vmcoreinfo_size); 1337 1338 final_note(buf); 1339 } 1340 1341 void vmcoreinfo_append_str(const char *fmt, ...) 1342 { 1343 va_list args; 1344 char buf[0x50]; 1345 int r; 1346 1347 va_start(args, fmt); 1348 r = vsnprintf(buf, sizeof(buf), fmt, args); 1349 va_end(args); 1350 1351 if (r + vmcoreinfo_size > vmcoreinfo_max_size) 1352 r = vmcoreinfo_max_size - vmcoreinfo_size; 1353 1354 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 1355 1356 vmcoreinfo_size += r; 1357 } 1358 1359 /* 1360 * provide an empty default implementation here -- architecture 1361 * code may override this 1362 */ 1363 void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) 1364 {} 1365 1366 unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) 1367 { 1368 return __pa((unsigned long)(char *)&vmcoreinfo_note); 1369 } 1370 1371 static int __init crash_save_vmcoreinfo_init(void) 1372 { 1373 VMCOREINFO_OSRELEASE(init_uts_ns.name.release); 1374 VMCOREINFO_PAGESIZE(PAGE_SIZE); 1375 1376 VMCOREINFO_SYMBOL(init_uts_ns); 1377 VMCOREINFO_SYMBOL(node_online_map); 1378 VMCOREINFO_SYMBOL(swapper_pg_dir); 1379 VMCOREINFO_SYMBOL(_stext); 1380 1381 #ifndef CONFIG_NEED_MULTIPLE_NODES 1382 VMCOREINFO_SYMBOL(mem_map); 1383 VMCOREINFO_SYMBOL(contig_page_data); 1384 #endif 1385 #ifdef CONFIG_SPARSEMEM 1386 VMCOREINFO_SYMBOL(mem_section); 1387 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); 1388 VMCOREINFO_STRUCT_SIZE(mem_section); 1389 VMCOREINFO_OFFSET(mem_section, section_mem_map); 1390 #endif 1391 VMCOREINFO_STRUCT_SIZE(page); 1392 VMCOREINFO_STRUCT_SIZE(pglist_data); 1393 VMCOREINFO_STRUCT_SIZE(zone); 1394 VMCOREINFO_STRUCT_SIZE(free_area); 1395 VMCOREINFO_STRUCT_SIZE(list_head); 1396 VMCOREINFO_SIZE(nodemask_t); 1397 VMCOREINFO_OFFSET(page, flags); 1398 VMCOREINFO_OFFSET(page, _count); 1399 VMCOREINFO_OFFSET(page, mapping); 1400 VMCOREINFO_OFFSET(page, lru); 1401 VMCOREINFO_OFFSET(pglist_data, node_zones); 1402 VMCOREINFO_OFFSET(pglist_data, nr_zones); 1403 #ifdef CONFIG_FLAT_NODE_MEM_MAP 1404 VMCOREINFO_OFFSET(pglist_data, node_mem_map); 1405 #endif 1406 VMCOREINFO_OFFSET(pglist_data, node_start_pfn); 1407 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); 1408 VMCOREINFO_OFFSET(pglist_data, node_id); 1409 VMCOREINFO_OFFSET(zone, free_area); 1410 VMCOREINFO_OFFSET(zone, vm_stat); 1411 VMCOREINFO_OFFSET(zone, spanned_pages); 1412 VMCOREINFO_OFFSET(free_area, free_list); 1413 VMCOREINFO_OFFSET(list_head, next); 1414 VMCOREINFO_OFFSET(list_head, prev); 1415 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1416 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1417 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1418 VMCOREINFO_NUMBER(PG_lru); 1419 VMCOREINFO_NUMBER(PG_private); 1420 VMCOREINFO_NUMBER(PG_swapcache); 1421 1422 arch_crash_save_vmcoreinfo(); 1423 1424 return 0; 1425 } 1426 1427 module_init(crash_save_vmcoreinfo_init) 1428 1429 /** 1430 * kernel_kexec - reboot the system 1431 * 1432 * Move into place and start executing a preloaded standalone 1433 * executable. If nothing was preloaded return an error. 1434 */ 1435 int kernel_kexec(void) 1436 { 1437 int error = 0; 1438 1439 if (xchg(&kexec_lock, 1)) 1440 return -EBUSY; 1441 if (!kexec_image) { 1442 error = -EINVAL; 1443 goto Unlock; 1444 } 1445 1446 if (kexec_image->preserve_context) { 1447 #ifdef CONFIG_KEXEC_JUMP 1448 mutex_lock(&pm_mutex); 1449 pm_prepare_console(); 1450 error = freeze_processes(); 1451 if (error) { 1452 error = -EBUSY; 1453 goto Restore_console; 1454 } 1455 suspend_console(); 1456 error = device_suspend(PMSG_FREEZE); 1457 if (error) 1458 goto Resume_console; 1459 error = disable_nonboot_cpus(); 1460 if (error) 1461 goto Resume_devices; 1462 local_irq_disable(); 1463 /* At this point, device_suspend() has been called, 1464 * but *not* device_power_down(). We *must* 1465 * device_power_down() now. Otherwise, drivers for 1466 * some devices (e.g. interrupt controllers) become 1467 * desynchronized with the actual state of the 1468 * hardware at resume time, and evil weirdness ensues. 1469 */ 1470 error = device_power_down(PMSG_FREEZE); 1471 if (error) 1472 goto Enable_irqs; 1473 save_processor_state(); 1474 #endif 1475 } else { 1476 blocking_notifier_call_chain(&reboot_notifier_list, 1477 SYS_RESTART, NULL); 1478 system_state = SYSTEM_RESTART; 1479 device_shutdown(); 1480 sysdev_shutdown(); 1481 printk(KERN_EMERG "Starting new kernel\n"); 1482 machine_shutdown(); 1483 } 1484 1485 machine_kexec(kexec_image); 1486 1487 if (kexec_image->preserve_context) { 1488 #ifdef CONFIG_KEXEC_JUMP 1489 restore_processor_state(); 1490 device_power_up(PMSG_RESTORE); 1491 Enable_irqs: 1492 local_irq_enable(); 1493 enable_nonboot_cpus(); 1494 Resume_devices: 1495 device_resume(PMSG_RESTORE); 1496 Resume_console: 1497 resume_console(); 1498 thaw_processes(); 1499 Restore_console: 1500 pm_restore_console(); 1501 mutex_unlock(&pm_mutex); 1502 #endif 1503 } 1504 1505 Unlock: 1506 xchg(&kexec_lock, 0); 1507 1508 return error; 1509 } 1510