1 /* 2 * kexec.c - kexec system call 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #include <linux/capability.h> 10 #include <linux/mm.h> 11 #include <linux/file.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/kexec.h> 15 #include <linux/spinlock.h> 16 #include <linux/list.h> 17 #include <linux/highmem.h> 18 #include <linux/syscalls.h> 19 #include <linux/reboot.h> 20 #include <linux/ioport.h> 21 #include <linux/hardirq.h> 22 #include <linux/elf.h> 23 #include <linux/elfcore.h> 24 #include <linux/utsrelease.h> 25 #include <linux/utsname.h> 26 #include <linux/numa.h> 27 28 #include <asm/page.h> 29 #include <asm/uaccess.h> 30 #include <asm/io.h> 31 #include <asm/system.h> 32 #include <asm/sections.h> 33 34 /* Per cpu memory for storing cpu states in case of system crash. */ 35 note_buf_t* crash_notes; 36 37 /* vmcoreinfo stuff */ 38 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 39 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 40 size_t vmcoreinfo_size; 41 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 42 43 /* Location of the reserved area for the crash kernel */ 44 struct resource crashk_res = { 45 .name = "Crash kernel", 46 .start = 0, 47 .end = 0, 48 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 49 }; 50 51 int kexec_should_crash(struct task_struct *p) 52 { 53 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) 54 return 1; 55 return 0; 56 } 57 58 /* 59 * When kexec transitions to the new kernel there is a one-to-one 60 * mapping between physical and virtual addresses. On processors 61 * where you can disable the MMU this is trivial, and easy. For 62 * others it is still a simple predictable page table to setup. 63 * 64 * In that environment kexec copies the new kernel to its final 65 * resting place. This means I can only support memory whose 66 * physical address can fit in an unsigned long. In particular 67 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 68 * If the assembly stub has more restrictive requirements 69 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 70 * defined more restrictively in <asm/kexec.h>. 71 * 72 * The code for the transition from the current kernel to the 73 * the new kernel is placed in the control_code_buffer, whose size 74 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 75 * page of memory is necessary, but some architectures require more. 76 * Because this memory must be identity mapped in the transition from 77 * virtual to physical addresses it must live in the range 78 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 79 * modifiable. 80 * 81 * The assembly stub in the control code buffer is passed a linked list 82 * of descriptor pages detailing the source pages of the new kernel, 83 * and the destination addresses of those source pages. As this data 84 * structure is not used in the context of the current OS, it must 85 * be self-contained. 86 * 87 * The code has been made to work with highmem pages and will use a 88 * destination page in its final resting place (if it happens 89 * to allocate it). The end product of this is that most of the 90 * physical address space, and most of RAM can be used. 91 * 92 * Future directions include: 93 * - allocating a page table with the control code buffer identity 94 * mapped, to simplify machine_kexec and make kexec_on_panic more 95 * reliable. 96 */ 97 98 /* 99 * KIMAGE_NO_DEST is an impossible destination address..., for 100 * allocating pages whose destination address we do not care about. 101 */ 102 #define KIMAGE_NO_DEST (-1UL) 103 104 static int kimage_is_destination_range(struct kimage *image, 105 unsigned long start, unsigned long end); 106 static struct page *kimage_alloc_page(struct kimage *image, 107 gfp_t gfp_mask, 108 unsigned long dest); 109 110 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 111 unsigned long nr_segments, 112 struct kexec_segment __user *segments) 113 { 114 size_t segment_bytes; 115 struct kimage *image; 116 unsigned long i; 117 int result; 118 119 /* Allocate a controlling structure */ 120 result = -ENOMEM; 121 image = kzalloc(sizeof(*image), GFP_KERNEL); 122 if (!image) 123 goto out; 124 125 image->head = 0; 126 image->entry = &image->head; 127 image->last_entry = &image->head; 128 image->control_page = ~0; /* By default this does not apply */ 129 image->start = entry; 130 image->type = KEXEC_TYPE_DEFAULT; 131 132 /* Initialize the list of control pages */ 133 INIT_LIST_HEAD(&image->control_pages); 134 135 /* Initialize the list of destination pages */ 136 INIT_LIST_HEAD(&image->dest_pages); 137 138 /* Initialize the list of unuseable pages */ 139 INIT_LIST_HEAD(&image->unuseable_pages); 140 141 /* Read in the segments */ 142 image->nr_segments = nr_segments; 143 segment_bytes = nr_segments * sizeof(*segments); 144 result = copy_from_user(image->segment, segments, segment_bytes); 145 if (result) 146 goto out; 147 148 /* 149 * Verify we have good destination addresses. The caller is 150 * responsible for making certain we don't attempt to load 151 * the new image into invalid or reserved areas of RAM. This 152 * just verifies it is an address we can use. 153 * 154 * Since the kernel does everything in page size chunks ensure 155 * the destination addreses are page aligned. Too many 156 * special cases crop of when we don't do this. The most 157 * insidious is getting overlapping destination addresses 158 * simply because addresses are changed to page size 159 * granularity. 160 */ 161 result = -EADDRNOTAVAIL; 162 for (i = 0; i < nr_segments; i++) { 163 unsigned long mstart, mend; 164 165 mstart = image->segment[i].mem; 166 mend = mstart + image->segment[i].memsz; 167 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 168 goto out; 169 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 170 goto out; 171 } 172 173 /* Verify our destination addresses do not overlap. 174 * If we alloed overlapping destination addresses 175 * through very weird things can happen with no 176 * easy explanation as one segment stops on another. 177 */ 178 result = -EINVAL; 179 for (i = 0; i < nr_segments; i++) { 180 unsigned long mstart, mend; 181 unsigned long j; 182 183 mstart = image->segment[i].mem; 184 mend = mstart + image->segment[i].memsz; 185 for (j = 0; j < i; j++) { 186 unsigned long pstart, pend; 187 pstart = image->segment[j].mem; 188 pend = pstart + image->segment[j].memsz; 189 /* Do the segments overlap ? */ 190 if ((mend > pstart) && (mstart < pend)) 191 goto out; 192 } 193 } 194 195 /* Ensure our buffer sizes are strictly less than 196 * our memory sizes. This should always be the case, 197 * and it is easier to check up front than to be surprised 198 * later on. 199 */ 200 result = -EINVAL; 201 for (i = 0; i < nr_segments; i++) { 202 if (image->segment[i].bufsz > image->segment[i].memsz) 203 goto out; 204 } 205 206 result = 0; 207 out: 208 if (result == 0) 209 *rimage = image; 210 else 211 kfree(image); 212 213 return result; 214 215 } 216 217 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 218 unsigned long nr_segments, 219 struct kexec_segment __user *segments) 220 { 221 int result; 222 struct kimage *image; 223 224 /* Allocate and initialize a controlling structure */ 225 image = NULL; 226 result = do_kimage_alloc(&image, entry, nr_segments, segments); 227 if (result) 228 goto out; 229 230 *rimage = image; 231 232 /* 233 * Find a location for the control code buffer, and add it 234 * the vector of segments so that it's pages will also be 235 * counted as destination pages. 236 */ 237 result = -ENOMEM; 238 image->control_code_page = kimage_alloc_control_pages(image, 239 get_order(KEXEC_CONTROL_CODE_SIZE)); 240 if (!image->control_code_page) { 241 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 242 goto out; 243 } 244 245 result = 0; 246 out: 247 if (result == 0) 248 *rimage = image; 249 else 250 kfree(image); 251 252 return result; 253 } 254 255 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 256 unsigned long nr_segments, 257 struct kexec_segment __user *segments) 258 { 259 int result; 260 struct kimage *image; 261 unsigned long i; 262 263 image = NULL; 264 /* Verify we have a valid entry point */ 265 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 266 result = -EADDRNOTAVAIL; 267 goto out; 268 } 269 270 /* Allocate and initialize a controlling structure */ 271 result = do_kimage_alloc(&image, entry, nr_segments, segments); 272 if (result) 273 goto out; 274 275 /* Enable the special crash kernel control page 276 * allocation policy. 277 */ 278 image->control_page = crashk_res.start; 279 image->type = KEXEC_TYPE_CRASH; 280 281 /* 282 * Verify we have good destination addresses. Normally 283 * the caller is responsible for making certain we don't 284 * attempt to load the new image into invalid or reserved 285 * areas of RAM. But crash kernels are preloaded into a 286 * reserved area of ram. We must ensure the addresses 287 * are in the reserved area otherwise preloading the 288 * kernel could corrupt things. 289 */ 290 result = -EADDRNOTAVAIL; 291 for (i = 0; i < nr_segments; i++) { 292 unsigned long mstart, mend; 293 294 mstart = image->segment[i].mem; 295 mend = mstart + image->segment[i].memsz - 1; 296 /* Ensure we are within the crash kernel limits */ 297 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 298 goto out; 299 } 300 301 /* 302 * Find a location for the control code buffer, and add 303 * the vector of segments so that it's pages will also be 304 * counted as destination pages. 305 */ 306 result = -ENOMEM; 307 image->control_code_page = kimage_alloc_control_pages(image, 308 get_order(KEXEC_CONTROL_CODE_SIZE)); 309 if (!image->control_code_page) { 310 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 311 goto out; 312 } 313 314 result = 0; 315 out: 316 if (result == 0) 317 *rimage = image; 318 else 319 kfree(image); 320 321 return result; 322 } 323 324 static int kimage_is_destination_range(struct kimage *image, 325 unsigned long start, 326 unsigned long end) 327 { 328 unsigned long i; 329 330 for (i = 0; i < image->nr_segments; i++) { 331 unsigned long mstart, mend; 332 333 mstart = image->segment[i].mem; 334 mend = mstart + image->segment[i].memsz; 335 if ((end > mstart) && (start < mend)) 336 return 1; 337 } 338 339 return 0; 340 } 341 342 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 343 { 344 struct page *pages; 345 346 pages = alloc_pages(gfp_mask, order); 347 if (pages) { 348 unsigned int count, i; 349 pages->mapping = NULL; 350 set_page_private(pages, order); 351 count = 1 << order; 352 for (i = 0; i < count; i++) 353 SetPageReserved(pages + i); 354 } 355 356 return pages; 357 } 358 359 static void kimage_free_pages(struct page *page) 360 { 361 unsigned int order, count, i; 362 363 order = page_private(page); 364 count = 1 << order; 365 for (i = 0; i < count; i++) 366 ClearPageReserved(page + i); 367 __free_pages(page, order); 368 } 369 370 static void kimage_free_page_list(struct list_head *list) 371 { 372 struct list_head *pos, *next; 373 374 list_for_each_safe(pos, next, list) { 375 struct page *page; 376 377 page = list_entry(pos, struct page, lru); 378 list_del(&page->lru); 379 kimage_free_pages(page); 380 } 381 } 382 383 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 384 unsigned int order) 385 { 386 /* Control pages are special, they are the intermediaries 387 * that are needed while we copy the rest of the pages 388 * to their final resting place. As such they must 389 * not conflict with either the destination addresses 390 * or memory the kernel is already using. 391 * 392 * The only case where we really need more than one of 393 * these are for architectures where we cannot disable 394 * the MMU and must instead generate an identity mapped 395 * page table for all of the memory. 396 * 397 * At worst this runs in O(N) of the image size. 398 */ 399 struct list_head extra_pages; 400 struct page *pages; 401 unsigned int count; 402 403 count = 1 << order; 404 INIT_LIST_HEAD(&extra_pages); 405 406 /* Loop while I can allocate a page and the page allocated 407 * is a destination page. 408 */ 409 do { 410 unsigned long pfn, epfn, addr, eaddr; 411 412 pages = kimage_alloc_pages(GFP_KERNEL, order); 413 if (!pages) 414 break; 415 pfn = page_to_pfn(pages); 416 epfn = pfn + count; 417 addr = pfn << PAGE_SHIFT; 418 eaddr = epfn << PAGE_SHIFT; 419 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 420 kimage_is_destination_range(image, addr, eaddr)) { 421 list_add(&pages->lru, &extra_pages); 422 pages = NULL; 423 } 424 } while (!pages); 425 426 if (pages) { 427 /* Remember the allocated page... */ 428 list_add(&pages->lru, &image->control_pages); 429 430 /* Because the page is already in it's destination 431 * location we will never allocate another page at 432 * that address. Therefore kimage_alloc_pages 433 * will not return it (again) and we don't need 434 * to give it an entry in image->segment[]. 435 */ 436 } 437 /* Deal with the destination pages I have inadvertently allocated. 438 * 439 * Ideally I would convert multi-page allocations into single 440 * page allocations, and add everyting to image->dest_pages. 441 * 442 * For now it is simpler to just free the pages. 443 */ 444 kimage_free_page_list(&extra_pages); 445 446 return pages; 447 } 448 449 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 450 unsigned int order) 451 { 452 /* Control pages are special, they are the intermediaries 453 * that are needed while we copy the rest of the pages 454 * to their final resting place. As such they must 455 * not conflict with either the destination addresses 456 * or memory the kernel is already using. 457 * 458 * Control pages are also the only pags we must allocate 459 * when loading a crash kernel. All of the other pages 460 * are specified by the segments and we just memcpy 461 * into them directly. 462 * 463 * The only case where we really need more than one of 464 * these are for architectures where we cannot disable 465 * the MMU and must instead generate an identity mapped 466 * page table for all of the memory. 467 * 468 * Given the low demand this implements a very simple 469 * allocator that finds the first hole of the appropriate 470 * size in the reserved memory region, and allocates all 471 * of the memory up to and including the hole. 472 */ 473 unsigned long hole_start, hole_end, size; 474 struct page *pages; 475 476 pages = NULL; 477 size = (1 << order) << PAGE_SHIFT; 478 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 479 hole_end = hole_start + size - 1; 480 while (hole_end <= crashk_res.end) { 481 unsigned long i; 482 483 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 484 break; 485 if (hole_end > crashk_res.end) 486 break; 487 /* See if I overlap any of the segments */ 488 for (i = 0; i < image->nr_segments; i++) { 489 unsigned long mstart, mend; 490 491 mstart = image->segment[i].mem; 492 mend = mstart + image->segment[i].memsz - 1; 493 if ((hole_end >= mstart) && (hole_start <= mend)) { 494 /* Advance the hole to the end of the segment */ 495 hole_start = (mend + (size - 1)) & ~(size - 1); 496 hole_end = hole_start + size - 1; 497 break; 498 } 499 } 500 /* If I don't overlap any segments I have found my hole! */ 501 if (i == image->nr_segments) { 502 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 503 break; 504 } 505 } 506 if (pages) 507 image->control_page = hole_end; 508 509 return pages; 510 } 511 512 513 struct page *kimage_alloc_control_pages(struct kimage *image, 514 unsigned int order) 515 { 516 struct page *pages = NULL; 517 518 switch (image->type) { 519 case KEXEC_TYPE_DEFAULT: 520 pages = kimage_alloc_normal_control_pages(image, order); 521 break; 522 case KEXEC_TYPE_CRASH: 523 pages = kimage_alloc_crash_control_pages(image, order); 524 break; 525 } 526 527 return pages; 528 } 529 530 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 531 { 532 if (*image->entry != 0) 533 image->entry++; 534 535 if (image->entry == image->last_entry) { 536 kimage_entry_t *ind_page; 537 struct page *page; 538 539 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 540 if (!page) 541 return -ENOMEM; 542 543 ind_page = page_address(page); 544 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 545 image->entry = ind_page; 546 image->last_entry = ind_page + 547 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 548 } 549 *image->entry = entry; 550 image->entry++; 551 *image->entry = 0; 552 553 return 0; 554 } 555 556 static int kimage_set_destination(struct kimage *image, 557 unsigned long destination) 558 { 559 int result; 560 561 destination &= PAGE_MASK; 562 result = kimage_add_entry(image, destination | IND_DESTINATION); 563 if (result == 0) 564 image->destination = destination; 565 566 return result; 567 } 568 569 570 static int kimage_add_page(struct kimage *image, unsigned long page) 571 { 572 int result; 573 574 page &= PAGE_MASK; 575 result = kimage_add_entry(image, page | IND_SOURCE); 576 if (result == 0) 577 image->destination += PAGE_SIZE; 578 579 return result; 580 } 581 582 583 static void kimage_free_extra_pages(struct kimage *image) 584 { 585 /* Walk through and free any extra destination pages I may have */ 586 kimage_free_page_list(&image->dest_pages); 587 588 /* Walk through and free any unuseable pages I have cached */ 589 kimage_free_page_list(&image->unuseable_pages); 590 591 } 592 static int kimage_terminate(struct kimage *image) 593 { 594 if (*image->entry != 0) 595 image->entry++; 596 597 *image->entry = IND_DONE; 598 599 return 0; 600 } 601 602 #define for_each_kimage_entry(image, ptr, entry) \ 603 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 604 ptr = (entry & IND_INDIRECTION)? \ 605 phys_to_virt((entry & PAGE_MASK)): ptr +1) 606 607 static void kimage_free_entry(kimage_entry_t entry) 608 { 609 struct page *page; 610 611 page = pfn_to_page(entry >> PAGE_SHIFT); 612 kimage_free_pages(page); 613 } 614 615 static void kimage_free(struct kimage *image) 616 { 617 kimage_entry_t *ptr, entry; 618 kimage_entry_t ind = 0; 619 620 if (!image) 621 return; 622 623 kimage_free_extra_pages(image); 624 for_each_kimage_entry(image, ptr, entry) { 625 if (entry & IND_INDIRECTION) { 626 /* Free the previous indirection page */ 627 if (ind & IND_INDIRECTION) 628 kimage_free_entry(ind); 629 /* Save this indirection page until we are 630 * done with it. 631 */ 632 ind = entry; 633 } 634 else if (entry & IND_SOURCE) 635 kimage_free_entry(entry); 636 } 637 /* Free the final indirection page */ 638 if (ind & IND_INDIRECTION) 639 kimage_free_entry(ind); 640 641 /* Handle any machine specific cleanup */ 642 machine_kexec_cleanup(image); 643 644 /* Free the kexec control pages... */ 645 kimage_free_page_list(&image->control_pages); 646 kfree(image); 647 } 648 649 static kimage_entry_t *kimage_dst_used(struct kimage *image, 650 unsigned long page) 651 { 652 kimage_entry_t *ptr, entry; 653 unsigned long destination = 0; 654 655 for_each_kimage_entry(image, ptr, entry) { 656 if (entry & IND_DESTINATION) 657 destination = entry & PAGE_MASK; 658 else if (entry & IND_SOURCE) { 659 if (page == destination) 660 return ptr; 661 destination += PAGE_SIZE; 662 } 663 } 664 665 return NULL; 666 } 667 668 static struct page *kimage_alloc_page(struct kimage *image, 669 gfp_t gfp_mask, 670 unsigned long destination) 671 { 672 /* 673 * Here we implement safeguards to ensure that a source page 674 * is not copied to its destination page before the data on 675 * the destination page is no longer useful. 676 * 677 * To do this we maintain the invariant that a source page is 678 * either its own destination page, or it is not a 679 * destination page at all. 680 * 681 * That is slightly stronger than required, but the proof 682 * that no problems will not occur is trivial, and the 683 * implementation is simply to verify. 684 * 685 * When allocating all pages normally this algorithm will run 686 * in O(N) time, but in the worst case it will run in O(N^2) 687 * time. If the runtime is a problem the data structures can 688 * be fixed. 689 */ 690 struct page *page; 691 unsigned long addr; 692 693 /* 694 * Walk through the list of destination pages, and see if I 695 * have a match. 696 */ 697 list_for_each_entry(page, &image->dest_pages, lru) { 698 addr = page_to_pfn(page) << PAGE_SHIFT; 699 if (addr == destination) { 700 list_del(&page->lru); 701 return page; 702 } 703 } 704 page = NULL; 705 while (1) { 706 kimage_entry_t *old; 707 708 /* Allocate a page, if we run out of memory give up */ 709 page = kimage_alloc_pages(gfp_mask, 0); 710 if (!page) 711 return NULL; 712 /* If the page cannot be used file it away */ 713 if (page_to_pfn(page) > 714 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 715 list_add(&page->lru, &image->unuseable_pages); 716 continue; 717 } 718 addr = page_to_pfn(page) << PAGE_SHIFT; 719 720 /* If it is the destination page we want use it */ 721 if (addr == destination) 722 break; 723 724 /* If the page is not a destination page use it */ 725 if (!kimage_is_destination_range(image, addr, 726 addr + PAGE_SIZE)) 727 break; 728 729 /* 730 * I know that the page is someones destination page. 731 * See if there is already a source page for this 732 * destination page. And if so swap the source pages. 733 */ 734 old = kimage_dst_used(image, addr); 735 if (old) { 736 /* If so move it */ 737 unsigned long old_addr; 738 struct page *old_page; 739 740 old_addr = *old & PAGE_MASK; 741 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 742 copy_highpage(page, old_page); 743 *old = addr | (*old & ~PAGE_MASK); 744 745 /* The old page I have found cannot be a 746 * destination page, so return it. 747 */ 748 addr = old_addr; 749 page = old_page; 750 break; 751 } 752 else { 753 /* Place the page on the destination list I 754 * will use it later. 755 */ 756 list_add(&page->lru, &image->dest_pages); 757 } 758 } 759 760 return page; 761 } 762 763 static int kimage_load_normal_segment(struct kimage *image, 764 struct kexec_segment *segment) 765 { 766 unsigned long maddr; 767 unsigned long ubytes, mbytes; 768 int result; 769 unsigned char __user *buf; 770 771 result = 0; 772 buf = segment->buf; 773 ubytes = segment->bufsz; 774 mbytes = segment->memsz; 775 maddr = segment->mem; 776 777 result = kimage_set_destination(image, maddr); 778 if (result < 0) 779 goto out; 780 781 while (mbytes) { 782 struct page *page; 783 char *ptr; 784 size_t uchunk, mchunk; 785 786 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 787 if (!page) { 788 result = -ENOMEM; 789 goto out; 790 } 791 result = kimage_add_page(image, page_to_pfn(page) 792 << PAGE_SHIFT); 793 if (result < 0) 794 goto out; 795 796 ptr = kmap(page); 797 /* Start with a clear page */ 798 memset(ptr, 0, PAGE_SIZE); 799 ptr += maddr & ~PAGE_MASK; 800 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 801 if (mchunk > mbytes) 802 mchunk = mbytes; 803 804 uchunk = mchunk; 805 if (uchunk > ubytes) 806 uchunk = ubytes; 807 808 result = copy_from_user(ptr, buf, uchunk); 809 kunmap(page); 810 if (result) { 811 result = (result < 0) ? result : -EIO; 812 goto out; 813 } 814 ubytes -= uchunk; 815 maddr += mchunk; 816 buf += mchunk; 817 mbytes -= mchunk; 818 } 819 out: 820 return result; 821 } 822 823 static int kimage_load_crash_segment(struct kimage *image, 824 struct kexec_segment *segment) 825 { 826 /* For crash dumps kernels we simply copy the data from 827 * user space to it's destination. 828 * We do things a page at a time for the sake of kmap. 829 */ 830 unsigned long maddr; 831 unsigned long ubytes, mbytes; 832 int result; 833 unsigned char __user *buf; 834 835 result = 0; 836 buf = segment->buf; 837 ubytes = segment->bufsz; 838 mbytes = segment->memsz; 839 maddr = segment->mem; 840 while (mbytes) { 841 struct page *page; 842 char *ptr; 843 size_t uchunk, mchunk; 844 845 page = pfn_to_page(maddr >> PAGE_SHIFT); 846 if (!page) { 847 result = -ENOMEM; 848 goto out; 849 } 850 ptr = kmap(page); 851 ptr += maddr & ~PAGE_MASK; 852 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 853 if (mchunk > mbytes) 854 mchunk = mbytes; 855 856 uchunk = mchunk; 857 if (uchunk > ubytes) { 858 uchunk = ubytes; 859 /* Zero the trailing part of the page */ 860 memset(ptr + uchunk, 0, mchunk - uchunk); 861 } 862 result = copy_from_user(ptr, buf, uchunk); 863 kexec_flush_icache_page(page); 864 kunmap(page); 865 if (result) { 866 result = (result < 0) ? result : -EIO; 867 goto out; 868 } 869 ubytes -= uchunk; 870 maddr += mchunk; 871 buf += mchunk; 872 mbytes -= mchunk; 873 } 874 out: 875 return result; 876 } 877 878 static int kimage_load_segment(struct kimage *image, 879 struct kexec_segment *segment) 880 { 881 int result = -ENOMEM; 882 883 switch (image->type) { 884 case KEXEC_TYPE_DEFAULT: 885 result = kimage_load_normal_segment(image, segment); 886 break; 887 case KEXEC_TYPE_CRASH: 888 result = kimage_load_crash_segment(image, segment); 889 break; 890 } 891 892 return result; 893 } 894 895 /* 896 * Exec Kernel system call: for obvious reasons only root may call it. 897 * 898 * This call breaks up into three pieces. 899 * - A generic part which loads the new kernel from the current 900 * address space, and very carefully places the data in the 901 * allocated pages. 902 * 903 * - A generic part that interacts with the kernel and tells all of 904 * the devices to shut down. Preventing on-going dmas, and placing 905 * the devices in a consistent state so a later kernel can 906 * reinitialize them. 907 * 908 * - A machine specific part that includes the syscall number 909 * and the copies the image to it's final destination. And 910 * jumps into the image at entry. 911 * 912 * kexec does not sync, or unmount filesystems so if you need 913 * that to happen you need to do that yourself. 914 */ 915 struct kimage *kexec_image; 916 struct kimage *kexec_crash_image; 917 /* 918 * A home grown binary mutex. 919 * Nothing can wait so this mutex is safe to use 920 * in interrupt context :) 921 */ 922 static int kexec_lock; 923 924 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 925 struct kexec_segment __user *segments, 926 unsigned long flags) 927 { 928 struct kimage **dest_image, *image; 929 int locked; 930 int result; 931 932 /* We only trust the superuser with rebooting the system. */ 933 if (!capable(CAP_SYS_BOOT)) 934 return -EPERM; 935 936 /* 937 * Verify we have a legal set of flags 938 * This leaves us room for future extensions. 939 */ 940 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 941 return -EINVAL; 942 943 /* Verify we are on the appropriate architecture */ 944 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 945 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 946 return -EINVAL; 947 948 /* Put an artificial cap on the number 949 * of segments passed to kexec_load. 950 */ 951 if (nr_segments > KEXEC_SEGMENT_MAX) 952 return -EINVAL; 953 954 image = NULL; 955 result = 0; 956 957 /* Because we write directly to the reserved memory 958 * region when loading crash kernels we need a mutex here to 959 * prevent multiple crash kernels from attempting to load 960 * simultaneously, and to prevent a crash kernel from loading 961 * over the top of a in use crash kernel. 962 * 963 * KISS: always take the mutex. 964 */ 965 locked = xchg(&kexec_lock, 1); 966 if (locked) 967 return -EBUSY; 968 969 dest_image = &kexec_image; 970 if (flags & KEXEC_ON_CRASH) 971 dest_image = &kexec_crash_image; 972 if (nr_segments > 0) { 973 unsigned long i; 974 975 /* Loading another kernel to reboot into */ 976 if ((flags & KEXEC_ON_CRASH) == 0) 977 result = kimage_normal_alloc(&image, entry, 978 nr_segments, segments); 979 /* Loading another kernel to switch to if this one crashes */ 980 else if (flags & KEXEC_ON_CRASH) { 981 /* Free any current crash dump kernel before 982 * we corrupt it. 983 */ 984 kimage_free(xchg(&kexec_crash_image, NULL)); 985 result = kimage_crash_alloc(&image, entry, 986 nr_segments, segments); 987 } 988 if (result) 989 goto out; 990 991 result = machine_kexec_prepare(image); 992 if (result) 993 goto out; 994 995 for (i = 0; i < nr_segments; i++) { 996 result = kimage_load_segment(image, &image->segment[i]); 997 if (result) 998 goto out; 999 } 1000 result = kimage_terminate(image); 1001 if (result) 1002 goto out; 1003 } 1004 /* Install the new kernel, and Uninstall the old */ 1005 image = xchg(dest_image, image); 1006 1007 out: 1008 locked = xchg(&kexec_lock, 0); /* Release the mutex */ 1009 BUG_ON(!locked); 1010 kimage_free(image); 1011 1012 return result; 1013 } 1014 1015 #ifdef CONFIG_COMPAT 1016 asmlinkage long compat_sys_kexec_load(unsigned long entry, 1017 unsigned long nr_segments, 1018 struct compat_kexec_segment __user *segments, 1019 unsigned long flags) 1020 { 1021 struct compat_kexec_segment in; 1022 struct kexec_segment out, __user *ksegments; 1023 unsigned long i, result; 1024 1025 /* Don't allow clients that don't understand the native 1026 * architecture to do anything. 1027 */ 1028 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 1029 return -EINVAL; 1030 1031 if (nr_segments > KEXEC_SEGMENT_MAX) 1032 return -EINVAL; 1033 1034 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1035 for (i=0; i < nr_segments; i++) { 1036 result = copy_from_user(&in, &segments[i], sizeof(in)); 1037 if (result) 1038 return -EFAULT; 1039 1040 out.buf = compat_ptr(in.buf); 1041 out.bufsz = in.bufsz; 1042 out.mem = in.mem; 1043 out.memsz = in.memsz; 1044 1045 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 1046 if (result) 1047 return -EFAULT; 1048 } 1049 1050 return sys_kexec_load(entry, nr_segments, ksegments, flags); 1051 } 1052 #endif 1053 1054 void crash_kexec(struct pt_regs *regs) 1055 { 1056 int locked; 1057 1058 1059 /* Take the kexec_lock here to prevent sys_kexec_load 1060 * running on one cpu from replacing the crash kernel 1061 * we are using after a panic on a different cpu. 1062 * 1063 * If the crash kernel was not located in a fixed area 1064 * of memory the xchg(&kexec_crash_image) would be 1065 * sufficient. But since I reuse the memory... 1066 */ 1067 locked = xchg(&kexec_lock, 1); 1068 if (!locked) { 1069 if (kexec_crash_image) { 1070 struct pt_regs fixed_regs; 1071 crash_setup_regs(&fixed_regs, regs); 1072 crash_save_vmcoreinfo(); 1073 machine_crash_shutdown(&fixed_regs); 1074 machine_kexec(kexec_crash_image); 1075 } 1076 locked = xchg(&kexec_lock, 0); 1077 BUG_ON(!locked); 1078 } 1079 } 1080 1081 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1082 size_t data_len) 1083 { 1084 struct elf_note note; 1085 1086 note.n_namesz = strlen(name) + 1; 1087 note.n_descsz = data_len; 1088 note.n_type = type; 1089 memcpy(buf, ¬e, sizeof(note)); 1090 buf += (sizeof(note) + 3)/4; 1091 memcpy(buf, name, note.n_namesz); 1092 buf += (note.n_namesz + 3)/4; 1093 memcpy(buf, data, note.n_descsz); 1094 buf += (note.n_descsz + 3)/4; 1095 1096 return buf; 1097 } 1098 1099 static void final_note(u32 *buf) 1100 { 1101 struct elf_note note; 1102 1103 note.n_namesz = 0; 1104 note.n_descsz = 0; 1105 note.n_type = 0; 1106 memcpy(buf, ¬e, sizeof(note)); 1107 } 1108 1109 void crash_save_cpu(struct pt_regs *regs, int cpu) 1110 { 1111 struct elf_prstatus prstatus; 1112 u32 *buf; 1113 1114 if ((cpu < 0) || (cpu >= NR_CPUS)) 1115 return; 1116 1117 /* Using ELF notes here is opportunistic. 1118 * I need a well defined structure format 1119 * for the data I pass, and I need tags 1120 * on the data to indicate what information I have 1121 * squirrelled away. ELF notes happen to provide 1122 * all of that, so there is no need to invent something new. 1123 */ 1124 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1125 if (!buf) 1126 return; 1127 memset(&prstatus, 0, sizeof(prstatus)); 1128 prstatus.pr_pid = current->pid; 1129 elf_core_copy_regs(&prstatus.pr_reg, regs); 1130 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1131 &prstatus, sizeof(prstatus)); 1132 final_note(buf); 1133 } 1134 1135 static int __init crash_notes_memory_init(void) 1136 { 1137 /* Allocate memory for saving cpu registers. */ 1138 crash_notes = alloc_percpu(note_buf_t); 1139 if (!crash_notes) { 1140 printk("Kexec: Memory allocation for saving cpu register" 1141 " states failed\n"); 1142 return -ENOMEM; 1143 } 1144 return 0; 1145 } 1146 module_init(crash_notes_memory_init) 1147 1148 1149 /* 1150 * parsing the "crashkernel" commandline 1151 * 1152 * this code is intended to be called from architecture specific code 1153 */ 1154 1155 1156 /* 1157 * This function parses command lines in the format 1158 * 1159 * crashkernel=ramsize-range:size[,...][@offset] 1160 * 1161 * The function returns 0 on success and -EINVAL on failure. 1162 */ 1163 static int __init parse_crashkernel_mem(char *cmdline, 1164 unsigned long long system_ram, 1165 unsigned long long *crash_size, 1166 unsigned long long *crash_base) 1167 { 1168 char *cur = cmdline, *tmp; 1169 1170 /* for each entry of the comma-separated list */ 1171 do { 1172 unsigned long long start, end = ULLONG_MAX, size; 1173 1174 /* get the start of the range */ 1175 start = memparse(cur, &tmp); 1176 if (cur == tmp) { 1177 pr_warning("crashkernel: Memory value expected\n"); 1178 return -EINVAL; 1179 } 1180 cur = tmp; 1181 if (*cur != '-') { 1182 pr_warning("crashkernel: '-' expected\n"); 1183 return -EINVAL; 1184 } 1185 cur++; 1186 1187 /* if no ':' is here, than we read the end */ 1188 if (*cur != ':') { 1189 end = memparse(cur, &tmp); 1190 if (cur == tmp) { 1191 pr_warning("crashkernel: Memory " 1192 "value expected\n"); 1193 return -EINVAL; 1194 } 1195 cur = tmp; 1196 if (end <= start) { 1197 pr_warning("crashkernel: end <= start\n"); 1198 return -EINVAL; 1199 } 1200 } 1201 1202 if (*cur != ':') { 1203 pr_warning("crashkernel: ':' expected\n"); 1204 return -EINVAL; 1205 } 1206 cur++; 1207 1208 size = memparse(cur, &tmp); 1209 if (cur == tmp) { 1210 pr_warning("Memory value expected\n"); 1211 return -EINVAL; 1212 } 1213 cur = tmp; 1214 if (size >= system_ram) { 1215 pr_warning("crashkernel: invalid size\n"); 1216 return -EINVAL; 1217 } 1218 1219 /* match ? */ 1220 if (system_ram >= start && system_ram < end) { 1221 *crash_size = size; 1222 break; 1223 } 1224 } while (*cur++ == ','); 1225 1226 if (*crash_size > 0) { 1227 while (*cur != ' ' && *cur != '@') 1228 cur++; 1229 if (*cur == '@') { 1230 cur++; 1231 *crash_base = memparse(cur, &tmp); 1232 if (cur == tmp) { 1233 pr_warning("Memory value expected " 1234 "after '@'\n"); 1235 return -EINVAL; 1236 } 1237 } 1238 } 1239 1240 return 0; 1241 } 1242 1243 /* 1244 * That function parses "simple" (old) crashkernel command lines like 1245 * 1246 * crashkernel=size[@offset] 1247 * 1248 * It returns 0 on success and -EINVAL on failure. 1249 */ 1250 static int __init parse_crashkernel_simple(char *cmdline, 1251 unsigned long long *crash_size, 1252 unsigned long long *crash_base) 1253 { 1254 char *cur = cmdline; 1255 1256 *crash_size = memparse(cmdline, &cur); 1257 if (cmdline == cur) { 1258 pr_warning("crashkernel: memory value expected\n"); 1259 return -EINVAL; 1260 } 1261 1262 if (*cur == '@') 1263 *crash_base = memparse(cur+1, &cur); 1264 1265 return 0; 1266 } 1267 1268 /* 1269 * That function is the entry point for command line parsing and should be 1270 * called from the arch-specific code. 1271 */ 1272 int __init parse_crashkernel(char *cmdline, 1273 unsigned long long system_ram, 1274 unsigned long long *crash_size, 1275 unsigned long long *crash_base) 1276 { 1277 char *p = cmdline, *ck_cmdline = NULL; 1278 char *first_colon, *first_space; 1279 1280 BUG_ON(!crash_size || !crash_base); 1281 *crash_size = 0; 1282 *crash_base = 0; 1283 1284 /* find crashkernel and use the last one if there are more */ 1285 p = strstr(p, "crashkernel="); 1286 while (p) { 1287 ck_cmdline = p; 1288 p = strstr(p+1, "crashkernel="); 1289 } 1290 1291 if (!ck_cmdline) 1292 return -EINVAL; 1293 1294 ck_cmdline += 12; /* strlen("crashkernel=") */ 1295 1296 /* 1297 * if the commandline contains a ':', then that's the extended 1298 * syntax -- if not, it must be the classic syntax 1299 */ 1300 first_colon = strchr(ck_cmdline, ':'); 1301 first_space = strchr(ck_cmdline, ' '); 1302 if (first_colon && (!first_space || first_colon < first_space)) 1303 return parse_crashkernel_mem(ck_cmdline, system_ram, 1304 crash_size, crash_base); 1305 else 1306 return parse_crashkernel_simple(ck_cmdline, crash_size, 1307 crash_base); 1308 1309 return 0; 1310 } 1311 1312 1313 1314 void crash_save_vmcoreinfo(void) 1315 { 1316 u32 *buf; 1317 1318 if (!vmcoreinfo_size) 1319 return; 1320 1321 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); 1322 1323 buf = (u32 *)vmcoreinfo_note; 1324 1325 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, 1326 vmcoreinfo_size); 1327 1328 final_note(buf); 1329 } 1330 1331 void vmcoreinfo_append_str(const char *fmt, ...) 1332 { 1333 va_list args; 1334 char buf[0x50]; 1335 int r; 1336 1337 va_start(args, fmt); 1338 r = vsnprintf(buf, sizeof(buf), fmt, args); 1339 va_end(args); 1340 1341 if (r + vmcoreinfo_size > vmcoreinfo_max_size) 1342 r = vmcoreinfo_max_size - vmcoreinfo_size; 1343 1344 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 1345 1346 vmcoreinfo_size += r; 1347 } 1348 1349 /* 1350 * provide an empty default implementation here -- architecture 1351 * code may override this 1352 */ 1353 void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) 1354 {} 1355 1356 unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) 1357 { 1358 return __pa((unsigned long)(char *)&vmcoreinfo_note); 1359 } 1360 1361 static int __init crash_save_vmcoreinfo_init(void) 1362 { 1363 VMCOREINFO_OSRELEASE(init_uts_ns.name.release); 1364 VMCOREINFO_PAGESIZE(PAGE_SIZE); 1365 1366 VMCOREINFO_SYMBOL(init_uts_ns); 1367 VMCOREINFO_SYMBOL(node_online_map); 1368 VMCOREINFO_SYMBOL(swapper_pg_dir); 1369 VMCOREINFO_SYMBOL(_stext); 1370 1371 #ifndef CONFIG_NEED_MULTIPLE_NODES 1372 VMCOREINFO_SYMBOL(mem_map); 1373 VMCOREINFO_SYMBOL(contig_page_data); 1374 #endif 1375 #ifdef CONFIG_SPARSEMEM 1376 VMCOREINFO_SYMBOL(mem_section); 1377 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); 1378 VMCOREINFO_STRUCT_SIZE(mem_section); 1379 VMCOREINFO_OFFSET(mem_section, section_mem_map); 1380 #endif 1381 VMCOREINFO_STRUCT_SIZE(page); 1382 VMCOREINFO_STRUCT_SIZE(pglist_data); 1383 VMCOREINFO_STRUCT_SIZE(zone); 1384 VMCOREINFO_STRUCT_SIZE(free_area); 1385 VMCOREINFO_STRUCT_SIZE(list_head); 1386 VMCOREINFO_SIZE(nodemask_t); 1387 VMCOREINFO_OFFSET(page, flags); 1388 VMCOREINFO_OFFSET(page, _count); 1389 VMCOREINFO_OFFSET(page, mapping); 1390 VMCOREINFO_OFFSET(page, lru); 1391 VMCOREINFO_OFFSET(pglist_data, node_zones); 1392 VMCOREINFO_OFFSET(pglist_data, nr_zones); 1393 #ifdef CONFIG_FLAT_NODE_MEM_MAP 1394 VMCOREINFO_OFFSET(pglist_data, node_mem_map); 1395 #endif 1396 VMCOREINFO_OFFSET(pglist_data, node_start_pfn); 1397 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); 1398 VMCOREINFO_OFFSET(pglist_data, node_id); 1399 VMCOREINFO_OFFSET(zone, free_area); 1400 VMCOREINFO_OFFSET(zone, vm_stat); 1401 VMCOREINFO_OFFSET(zone, spanned_pages); 1402 VMCOREINFO_OFFSET(free_area, free_list); 1403 VMCOREINFO_OFFSET(list_head, next); 1404 VMCOREINFO_OFFSET(list_head, prev); 1405 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1406 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1407 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1408 VMCOREINFO_NUMBER(PG_lru); 1409 VMCOREINFO_NUMBER(PG_private); 1410 VMCOREINFO_NUMBER(PG_swapcache); 1411 1412 arch_crash_save_vmcoreinfo(); 1413 1414 return 0; 1415 } 1416 1417 module_init(crash_save_vmcoreinfo_init) 1418