1 /* 2 * kexec.c - kexec system call 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #include <linux/capability.h> 10 #include <linux/mm.h> 11 #include <linux/file.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/kexec.h> 15 #include <linux/spinlock.h> 16 #include <linux/list.h> 17 #include <linux/highmem.h> 18 #include <linux/syscalls.h> 19 #include <linux/reboot.h> 20 #include <linux/ioport.h> 21 #include <linux/hardirq.h> 22 #include <linux/elf.h> 23 #include <linux/elfcore.h> 24 #include <linux/utsrelease.h> 25 #include <linux/utsname.h> 26 #include <linux/numa.h> 27 28 #include <asm/page.h> 29 #include <asm/uaccess.h> 30 #include <asm/io.h> 31 #include <asm/system.h> 32 #include <asm/semaphore.h> 33 #include <asm/sections.h> 34 35 /* Per cpu memory for storing cpu states in case of system crash. */ 36 note_buf_t* crash_notes; 37 38 /* vmcoreinfo stuff */ 39 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 40 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 41 size_t vmcoreinfo_size; 42 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 43 44 /* Location of the reserved area for the crash kernel */ 45 struct resource crashk_res = { 46 .name = "Crash kernel", 47 .start = 0, 48 .end = 0, 49 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 50 }; 51 52 int kexec_should_crash(struct task_struct *p) 53 { 54 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) 55 return 1; 56 return 0; 57 } 58 59 /* 60 * When kexec transitions to the new kernel there is a one-to-one 61 * mapping between physical and virtual addresses. On processors 62 * where you can disable the MMU this is trivial, and easy. For 63 * others it is still a simple predictable page table to setup. 64 * 65 * In that environment kexec copies the new kernel to its final 66 * resting place. This means I can only support memory whose 67 * physical address can fit in an unsigned long. In particular 68 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 69 * If the assembly stub has more restrictive requirements 70 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 71 * defined more restrictively in <asm/kexec.h>. 72 * 73 * The code for the transition from the current kernel to the 74 * the new kernel is placed in the control_code_buffer, whose size 75 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 76 * page of memory is necessary, but some architectures require more. 77 * Because this memory must be identity mapped in the transition from 78 * virtual to physical addresses it must live in the range 79 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 80 * modifiable. 81 * 82 * The assembly stub in the control code buffer is passed a linked list 83 * of descriptor pages detailing the source pages of the new kernel, 84 * and the destination addresses of those source pages. As this data 85 * structure is not used in the context of the current OS, it must 86 * be self-contained. 87 * 88 * The code has been made to work with highmem pages and will use a 89 * destination page in its final resting place (if it happens 90 * to allocate it). The end product of this is that most of the 91 * physical address space, and most of RAM can be used. 92 * 93 * Future directions include: 94 * - allocating a page table with the control code buffer identity 95 * mapped, to simplify machine_kexec and make kexec_on_panic more 96 * reliable. 97 */ 98 99 /* 100 * KIMAGE_NO_DEST is an impossible destination address..., for 101 * allocating pages whose destination address we do not care about. 102 */ 103 #define KIMAGE_NO_DEST (-1UL) 104 105 static int kimage_is_destination_range(struct kimage *image, 106 unsigned long start, unsigned long end); 107 static struct page *kimage_alloc_page(struct kimage *image, 108 gfp_t gfp_mask, 109 unsigned long dest); 110 111 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 112 unsigned long nr_segments, 113 struct kexec_segment __user *segments) 114 { 115 size_t segment_bytes; 116 struct kimage *image; 117 unsigned long i; 118 int result; 119 120 /* Allocate a controlling structure */ 121 result = -ENOMEM; 122 image = kzalloc(sizeof(*image), GFP_KERNEL); 123 if (!image) 124 goto out; 125 126 image->head = 0; 127 image->entry = &image->head; 128 image->last_entry = &image->head; 129 image->control_page = ~0; /* By default this does not apply */ 130 image->start = entry; 131 image->type = KEXEC_TYPE_DEFAULT; 132 133 /* Initialize the list of control pages */ 134 INIT_LIST_HEAD(&image->control_pages); 135 136 /* Initialize the list of destination pages */ 137 INIT_LIST_HEAD(&image->dest_pages); 138 139 /* Initialize the list of unuseable pages */ 140 INIT_LIST_HEAD(&image->unuseable_pages); 141 142 /* Read in the segments */ 143 image->nr_segments = nr_segments; 144 segment_bytes = nr_segments * sizeof(*segments); 145 result = copy_from_user(image->segment, segments, segment_bytes); 146 if (result) 147 goto out; 148 149 /* 150 * Verify we have good destination addresses. The caller is 151 * responsible for making certain we don't attempt to load 152 * the new image into invalid or reserved areas of RAM. This 153 * just verifies it is an address we can use. 154 * 155 * Since the kernel does everything in page size chunks ensure 156 * the destination addreses are page aligned. Too many 157 * special cases crop of when we don't do this. The most 158 * insidious is getting overlapping destination addresses 159 * simply because addresses are changed to page size 160 * granularity. 161 */ 162 result = -EADDRNOTAVAIL; 163 for (i = 0; i < nr_segments; i++) { 164 unsigned long mstart, mend; 165 166 mstart = image->segment[i].mem; 167 mend = mstart + image->segment[i].memsz; 168 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 169 goto out; 170 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 171 goto out; 172 } 173 174 /* Verify our destination addresses do not overlap. 175 * If we alloed overlapping destination addresses 176 * through very weird things can happen with no 177 * easy explanation as one segment stops on another. 178 */ 179 result = -EINVAL; 180 for (i = 0; i < nr_segments; i++) { 181 unsigned long mstart, mend; 182 unsigned long j; 183 184 mstart = image->segment[i].mem; 185 mend = mstart + image->segment[i].memsz; 186 for (j = 0; j < i; j++) { 187 unsigned long pstart, pend; 188 pstart = image->segment[j].mem; 189 pend = pstart + image->segment[j].memsz; 190 /* Do the segments overlap ? */ 191 if ((mend > pstart) && (mstart < pend)) 192 goto out; 193 } 194 } 195 196 /* Ensure our buffer sizes are strictly less than 197 * our memory sizes. This should always be the case, 198 * and it is easier to check up front than to be surprised 199 * later on. 200 */ 201 result = -EINVAL; 202 for (i = 0; i < nr_segments; i++) { 203 if (image->segment[i].bufsz > image->segment[i].memsz) 204 goto out; 205 } 206 207 result = 0; 208 out: 209 if (result == 0) 210 *rimage = image; 211 else 212 kfree(image); 213 214 return result; 215 216 } 217 218 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 219 unsigned long nr_segments, 220 struct kexec_segment __user *segments) 221 { 222 int result; 223 struct kimage *image; 224 225 /* Allocate and initialize a controlling structure */ 226 image = NULL; 227 result = do_kimage_alloc(&image, entry, nr_segments, segments); 228 if (result) 229 goto out; 230 231 *rimage = image; 232 233 /* 234 * Find a location for the control code buffer, and add it 235 * the vector of segments so that it's pages will also be 236 * counted as destination pages. 237 */ 238 result = -ENOMEM; 239 image->control_code_page = kimage_alloc_control_pages(image, 240 get_order(KEXEC_CONTROL_CODE_SIZE)); 241 if (!image->control_code_page) { 242 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 243 goto out; 244 } 245 246 result = 0; 247 out: 248 if (result == 0) 249 *rimage = image; 250 else 251 kfree(image); 252 253 return result; 254 } 255 256 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 257 unsigned long nr_segments, 258 struct kexec_segment __user *segments) 259 { 260 int result; 261 struct kimage *image; 262 unsigned long i; 263 264 image = NULL; 265 /* Verify we have a valid entry point */ 266 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 267 result = -EADDRNOTAVAIL; 268 goto out; 269 } 270 271 /* Allocate and initialize a controlling structure */ 272 result = do_kimage_alloc(&image, entry, nr_segments, segments); 273 if (result) 274 goto out; 275 276 /* Enable the special crash kernel control page 277 * allocation policy. 278 */ 279 image->control_page = crashk_res.start; 280 image->type = KEXEC_TYPE_CRASH; 281 282 /* 283 * Verify we have good destination addresses. Normally 284 * the caller is responsible for making certain we don't 285 * attempt to load the new image into invalid or reserved 286 * areas of RAM. But crash kernels are preloaded into a 287 * reserved area of ram. We must ensure the addresses 288 * are in the reserved area otherwise preloading the 289 * kernel could corrupt things. 290 */ 291 result = -EADDRNOTAVAIL; 292 for (i = 0; i < nr_segments; i++) { 293 unsigned long mstart, mend; 294 295 mstart = image->segment[i].mem; 296 mend = mstart + image->segment[i].memsz - 1; 297 /* Ensure we are within the crash kernel limits */ 298 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 299 goto out; 300 } 301 302 /* 303 * Find a location for the control code buffer, and add 304 * the vector of segments so that it's pages will also be 305 * counted as destination pages. 306 */ 307 result = -ENOMEM; 308 image->control_code_page = kimage_alloc_control_pages(image, 309 get_order(KEXEC_CONTROL_CODE_SIZE)); 310 if (!image->control_code_page) { 311 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 312 goto out; 313 } 314 315 result = 0; 316 out: 317 if (result == 0) 318 *rimage = image; 319 else 320 kfree(image); 321 322 return result; 323 } 324 325 static int kimage_is_destination_range(struct kimage *image, 326 unsigned long start, 327 unsigned long end) 328 { 329 unsigned long i; 330 331 for (i = 0; i < image->nr_segments; i++) { 332 unsigned long mstart, mend; 333 334 mstart = image->segment[i].mem; 335 mend = mstart + image->segment[i].memsz; 336 if ((end > mstart) && (start < mend)) 337 return 1; 338 } 339 340 return 0; 341 } 342 343 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 344 { 345 struct page *pages; 346 347 pages = alloc_pages(gfp_mask, order); 348 if (pages) { 349 unsigned int count, i; 350 pages->mapping = NULL; 351 set_page_private(pages, order); 352 count = 1 << order; 353 for (i = 0; i < count; i++) 354 SetPageReserved(pages + i); 355 } 356 357 return pages; 358 } 359 360 static void kimage_free_pages(struct page *page) 361 { 362 unsigned int order, count, i; 363 364 order = page_private(page); 365 count = 1 << order; 366 for (i = 0; i < count; i++) 367 ClearPageReserved(page + i); 368 __free_pages(page, order); 369 } 370 371 static void kimage_free_page_list(struct list_head *list) 372 { 373 struct list_head *pos, *next; 374 375 list_for_each_safe(pos, next, list) { 376 struct page *page; 377 378 page = list_entry(pos, struct page, lru); 379 list_del(&page->lru); 380 kimage_free_pages(page); 381 } 382 } 383 384 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 385 unsigned int order) 386 { 387 /* Control pages are special, they are the intermediaries 388 * that are needed while we copy the rest of the pages 389 * to their final resting place. As such they must 390 * not conflict with either the destination addresses 391 * or memory the kernel is already using. 392 * 393 * The only case where we really need more than one of 394 * these are for architectures where we cannot disable 395 * the MMU and must instead generate an identity mapped 396 * page table for all of the memory. 397 * 398 * At worst this runs in O(N) of the image size. 399 */ 400 struct list_head extra_pages; 401 struct page *pages; 402 unsigned int count; 403 404 count = 1 << order; 405 INIT_LIST_HEAD(&extra_pages); 406 407 /* Loop while I can allocate a page and the page allocated 408 * is a destination page. 409 */ 410 do { 411 unsigned long pfn, epfn, addr, eaddr; 412 413 pages = kimage_alloc_pages(GFP_KERNEL, order); 414 if (!pages) 415 break; 416 pfn = page_to_pfn(pages); 417 epfn = pfn + count; 418 addr = pfn << PAGE_SHIFT; 419 eaddr = epfn << PAGE_SHIFT; 420 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 421 kimage_is_destination_range(image, addr, eaddr)) { 422 list_add(&pages->lru, &extra_pages); 423 pages = NULL; 424 } 425 } while (!pages); 426 427 if (pages) { 428 /* Remember the allocated page... */ 429 list_add(&pages->lru, &image->control_pages); 430 431 /* Because the page is already in it's destination 432 * location we will never allocate another page at 433 * that address. Therefore kimage_alloc_pages 434 * will not return it (again) and we don't need 435 * to give it an entry in image->segment[]. 436 */ 437 } 438 /* Deal with the destination pages I have inadvertently allocated. 439 * 440 * Ideally I would convert multi-page allocations into single 441 * page allocations, and add everyting to image->dest_pages. 442 * 443 * For now it is simpler to just free the pages. 444 */ 445 kimage_free_page_list(&extra_pages); 446 447 return pages; 448 } 449 450 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 451 unsigned int order) 452 { 453 /* Control pages are special, they are the intermediaries 454 * that are needed while we copy the rest of the pages 455 * to their final resting place. As such they must 456 * not conflict with either the destination addresses 457 * or memory the kernel is already using. 458 * 459 * Control pages are also the only pags we must allocate 460 * when loading a crash kernel. All of the other pages 461 * are specified by the segments and we just memcpy 462 * into them directly. 463 * 464 * The only case where we really need more than one of 465 * these are for architectures where we cannot disable 466 * the MMU and must instead generate an identity mapped 467 * page table for all of the memory. 468 * 469 * Given the low demand this implements a very simple 470 * allocator that finds the first hole of the appropriate 471 * size in the reserved memory region, and allocates all 472 * of the memory up to and including the hole. 473 */ 474 unsigned long hole_start, hole_end, size; 475 struct page *pages; 476 477 pages = NULL; 478 size = (1 << order) << PAGE_SHIFT; 479 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 480 hole_end = hole_start + size - 1; 481 while (hole_end <= crashk_res.end) { 482 unsigned long i; 483 484 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 485 break; 486 if (hole_end > crashk_res.end) 487 break; 488 /* See if I overlap any of the segments */ 489 for (i = 0; i < image->nr_segments; i++) { 490 unsigned long mstart, mend; 491 492 mstart = image->segment[i].mem; 493 mend = mstart + image->segment[i].memsz - 1; 494 if ((hole_end >= mstart) && (hole_start <= mend)) { 495 /* Advance the hole to the end of the segment */ 496 hole_start = (mend + (size - 1)) & ~(size - 1); 497 hole_end = hole_start + size - 1; 498 break; 499 } 500 } 501 /* If I don't overlap any segments I have found my hole! */ 502 if (i == image->nr_segments) { 503 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 504 break; 505 } 506 } 507 if (pages) 508 image->control_page = hole_end; 509 510 return pages; 511 } 512 513 514 struct page *kimage_alloc_control_pages(struct kimage *image, 515 unsigned int order) 516 { 517 struct page *pages = NULL; 518 519 switch (image->type) { 520 case KEXEC_TYPE_DEFAULT: 521 pages = kimage_alloc_normal_control_pages(image, order); 522 break; 523 case KEXEC_TYPE_CRASH: 524 pages = kimage_alloc_crash_control_pages(image, order); 525 break; 526 } 527 528 return pages; 529 } 530 531 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 532 { 533 if (*image->entry != 0) 534 image->entry++; 535 536 if (image->entry == image->last_entry) { 537 kimage_entry_t *ind_page; 538 struct page *page; 539 540 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 541 if (!page) 542 return -ENOMEM; 543 544 ind_page = page_address(page); 545 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 546 image->entry = ind_page; 547 image->last_entry = ind_page + 548 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 549 } 550 *image->entry = entry; 551 image->entry++; 552 *image->entry = 0; 553 554 return 0; 555 } 556 557 static int kimage_set_destination(struct kimage *image, 558 unsigned long destination) 559 { 560 int result; 561 562 destination &= PAGE_MASK; 563 result = kimage_add_entry(image, destination | IND_DESTINATION); 564 if (result == 0) 565 image->destination = destination; 566 567 return result; 568 } 569 570 571 static int kimage_add_page(struct kimage *image, unsigned long page) 572 { 573 int result; 574 575 page &= PAGE_MASK; 576 result = kimage_add_entry(image, page | IND_SOURCE); 577 if (result == 0) 578 image->destination += PAGE_SIZE; 579 580 return result; 581 } 582 583 584 static void kimage_free_extra_pages(struct kimage *image) 585 { 586 /* Walk through and free any extra destination pages I may have */ 587 kimage_free_page_list(&image->dest_pages); 588 589 /* Walk through and free any unuseable pages I have cached */ 590 kimage_free_page_list(&image->unuseable_pages); 591 592 } 593 static int kimage_terminate(struct kimage *image) 594 { 595 if (*image->entry != 0) 596 image->entry++; 597 598 *image->entry = IND_DONE; 599 600 return 0; 601 } 602 603 #define for_each_kimage_entry(image, ptr, entry) \ 604 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 605 ptr = (entry & IND_INDIRECTION)? \ 606 phys_to_virt((entry & PAGE_MASK)): ptr +1) 607 608 static void kimage_free_entry(kimage_entry_t entry) 609 { 610 struct page *page; 611 612 page = pfn_to_page(entry >> PAGE_SHIFT); 613 kimage_free_pages(page); 614 } 615 616 static void kimage_free(struct kimage *image) 617 { 618 kimage_entry_t *ptr, entry; 619 kimage_entry_t ind = 0; 620 621 if (!image) 622 return; 623 624 kimage_free_extra_pages(image); 625 for_each_kimage_entry(image, ptr, entry) { 626 if (entry & IND_INDIRECTION) { 627 /* Free the previous indirection page */ 628 if (ind & IND_INDIRECTION) 629 kimage_free_entry(ind); 630 /* Save this indirection page until we are 631 * done with it. 632 */ 633 ind = entry; 634 } 635 else if (entry & IND_SOURCE) 636 kimage_free_entry(entry); 637 } 638 /* Free the final indirection page */ 639 if (ind & IND_INDIRECTION) 640 kimage_free_entry(ind); 641 642 /* Handle any machine specific cleanup */ 643 machine_kexec_cleanup(image); 644 645 /* Free the kexec control pages... */ 646 kimage_free_page_list(&image->control_pages); 647 kfree(image); 648 } 649 650 static kimage_entry_t *kimage_dst_used(struct kimage *image, 651 unsigned long page) 652 { 653 kimage_entry_t *ptr, entry; 654 unsigned long destination = 0; 655 656 for_each_kimage_entry(image, ptr, entry) { 657 if (entry & IND_DESTINATION) 658 destination = entry & PAGE_MASK; 659 else if (entry & IND_SOURCE) { 660 if (page == destination) 661 return ptr; 662 destination += PAGE_SIZE; 663 } 664 } 665 666 return NULL; 667 } 668 669 static struct page *kimage_alloc_page(struct kimage *image, 670 gfp_t gfp_mask, 671 unsigned long destination) 672 { 673 /* 674 * Here we implement safeguards to ensure that a source page 675 * is not copied to its destination page before the data on 676 * the destination page is no longer useful. 677 * 678 * To do this we maintain the invariant that a source page is 679 * either its own destination page, or it is not a 680 * destination page at all. 681 * 682 * That is slightly stronger than required, but the proof 683 * that no problems will not occur is trivial, and the 684 * implementation is simply to verify. 685 * 686 * When allocating all pages normally this algorithm will run 687 * in O(N) time, but in the worst case it will run in O(N^2) 688 * time. If the runtime is a problem the data structures can 689 * be fixed. 690 */ 691 struct page *page; 692 unsigned long addr; 693 694 /* 695 * Walk through the list of destination pages, and see if I 696 * have a match. 697 */ 698 list_for_each_entry(page, &image->dest_pages, lru) { 699 addr = page_to_pfn(page) << PAGE_SHIFT; 700 if (addr == destination) { 701 list_del(&page->lru); 702 return page; 703 } 704 } 705 page = NULL; 706 while (1) { 707 kimage_entry_t *old; 708 709 /* Allocate a page, if we run out of memory give up */ 710 page = kimage_alloc_pages(gfp_mask, 0); 711 if (!page) 712 return NULL; 713 /* If the page cannot be used file it away */ 714 if (page_to_pfn(page) > 715 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 716 list_add(&page->lru, &image->unuseable_pages); 717 continue; 718 } 719 addr = page_to_pfn(page) << PAGE_SHIFT; 720 721 /* If it is the destination page we want use it */ 722 if (addr == destination) 723 break; 724 725 /* If the page is not a destination page use it */ 726 if (!kimage_is_destination_range(image, addr, 727 addr + PAGE_SIZE)) 728 break; 729 730 /* 731 * I know that the page is someones destination page. 732 * See if there is already a source page for this 733 * destination page. And if so swap the source pages. 734 */ 735 old = kimage_dst_used(image, addr); 736 if (old) { 737 /* If so move it */ 738 unsigned long old_addr; 739 struct page *old_page; 740 741 old_addr = *old & PAGE_MASK; 742 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 743 copy_highpage(page, old_page); 744 *old = addr | (*old & ~PAGE_MASK); 745 746 /* The old page I have found cannot be a 747 * destination page, so return it. 748 */ 749 addr = old_addr; 750 page = old_page; 751 break; 752 } 753 else { 754 /* Place the page on the destination list I 755 * will use it later. 756 */ 757 list_add(&page->lru, &image->dest_pages); 758 } 759 } 760 761 return page; 762 } 763 764 static int kimage_load_normal_segment(struct kimage *image, 765 struct kexec_segment *segment) 766 { 767 unsigned long maddr; 768 unsigned long ubytes, mbytes; 769 int result; 770 unsigned char __user *buf; 771 772 result = 0; 773 buf = segment->buf; 774 ubytes = segment->bufsz; 775 mbytes = segment->memsz; 776 maddr = segment->mem; 777 778 result = kimage_set_destination(image, maddr); 779 if (result < 0) 780 goto out; 781 782 while (mbytes) { 783 struct page *page; 784 char *ptr; 785 size_t uchunk, mchunk; 786 787 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 788 if (!page) { 789 result = -ENOMEM; 790 goto out; 791 } 792 result = kimage_add_page(image, page_to_pfn(page) 793 << PAGE_SHIFT); 794 if (result < 0) 795 goto out; 796 797 ptr = kmap(page); 798 /* Start with a clear page */ 799 memset(ptr, 0, PAGE_SIZE); 800 ptr += maddr & ~PAGE_MASK; 801 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 802 if (mchunk > mbytes) 803 mchunk = mbytes; 804 805 uchunk = mchunk; 806 if (uchunk > ubytes) 807 uchunk = ubytes; 808 809 result = copy_from_user(ptr, buf, uchunk); 810 kunmap(page); 811 if (result) { 812 result = (result < 0) ? result : -EIO; 813 goto out; 814 } 815 ubytes -= uchunk; 816 maddr += mchunk; 817 buf += mchunk; 818 mbytes -= mchunk; 819 } 820 out: 821 return result; 822 } 823 824 static int kimage_load_crash_segment(struct kimage *image, 825 struct kexec_segment *segment) 826 { 827 /* For crash dumps kernels we simply copy the data from 828 * user space to it's destination. 829 * We do things a page at a time for the sake of kmap. 830 */ 831 unsigned long maddr; 832 unsigned long ubytes, mbytes; 833 int result; 834 unsigned char __user *buf; 835 836 result = 0; 837 buf = segment->buf; 838 ubytes = segment->bufsz; 839 mbytes = segment->memsz; 840 maddr = segment->mem; 841 while (mbytes) { 842 struct page *page; 843 char *ptr; 844 size_t uchunk, mchunk; 845 846 page = pfn_to_page(maddr >> PAGE_SHIFT); 847 if (!page) { 848 result = -ENOMEM; 849 goto out; 850 } 851 ptr = kmap(page); 852 ptr += maddr & ~PAGE_MASK; 853 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 854 if (mchunk > mbytes) 855 mchunk = mbytes; 856 857 uchunk = mchunk; 858 if (uchunk > ubytes) { 859 uchunk = ubytes; 860 /* Zero the trailing part of the page */ 861 memset(ptr + uchunk, 0, mchunk - uchunk); 862 } 863 result = copy_from_user(ptr, buf, uchunk); 864 kexec_flush_icache_page(page); 865 kunmap(page); 866 if (result) { 867 result = (result < 0) ? result : -EIO; 868 goto out; 869 } 870 ubytes -= uchunk; 871 maddr += mchunk; 872 buf += mchunk; 873 mbytes -= mchunk; 874 } 875 out: 876 return result; 877 } 878 879 static int kimage_load_segment(struct kimage *image, 880 struct kexec_segment *segment) 881 { 882 int result = -ENOMEM; 883 884 switch (image->type) { 885 case KEXEC_TYPE_DEFAULT: 886 result = kimage_load_normal_segment(image, segment); 887 break; 888 case KEXEC_TYPE_CRASH: 889 result = kimage_load_crash_segment(image, segment); 890 break; 891 } 892 893 return result; 894 } 895 896 /* 897 * Exec Kernel system call: for obvious reasons only root may call it. 898 * 899 * This call breaks up into three pieces. 900 * - A generic part which loads the new kernel from the current 901 * address space, and very carefully places the data in the 902 * allocated pages. 903 * 904 * - A generic part that interacts with the kernel and tells all of 905 * the devices to shut down. Preventing on-going dmas, and placing 906 * the devices in a consistent state so a later kernel can 907 * reinitialize them. 908 * 909 * - A machine specific part that includes the syscall number 910 * and the copies the image to it's final destination. And 911 * jumps into the image at entry. 912 * 913 * kexec does not sync, or unmount filesystems so if you need 914 * that to happen you need to do that yourself. 915 */ 916 struct kimage *kexec_image; 917 struct kimage *kexec_crash_image; 918 /* 919 * A home grown binary mutex. 920 * Nothing can wait so this mutex is safe to use 921 * in interrupt context :) 922 */ 923 static int kexec_lock; 924 925 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 926 struct kexec_segment __user *segments, 927 unsigned long flags) 928 { 929 struct kimage **dest_image, *image; 930 int locked; 931 int result; 932 933 /* We only trust the superuser with rebooting the system. */ 934 if (!capable(CAP_SYS_BOOT)) 935 return -EPERM; 936 937 /* 938 * Verify we have a legal set of flags 939 * This leaves us room for future extensions. 940 */ 941 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 942 return -EINVAL; 943 944 /* Verify we are on the appropriate architecture */ 945 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 946 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 947 return -EINVAL; 948 949 /* Put an artificial cap on the number 950 * of segments passed to kexec_load. 951 */ 952 if (nr_segments > KEXEC_SEGMENT_MAX) 953 return -EINVAL; 954 955 image = NULL; 956 result = 0; 957 958 /* Because we write directly to the reserved memory 959 * region when loading crash kernels we need a mutex here to 960 * prevent multiple crash kernels from attempting to load 961 * simultaneously, and to prevent a crash kernel from loading 962 * over the top of a in use crash kernel. 963 * 964 * KISS: always take the mutex. 965 */ 966 locked = xchg(&kexec_lock, 1); 967 if (locked) 968 return -EBUSY; 969 970 dest_image = &kexec_image; 971 if (flags & KEXEC_ON_CRASH) 972 dest_image = &kexec_crash_image; 973 if (nr_segments > 0) { 974 unsigned long i; 975 976 /* Loading another kernel to reboot into */ 977 if ((flags & KEXEC_ON_CRASH) == 0) 978 result = kimage_normal_alloc(&image, entry, 979 nr_segments, segments); 980 /* Loading another kernel to switch to if this one crashes */ 981 else if (flags & KEXEC_ON_CRASH) { 982 /* Free any current crash dump kernel before 983 * we corrupt it. 984 */ 985 kimage_free(xchg(&kexec_crash_image, NULL)); 986 result = kimage_crash_alloc(&image, entry, 987 nr_segments, segments); 988 } 989 if (result) 990 goto out; 991 992 result = machine_kexec_prepare(image); 993 if (result) 994 goto out; 995 996 for (i = 0; i < nr_segments; i++) { 997 result = kimage_load_segment(image, &image->segment[i]); 998 if (result) 999 goto out; 1000 } 1001 result = kimage_terminate(image); 1002 if (result) 1003 goto out; 1004 } 1005 /* Install the new kernel, and Uninstall the old */ 1006 image = xchg(dest_image, image); 1007 1008 out: 1009 locked = xchg(&kexec_lock, 0); /* Release the mutex */ 1010 BUG_ON(!locked); 1011 kimage_free(image); 1012 1013 return result; 1014 } 1015 1016 #ifdef CONFIG_COMPAT 1017 asmlinkage long compat_sys_kexec_load(unsigned long entry, 1018 unsigned long nr_segments, 1019 struct compat_kexec_segment __user *segments, 1020 unsigned long flags) 1021 { 1022 struct compat_kexec_segment in; 1023 struct kexec_segment out, __user *ksegments; 1024 unsigned long i, result; 1025 1026 /* Don't allow clients that don't understand the native 1027 * architecture to do anything. 1028 */ 1029 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 1030 return -EINVAL; 1031 1032 if (nr_segments > KEXEC_SEGMENT_MAX) 1033 return -EINVAL; 1034 1035 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1036 for (i=0; i < nr_segments; i++) { 1037 result = copy_from_user(&in, &segments[i], sizeof(in)); 1038 if (result) 1039 return -EFAULT; 1040 1041 out.buf = compat_ptr(in.buf); 1042 out.bufsz = in.bufsz; 1043 out.mem = in.mem; 1044 out.memsz = in.memsz; 1045 1046 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 1047 if (result) 1048 return -EFAULT; 1049 } 1050 1051 return sys_kexec_load(entry, nr_segments, ksegments, flags); 1052 } 1053 #endif 1054 1055 void crash_kexec(struct pt_regs *regs) 1056 { 1057 int locked; 1058 1059 1060 /* Take the kexec_lock here to prevent sys_kexec_load 1061 * running on one cpu from replacing the crash kernel 1062 * we are using after a panic on a different cpu. 1063 * 1064 * If the crash kernel was not located in a fixed area 1065 * of memory the xchg(&kexec_crash_image) would be 1066 * sufficient. But since I reuse the memory... 1067 */ 1068 locked = xchg(&kexec_lock, 1); 1069 if (!locked) { 1070 if (kexec_crash_image) { 1071 struct pt_regs fixed_regs; 1072 crash_setup_regs(&fixed_regs, regs); 1073 crash_save_vmcoreinfo(); 1074 machine_crash_shutdown(&fixed_regs); 1075 machine_kexec(kexec_crash_image); 1076 } 1077 locked = xchg(&kexec_lock, 0); 1078 BUG_ON(!locked); 1079 } 1080 } 1081 1082 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1083 size_t data_len) 1084 { 1085 struct elf_note note; 1086 1087 note.n_namesz = strlen(name) + 1; 1088 note.n_descsz = data_len; 1089 note.n_type = type; 1090 memcpy(buf, ¬e, sizeof(note)); 1091 buf += (sizeof(note) + 3)/4; 1092 memcpy(buf, name, note.n_namesz); 1093 buf += (note.n_namesz + 3)/4; 1094 memcpy(buf, data, note.n_descsz); 1095 buf += (note.n_descsz + 3)/4; 1096 1097 return buf; 1098 } 1099 1100 static void final_note(u32 *buf) 1101 { 1102 struct elf_note note; 1103 1104 note.n_namesz = 0; 1105 note.n_descsz = 0; 1106 note.n_type = 0; 1107 memcpy(buf, ¬e, sizeof(note)); 1108 } 1109 1110 void crash_save_cpu(struct pt_regs *regs, int cpu) 1111 { 1112 struct elf_prstatus prstatus; 1113 u32 *buf; 1114 1115 if ((cpu < 0) || (cpu >= NR_CPUS)) 1116 return; 1117 1118 /* Using ELF notes here is opportunistic. 1119 * I need a well defined structure format 1120 * for the data I pass, and I need tags 1121 * on the data to indicate what information I have 1122 * squirrelled away. ELF notes happen to provide 1123 * all of that, so there is no need to invent something new. 1124 */ 1125 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1126 if (!buf) 1127 return; 1128 memset(&prstatus, 0, sizeof(prstatus)); 1129 prstatus.pr_pid = current->pid; 1130 elf_core_copy_regs(&prstatus.pr_reg, regs); 1131 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1132 &prstatus, sizeof(prstatus)); 1133 final_note(buf); 1134 } 1135 1136 static int __init crash_notes_memory_init(void) 1137 { 1138 /* Allocate memory for saving cpu registers. */ 1139 crash_notes = alloc_percpu(note_buf_t); 1140 if (!crash_notes) { 1141 printk("Kexec: Memory allocation for saving cpu register" 1142 " states failed\n"); 1143 return -ENOMEM; 1144 } 1145 return 0; 1146 } 1147 module_init(crash_notes_memory_init) 1148 1149 1150 /* 1151 * parsing the "crashkernel" commandline 1152 * 1153 * this code is intended to be called from architecture specific code 1154 */ 1155 1156 1157 /* 1158 * This function parses command lines in the format 1159 * 1160 * crashkernel=ramsize-range:size[,...][@offset] 1161 * 1162 * The function returns 0 on success and -EINVAL on failure. 1163 */ 1164 static int __init parse_crashkernel_mem(char *cmdline, 1165 unsigned long long system_ram, 1166 unsigned long long *crash_size, 1167 unsigned long long *crash_base) 1168 { 1169 char *cur = cmdline, *tmp; 1170 1171 /* for each entry of the comma-separated list */ 1172 do { 1173 unsigned long long start, end = ULLONG_MAX, size; 1174 1175 /* get the start of the range */ 1176 start = memparse(cur, &tmp); 1177 if (cur == tmp) { 1178 pr_warning("crashkernel: Memory value expected\n"); 1179 return -EINVAL; 1180 } 1181 cur = tmp; 1182 if (*cur != '-') { 1183 pr_warning("crashkernel: '-' expected\n"); 1184 return -EINVAL; 1185 } 1186 cur++; 1187 1188 /* if no ':' is here, than we read the end */ 1189 if (*cur != ':') { 1190 end = memparse(cur, &tmp); 1191 if (cur == tmp) { 1192 pr_warning("crashkernel: Memory " 1193 "value expected\n"); 1194 return -EINVAL; 1195 } 1196 cur = tmp; 1197 if (end <= start) { 1198 pr_warning("crashkernel: end <= start\n"); 1199 return -EINVAL; 1200 } 1201 } 1202 1203 if (*cur != ':') { 1204 pr_warning("crashkernel: ':' expected\n"); 1205 return -EINVAL; 1206 } 1207 cur++; 1208 1209 size = memparse(cur, &tmp); 1210 if (cur == tmp) { 1211 pr_warning("Memory value expected\n"); 1212 return -EINVAL; 1213 } 1214 cur = tmp; 1215 if (size >= system_ram) { 1216 pr_warning("crashkernel: invalid size\n"); 1217 return -EINVAL; 1218 } 1219 1220 /* match ? */ 1221 if (system_ram >= start && system_ram <= end) { 1222 *crash_size = size; 1223 break; 1224 } 1225 } while (*cur++ == ','); 1226 1227 if (*crash_size > 0) { 1228 while (*cur != ' ' && *cur != '@') 1229 cur++; 1230 if (*cur == '@') { 1231 cur++; 1232 *crash_base = memparse(cur, &tmp); 1233 if (cur == tmp) { 1234 pr_warning("Memory value expected " 1235 "after '@'\n"); 1236 return -EINVAL; 1237 } 1238 } 1239 } 1240 1241 return 0; 1242 } 1243 1244 /* 1245 * That function parses "simple" (old) crashkernel command lines like 1246 * 1247 * crashkernel=size[@offset] 1248 * 1249 * It returns 0 on success and -EINVAL on failure. 1250 */ 1251 static int __init parse_crashkernel_simple(char *cmdline, 1252 unsigned long long *crash_size, 1253 unsigned long long *crash_base) 1254 { 1255 char *cur = cmdline; 1256 1257 *crash_size = memparse(cmdline, &cur); 1258 if (cmdline == cur) { 1259 pr_warning("crashkernel: memory value expected\n"); 1260 return -EINVAL; 1261 } 1262 1263 if (*cur == '@') 1264 *crash_base = memparse(cur+1, &cur); 1265 1266 return 0; 1267 } 1268 1269 /* 1270 * That function is the entry point for command line parsing and should be 1271 * called from the arch-specific code. 1272 */ 1273 int __init parse_crashkernel(char *cmdline, 1274 unsigned long long system_ram, 1275 unsigned long long *crash_size, 1276 unsigned long long *crash_base) 1277 { 1278 char *p = cmdline, *ck_cmdline = NULL; 1279 char *first_colon, *first_space; 1280 1281 BUG_ON(!crash_size || !crash_base); 1282 *crash_size = 0; 1283 *crash_base = 0; 1284 1285 /* find crashkernel and use the last one if there are more */ 1286 p = strstr(p, "crashkernel="); 1287 while (p) { 1288 ck_cmdline = p; 1289 p = strstr(p+1, "crashkernel="); 1290 } 1291 1292 if (!ck_cmdline) 1293 return -EINVAL; 1294 1295 ck_cmdline += 12; /* strlen("crashkernel=") */ 1296 1297 /* 1298 * if the commandline contains a ':', then that's the extended 1299 * syntax -- if not, it must be the classic syntax 1300 */ 1301 first_colon = strchr(ck_cmdline, ':'); 1302 first_space = strchr(ck_cmdline, ' '); 1303 if (first_colon && (!first_space || first_colon < first_space)) 1304 return parse_crashkernel_mem(ck_cmdline, system_ram, 1305 crash_size, crash_base); 1306 else 1307 return parse_crashkernel_simple(ck_cmdline, crash_size, 1308 crash_base); 1309 1310 return 0; 1311 } 1312 1313 1314 1315 void crash_save_vmcoreinfo(void) 1316 { 1317 u32 *buf; 1318 1319 if (!vmcoreinfo_size) 1320 return; 1321 1322 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); 1323 1324 buf = (u32 *)vmcoreinfo_note; 1325 1326 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, 1327 vmcoreinfo_size); 1328 1329 final_note(buf); 1330 } 1331 1332 void vmcoreinfo_append_str(const char *fmt, ...) 1333 { 1334 va_list args; 1335 char buf[0x50]; 1336 int r; 1337 1338 va_start(args, fmt); 1339 r = vsnprintf(buf, sizeof(buf), fmt, args); 1340 va_end(args); 1341 1342 if (r + vmcoreinfo_size > vmcoreinfo_max_size) 1343 r = vmcoreinfo_max_size - vmcoreinfo_size; 1344 1345 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 1346 1347 vmcoreinfo_size += r; 1348 } 1349 1350 /* 1351 * provide an empty default implementation here -- architecture 1352 * code may override this 1353 */ 1354 void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) 1355 {} 1356 1357 unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) 1358 { 1359 return __pa((unsigned long)(char *)&vmcoreinfo_note); 1360 } 1361 1362 static int __init crash_save_vmcoreinfo_init(void) 1363 { 1364 vmcoreinfo_append_str("OSRELEASE=%s\n", init_uts_ns.name.release); 1365 vmcoreinfo_append_str("PAGESIZE=%ld\n", PAGE_SIZE); 1366 1367 VMCOREINFO_SYMBOL(init_uts_ns); 1368 VMCOREINFO_SYMBOL(node_online_map); 1369 VMCOREINFO_SYMBOL(swapper_pg_dir); 1370 VMCOREINFO_SYMBOL(_stext); 1371 1372 #ifndef CONFIG_NEED_MULTIPLE_NODES 1373 VMCOREINFO_SYMBOL(mem_map); 1374 VMCOREINFO_SYMBOL(contig_page_data); 1375 #endif 1376 #ifdef CONFIG_SPARSEMEM 1377 VMCOREINFO_SYMBOL(mem_section); 1378 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); 1379 VMCOREINFO_SIZE(mem_section); 1380 VMCOREINFO_OFFSET(mem_section, section_mem_map); 1381 #endif 1382 VMCOREINFO_SIZE(page); 1383 VMCOREINFO_SIZE(pglist_data); 1384 VMCOREINFO_SIZE(zone); 1385 VMCOREINFO_SIZE(free_area); 1386 VMCOREINFO_SIZE(list_head); 1387 VMCOREINFO_TYPEDEF_SIZE(nodemask_t); 1388 VMCOREINFO_OFFSET(page, flags); 1389 VMCOREINFO_OFFSET(page, _count); 1390 VMCOREINFO_OFFSET(page, mapping); 1391 VMCOREINFO_OFFSET(page, lru); 1392 VMCOREINFO_OFFSET(pglist_data, node_zones); 1393 VMCOREINFO_OFFSET(pglist_data, nr_zones); 1394 #ifdef CONFIG_FLAT_NODE_MEM_MAP 1395 VMCOREINFO_OFFSET(pglist_data, node_mem_map); 1396 #endif 1397 VMCOREINFO_OFFSET(pglist_data, node_start_pfn); 1398 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); 1399 VMCOREINFO_OFFSET(pglist_data, node_id); 1400 VMCOREINFO_OFFSET(zone, free_area); 1401 VMCOREINFO_OFFSET(zone, vm_stat); 1402 VMCOREINFO_OFFSET(zone, spanned_pages); 1403 VMCOREINFO_OFFSET(free_area, free_list); 1404 VMCOREINFO_OFFSET(list_head, next); 1405 VMCOREINFO_OFFSET(list_head, prev); 1406 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1407 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1408 1409 arch_crash_save_vmcoreinfo(); 1410 1411 return 0; 1412 } 1413 1414 module_init(crash_save_vmcoreinfo_init) 1415