1 /* 2 * kexec.c - kexec system call 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #include <linux/capability.h> 10 #include <linux/mm.h> 11 #include <linux/file.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/kexec.h> 15 #include <linux/spinlock.h> 16 #include <linux/list.h> 17 #include <linux/highmem.h> 18 #include <linux/syscalls.h> 19 #include <linux/reboot.h> 20 #include <linux/syscalls.h> 21 #include <linux/ioport.h> 22 #include <linux/hardirq.h> 23 #include <linux/elf.h> 24 #include <linux/elfcore.h> 25 26 #include <asm/page.h> 27 #include <asm/uaccess.h> 28 #include <asm/io.h> 29 #include <asm/system.h> 30 #include <asm/semaphore.h> 31 32 /* Per cpu memory for storing cpu states in case of system crash. */ 33 note_buf_t* crash_notes; 34 35 /* Location of the reserved area for the crash kernel */ 36 struct resource crashk_res = { 37 .name = "Crash kernel", 38 .start = 0, 39 .end = 0, 40 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 41 }; 42 43 int kexec_should_crash(struct task_struct *p) 44 { 45 if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops) 46 return 1; 47 return 0; 48 } 49 50 /* 51 * When kexec transitions to the new kernel there is a one-to-one 52 * mapping between physical and virtual addresses. On processors 53 * where you can disable the MMU this is trivial, and easy. For 54 * others it is still a simple predictable page table to setup. 55 * 56 * In that environment kexec copies the new kernel to its final 57 * resting place. This means I can only support memory whose 58 * physical address can fit in an unsigned long. In particular 59 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 60 * If the assembly stub has more restrictive requirements 61 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 62 * defined more restrictively in <asm/kexec.h>. 63 * 64 * The code for the transition from the current kernel to the 65 * the new kernel is placed in the control_code_buffer, whose size 66 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 67 * page of memory is necessary, but some architectures require more. 68 * Because this memory must be identity mapped in the transition from 69 * virtual to physical addresses it must live in the range 70 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 71 * modifiable. 72 * 73 * The assembly stub in the control code buffer is passed a linked list 74 * of descriptor pages detailing the source pages of the new kernel, 75 * and the destination addresses of those source pages. As this data 76 * structure is not used in the context of the current OS, it must 77 * be self-contained. 78 * 79 * The code has been made to work with highmem pages and will use a 80 * destination page in its final resting place (if it happens 81 * to allocate it). The end product of this is that most of the 82 * physical address space, and most of RAM can be used. 83 * 84 * Future directions include: 85 * - allocating a page table with the control code buffer identity 86 * mapped, to simplify machine_kexec and make kexec_on_panic more 87 * reliable. 88 */ 89 90 /* 91 * KIMAGE_NO_DEST is an impossible destination address..., for 92 * allocating pages whose destination address we do not care about. 93 */ 94 #define KIMAGE_NO_DEST (-1UL) 95 96 static int kimage_is_destination_range(struct kimage *image, 97 unsigned long start, unsigned long end); 98 static struct page *kimage_alloc_page(struct kimage *image, 99 gfp_t gfp_mask, 100 unsigned long dest); 101 102 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 103 unsigned long nr_segments, 104 struct kexec_segment __user *segments) 105 { 106 size_t segment_bytes; 107 struct kimage *image; 108 unsigned long i; 109 int result; 110 111 /* Allocate a controlling structure */ 112 result = -ENOMEM; 113 image = kzalloc(sizeof(*image), GFP_KERNEL); 114 if (!image) 115 goto out; 116 117 image->head = 0; 118 image->entry = &image->head; 119 image->last_entry = &image->head; 120 image->control_page = ~0; /* By default this does not apply */ 121 image->start = entry; 122 image->type = KEXEC_TYPE_DEFAULT; 123 124 /* Initialize the list of control pages */ 125 INIT_LIST_HEAD(&image->control_pages); 126 127 /* Initialize the list of destination pages */ 128 INIT_LIST_HEAD(&image->dest_pages); 129 130 /* Initialize the list of unuseable pages */ 131 INIT_LIST_HEAD(&image->unuseable_pages); 132 133 /* Read in the segments */ 134 image->nr_segments = nr_segments; 135 segment_bytes = nr_segments * sizeof(*segments); 136 result = copy_from_user(image->segment, segments, segment_bytes); 137 if (result) 138 goto out; 139 140 /* 141 * Verify we have good destination addresses. The caller is 142 * responsible for making certain we don't attempt to load 143 * the new image into invalid or reserved areas of RAM. This 144 * just verifies it is an address we can use. 145 * 146 * Since the kernel does everything in page size chunks ensure 147 * the destination addreses are page aligned. Too many 148 * special cases crop of when we don't do this. The most 149 * insidious is getting overlapping destination addresses 150 * simply because addresses are changed to page size 151 * granularity. 152 */ 153 result = -EADDRNOTAVAIL; 154 for (i = 0; i < nr_segments; i++) { 155 unsigned long mstart, mend; 156 157 mstart = image->segment[i].mem; 158 mend = mstart + image->segment[i].memsz; 159 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 160 goto out; 161 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 162 goto out; 163 } 164 165 /* Verify our destination addresses do not overlap. 166 * If we alloed overlapping destination addresses 167 * through very weird things can happen with no 168 * easy explanation as one segment stops on another. 169 */ 170 result = -EINVAL; 171 for (i = 0; i < nr_segments; i++) { 172 unsigned long mstart, mend; 173 unsigned long j; 174 175 mstart = image->segment[i].mem; 176 mend = mstart + image->segment[i].memsz; 177 for (j = 0; j < i; j++) { 178 unsigned long pstart, pend; 179 pstart = image->segment[j].mem; 180 pend = pstart + image->segment[j].memsz; 181 /* Do the segments overlap ? */ 182 if ((mend > pstart) && (mstart < pend)) 183 goto out; 184 } 185 } 186 187 /* Ensure our buffer sizes are strictly less than 188 * our memory sizes. This should always be the case, 189 * and it is easier to check up front than to be surprised 190 * later on. 191 */ 192 result = -EINVAL; 193 for (i = 0; i < nr_segments; i++) { 194 if (image->segment[i].bufsz > image->segment[i].memsz) 195 goto out; 196 } 197 198 result = 0; 199 out: 200 if (result == 0) 201 *rimage = image; 202 else 203 kfree(image); 204 205 return result; 206 207 } 208 209 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 210 unsigned long nr_segments, 211 struct kexec_segment __user *segments) 212 { 213 int result; 214 struct kimage *image; 215 216 /* Allocate and initialize a controlling structure */ 217 image = NULL; 218 result = do_kimage_alloc(&image, entry, nr_segments, segments); 219 if (result) 220 goto out; 221 222 *rimage = image; 223 224 /* 225 * Find a location for the control code buffer, and add it 226 * the vector of segments so that it's pages will also be 227 * counted as destination pages. 228 */ 229 result = -ENOMEM; 230 image->control_code_page = kimage_alloc_control_pages(image, 231 get_order(KEXEC_CONTROL_CODE_SIZE)); 232 if (!image->control_code_page) { 233 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 234 goto out; 235 } 236 237 result = 0; 238 out: 239 if (result == 0) 240 *rimage = image; 241 else 242 kfree(image); 243 244 return result; 245 } 246 247 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 248 unsigned long nr_segments, 249 struct kexec_segment __user *segments) 250 { 251 int result; 252 struct kimage *image; 253 unsigned long i; 254 255 image = NULL; 256 /* Verify we have a valid entry point */ 257 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 258 result = -EADDRNOTAVAIL; 259 goto out; 260 } 261 262 /* Allocate and initialize a controlling structure */ 263 result = do_kimage_alloc(&image, entry, nr_segments, segments); 264 if (result) 265 goto out; 266 267 /* Enable the special crash kernel control page 268 * allocation policy. 269 */ 270 image->control_page = crashk_res.start; 271 image->type = KEXEC_TYPE_CRASH; 272 273 /* 274 * Verify we have good destination addresses. Normally 275 * the caller is responsible for making certain we don't 276 * attempt to load the new image into invalid or reserved 277 * areas of RAM. But crash kernels are preloaded into a 278 * reserved area of ram. We must ensure the addresses 279 * are in the reserved area otherwise preloading the 280 * kernel could corrupt things. 281 */ 282 result = -EADDRNOTAVAIL; 283 for (i = 0; i < nr_segments; i++) { 284 unsigned long mstart, mend; 285 286 mstart = image->segment[i].mem; 287 mend = mstart + image->segment[i].memsz - 1; 288 /* Ensure we are within the crash kernel limits */ 289 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 290 goto out; 291 } 292 293 /* 294 * Find a location for the control code buffer, and add 295 * the vector of segments so that it's pages will also be 296 * counted as destination pages. 297 */ 298 result = -ENOMEM; 299 image->control_code_page = kimage_alloc_control_pages(image, 300 get_order(KEXEC_CONTROL_CODE_SIZE)); 301 if (!image->control_code_page) { 302 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 303 goto out; 304 } 305 306 result = 0; 307 out: 308 if (result == 0) 309 *rimage = image; 310 else 311 kfree(image); 312 313 return result; 314 } 315 316 static int kimage_is_destination_range(struct kimage *image, 317 unsigned long start, 318 unsigned long end) 319 { 320 unsigned long i; 321 322 for (i = 0; i < image->nr_segments; i++) { 323 unsigned long mstart, mend; 324 325 mstart = image->segment[i].mem; 326 mend = mstart + image->segment[i].memsz; 327 if ((end > mstart) && (start < mend)) 328 return 1; 329 } 330 331 return 0; 332 } 333 334 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 335 { 336 struct page *pages; 337 338 pages = alloc_pages(gfp_mask, order); 339 if (pages) { 340 unsigned int count, i; 341 pages->mapping = NULL; 342 set_page_private(pages, order); 343 count = 1 << order; 344 for (i = 0; i < count; i++) 345 SetPageReserved(pages + i); 346 } 347 348 return pages; 349 } 350 351 static void kimage_free_pages(struct page *page) 352 { 353 unsigned int order, count, i; 354 355 order = page_private(page); 356 count = 1 << order; 357 for (i = 0; i < count; i++) 358 ClearPageReserved(page + i); 359 __free_pages(page, order); 360 } 361 362 static void kimage_free_page_list(struct list_head *list) 363 { 364 struct list_head *pos, *next; 365 366 list_for_each_safe(pos, next, list) { 367 struct page *page; 368 369 page = list_entry(pos, struct page, lru); 370 list_del(&page->lru); 371 kimage_free_pages(page); 372 } 373 } 374 375 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 376 unsigned int order) 377 { 378 /* Control pages are special, they are the intermediaries 379 * that are needed while we copy the rest of the pages 380 * to their final resting place. As such they must 381 * not conflict with either the destination addresses 382 * or memory the kernel is already using. 383 * 384 * The only case where we really need more than one of 385 * these are for architectures where we cannot disable 386 * the MMU and must instead generate an identity mapped 387 * page table for all of the memory. 388 * 389 * At worst this runs in O(N) of the image size. 390 */ 391 struct list_head extra_pages; 392 struct page *pages; 393 unsigned int count; 394 395 count = 1 << order; 396 INIT_LIST_HEAD(&extra_pages); 397 398 /* Loop while I can allocate a page and the page allocated 399 * is a destination page. 400 */ 401 do { 402 unsigned long pfn, epfn, addr, eaddr; 403 404 pages = kimage_alloc_pages(GFP_KERNEL, order); 405 if (!pages) 406 break; 407 pfn = page_to_pfn(pages); 408 epfn = pfn + count; 409 addr = pfn << PAGE_SHIFT; 410 eaddr = epfn << PAGE_SHIFT; 411 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 412 kimage_is_destination_range(image, addr, eaddr)) { 413 list_add(&pages->lru, &extra_pages); 414 pages = NULL; 415 } 416 } while (!pages); 417 418 if (pages) { 419 /* Remember the allocated page... */ 420 list_add(&pages->lru, &image->control_pages); 421 422 /* Because the page is already in it's destination 423 * location we will never allocate another page at 424 * that address. Therefore kimage_alloc_pages 425 * will not return it (again) and we don't need 426 * to give it an entry in image->segment[]. 427 */ 428 } 429 /* Deal with the destination pages I have inadvertently allocated. 430 * 431 * Ideally I would convert multi-page allocations into single 432 * page allocations, and add everyting to image->dest_pages. 433 * 434 * For now it is simpler to just free the pages. 435 */ 436 kimage_free_page_list(&extra_pages); 437 438 return pages; 439 } 440 441 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 442 unsigned int order) 443 { 444 /* Control pages are special, they are the intermediaries 445 * that are needed while we copy the rest of the pages 446 * to their final resting place. As such they must 447 * not conflict with either the destination addresses 448 * or memory the kernel is already using. 449 * 450 * Control pages are also the only pags we must allocate 451 * when loading a crash kernel. All of the other pages 452 * are specified by the segments and we just memcpy 453 * into them directly. 454 * 455 * The only case where we really need more than one of 456 * these are for architectures where we cannot disable 457 * the MMU and must instead generate an identity mapped 458 * page table for all of the memory. 459 * 460 * Given the low demand this implements a very simple 461 * allocator that finds the first hole of the appropriate 462 * size in the reserved memory region, and allocates all 463 * of the memory up to and including the hole. 464 */ 465 unsigned long hole_start, hole_end, size; 466 struct page *pages; 467 468 pages = NULL; 469 size = (1 << order) << PAGE_SHIFT; 470 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 471 hole_end = hole_start + size - 1; 472 while (hole_end <= crashk_res.end) { 473 unsigned long i; 474 475 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 476 break; 477 if (hole_end > crashk_res.end) 478 break; 479 /* See if I overlap any of the segments */ 480 for (i = 0; i < image->nr_segments; i++) { 481 unsigned long mstart, mend; 482 483 mstart = image->segment[i].mem; 484 mend = mstart + image->segment[i].memsz - 1; 485 if ((hole_end >= mstart) && (hole_start <= mend)) { 486 /* Advance the hole to the end of the segment */ 487 hole_start = (mend + (size - 1)) & ~(size - 1); 488 hole_end = hole_start + size - 1; 489 break; 490 } 491 } 492 /* If I don't overlap any segments I have found my hole! */ 493 if (i == image->nr_segments) { 494 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 495 break; 496 } 497 } 498 if (pages) 499 image->control_page = hole_end; 500 501 return pages; 502 } 503 504 505 struct page *kimage_alloc_control_pages(struct kimage *image, 506 unsigned int order) 507 { 508 struct page *pages = NULL; 509 510 switch (image->type) { 511 case KEXEC_TYPE_DEFAULT: 512 pages = kimage_alloc_normal_control_pages(image, order); 513 break; 514 case KEXEC_TYPE_CRASH: 515 pages = kimage_alloc_crash_control_pages(image, order); 516 break; 517 } 518 519 return pages; 520 } 521 522 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 523 { 524 if (*image->entry != 0) 525 image->entry++; 526 527 if (image->entry == image->last_entry) { 528 kimage_entry_t *ind_page; 529 struct page *page; 530 531 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 532 if (!page) 533 return -ENOMEM; 534 535 ind_page = page_address(page); 536 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 537 image->entry = ind_page; 538 image->last_entry = ind_page + 539 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 540 } 541 *image->entry = entry; 542 image->entry++; 543 *image->entry = 0; 544 545 return 0; 546 } 547 548 static int kimage_set_destination(struct kimage *image, 549 unsigned long destination) 550 { 551 int result; 552 553 destination &= PAGE_MASK; 554 result = kimage_add_entry(image, destination | IND_DESTINATION); 555 if (result == 0) 556 image->destination = destination; 557 558 return result; 559 } 560 561 562 static int kimage_add_page(struct kimage *image, unsigned long page) 563 { 564 int result; 565 566 page &= PAGE_MASK; 567 result = kimage_add_entry(image, page | IND_SOURCE); 568 if (result == 0) 569 image->destination += PAGE_SIZE; 570 571 return result; 572 } 573 574 575 static void kimage_free_extra_pages(struct kimage *image) 576 { 577 /* Walk through and free any extra destination pages I may have */ 578 kimage_free_page_list(&image->dest_pages); 579 580 /* Walk through and free any unuseable pages I have cached */ 581 kimage_free_page_list(&image->unuseable_pages); 582 583 } 584 static int kimage_terminate(struct kimage *image) 585 { 586 if (*image->entry != 0) 587 image->entry++; 588 589 *image->entry = IND_DONE; 590 591 return 0; 592 } 593 594 #define for_each_kimage_entry(image, ptr, entry) \ 595 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 596 ptr = (entry & IND_INDIRECTION)? \ 597 phys_to_virt((entry & PAGE_MASK)): ptr +1) 598 599 static void kimage_free_entry(kimage_entry_t entry) 600 { 601 struct page *page; 602 603 page = pfn_to_page(entry >> PAGE_SHIFT); 604 kimage_free_pages(page); 605 } 606 607 static void kimage_free(struct kimage *image) 608 { 609 kimage_entry_t *ptr, entry; 610 kimage_entry_t ind = 0; 611 612 if (!image) 613 return; 614 615 kimage_free_extra_pages(image); 616 for_each_kimage_entry(image, ptr, entry) { 617 if (entry & IND_INDIRECTION) { 618 /* Free the previous indirection page */ 619 if (ind & IND_INDIRECTION) 620 kimage_free_entry(ind); 621 /* Save this indirection page until we are 622 * done with it. 623 */ 624 ind = entry; 625 } 626 else if (entry & IND_SOURCE) 627 kimage_free_entry(entry); 628 } 629 /* Free the final indirection page */ 630 if (ind & IND_INDIRECTION) 631 kimage_free_entry(ind); 632 633 /* Handle any machine specific cleanup */ 634 machine_kexec_cleanup(image); 635 636 /* Free the kexec control pages... */ 637 kimage_free_page_list(&image->control_pages); 638 kfree(image); 639 } 640 641 static kimage_entry_t *kimage_dst_used(struct kimage *image, 642 unsigned long page) 643 { 644 kimage_entry_t *ptr, entry; 645 unsigned long destination = 0; 646 647 for_each_kimage_entry(image, ptr, entry) { 648 if (entry & IND_DESTINATION) 649 destination = entry & PAGE_MASK; 650 else if (entry & IND_SOURCE) { 651 if (page == destination) 652 return ptr; 653 destination += PAGE_SIZE; 654 } 655 } 656 657 return NULL; 658 } 659 660 static struct page *kimage_alloc_page(struct kimage *image, 661 gfp_t gfp_mask, 662 unsigned long destination) 663 { 664 /* 665 * Here we implement safeguards to ensure that a source page 666 * is not copied to its destination page before the data on 667 * the destination page is no longer useful. 668 * 669 * To do this we maintain the invariant that a source page is 670 * either its own destination page, or it is not a 671 * destination page at all. 672 * 673 * That is slightly stronger than required, but the proof 674 * that no problems will not occur is trivial, and the 675 * implementation is simply to verify. 676 * 677 * When allocating all pages normally this algorithm will run 678 * in O(N) time, but in the worst case it will run in O(N^2) 679 * time. If the runtime is a problem the data structures can 680 * be fixed. 681 */ 682 struct page *page; 683 unsigned long addr; 684 685 /* 686 * Walk through the list of destination pages, and see if I 687 * have a match. 688 */ 689 list_for_each_entry(page, &image->dest_pages, lru) { 690 addr = page_to_pfn(page) << PAGE_SHIFT; 691 if (addr == destination) { 692 list_del(&page->lru); 693 return page; 694 } 695 } 696 page = NULL; 697 while (1) { 698 kimage_entry_t *old; 699 700 /* Allocate a page, if we run out of memory give up */ 701 page = kimage_alloc_pages(gfp_mask, 0); 702 if (!page) 703 return NULL; 704 /* If the page cannot be used file it away */ 705 if (page_to_pfn(page) > 706 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 707 list_add(&page->lru, &image->unuseable_pages); 708 continue; 709 } 710 addr = page_to_pfn(page) << PAGE_SHIFT; 711 712 /* If it is the destination page we want use it */ 713 if (addr == destination) 714 break; 715 716 /* If the page is not a destination page use it */ 717 if (!kimage_is_destination_range(image, addr, 718 addr + PAGE_SIZE)) 719 break; 720 721 /* 722 * I know that the page is someones destination page. 723 * See if there is already a source page for this 724 * destination page. And if so swap the source pages. 725 */ 726 old = kimage_dst_used(image, addr); 727 if (old) { 728 /* If so move it */ 729 unsigned long old_addr; 730 struct page *old_page; 731 732 old_addr = *old & PAGE_MASK; 733 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 734 copy_highpage(page, old_page); 735 *old = addr | (*old & ~PAGE_MASK); 736 737 /* The old page I have found cannot be a 738 * destination page, so return it. 739 */ 740 addr = old_addr; 741 page = old_page; 742 break; 743 } 744 else { 745 /* Place the page on the destination list I 746 * will use it later. 747 */ 748 list_add(&page->lru, &image->dest_pages); 749 } 750 } 751 752 return page; 753 } 754 755 static int kimage_load_normal_segment(struct kimage *image, 756 struct kexec_segment *segment) 757 { 758 unsigned long maddr; 759 unsigned long ubytes, mbytes; 760 int result; 761 unsigned char __user *buf; 762 763 result = 0; 764 buf = segment->buf; 765 ubytes = segment->bufsz; 766 mbytes = segment->memsz; 767 maddr = segment->mem; 768 769 result = kimage_set_destination(image, maddr); 770 if (result < 0) 771 goto out; 772 773 while (mbytes) { 774 struct page *page; 775 char *ptr; 776 size_t uchunk, mchunk; 777 778 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 779 if (page == 0) { 780 result = -ENOMEM; 781 goto out; 782 } 783 result = kimage_add_page(image, page_to_pfn(page) 784 << PAGE_SHIFT); 785 if (result < 0) 786 goto out; 787 788 ptr = kmap(page); 789 /* Start with a clear page */ 790 memset(ptr, 0, PAGE_SIZE); 791 ptr += maddr & ~PAGE_MASK; 792 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 793 if (mchunk > mbytes) 794 mchunk = mbytes; 795 796 uchunk = mchunk; 797 if (uchunk > ubytes) 798 uchunk = ubytes; 799 800 result = copy_from_user(ptr, buf, uchunk); 801 kunmap(page); 802 if (result) { 803 result = (result < 0) ? result : -EIO; 804 goto out; 805 } 806 ubytes -= uchunk; 807 maddr += mchunk; 808 buf += mchunk; 809 mbytes -= mchunk; 810 } 811 out: 812 return result; 813 } 814 815 static int kimage_load_crash_segment(struct kimage *image, 816 struct kexec_segment *segment) 817 { 818 /* For crash dumps kernels we simply copy the data from 819 * user space to it's destination. 820 * We do things a page at a time for the sake of kmap. 821 */ 822 unsigned long maddr; 823 unsigned long ubytes, mbytes; 824 int result; 825 unsigned char __user *buf; 826 827 result = 0; 828 buf = segment->buf; 829 ubytes = segment->bufsz; 830 mbytes = segment->memsz; 831 maddr = segment->mem; 832 while (mbytes) { 833 struct page *page; 834 char *ptr; 835 size_t uchunk, mchunk; 836 837 page = pfn_to_page(maddr >> PAGE_SHIFT); 838 if (page == 0) { 839 result = -ENOMEM; 840 goto out; 841 } 842 ptr = kmap(page); 843 ptr += maddr & ~PAGE_MASK; 844 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 845 if (mchunk > mbytes) 846 mchunk = mbytes; 847 848 uchunk = mchunk; 849 if (uchunk > ubytes) { 850 uchunk = ubytes; 851 /* Zero the trailing part of the page */ 852 memset(ptr + uchunk, 0, mchunk - uchunk); 853 } 854 result = copy_from_user(ptr, buf, uchunk); 855 kexec_flush_icache_page(page); 856 kunmap(page); 857 if (result) { 858 result = (result < 0) ? result : -EIO; 859 goto out; 860 } 861 ubytes -= uchunk; 862 maddr += mchunk; 863 buf += mchunk; 864 mbytes -= mchunk; 865 } 866 out: 867 return result; 868 } 869 870 static int kimage_load_segment(struct kimage *image, 871 struct kexec_segment *segment) 872 { 873 int result = -ENOMEM; 874 875 switch (image->type) { 876 case KEXEC_TYPE_DEFAULT: 877 result = kimage_load_normal_segment(image, segment); 878 break; 879 case KEXEC_TYPE_CRASH: 880 result = kimage_load_crash_segment(image, segment); 881 break; 882 } 883 884 return result; 885 } 886 887 /* 888 * Exec Kernel system call: for obvious reasons only root may call it. 889 * 890 * This call breaks up into three pieces. 891 * - A generic part which loads the new kernel from the current 892 * address space, and very carefully places the data in the 893 * allocated pages. 894 * 895 * - A generic part that interacts with the kernel and tells all of 896 * the devices to shut down. Preventing on-going dmas, and placing 897 * the devices in a consistent state so a later kernel can 898 * reinitialize them. 899 * 900 * - A machine specific part that includes the syscall number 901 * and the copies the image to it's final destination. And 902 * jumps into the image at entry. 903 * 904 * kexec does not sync, or unmount filesystems so if you need 905 * that to happen you need to do that yourself. 906 */ 907 struct kimage *kexec_image; 908 struct kimage *kexec_crash_image; 909 /* 910 * A home grown binary mutex. 911 * Nothing can wait so this mutex is safe to use 912 * in interrupt context :) 913 */ 914 static int kexec_lock; 915 916 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 917 struct kexec_segment __user *segments, 918 unsigned long flags) 919 { 920 struct kimage **dest_image, *image; 921 int locked; 922 int result; 923 924 /* We only trust the superuser with rebooting the system. */ 925 if (!capable(CAP_SYS_BOOT)) 926 return -EPERM; 927 928 /* 929 * Verify we have a legal set of flags 930 * This leaves us room for future extensions. 931 */ 932 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 933 return -EINVAL; 934 935 /* Verify we are on the appropriate architecture */ 936 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 937 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 938 return -EINVAL; 939 940 /* Put an artificial cap on the number 941 * of segments passed to kexec_load. 942 */ 943 if (nr_segments > KEXEC_SEGMENT_MAX) 944 return -EINVAL; 945 946 image = NULL; 947 result = 0; 948 949 /* Because we write directly to the reserved memory 950 * region when loading crash kernels we need a mutex here to 951 * prevent multiple crash kernels from attempting to load 952 * simultaneously, and to prevent a crash kernel from loading 953 * over the top of a in use crash kernel. 954 * 955 * KISS: always take the mutex. 956 */ 957 locked = xchg(&kexec_lock, 1); 958 if (locked) 959 return -EBUSY; 960 961 dest_image = &kexec_image; 962 if (flags & KEXEC_ON_CRASH) 963 dest_image = &kexec_crash_image; 964 if (nr_segments > 0) { 965 unsigned long i; 966 967 /* Loading another kernel to reboot into */ 968 if ((flags & KEXEC_ON_CRASH) == 0) 969 result = kimage_normal_alloc(&image, entry, 970 nr_segments, segments); 971 /* Loading another kernel to switch to if this one crashes */ 972 else if (flags & KEXEC_ON_CRASH) { 973 /* Free any current crash dump kernel before 974 * we corrupt it. 975 */ 976 kimage_free(xchg(&kexec_crash_image, NULL)); 977 result = kimage_crash_alloc(&image, entry, 978 nr_segments, segments); 979 } 980 if (result) 981 goto out; 982 983 result = machine_kexec_prepare(image); 984 if (result) 985 goto out; 986 987 for (i = 0; i < nr_segments; i++) { 988 result = kimage_load_segment(image, &image->segment[i]); 989 if (result) 990 goto out; 991 } 992 result = kimage_terminate(image); 993 if (result) 994 goto out; 995 } 996 /* Install the new kernel, and Uninstall the old */ 997 image = xchg(dest_image, image); 998 999 out: 1000 locked = xchg(&kexec_lock, 0); /* Release the mutex */ 1001 BUG_ON(!locked); 1002 kimage_free(image); 1003 1004 return result; 1005 } 1006 1007 #ifdef CONFIG_COMPAT 1008 asmlinkage long compat_sys_kexec_load(unsigned long entry, 1009 unsigned long nr_segments, 1010 struct compat_kexec_segment __user *segments, 1011 unsigned long flags) 1012 { 1013 struct compat_kexec_segment in; 1014 struct kexec_segment out, __user *ksegments; 1015 unsigned long i, result; 1016 1017 /* Don't allow clients that don't understand the native 1018 * architecture to do anything. 1019 */ 1020 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 1021 return -EINVAL; 1022 1023 if (nr_segments > KEXEC_SEGMENT_MAX) 1024 return -EINVAL; 1025 1026 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1027 for (i=0; i < nr_segments; i++) { 1028 result = copy_from_user(&in, &segments[i], sizeof(in)); 1029 if (result) 1030 return -EFAULT; 1031 1032 out.buf = compat_ptr(in.buf); 1033 out.bufsz = in.bufsz; 1034 out.mem = in.mem; 1035 out.memsz = in.memsz; 1036 1037 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 1038 if (result) 1039 return -EFAULT; 1040 } 1041 1042 return sys_kexec_load(entry, nr_segments, ksegments, flags); 1043 } 1044 #endif 1045 1046 void crash_kexec(struct pt_regs *regs) 1047 { 1048 int locked; 1049 1050 1051 /* Take the kexec_lock here to prevent sys_kexec_load 1052 * running on one cpu from replacing the crash kernel 1053 * we are using after a panic on a different cpu. 1054 * 1055 * If the crash kernel was not located in a fixed area 1056 * of memory the xchg(&kexec_crash_image) would be 1057 * sufficient. But since I reuse the memory... 1058 */ 1059 locked = xchg(&kexec_lock, 1); 1060 if (!locked) { 1061 if (kexec_crash_image) { 1062 struct pt_regs fixed_regs; 1063 crash_setup_regs(&fixed_regs, regs); 1064 machine_crash_shutdown(&fixed_regs); 1065 machine_kexec(kexec_crash_image); 1066 } 1067 locked = xchg(&kexec_lock, 0); 1068 BUG_ON(!locked); 1069 } 1070 } 1071 1072 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1073 size_t data_len) 1074 { 1075 struct elf_note note; 1076 1077 note.n_namesz = strlen(name) + 1; 1078 note.n_descsz = data_len; 1079 note.n_type = type; 1080 memcpy(buf, ¬e, sizeof(note)); 1081 buf += (sizeof(note) + 3)/4; 1082 memcpy(buf, name, note.n_namesz); 1083 buf += (note.n_namesz + 3)/4; 1084 memcpy(buf, data, note.n_descsz); 1085 buf += (note.n_descsz + 3)/4; 1086 1087 return buf; 1088 } 1089 1090 static void final_note(u32 *buf) 1091 { 1092 struct elf_note note; 1093 1094 note.n_namesz = 0; 1095 note.n_descsz = 0; 1096 note.n_type = 0; 1097 memcpy(buf, ¬e, sizeof(note)); 1098 } 1099 1100 void crash_save_cpu(struct pt_regs *regs, int cpu) 1101 { 1102 struct elf_prstatus prstatus; 1103 u32 *buf; 1104 1105 if ((cpu < 0) || (cpu >= NR_CPUS)) 1106 return; 1107 1108 /* Using ELF notes here is opportunistic. 1109 * I need a well defined structure format 1110 * for the data I pass, and I need tags 1111 * on the data to indicate what information I have 1112 * squirrelled away. ELF notes happen to provide 1113 * all of that, so there is no need to invent something new. 1114 */ 1115 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1116 if (!buf) 1117 return; 1118 memset(&prstatus, 0, sizeof(prstatus)); 1119 prstatus.pr_pid = current->pid; 1120 elf_core_copy_regs(&prstatus.pr_reg, regs); 1121 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1122 &prstatus, sizeof(prstatus)); 1123 final_note(buf); 1124 } 1125 1126 static int __init crash_notes_memory_init(void) 1127 { 1128 /* Allocate memory for saving cpu registers. */ 1129 crash_notes = alloc_percpu(note_buf_t); 1130 if (!crash_notes) { 1131 printk("Kexec: Memory allocation for saving cpu register" 1132 " states failed\n"); 1133 return -ENOMEM; 1134 } 1135 return 0; 1136 } 1137 module_init(crash_notes_memory_init) 1138