1 /* 2 * kexec.c - kexec system call 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #include <linux/mm.h> 10 #include <linux/file.h> 11 #include <linux/slab.h> 12 #include <linux/fs.h> 13 #include <linux/kexec.h> 14 #include <linux/spinlock.h> 15 #include <linux/list.h> 16 #include <linux/highmem.h> 17 #include <linux/syscalls.h> 18 #include <linux/reboot.h> 19 #include <linux/syscalls.h> 20 #include <linux/ioport.h> 21 #include <linux/hardirq.h> 22 23 #include <asm/page.h> 24 #include <asm/uaccess.h> 25 #include <asm/io.h> 26 #include <asm/system.h> 27 #include <asm/semaphore.h> 28 29 /* Location of the reserved area for the crash kernel */ 30 struct resource crashk_res = { 31 .name = "Crash kernel", 32 .start = 0, 33 .end = 0, 34 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 35 }; 36 37 int kexec_should_crash(struct task_struct *p) 38 { 39 if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) 40 return 1; 41 return 0; 42 } 43 44 /* 45 * When kexec transitions to the new kernel there is a one-to-one 46 * mapping between physical and virtual addresses. On processors 47 * where you can disable the MMU this is trivial, and easy. For 48 * others it is still a simple predictable page table to setup. 49 * 50 * In that environment kexec copies the new kernel to its final 51 * resting place. This means I can only support memory whose 52 * physical address can fit in an unsigned long. In particular 53 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 54 * If the assembly stub has more restrictive requirements 55 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 56 * defined more restrictively in <asm/kexec.h>. 57 * 58 * The code for the transition from the current kernel to the 59 * the new kernel is placed in the control_code_buffer, whose size 60 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 61 * page of memory is necessary, but some architectures require more. 62 * Because this memory must be identity mapped in the transition from 63 * virtual to physical addresses it must live in the range 64 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 65 * modifiable. 66 * 67 * The assembly stub in the control code buffer is passed a linked list 68 * of descriptor pages detailing the source pages of the new kernel, 69 * and the destination addresses of those source pages. As this data 70 * structure is not used in the context of the current OS, it must 71 * be self-contained. 72 * 73 * The code has been made to work with highmem pages and will use a 74 * destination page in its final resting place (if it happens 75 * to allocate it). The end product of this is that most of the 76 * physical address space, and most of RAM can be used. 77 * 78 * Future directions include: 79 * - allocating a page table with the control code buffer identity 80 * mapped, to simplify machine_kexec and make kexec_on_panic more 81 * reliable. 82 */ 83 84 /* 85 * KIMAGE_NO_DEST is an impossible destination address..., for 86 * allocating pages whose destination address we do not care about. 87 */ 88 #define KIMAGE_NO_DEST (-1UL) 89 90 static int kimage_is_destination_range(struct kimage *image, 91 unsigned long start, unsigned long end); 92 static struct page *kimage_alloc_page(struct kimage *image, 93 gfp_t gfp_mask, 94 unsigned long dest); 95 96 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 97 unsigned long nr_segments, 98 struct kexec_segment __user *segments) 99 { 100 size_t segment_bytes; 101 struct kimage *image; 102 unsigned long i; 103 int result; 104 105 /* Allocate a controlling structure */ 106 result = -ENOMEM; 107 image = kmalloc(sizeof(*image), GFP_KERNEL); 108 if (!image) 109 goto out; 110 111 memset(image, 0, sizeof(*image)); 112 image->head = 0; 113 image->entry = &image->head; 114 image->last_entry = &image->head; 115 image->control_page = ~0; /* By default this does not apply */ 116 image->start = entry; 117 image->type = KEXEC_TYPE_DEFAULT; 118 119 /* Initialize the list of control pages */ 120 INIT_LIST_HEAD(&image->control_pages); 121 122 /* Initialize the list of destination pages */ 123 INIT_LIST_HEAD(&image->dest_pages); 124 125 /* Initialize the list of unuseable pages */ 126 INIT_LIST_HEAD(&image->unuseable_pages); 127 128 /* Read in the segments */ 129 image->nr_segments = nr_segments; 130 segment_bytes = nr_segments * sizeof(*segments); 131 result = copy_from_user(image->segment, segments, segment_bytes); 132 if (result) 133 goto out; 134 135 /* 136 * Verify we have good destination addresses. The caller is 137 * responsible for making certain we don't attempt to load 138 * the new image into invalid or reserved areas of RAM. This 139 * just verifies it is an address we can use. 140 * 141 * Since the kernel does everything in page size chunks ensure 142 * the destination addreses are page aligned. Too many 143 * special cases crop of when we don't do this. The most 144 * insidious is getting overlapping destination addresses 145 * simply because addresses are changed to page size 146 * granularity. 147 */ 148 result = -EADDRNOTAVAIL; 149 for (i = 0; i < nr_segments; i++) { 150 unsigned long mstart, mend; 151 152 mstart = image->segment[i].mem; 153 mend = mstart + image->segment[i].memsz; 154 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 155 goto out; 156 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 157 goto out; 158 } 159 160 /* Verify our destination addresses do not overlap. 161 * If we alloed overlapping destination addresses 162 * through very weird things can happen with no 163 * easy explanation as one segment stops on another. 164 */ 165 result = -EINVAL; 166 for (i = 0; i < nr_segments; i++) { 167 unsigned long mstart, mend; 168 unsigned long j; 169 170 mstart = image->segment[i].mem; 171 mend = mstart + image->segment[i].memsz; 172 for (j = 0; j < i; j++) { 173 unsigned long pstart, pend; 174 pstart = image->segment[j].mem; 175 pend = pstart + image->segment[j].memsz; 176 /* Do the segments overlap ? */ 177 if ((mend > pstart) && (mstart < pend)) 178 goto out; 179 } 180 } 181 182 /* Ensure our buffer sizes are strictly less than 183 * our memory sizes. This should always be the case, 184 * and it is easier to check up front than to be surprised 185 * later on. 186 */ 187 result = -EINVAL; 188 for (i = 0; i < nr_segments; i++) { 189 if (image->segment[i].bufsz > image->segment[i].memsz) 190 goto out; 191 } 192 193 result = 0; 194 out: 195 if (result == 0) 196 *rimage = image; 197 else 198 kfree(image); 199 200 return result; 201 202 } 203 204 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 205 unsigned long nr_segments, 206 struct kexec_segment __user *segments) 207 { 208 int result; 209 struct kimage *image; 210 211 /* Allocate and initialize a controlling structure */ 212 image = NULL; 213 result = do_kimage_alloc(&image, entry, nr_segments, segments); 214 if (result) 215 goto out; 216 217 *rimage = image; 218 219 /* 220 * Find a location for the control code buffer, and add it 221 * the vector of segments so that it's pages will also be 222 * counted as destination pages. 223 */ 224 result = -ENOMEM; 225 image->control_code_page = kimage_alloc_control_pages(image, 226 get_order(KEXEC_CONTROL_CODE_SIZE)); 227 if (!image->control_code_page) { 228 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 229 goto out; 230 } 231 232 result = 0; 233 out: 234 if (result == 0) 235 *rimage = image; 236 else 237 kfree(image); 238 239 return result; 240 } 241 242 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 243 unsigned long nr_segments, 244 struct kexec_segment __user *segments) 245 { 246 int result; 247 struct kimage *image; 248 unsigned long i; 249 250 image = NULL; 251 /* Verify we have a valid entry point */ 252 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 253 result = -EADDRNOTAVAIL; 254 goto out; 255 } 256 257 /* Allocate and initialize a controlling structure */ 258 result = do_kimage_alloc(&image, entry, nr_segments, segments); 259 if (result) 260 goto out; 261 262 /* Enable the special crash kernel control page 263 * allocation policy. 264 */ 265 image->control_page = crashk_res.start; 266 image->type = KEXEC_TYPE_CRASH; 267 268 /* 269 * Verify we have good destination addresses. Normally 270 * the caller is responsible for making certain we don't 271 * attempt to load the new image into invalid or reserved 272 * areas of RAM. But crash kernels are preloaded into a 273 * reserved area of ram. We must ensure the addresses 274 * are in the reserved area otherwise preloading the 275 * kernel could corrupt things. 276 */ 277 result = -EADDRNOTAVAIL; 278 for (i = 0; i < nr_segments; i++) { 279 unsigned long mstart, mend; 280 281 mstart = image->segment[i].mem; 282 mend = mstart + image->segment[i].memsz - 1; 283 /* Ensure we are within the crash kernel limits */ 284 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 285 goto out; 286 } 287 288 /* 289 * Find a location for the control code buffer, and add 290 * the vector of segments so that it's pages will also be 291 * counted as destination pages. 292 */ 293 result = -ENOMEM; 294 image->control_code_page = kimage_alloc_control_pages(image, 295 get_order(KEXEC_CONTROL_CODE_SIZE)); 296 if (!image->control_code_page) { 297 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 298 goto out; 299 } 300 301 result = 0; 302 out: 303 if (result == 0) 304 *rimage = image; 305 else 306 kfree(image); 307 308 return result; 309 } 310 311 static int kimage_is_destination_range(struct kimage *image, 312 unsigned long start, 313 unsigned long end) 314 { 315 unsigned long i; 316 317 for (i = 0; i < image->nr_segments; i++) { 318 unsigned long mstart, mend; 319 320 mstart = image->segment[i].mem; 321 mend = mstart + image->segment[i].memsz; 322 if ((end > mstart) && (start < mend)) 323 return 1; 324 } 325 326 return 0; 327 } 328 329 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 330 { 331 struct page *pages; 332 333 pages = alloc_pages(gfp_mask, order); 334 if (pages) { 335 unsigned int count, i; 336 pages->mapping = NULL; 337 set_page_private(pages, order); 338 count = 1 << order; 339 for (i = 0; i < count; i++) 340 SetPageReserved(pages + i); 341 } 342 343 return pages; 344 } 345 346 static void kimage_free_pages(struct page *page) 347 { 348 unsigned int order, count, i; 349 350 order = page_private(page); 351 count = 1 << order; 352 for (i = 0; i < count; i++) 353 ClearPageReserved(page + i); 354 __free_pages(page, order); 355 } 356 357 static void kimage_free_page_list(struct list_head *list) 358 { 359 struct list_head *pos, *next; 360 361 list_for_each_safe(pos, next, list) { 362 struct page *page; 363 364 page = list_entry(pos, struct page, lru); 365 list_del(&page->lru); 366 kimage_free_pages(page); 367 } 368 } 369 370 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 371 unsigned int order) 372 { 373 /* Control pages are special, they are the intermediaries 374 * that are needed while we copy the rest of the pages 375 * to their final resting place. As such they must 376 * not conflict with either the destination addresses 377 * or memory the kernel is already using. 378 * 379 * The only case where we really need more than one of 380 * these are for architectures where we cannot disable 381 * the MMU and must instead generate an identity mapped 382 * page table for all of the memory. 383 * 384 * At worst this runs in O(N) of the image size. 385 */ 386 struct list_head extra_pages; 387 struct page *pages; 388 unsigned int count; 389 390 count = 1 << order; 391 INIT_LIST_HEAD(&extra_pages); 392 393 /* Loop while I can allocate a page and the page allocated 394 * is a destination page. 395 */ 396 do { 397 unsigned long pfn, epfn, addr, eaddr; 398 399 pages = kimage_alloc_pages(GFP_KERNEL, order); 400 if (!pages) 401 break; 402 pfn = page_to_pfn(pages); 403 epfn = pfn + count; 404 addr = pfn << PAGE_SHIFT; 405 eaddr = epfn << PAGE_SHIFT; 406 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 407 kimage_is_destination_range(image, addr, eaddr)) { 408 list_add(&pages->lru, &extra_pages); 409 pages = NULL; 410 } 411 } while (!pages); 412 413 if (pages) { 414 /* Remember the allocated page... */ 415 list_add(&pages->lru, &image->control_pages); 416 417 /* Because the page is already in it's destination 418 * location we will never allocate another page at 419 * that address. Therefore kimage_alloc_pages 420 * will not return it (again) and we don't need 421 * to give it an entry in image->segment[]. 422 */ 423 } 424 /* Deal with the destination pages I have inadvertently allocated. 425 * 426 * Ideally I would convert multi-page allocations into single 427 * page allocations, and add everyting to image->dest_pages. 428 * 429 * For now it is simpler to just free the pages. 430 */ 431 kimage_free_page_list(&extra_pages); 432 433 return pages; 434 } 435 436 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 437 unsigned int order) 438 { 439 /* Control pages are special, they are the intermediaries 440 * that are needed while we copy the rest of the pages 441 * to their final resting place. As such they must 442 * not conflict with either the destination addresses 443 * or memory the kernel is already using. 444 * 445 * Control pages are also the only pags we must allocate 446 * when loading a crash kernel. All of the other pages 447 * are specified by the segments and we just memcpy 448 * into them directly. 449 * 450 * The only case where we really need more than one of 451 * these are for architectures where we cannot disable 452 * the MMU and must instead generate an identity mapped 453 * page table for all of the memory. 454 * 455 * Given the low demand this implements a very simple 456 * allocator that finds the first hole of the appropriate 457 * size in the reserved memory region, and allocates all 458 * of the memory up to and including the hole. 459 */ 460 unsigned long hole_start, hole_end, size; 461 struct page *pages; 462 463 pages = NULL; 464 size = (1 << order) << PAGE_SHIFT; 465 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 466 hole_end = hole_start + size - 1; 467 while (hole_end <= crashk_res.end) { 468 unsigned long i; 469 470 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 471 break; 472 if (hole_end > crashk_res.end) 473 break; 474 /* See if I overlap any of the segments */ 475 for (i = 0; i < image->nr_segments; i++) { 476 unsigned long mstart, mend; 477 478 mstart = image->segment[i].mem; 479 mend = mstart + image->segment[i].memsz - 1; 480 if ((hole_end >= mstart) && (hole_start <= mend)) { 481 /* Advance the hole to the end of the segment */ 482 hole_start = (mend + (size - 1)) & ~(size - 1); 483 hole_end = hole_start + size - 1; 484 break; 485 } 486 } 487 /* If I don't overlap any segments I have found my hole! */ 488 if (i == image->nr_segments) { 489 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 490 break; 491 } 492 } 493 if (pages) 494 image->control_page = hole_end; 495 496 return pages; 497 } 498 499 500 struct page *kimage_alloc_control_pages(struct kimage *image, 501 unsigned int order) 502 { 503 struct page *pages = NULL; 504 505 switch (image->type) { 506 case KEXEC_TYPE_DEFAULT: 507 pages = kimage_alloc_normal_control_pages(image, order); 508 break; 509 case KEXEC_TYPE_CRASH: 510 pages = kimage_alloc_crash_control_pages(image, order); 511 break; 512 } 513 514 return pages; 515 } 516 517 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 518 { 519 if (*image->entry != 0) 520 image->entry++; 521 522 if (image->entry == image->last_entry) { 523 kimage_entry_t *ind_page; 524 struct page *page; 525 526 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 527 if (!page) 528 return -ENOMEM; 529 530 ind_page = page_address(page); 531 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 532 image->entry = ind_page; 533 image->last_entry = ind_page + 534 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 535 } 536 *image->entry = entry; 537 image->entry++; 538 *image->entry = 0; 539 540 return 0; 541 } 542 543 static int kimage_set_destination(struct kimage *image, 544 unsigned long destination) 545 { 546 int result; 547 548 destination &= PAGE_MASK; 549 result = kimage_add_entry(image, destination | IND_DESTINATION); 550 if (result == 0) 551 image->destination = destination; 552 553 return result; 554 } 555 556 557 static int kimage_add_page(struct kimage *image, unsigned long page) 558 { 559 int result; 560 561 page &= PAGE_MASK; 562 result = kimage_add_entry(image, page | IND_SOURCE); 563 if (result == 0) 564 image->destination += PAGE_SIZE; 565 566 return result; 567 } 568 569 570 static void kimage_free_extra_pages(struct kimage *image) 571 { 572 /* Walk through and free any extra destination pages I may have */ 573 kimage_free_page_list(&image->dest_pages); 574 575 /* Walk through and free any unuseable pages I have cached */ 576 kimage_free_page_list(&image->unuseable_pages); 577 578 } 579 static int kimage_terminate(struct kimage *image) 580 { 581 if (*image->entry != 0) 582 image->entry++; 583 584 *image->entry = IND_DONE; 585 586 return 0; 587 } 588 589 #define for_each_kimage_entry(image, ptr, entry) \ 590 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 591 ptr = (entry & IND_INDIRECTION)? \ 592 phys_to_virt((entry & PAGE_MASK)): ptr +1) 593 594 static void kimage_free_entry(kimage_entry_t entry) 595 { 596 struct page *page; 597 598 page = pfn_to_page(entry >> PAGE_SHIFT); 599 kimage_free_pages(page); 600 } 601 602 static void kimage_free(struct kimage *image) 603 { 604 kimage_entry_t *ptr, entry; 605 kimage_entry_t ind = 0; 606 607 if (!image) 608 return; 609 610 kimage_free_extra_pages(image); 611 for_each_kimage_entry(image, ptr, entry) { 612 if (entry & IND_INDIRECTION) { 613 /* Free the previous indirection page */ 614 if (ind & IND_INDIRECTION) 615 kimage_free_entry(ind); 616 /* Save this indirection page until we are 617 * done with it. 618 */ 619 ind = entry; 620 } 621 else if (entry & IND_SOURCE) 622 kimage_free_entry(entry); 623 } 624 /* Free the final indirection page */ 625 if (ind & IND_INDIRECTION) 626 kimage_free_entry(ind); 627 628 /* Handle any machine specific cleanup */ 629 machine_kexec_cleanup(image); 630 631 /* Free the kexec control pages... */ 632 kimage_free_page_list(&image->control_pages); 633 kfree(image); 634 } 635 636 static kimage_entry_t *kimage_dst_used(struct kimage *image, 637 unsigned long page) 638 { 639 kimage_entry_t *ptr, entry; 640 unsigned long destination = 0; 641 642 for_each_kimage_entry(image, ptr, entry) { 643 if (entry & IND_DESTINATION) 644 destination = entry & PAGE_MASK; 645 else if (entry & IND_SOURCE) { 646 if (page == destination) 647 return ptr; 648 destination += PAGE_SIZE; 649 } 650 } 651 652 return NULL; 653 } 654 655 static struct page *kimage_alloc_page(struct kimage *image, 656 gfp_t gfp_mask, 657 unsigned long destination) 658 { 659 /* 660 * Here we implement safeguards to ensure that a source page 661 * is not copied to its destination page before the data on 662 * the destination page is no longer useful. 663 * 664 * To do this we maintain the invariant that a source page is 665 * either its own destination page, or it is not a 666 * destination page at all. 667 * 668 * That is slightly stronger than required, but the proof 669 * that no problems will not occur is trivial, and the 670 * implementation is simply to verify. 671 * 672 * When allocating all pages normally this algorithm will run 673 * in O(N) time, but in the worst case it will run in O(N^2) 674 * time. If the runtime is a problem the data structures can 675 * be fixed. 676 */ 677 struct page *page; 678 unsigned long addr; 679 680 /* 681 * Walk through the list of destination pages, and see if I 682 * have a match. 683 */ 684 list_for_each_entry(page, &image->dest_pages, lru) { 685 addr = page_to_pfn(page) << PAGE_SHIFT; 686 if (addr == destination) { 687 list_del(&page->lru); 688 return page; 689 } 690 } 691 page = NULL; 692 while (1) { 693 kimage_entry_t *old; 694 695 /* Allocate a page, if we run out of memory give up */ 696 page = kimage_alloc_pages(gfp_mask, 0); 697 if (!page) 698 return NULL; 699 /* If the page cannot be used file it away */ 700 if (page_to_pfn(page) > 701 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 702 list_add(&page->lru, &image->unuseable_pages); 703 continue; 704 } 705 addr = page_to_pfn(page) << PAGE_SHIFT; 706 707 /* If it is the destination page we want use it */ 708 if (addr == destination) 709 break; 710 711 /* If the page is not a destination page use it */ 712 if (!kimage_is_destination_range(image, addr, 713 addr + PAGE_SIZE)) 714 break; 715 716 /* 717 * I know that the page is someones destination page. 718 * See if there is already a source page for this 719 * destination page. And if so swap the source pages. 720 */ 721 old = kimage_dst_used(image, addr); 722 if (old) { 723 /* If so move it */ 724 unsigned long old_addr; 725 struct page *old_page; 726 727 old_addr = *old & PAGE_MASK; 728 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 729 copy_highpage(page, old_page); 730 *old = addr | (*old & ~PAGE_MASK); 731 732 /* The old page I have found cannot be a 733 * destination page, so return it. 734 */ 735 addr = old_addr; 736 page = old_page; 737 break; 738 } 739 else { 740 /* Place the page on the destination list I 741 * will use it later. 742 */ 743 list_add(&page->lru, &image->dest_pages); 744 } 745 } 746 747 return page; 748 } 749 750 static int kimage_load_normal_segment(struct kimage *image, 751 struct kexec_segment *segment) 752 { 753 unsigned long maddr; 754 unsigned long ubytes, mbytes; 755 int result; 756 unsigned char __user *buf; 757 758 result = 0; 759 buf = segment->buf; 760 ubytes = segment->bufsz; 761 mbytes = segment->memsz; 762 maddr = segment->mem; 763 764 result = kimage_set_destination(image, maddr); 765 if (result < 0) 766 goto out; 767 768 while (mbytes) { 769 struct page *page; 770 char *ptr; 771 size_t uchunk, mchunk; 772 773 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 774 if (page == 0) { 775 result = -ENOMEM; 776 goto out; 777 } 778 result = kimage_add_page(image, page_to_pfn(page) 779 << PAGE_SHIFT); 780 if (result < 0) 781 goto out; 782 783 ptr = kmap(page); 784 /* Start with a clear page */ 785 memset(ptr, 0, PAGE_SIZE); 786 ptr += maddr & ~PAGE_MASK; 787 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 788 if (mchunk > mbytes) 789 mchunk = mbytes; 790 791 uchunk = mchunk; 792 if (uchunk > ubytes) 793 uchunk = ubytes; 794 795 result = copy_from_user(ptr, buf, uchunk); 796 kunmap(page); 797 if (result) { 798 result = (result < 0) ? result : -EIO; 799 goto out; 800 } 801 ubytes -= uchunk; 802 maddr += mchunk; 803 buf += mchunk; 804 mbytes -= mchunk; 805 } 806 out: 807 return result; 808 } 809 810 static int kimage_load_crash_segment(struct kimage *image, 811 struct kexec_segment *segment) 812 { 813 /* For crash dumps kernels we simply copy the data from 814 * user space to it's destination. 815 * We do things a page at a time for the sake of kmap. 816 */ 817 unsigned long maddr; 818 unsigned long ubytes, mbytes; 819 int result; 820 unsigned char __user *buf; 821 822 result = 0; 823 buf = segment->buf; 824 ubytes = segment->bufsz; 825 mbytes = segment->memsz; 826 maddr = segment->mem; 827 while (mbytes) { 828 struct page *page; 829 char *ptr; 830 size_t uchunk, mchunk; 831 832 page = pfn_to_page(maddr >> PAGE_SHIFT); 833 if (page == 0) { 834 result = -ENOMEM; 835 goto out; 836 } 837 ptr = kmap(page); 838 ptr += maddr & ~PAGE_MASK; 839 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 840 if (mchunk > mbytes) 841 mchunk = mbytes; 842 843 uchunk = mchunk; 844 if (uchunk > ubytes) { 845 uchunk = ubytes; 846 /* Zero the trailing part of the page */ 847 memset(ptr + uchunk, 0, mchunk - uchunk); 848 } 849 result = copy_from_user(ptr, buf, uchunk); 850 kunmap(page); 851 if (result) { 852 result = (result < 0) ? result : -EIO; 853 goto out; 854 } 855 ubytes -= uchunk; 856 maddr += mchunk; 857 buf += mchunk; 858 mbytes -= mchunk; 859 } 860 out: 861 return result; 862 } 863 864 static int kimage_load_segment(struct kimage *image, 865 struct kexec_segment *segment) 866 { 867 int result = -ENOMEM; 868 869 switch (image->type) { 870 case KEXEC_TYPE_DEFAULT: 871 result = kimage_load_normal_segment(image, segment); 872 break; 873 case KEXEC_TYPE_CRASH: 874 result = kimage_load_crash_segment(image, segment); 875 break; 876 } 877 878 return result; 879 } 880 881 /* 882 * Exec Kernel system call: for obvious reasons only root may call it. 883 * 884 * This call breaks up into three pieces. 885 * - A generic part which loads the new kernel from the current 886 * address space, and very carefully places the data in the 887 * allocated pages. 888 * 889 * - A generic part that interacts with the kernel and tells all of 890 * the devices to shut down. Preventing on-going dmas, and placing 891 * the devices in a consistent state so a later kernel can 892 * reinitialize them. 893 * 894 * - A machine specific part that includes the syscall number 895 * and the copies the image to it's final destination. And 896 * jumps into the image at entry. 897 * 898 * kexec does not sync, or unmount filesystems so if you need 899 * that to happen you need to do that yourself. 900 */ 901 struct kimage *kexec_image = NULL; 902 static struct kimage *kexec_crash_image = NULL; 903 /* 904 * A home grown binary mutex. 905 * Nothing can wait so this mutex is safe to use 906 * in interrupt context :) 907 */ 908 static int kexec_lock = 0; 909 910 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 911 struct kexec_segment __user *segments, 912 unsigned long flags) 913 { 914 struct kimage **dest_image, *image; 915 int locked; 916 int result; 917 918 /* We only trust the superuser with rebooting the system. */ 919 if (!capable(CAP_SYS_BOOT)) 920 return -EPERM; 921 922 /* 923 * Verify we have a legal set of flags 924 * This leaves us room for future extensions. 925 */ 926 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 927 return -EINVAL; 928 929 /* Verify we are on the appropriate architecture */ 930 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 931 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 932 return -EINVAL; 933 934 /* Put an artificial cap on the number 935 * of segments passed to kexec_load. 936 */ 937 if (nr_segments > KEXEC_SEGMENT_MAX) 938 return -EINVAL; 939 940 image = NULL; 941 result = 0; 942 943 /* Because we write directly to the reserved memory 944 * region when loading crash kernels we need a mutex here to 945 * prevent multiple crash kernels from attempting to load 946 * simultaneously, and to prevent a crash kernel from loading 947 * over the top of a in use crash kernel. 948 * 949 * KISS: always take the mutex. 950 */ 951 locked = xchg(&kexec_lock, 1); 952 if (locked) 953 return -EBUSY; 954 955 dest_image = &kexec_image; 956 if (flags & KEXEC_ON_CRASH) 957 dest_image = &kexec_crash_image; 958 if (nr_segments > 0) { 959 unsigned long i; 960 961 /* Loading another kernel to reboot into */ 962 if ((flags & KEXEC_ON_CRASH) == 0) 963 result = kimage_normal_alloc(&image, entry, 964 nr_segments, segments); 965 /* Loading another kernel to switch to if this one crashes */ 966 else if (flags & KEXEC_ON_CRASH) { 967 /* Free any current crash dump kernel before 968 * we corrupt it. 969 */ 970 kimage_free(xchg(&kexec_crash_image, NULL)); 971 result = kimage_crash_alloc(&image, entry, 972 nr_segments, segments); 973 } 974 if (result) 975 goto out; 976 977 result = machine_kexec_prepare(image); 978 if (result) 979 goto out; 980 981 for (i = 0; i < nr_segments; i++) { 982 result = kimage_load_segment(image, &image->segment[i]); 983 if (result) 984 goto out; 985 } 986 result = kimage_terminate(image); 987 if (result) 988 goto out; 989 } 990 /* Install the new kernel, and Uninstall the old */ 991 image = xchg(dest_image, image); 992 993 out: 994 xchg(&kexec_lock, 0); /* Release the mutex */ 995 kimage_free(image); 996 997 return result; 998 } 999 1000 #ifdef CONFIG_COMPAT 1001 asmlinkage long compat_sys_kexec_load(unsigned long entry, 1002 unsigned long nr_segments, 1003 struct compat_kexec_segment __user *segments, 1004 unsigned long flags) 1005 { 1006 struct compat_kexec_segment in; 1007 struct kexec_segment out, __user *ksegments; 1008 unsigned long i, result; 1009 1010 /* Don't allow clients that don't understand the native 1011 * architecture to do anything. 1012 */ 1013 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 1014 return -EINVAL; 1015 1016 if (nr_segments > KEXEC_SEGMENT_MAX) 1017 return -EINVAL; 1018 1019 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1020 for (i=0; i < nr_segments; i++) { 1021 result = copy_from_user(&in, &segments[i], sizeof(in)); 1022 if (result) 1023 return -EFAULT; 1024 1025 out.buf = compat_ptr(in.buf); 1026 out.bufsz = in.bufsz; 1027 out.mem = in.mem; 1028 out.memsz = in.memsz; 1029 1030 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 1031 if (result) 1032 return -EFAULT; 1033 } 1034 1035 return sys_kexec_load(entry, nr_segments, ksegments, flags); 1036 } 1037 #endif 1038 1039 void crash_kexec(struct pt_regs *regs) 1040 { 1041 struct kimage *image; 1042 int locked; 1043 1044 1045 /* Take the kexec_lock here to prevent sys_kexec_load 1046 * running on one cpu from replacing the crash kernel 1047 * we are using after a panic on a different cpu. 1048 * 1049 * If the crash kernel was not located in a fixed area 1050 * of memory the xchg(&kexec_crash_image) would be 1051 * sufficient. But since I reuse the memory... 1052 */ 1053 locked = xchg(&kexec_lock, 1); 1054 if (!locked) { 1055 image = xchg(&kexec_crash_image, NULL); 1056 if (image) { 1057 machine_crash_shutdown(regs); 1058 machine_kexec(image); 1059 } 1060 xchg(&kexec_lock, 0); 1061 } 1062 } 1063