1 /* 2 * kexec.c - kexec system call 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * 5 * This source code is licensed under the GNU General Public License, 6 * Version 2. See the file COPYING for more details. 7 */ 8 9 #include <linux/mm.h> 10 #include <linux/file.h> 11 #include <linux/slab.h> 12 #include <linux/fs.h> 13 #include <linux/kexec.h> 14 #include <linux/spinlock.h> 15 #include <linux/list.h> 16 #include <linux/highmem.h> 17 #include <linux/syscalls.h> 18 #include <linux/reboot.h> 19 #include <linux/syscalls.h> 20 #include <linux/ioport.h> 21 #include <linux/hardirq.h> 22 23 #include <asm/page.h> 24 #include <asm/uaccess.h> 25 #include <asm/io.h> 26 #include <asm/system.h> 27 #include <asm/semaphore.h> 28 29 /* Per cpu memory for storing cpu states in case of system crash. */ 30 note_buf_t* crash_notes; 31 32 /* Location of the reserved area for the crash kernel */ 33 struct resource crashk_res = { 34 .name = "Crash kernel", 35 .start = 0, 36 .end = 0, 37 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 38 }; 39 40 int kexec_should_crash(struct task_struct *p) 41 { 42 if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) 43 return 1; 44 return 0; 45 } 46 47 /* 48 * When kexec transitions to the new kernel there is a one-to-one 49 * mapping between physical and virtual addresses. On processors 50 * where you can disable the MMU this is trivial, and easy. For 51 * others it is still a simple predictable page table to setup. 52 * 53 * In that environment kexec copies the new kernel to its final 54 * resting place. This means I can only support memory whose 55 * physical address can fit in an unsigned long. In particular 56 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 57 * If the assembly stub has more restrictive requirements 58 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 59 * defined more restrictively in <asm/kexec.h>. 60 * 61 * The code for the transition from the current kernel to the 62 * the new kernel is placed in the control_code_buffer, whose size 63 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 64 * page of memory is necessary, but some architectures require more. 65 * Because this memory must be identity mapped in the transition from 66 * virtual to physical addresses it must live in the range 67 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 68 * modifiable. 69 * 70 * The assembly stub in the control code buffer is passed a linked list 71 * of descriptor pages detailing the source pages of the new kernel, 72 * and the destination addresses of those source pages. As this data 73 * structure is not used in the context of the current OS, it must 74 * be self-contained. 75 * 76 * The code has been made to work with highmem pages and will use a 77 * destination page in its final resting place (if it happens 78 * to allocate it). The end product of this is that most of the 79 * physical address space, and most of RAM can be used. 80 * 81 * Future directions include: 82 * - allocating a page table with the control code buffer identity 83 * mapped, to simplify machine_kexec and make kexec_on_panic more 84 * reliable. 85 */ 86 87 /* 88 * KIMAGE_NO_DEST is an impossible destination address..., for 89 * allocating pages whose destination address we do not care about. 90 */ 91 #define KIMAGE_NO_DEST (-1UL) 92 93 static int kimage_is_destination_range(struct kimage *image, 94 unsigned long start, unsigned long end); 95 static struct page *kimage_alloc_page(struct kimage *image, 96 gfp_t gfp_mask, 97 unsigned long dest); 98 99 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 100 unsigned long nr_segments, 101 struct kexec_segment __user *segments) 102 { 103 size_t segment_bytes; 104 struct kimage *image; 105 unsigned long i; 106 int result; 107 108 /* Allocate a controlling structure */ 109 result = -ENOMEM; 110 image = kmalloc(sizeof(*image), GFP_KERNEL); 111 if (!image) 112 goto out; 113 114 memset(image, 0, sizeof(*image)); 115 image->head = 0; 116 image->entry = &image->head; 117 image->last_entry = &image->head; 118 image->control_page = ~0; /* By default this does not apply */ 119 image->start = entry; 120 image->type = KEXEC_TYPE_DEFAULT; 121 122 /* Initialize the list of control pages */ 123 INIT_LIST_HEAD(&image->control_pages); 124 125 /* Initialize the list of destination pages */ 126 INIT_LIST_HEAD(&image->dest_pages); 127 128 /* Initialize the list of unuseable pages */ 129 INIT_LIST_HEAD(&image->unuseable_pages); 130 131 /* Read in the segments */ 132 image->nr_segments = nr_segments; 133 segment_bytes = nr_segments * sizeof(*segments); 134 result = copy_from_user(image->segment, segments, segment_bytes); 135 if (result) 136 goto out; 137 138 /* 139 * Verify we have good destination addresses. The caller is 140 * responsible for making certain we don't attempt to load 141 * the new image into invalid or reserved areas of RAM. This 142 * just verifies it is an address we can use. 143 * 144 * Since the kernel does everything in page size chunks ensure 145 * the destination addreses are page aligned. Too many 146 * special cases crop of when we don't do this. The most 147 * insidious is getting overlapping destination addresses 148 * simply because addresses are changed to page size 149 * granularity. 150 */ 151 result = -EADDRNOTAVAIL; 152 for (i = 0; i < nr_segments; i++) { 153 unsigned long mstart, mend; 154 155 mstart = image->segment[i].mem; 156 mend = mstart + image->segment[i].memsz; 157 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 158 goto out; 159 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 160 goto out; 161 } 162 163 /* Verify our destination addresses do not overlap. 164 * If we alloed overlapping destination addresses 165 * through very weird things can happen with no 166 * easy explanation as one segment stops on another. 167 */ 168 result = -EINVAL; 169 for (i = 0; i < nr_segments; i++) { 170 unsigned long mstart, mend; 171 unsigned long j; 172 173 mstart = image->segment[i].mem; 174 mend = mstart + image->segment[i].memsz; 175 for (j = 0; j < i; j++) { 176 unsigned long pstart, pend; 177 pstart = image->segment[j].mem; 178 pend = pstart + image->segment[j].memsz; 179 /* Do the segments overlap ? */ 180 if ((mend > pstart) && (mstart < pend)) 181 goto out; 182 } 183 } 184 185 /* Ensure our buffer sizes are strictly less than 186 * our memory sizes. This should always be the case, 187 * and it is easier to check up front than to be surprised 188 * later on. 189 */ 190 result = -EINVAL; 191 for (i = 0; i < nr_segments; i++) { 192 if (image->segment[i].bufsz > image->segment[i].memsz) 193 goto out; 194 } 195 196 result = 0; 197 out: 198 if (result == 0) 199 *rimage = image; 200 else 201 kfree(image); 202 203 return result; 204 205 } 206 207 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 208 unsigned long nr_segments, 209 struct kexec_segment __user *segments) 210 { 211 int result; 212 struct kimage *image; 213 214 /* Allocate and initialize a controlling structure */ 215 image = NULL; 216 result = do_kimage_alloc(&image, entry, nr_segments, segments); 217 if (result) 218 goto out; 219 220 *rimage = image; 221 222 /* 223 * Find a location for the control code buffer, and add it 224 * the vector of segments so that it's pages will also be 225 * counted as destination pages. 226 */ 227 result = -ENOMEM; 228 image->control_code_page = kimage_alloc_control_pages(image, 229 get_order(KEXEC_CONTROL_CODE_SIZE)); 230 if (!image->control_code_page) { 231 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 232 goto out; 233 } 234 235 result = 0; 236 out: 237 if (result == 0) 238 *rimage = image; 239 else 240 kfree(image); 241 242 return result; 243 } 244 245 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 246 unsigned long nr_segments, 247 struct kexec_segment __user *segments) 248 { 249 int result; 250 struct kimage *image; 251 unsigned long i; 252 253 image = NULL; 254 /* Verify we have a valid entry point */ 255 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 256 result = -EADDRNOTAVAIL; 257 goto out; 258 } 259 260 /* Allocate and initialize a controlling structure */ 261 result = do_kimage_alloc(&image, entry, nr_segments, segments); 262 if (result) 263 goto out; 264 265 /* Enable the special crash kernel control page 266 * allocation policy. 267 */ 268 image->control_page = crashk_res.start; 269 image->type = KEXEC_TYPE_CRASH; 270 271 /* 272 * Verify we have good destination addresses. Normally 273 * the caller is responsible for making certain we don't 274 * attempt to load the new image into invalid or reserved 275 * areas of RAM. But crash kernels are preloaded into a 276 * reserved area of ram. We must ensure the addresses 277 * are in the reserved area otherwise preloading the 278 * kernel could corrupt things. 279 */ 280 result = -EADDRNOTAVAIL; 281 for (i = 0; i < nr_segments; i++) { 282 unsigned long mstart, mend; 283 284 mstart = image->segment[i].mem; 285 mend = mstart + image->segment[i].memsz - 1; 286 /* Ensure we are within the crash kernel limits */ 287 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 288 goto out; 289 } 290 291 /* 292 * Find a location for the control code buffer, and add 293 * the vector of segments so that it's pages will also be 294 * counted as destination pages. 295 */ 296 result = -ENOMEM; 297 image->control_code_page = kimage_alloc_control_pages(image, 298 get_order(KEXEC_CONTROL_CODE_SIZE)); 299 if (!image->control_code_page) { 300 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 301 goto out; 302 } 303 304 result = 0; 305 out: 306 if (result == 0) 307 *rimage = image; 308 else 309 kfree(image); 310 311 return result; 312 } 313 314 static int kimage_is_destination_range(struct kimage *image, 315 unsigned long start, 316 unsigned long end) 317 { 318 unsigned long i; 319 320 for (i = 0; i < image->nr_segments; i++) { 321 unsigned long mstart, mend; 322 323 mstart = image->segment[i].mem; 324 mend = mstart + image->segment[i].memsz; 325 if ((end > mstart) && (start < mend)) 326 return 1; 327 } 328 329 return 0; 330 } 331 332 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 333 { 334 struct page *pages; 335 336 pages = alloc_pages(gfp_mask, order); 337 if (pages) { 338 unsigned int count, i; 339 pages->mapping = NULL; 340 set_page_private(pages, order); 341 count = 1 << order; 342 for (i = 0; i < count; i++) 343 SetPageReserved(pages + i); 344 } 345 346 return pages; 347 } 348 349 static void kimage_free_pages(struct page *page) 350 { 351 unsigned int order, count, i; 352 353 order = page_private(page); 354 count = 1 << order; 355 for (i = 0; i < count; i++) 356 ClearPageReserved(page + i); 357 __free_pages(page, order); 358 } 359 360 static void kimage_free_page_list(struct list_head *list) 361 { 362 struct list_head *pos, *next; 363 364 list_for_each_safe(pos, next, list) { 365 struct page *page; 366 367 page = list_entry(pos, struct page, lru); 368 list_del(&page->lru); 369 kimage_free_pages(page); 370 } 371 } 372 373 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 374 unsigned int order) 375 { 376 /* Control pages are special, they are the intermediaries 377 * that are needed while we copy the rest of the pages 378 * to their final resting place. As such they must 379 * not conflict with either the destination addresses 380 * or memory the kernel is already using. 381 * 382 * The only case where we really need more than one of 383 * these are for architectures where we cannot disable 384 * the MMU and must instead generate an identity mapped 385 * page table for all of the memory. 386 * 387 * At worst this runs in O(N) of the image size. 388 */ 389 struct list_head extra_pages; 390 struct page *pages; 391 unsigned int count; 392 393 count = 1 << order; 394 INIT_LIST_HEAD(&extra_pages); 395 396 /* Loop while I can allocate a page and the page allocated 397 * is a destination page. 398 */ 399 do { 400 unsigned long pfn, epfn, addr, eaddr; 401 402 pages = kimage_alloc_pages(GFP_KERNEL, order); 403 if (!pages) 404 break; 405 pfn = page_to_pfn(pages); 406 epfn = pfn + count; 407 addr = pfn << PAGE_SHIFT; 408 eaddr = epfn << PAGE_SHIFT; 409 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 410 kimage_is_destination_range(image, addr, eaddr)) { 411 list_add(&pages->lru, &extra_pages); 412 pages = NULL; 413 } 414 } while (!pages); 415 416 if (pages) { 417 /* Remember the allocated page... */ 418 list_add(&pages->lru, &image->control_pages); 419 420 /* Because the page is already in it's destination 421 * location we will never allocate another page at 422 * that address. Therefore kimage_alloc_pages 423 * will not return it (again) and we don't need 424 * to give it an entry in image->segment[]. 425 */ 426 } 427 /* Deal with the destination pages I have inadvertently allocated. 428 * 429 * Ideally I would convert multi-page allocations into single 430 * page allocations, and add everyting to image->dest_pages. 431 * 432 * For now it is simpler to just free the pages. 433 */ 434 kimage_free_page_list(&extra_pages); 435 436 return pages; 437 } 438 439 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 440 unsigned int order) 441 { 442 /* Control pages are special, they are the intermediaries 443 * that are needed while we copy the rest of the pages 444 * to their final resting place. As such they must 445 * not conflict with either the destination addresses 446 * or memory the kernel is already using. 447 * 448 * Control pages are also the only pags we must allocate 449 * when loading a crash kernel. All of the other pages 450 * are specified by the segments and we just memcpy 451 * into them directly. 452 * 453 * The only case where we really need more than one of 454 * these are for architectures where we cannot disable 455 * the MMU and must instead generate an identity mapped 456 * page table for all of the memory. 457 * 458 * Given the low demand this implements a very simple 459 * allocator that finds the first hole of the appropriate 460 * size in the reserved memory region, and allocates all 461 * of the memory up to and including the hole. 462 */ 463 unsigned long hole_start, hole_end, size; 464 struct page *pages; 465 466 pages = NULL; 467 size = (1 << order) << PAGE_SHIFT; 468 hole_start = (image->control_page + (size - 1)) & ~(size - 1); 469 hole_end = hole_start + size - 1; 470 while (hole_end <= crashk_res.end) { 471 unsigned long i; 472 473 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 474 break; 475 if (hole_end > crashk_res.end) 476 break; 477 /* See if I overlap any of the segments */ 478 for (i = 0; i < image->nr_segments; i++) { 479 unsigned long mstart, mend; 480 481 mstart = image->segment[i].mem; 482 mend = mstart + image->segment[i].memsz - 1; 483 if ((hole_end >= mstart) && (hole_start <= mend)) { 484 /* Advance the hole to the end of the segment */ 485 hole_start = (mend + (size - 1)) & ~(size - 1); 486 hole_end = hole_start + size - 1; 487 break; 488 } 489 } 490 /* If I don't overlap any segments I have found my hole! */ 491 if (i == image->nr_segments) { 492 pages = pfn_to_page(hole_start >> PAGE_SHIFT); 493 break; 494 } 495 } 496 if (pages) 497 image->control_page = hole_end; 498 499 return pages; 500 } 501 502 503 struct page *kimage_alloc_control_pages(struct kimage *image, 504 unsigned int order) 505 { 506 struct page *pages = NULL; 507 508 switch (image->type) { 509 case KEXEC_TYPE_DEFAULT: 510 pages = kimage_alloc_normal_control_pages(image, order); 511 break; 512 case KEXEC_TYPE_CRASH: 513 pages = kimage_alloc_crash_control_pages(image, order); 514 break; 515 } 516 517 return pages; 518 } 519 520 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 521 { 522 if (*image->entry != 0) 523 image->entry++; 524 525 if (image->entry == image->last_entry) { 526 kimage_entry_t *ind_page; 527 struct page *page; 528 529 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 530 if (!page) 531 return -ENOMEM; 532 533 ind_page = page_address(page); 534 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 535 image->entry = ind_page; 536 image->last_entry = ind_page + 537 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 538 } 539 *image->entry = entry; 540 image->entry++; 541 *image->entry = 0; 542 543 return 0; 544 } 545 546 static int kimage_set_destination(struct kimage *image, 547 unsigned long destination) 548 { 549 int result; 550 551 destination &= PAGE_MASK; 552 result = kimage_add_entry(image, destination | IND_DESTINATION); 553 if (result == 0) 554 image->destination = destination; 555 556 return result; 557 } 558 559 560 static int kimage_add_page(struct kimage *image, unsigned long page) 561 { 562 int result; 563 564 page &= PAGE_MASK; 565 result = kimage_add_entry(image, page | IND_SOURCE); 566 if (result == 0) 567 image->destination += PAGE_SIZE; 568 569 return result; 570 } 571 572 573 static void kimage_free_extra_pages(struct kimage *image) 574 { 575 /* Walk through and free any extra destination pages I may have */ 576 kimage_free_page_list(&image->dest_pages); 577 578 /* Walk through and free any unuseable pages I have cached */ 579 kimage_free_page_list(&image->unuseable_pages); 580 581 } 582 static int kimage_terminate(struct kimage *image) 583 { 584 if (*image->entry != 0) 585 image->entry++; 586 587 *image->entry = IND_DONE; 588 589 return 0; 590 } 591 592 #define for_each_kimage_entry(image, ptr, entry) \ 593 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 594 ptr = (entry & IND_INDIRECTION)? \ 595 phys_to_virt((entry & PAGE_MASK)): ptr +1) 596 597 static void kimage_free_entry(kimage_entry_t entry) 598 { 599 struct page *page; 600 601 page = pfn_to_page(entry >> PAGE_SHIFT); 602 kimage_free_pages(page); 603 } 604 605 static void kimage_free(struct kimage *image) 606 { 607 kimage_entry_t *ptr, entry; 608 kimage_entry_t ind = 0; 609 610 if (!image) 611 return; 612 613 kimage_free_extra_pages(image); 614 for_each_kimage_entry(image, ptr, entry) { 615 if (entry & IND_INDIRECTION) { 616 /* Free the previous indirection page */ 617 if (ind & IND_INDIRECTION) 618 kimage_free_entry(ind); 619 /* Save this indirection page until we are 620 * done with it. 621 */ 622 ind = entry; 623 } 624 else if (entry & IND_SOURCE) 625 kimage_free_entry(entry); 626 } 627 /* Free the final indirection page */ 628 if (ind & IND_INDIRECTION) 629 kimage_free_entry(ind); 630 631 /* Handle any machine specific cleanup */ 632 machine_kexec_cleanup(image); 633 634 /* Free the kexec control pages... */ 635 kimage_free_page_list(&image->control_pages); 636 kfree(image); 637 } 638 639 static kimage_entry_t *kimage_dst_used(struct kimage *image, 640 unsigned long page) 641 { 642 kimage_entry_t *ptr, entry; 643 unsigned long destination = 0; 644 645 for_each_kimage_entry(image, ptr, entry) { 646 if (entry & IND_DESTINATION) 647 destination = entry & PAGE_MASK; 648 else if (entry & IND_SOURCE) { 649 if (page == destination) 650 return ptr; 651 destination += PAGE_SIZE; 652 } 653 } 654 655 return NULL; 656 } 657 658 static struct page *kimage_alloc_page(struct kimage *image, 659 gfp_t gfp_mask, 660 unsigned long destination) 661 { 662 /* 663 * Here we implement safeguards to ensure that a source page 664 * is not copied to its destination page before the data on 665 * the destination page is no longer useful. 666 * 667 * To do this we maintain the invariant that a source page is 668 * either its own destination page, or it is not a 669 * destination page at all. 670 * 671 * That is slightly stronger than required, but the proof 672 * that no problems will not occur is trivial, and the 673 * implementation is simply to verify. 674 * 675 * When allocating all pages normally this algorithm will run 676 * in O(N) time, but in the worst case it will run in O(N^2) 677 * time. If the runtime is a problem the data structures can 678 * be fixed. 679 */ 680 struct page *page; 681 unsigned long addr; 682 683 /* 684 * Walk through the list of destination pages, and see if I 685 * have a match. 686 */ 687 list_for_each_entry(page, &image->dest_pages, lru) { 688 addr = page_to_pfn(page) << PAGE_SHIFT; 689 if (addr == destination) { 690 list_del(&page->lru); 691 return page; 692 } 693 } 694 page = NULL; 695 while (1) { 696 kimage_entry_t *old; 697 698 /* Allocate a page, if we run out of memory give up */ 699 page = kimage_alloc_pages(gfp_mask, 0); 700 if (!page) 701 return NULL; 702 /* If the page cannot be used file it away */ 703 if (page_to_pfn(page) > 704 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 705 list_add(&page->lru, &image->unuseable_pages); 706 continue; 707 } 708 addr = page_to_pfn(page) << PAGE_SHIFT; 709 710 /* If it is the destination page we want use it */ 711 if (addr == destination) 712 break; 713 714 /* If the page is not a destination page use it */ 715 if (!kimage_is_destination_range(image, addr, 716 addr + PAGE_SIZE)) 717 break; 718 719 /* 720 * I know that the page is someones destination page. 721 * See if there is already a source page for this 722 * destination page. And if so swap the source pages. 723 */ 724 old = kimage_dst_used(image, addr); 725 if (old) { 726 /* If so move it */ 727 unsigned long old_addr; 728 struct page *old_page; 729 730 old_addr = *old & PAGE_MASK; 731 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 732 copy_highpage(page, old_page); 733 *old = addr | (*old & ~PAGE_MASK); 734 735 /* The old page I have found cannot be a 736 * destination page, so return it. 737 */ 738 addr = old_addr; 739 page = old_page; 740 break; 741 } 742 else { 743 /* Place the page on the destination list I 744 * will use it later. 745 */ 746 list_add(&page->lru, &image->dest_pages); 747 } 748 } 749 750 return page; 751 } 752 753 static int kimage_load_normal_segment(struct kimage *image, 754 struct kexec_segment *segment) 755 { 756 unsigned long maddr; 757 unsigned long ubytes, mbytes; 758 int result; 759 unsigned char __user *buf; 760 761 result = 0; 762 buf = segment->buf; 763 ubytes = segment->bufsz; 764 mbytes = segment->memsz; 765 maddr = segment->mem; 766 767 result = kimage_set_destination(image, maddr); 768 if (result < 0) 769 goto out; 770 771 while (mbytes) { 772 struct page *page; 773 char *ptr; 774 size_t uchunk, mchunk; 775 776 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 777 if (page == 0) { 778 result = -ENOMEM; 779 goto out; 780 } 781 result = kimage_add_page(image, page_to_pfn(page) 782 << PAGE_SHIFT); 783 if (result < 0) 784 goto out; 785 786 ptr = kmap(page); 787 /* Start with a clear page */ 788 memset(ptr, 0, PAGE_SIZE); 789 ptr += maddr & ~PAGE_MASK; 790 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 791 if (mchunk > mbytes) 792 mchunk = mbytes; 793 794 uchunk = mchunk; 795 if (uchunk > ubytes) 796 uchunk = ubytes; 797 798 result = copy_from_user(ptr, buf, uchunk); 799 kunmap(page); 800 if (result) { 801 result = (result < 0) ? result : -EIO; 802 goto out; 803 } 804 ubytes -= uchunk; 805 maddr += mchunk; 806 buf += mchunk; 807 mbytes -= mchunk; 808 } 809 out: 810 return result; 811 } 812 813 static int kimage_load_crash_segment(struct kimage *image, 814 struct kexec_segment *segment) 815 { 816 /* For crash dumps kernels we simply copy the data from 817 * user space to it's destination. 818 * We do things a page at a time for the sake of kmap. 819 */ 820 unsigned long maddr; 821 unsigned long ubytes, mbytes; 822 int result; 823 unsigned char __user *buf; 824 825 result = 0; 826 buf = segment->buf; 827 ubytes = segment->bufsz; 828 mbytes = segment->memsz; 829 maddr = segment->mem; 830 while (mbytes) { 831 struct page *page; 832 char *ptr; 833 size_t uchunk, mchunk; 834 835 page = pfn_to_page(maddr >> PAGE_SHIFT); 836 if (page == 0) { 837 result = -ENOMEM; 838 goto out; 839 } 840 ptr = kmap(page); 841 ptr += maddr & ~PAGE_MASK; 842 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 843 if (mchunk > mbytes) 844 mchunk = mbytes; 845 846 uchunk = mchunk; 847 if (uchunk > ubytes) { 848 uchunk = ubytes; 849 /* Zero the trailing part of the page */ 850 memset(ptr + uchunk, 0, mchunk - uchunk); 851 } 852 result = copy_from_user(ptr, buf, uchunk); 853 kunmap(page); 854 if (result) { 855 result = (result < 0) ? result : -EIO; 856 goto out; 857 } 858 ubytes -= uchunk; 859 maddr += mchunk; 860 buf += mchunk; 861 mbytes -= mchunk; 862 } 863 out: 864 return result; 865 } 866 867 static int kimage_load_segment(struct kimage *image, 868 struct kexec_segment *segment) 869 { 870 int result = -ENOMEM; 871 872 switch (image->type) { 873 case KEXEC_TYPE_DEFAULT: 874 result = kimage_load_normal_segment(image, segment); 875 break; 876 case KEXEC_TYPE_CRASH: 877 result = kimage_load_crash_segment(image, segment); 878 break; 879 } 880 881 return result; 882 } 883 884 /* 885 * Exec Kernel system call: for obvious reasons only root may call it. 886 * 887 * This call breaks up into three pieces. 888 * - A generic part which loads the new kernel from the current 889 * address space, and very carefully places the data in the 890 * allocated pages. 891 * 892 * - A generic part that interacts with the kernel and tells all of 893 * the devices to shut down. Preventing on-going dmas, and placing 894 * the devices in a consistent state so a later kernel can 895 * reinitialize them. 896 * 897 * - A machine specific part that includes the syscall number 898 * and the copies the image to it's final destination. And 899 * jumps into the image at entry. 900 * 901 * kexec does not sync, or unmount filesystems so if you need 902 * that to happen you need to do that yourself. 903 */ 904 struct kimage *kexec_image = NULL; 905 static struct kimage *kexec_crash_image = NULL; 906 /* 907 * A home grown binary mutex. 908 * Nothing can wait so this mutex is safe to use 909 * in interrupt context :) 910 */ 911 static int kexec_lock = 0; 912 913 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 914 struct kexec_segment __user *segments, 915 unsigned long flags) 916 { 917 struct kimage **dest_image, *image; 918 int locked; 919 int result; 920 921 /* We only trust the superuser with rebooting the system. */ 922 if (!capable(CAP_SYS_BOOT)) 923 return -EPERM; 924 925 /* 926 * Verify we have a legal set of flags 927 * This leaves us room for future extensions. 928 */ 929 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 930 return -EINVAL; 931 932 /* Verify we are on the appropriate architecture */ 933 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 934 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 935 return -EINVAL; 936 937 /* Put an artificial cap on the number 938 * of segments passed to kexec_load. 939 */ 940 if (nr_segments > KEXEC_SEGMENT_MAX) 941 return -EINVAL; 942 943 image = NULL; 944 result = 0; 945 946 /* Because we write directly to the reserved memory 947 * region when loading crash kernels we need a mutex here to 948 * prevent multiple crash kernels from attempting to load 949 * simultaneously, and to prevent a crash kernel from loading 950 * over the top of a in use crash kernel. 951 * 952 * KISS: always take the mutex. 953 */ 954 locked = xchg(&kexec_lock, 1); 955 if (locked) 956 return -EBUSY; 957 958 dest_image = &kexec_image; 959 if (flags & KEXEC_ON_CRASH) 960 dest_image = &kexec_crash_image; 961 if (nr_segments > 0) { 962 unsigned long i; 963 964 /* Loading another kernel to reboot into */ 965 if ((flags & KEXEC_ON_CRASH) == 0) 966 result = kimage_normal_alloc(&image, entry, 967 nr_segments, segments); 968 /* Loading another kernel to switch to if this one crashes */ 969 else if (flags & KEXEC_ON_CRASH) { 970 /* Free any current crash dump kernel before 971 * we corrupt it. 972 */ 973 kimage_free(xchg(&kexec_crash_image, NULL)); 974 result = kimage_crash_alloc(&image, entry, 975 nr_segments, segments); 976 } 977 if (result) 978 goto out; 979 980 result = machine_kexec_prepare(image); 981 if (result) 982 goto out; 983 984 for (i = 0; i < nr_segments; i++) { 985 result = kimage_load_segment(image, &image->segment[i]); 986 if (result) 987 goto out; 988 } 989 result = kimage_terminate(image); 990 if (result) 991 goto out; 992 } 993 /* Install the new kernel, and Uninstall the old */ 994 image = xchg(dest_image, image); 995 996 out: 997 xchg(&kexec_lock, 0); /* Release the mutex */ 998 kimage_free(image); 999 1000 return result; 1001 } 1002 1003 #ifdef CONFIG_COMPAT 1004 asmlinkage long compat_sys_kexec_load(unsigned long entry, 1005 unsigned long nr_segments, 1006 struct compat_kexec_segment __user *segments, 1007 unsigned long flags) 1008 { 1009 struct compat_kexec_segment in; 1010 struct kexec_segment out, __user *ksegments; 1011 unsigned long i, result; 1012 1013 /* Don't allow clients that don't understand the native 1014 * architecture to do anything. 1015 */ 1016 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 1017 return -EINVAL; 1018 1019 if (nr_segments > KEXEC_SEGMENT_MAX) 1020 return -EINVAL; 1021 1022 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1023 for (i=0; i < nr_segments; i++) { 1024 result = copy_from_user(&in, &segments[i], sizeof(in)); 1025 if (result) 1026 return -EFAULT; 1027 1028 out.buf = compat_ptr(in.buf); 1029 out.bufsz = in.bufsz; 1030 out.mem = in.mem; 1031 out.memsz = in.memsz; 1032 1033 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 1034 if (result) 1035 return -EFAULT; 1036 } 1037 1038 return sys_kexec_load(entry, nr_segments, ksegments, flags); 1039 } 1040 #endif 1041 1042 void crash_kexec(struct pt_regs *regs) 1043 { 1044 struct kimage *image; 1045 int locked; 1046 1047 1048 /* Take the kexec_lock here to prevent sys_kexec_load 1049 * running on one cpu from replacing the crash kernel 1050 * we are using after a panic on a different cpu. 1051 * 1052 * If the crash kernel was not located in a fixed area 1053 * of memory the xchg(&kexec_crash_image) would be 1054 * sufficient. But since I reuse the memory... 1055 */ 1056 locked = xchg(&kexec_lock, 1); 1057 if (!locked) { 1058 image = xchg(&kexec_crash_image, NULL); 1059 if (image) { 1060 struct pt_regs fixed_regs; 1061 crash_setup_regs(&fixed_regs, regs); 1062 machine_crash_shutdown(&fixed_regs); 1063 machine_kexec(image); 1064 } 1065 xchg(&kexec_lock, 0); 1066 } 1067 } 1068 1069 static int __init crash_notes_memory_init(void) 1070 { 1071 /* Allocate memory for saving cpu registers. */ 1072 crash_notes = alloc_percpu(note_buf_t); 1073 if (!crash_notes) { 1074 printk("Kexec: Memory allocation for saving cpu register" 1075 " states failed\n"); 1076 return -ENOMEM; 1077 } 1078 return 0; 1079 } 1080 module_init(crash_notes_memory_init) 1081