1 /* 2 * Copyright (c) 2012, Microsoft Corporation. 3 * 4 * Author: 5 * K. Y. Srinivasan <kys@microsoft.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 as published 9 * by the Free Software Foundation. 10 * 11 * This program is distributed in the hope that it will be useful, but 12 * WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 14 * NON INFRINGEMENT. See the GNU General Public License for more 15 * details. 16 * 17 */ 18 19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 20 21 #include <linux/kernel.h> 22 #include <linux/mman.h> 23 #include <linux/delay.h> 24 #include <linux/init.h> 25 #include <linux/module.h> 26 #include <linux/slab.h> 27 #include <linux/kthread.h> 28 #include <linux/completion.h> 29 #include <linux/memory_hotplug.h> 30 #include <linux/memory.h> 31 #include <linux/notifier.h> 32 #include <linux/percpu_counter.h> 33 34 #include <linux/hyperv.h> 35 36 /* 37 * We begin with definitions supporting the Dynamic Memory protocol 38 * with the host. 39 * 40 * Begin protocol definitions. 41 */ 42 43 44 45 /* 46 * Protocol versions. The low word is the minor version, the high word the major 47 * version. 48 * 49 * History: 50 * Initial version 1.0 51 * Changed to 0.1 on 2009/03/25 52 * Changes to 0.2 on 2009/05/14 53 * Changes to 0.3 on 2009/12/03 54 * Changed to 1.0 on 2011/04/05 55 */ 56 57 #define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor))) 58 #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16) 59 #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff) 60 61 enum { 62 DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3), 63 DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0), 64 65 DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1, 66 DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2, 67 68 DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN8 69 }; 70 71 72 73 /* 74 * Message Types 75 */ 76 77 enum dm_message_type { 78 /* 79 * Version 0.3 80 */ 81 DM_ERROR = 0, 82 DM_VERSION_REQUEST = 1, 83 DM_VERSION_RESPONSE = 2, 84 DM_CAPABILITIES_REPORT = 3, 85 DM_CAPABILITIES_RESPONSE = 4, 86 DM_STATUS_REPORT = 5, 87 DM_BALLOON_REQUEST = 6, 88 DM_BALLOON_RESPONSE = 7, 89 DM_UNBALLOON_REQUEST = 8, 90 DM_UNBALLOON_RESPONSE = 9, 91 DM_MEM_HOT_ADD_REQUEST = 10, 92 DM_MEM_HOT_ADD_RESPONSE = 11, 93 DM_VERSION_03_MAX = 11, 94 /* 95 * Version 1.0. 96 */ 97 DM_INFO_MESSAGE = 12, 98 DM_VERSION_1_MAX = 12 99 }; 100 101 102 /* 103 * Structures defining the dynamic memory management 104 * protocol. 105 */ 106 107 union dm_version { 108 struct { 109 __u16 minor_version; 110 __u16 major_version; 111 }; 112 __u32 version; 113 } __packed; 114 115 116 union dm_caps { 117 struct { 118 __u64 balloon:1; 119 __u64 hot_add:1; 120 /* 121 * To support guests that may have alignment 122 * limitations on hot-add, the guest can specify 123 * its alignment requirements; a value of n 124 * represents an alignment of 2^n in mega bytes. 125 */ 126 __u64 hot_add_alignment:4; 127 __u64 reservedz:58; 128 } cap_bits; 129 __u64 caps; 130 } __packed; 131 132 union dm_mem_page_range { 133 struct { 134 /* 135 * The PFN number of the first page in the range. 136 * 40 bits is the architectural limit of a PFN 137 * number for AMD64. 138 */ 139 __u64 start_page:40; 140 /* 141 * The number of pages in the range. 142 */ 143 __u64 page_cnt:24; 144 } finfo; 145 __u64 page_range; 146 } __packed; 147 148 149 150 /* 151 * The header for all dynamic memory messages: 152 * 153 * type: Type of the message. 154 * size: Size of the message in bytes; including the header. 155 * trans_id: The guest is responsible for manufacturing this ID. 156 */ 157 158 struct dm_header { 159 __u16 type; 160 __u16 size; 161 __u32 trans_id; 162 } __packed; 163 164 /* 165 * A generic message format for dynamic memory. 166 * Specific message formats are defined later in the file. 167 */ 168 169 struct dm_message { 170 struct dm_header hdr; 171 __u8 data[]; /* enclosed message */ 172 } __packed; 173 174 175 /* 176 * Specific message types supporting the dynamic memory protocol. 177 */ 178 179 /* 180 * Version negotiation message. Sent from the guest to the host. 181 * The guest is free to try different versions until the host 182 * accepts the version. 183 * 184 * dm_version: The protocol version requested. 185 * is_last_attempt: If TRUE, this is the last version guest will request. 186 * reservedz: Reserved field, set to zero. 187 */ 188 189 struct dm_version_request { 190 struct dm_header hdr; 191 union dm_version version; 192 __u32 is_last_attempt:1; 193 __u32 reservedz:31; 194 } __packed; 195 196 /* 197 * Version response message; Host to Guest and indicates 198 * if the host has accepted the version sent by the guest. 199 * 200 * is_accepted: If TRUE, host has accepted the version and the guest 201 * should proceed to the next stage of the protocol. FALSE indicates that 202 * guest should re-try with a different version. 203 * 204 * reservedz: Reserved field, set to zero. 205 */ 206 207 struct dm_version_response { 208 struct dm_header hdr; 209 __u64 is_accepted:1; 210 __u64 reservedz:63; 211 } __packed; 212 213 /* 214 * Message reporting capabilities. This is sent from the guest to the 215 * host. 216 */ 217 218 struct dm_capabilities { 219 struct dm_header hdr; 220 union dm_caps caps; 221 __u64 min_page_cnt; 222 __u64 max_page_number; 223 } __packed; 224 225 /* 226 * Response to the capabilities message. This is sent from the host to the 227 * guest. This message notifies if the host has accepted the guest's 228 * capabilities. If the host has not accepted, the guest must shutdown 229 * the service. 230 * 231 * is_accepted: Indicates if the host has accepted guest's capabilities. 232 * reservedz: Must be 0. 233 */ 234 235 struct dm_capabilities_resp_msg { 236 struct dm_header hdr; 237 __u64 is_accepted:1; 238 __u64 reservedz:63; 239 } __packed; 240 241 /* 242 * This message is used to report memory pressure from the guest. 243 * This message is not part of any transaction and there is no 244 * response to this message. 245 * 246 * num_avail: Available memory in pages. 247 * num_committed: Committed memory in pages. 248 * page_file_size: The accumulated size of all page files 249 * in the system in pages. 250 * zero_free: The nunber of zero and free pages. 251 * page_file_writes: The writes to the page file in pages. 252 * io_diff: An indicator of file cache efficiency or page file activity, 253 * calculated as File Cache Page Fault Count - Page Read Count. 254 * This value is in pages. 255 * 256 * Some of these metrics are Windows specific and fortunately 257 * the algorithm on the host side that computes the guest memory 258 * pressure only uses num_committed value. 259 */ 260 261 struct dm_status { 262 struct dm_header hdr; 263 __u64 num_avail; 264 __u64 num_committed; 265 __u64 page_file_size; 266 __u64 zero_free; 267 __u32 page_file_writes; 268 __u32 io_diff; 269 } __packed; 270 271 272 /* 273 * Message to ask the guest to allocate memory - balloon up message. 274 * This message is sent from the host to the guest. The guest may not be 275 * able to allocate as much memory as requested. 276 * 277 * num_pages: number of pages to allocate. 278 */ 279 280 struct dm_balloon { 281 struct dm_header hdr; 282 __u32 num_pages; 283 __u32 reservedz; 284 } __packed; 285 286 287 /* 288 * Balloon response message; this message is sent from the guest 289 * to the host in response to the balloon message. 290 * 291 * reservedz: Reserved; must be set to zero. 292 * more_pages: If FALSE, this is the last message of the transaction. 293 * if TRUE there will atleast one more message from the guest. 294 * 295 * range_count: The number of ranges in the range array. 296 * 297 * range_array: An array of page ranges returned to the host. 298 * 299 */ 300 301 struct dm_balloon_response { 302 struct dm_header hdr; 303 __u32 reservedz; 304 __u32 more_pages:1; 305 __u32 range_count:31; 306 union dm_mem_page_range range_array[]; 307 } __packed; 308 309 /* 310 * Un-balloon message; this message is sent from the host 311 * to the guest to give guest more memory. 312 * 313 * more_pages: If FALSE, this is the last message of the transaction. 314 * if TRUE there will atleast one more message from the guest. 315 * 316 * reservedz: Reserved; must be set to zero. 317 * 318 * range_count: The number of ranges in the range array. 319 * 320 * range_array: An array of page ranges returned to the host. 321 * 322 */ 323 324 struct dm_unballoon_request { 325 struct dm_header hdr; 326 __u32 more_pages:1; 327 __u32 reservedz:31; 328 __u32 range_count; 329 union dm_mem_page_range range_array[]; 330 } __packed; 331 332 /* 333 * Un-balloon response message; this message is sent from the guest 334 * to the host in response to an unballoon request. 335 * 336 */ 337 338 struct dm_unballoon_response { 339 struct dm_header hdr; 340 } __packed; 341 342 343 /* 344 * Hot add request message. Message sent from the host to the guest. 345 * 346 * mem_range: Memory range to hot add. 347 * 348 * On Linux we currently don't support this since we cannot hot add 349 * arbitrary granularity of memory. 350 */ 351 352 struct dm_hot_add { 353 struct dm_header hdr; 354 union dm_mem_page_range range; 355 } __packed; 356 357 /* 358 * Hot add response message. 359 * This message is sent by the guest to report the status of a hot add request. 360 * If page_count is less than the requested page count, then the host should 361 * assume all further hot add requests will fail, since this indicates that 362 * the guest has hit an upper physical memory barrier. 363 * 364 * Hot adds may also fail due to low resources; in this case, the guest must 365 * not complete this message until the hot add can succeed, and the host must 366 * not send a new hot add request until the response is sent. 367 * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS 368 * times it fails the request. 369 * 370 * 371 * page_count: number of pages that were successfully hot added. 372 * 373 * result: result of the operation 1: success, 0: failure. 374 * 375 */ 376 377 struct dm_hot_add_response { 378 struct dm_header hdr; 379 __u32 page_count; 380 __u32 result; 381 } __packed; 382 383 /* 384 * Types of information sent from host to the guest. 385 */ 386 387 enum dm_info_type { 388 INFO_TYPE_MAX_PAGE_CNT = 0, 389 MAX_INFO_TYPE 390 }; 391 392 393 /* 394 * Header for the information message. 395 */ 396 397 struct dm_info_header { 398 enum dm_info_type type; 399 __u32 data_size; 400 } __packed; 401 402 /* 403 * This message is sent from the host to the guest to pass 404 * some relevant information (win8 addition). 405 * 406 * reserved: no used. 407 * info_size: size of the information blob. 408 * info: information blob. 409 */ 410 411 struct dm_info_msg { 412 struct dm_header hdr; 413 __u32 reserved; 414 __u32 info_size; 415 __u8 info[]; 416 }; 417 418 /* 419 * End protocol definitions. 420 */ 421 422 /* 423 * State to manage hot adding memory into the guest. 424 * The range start_pfn : end_pfn specifies the range 425 * that the host has asked us to hot add. The range 426 * start_pfn : ha_end_pfn specifies the range that we have 427 * currently hot added. We hot add in multiples of 128M 428 * chunks; it is possible that we may not be able to bring 429 * online all the pages in the region. The range 430 * covered_start_pfn : covered_end_pfn defines the pages that can 431 * be brough online. 432 */ 433 434 struct hv_hotadd_state { 435 struct list_head list; 436 unsigned long start_pfn; 437 unsigned long covered_start_pfn; 438 unsigned long covered_end_pfn; 439 unsigned long ha_end_pfn; 440 unsigned long end_pfn; 441 }; 442 443 struct balloon_state { 444 __u32 num_pages; 445 struct work_struct wrk; 446 }; 447 448 struct hot_add_wrk { 449 union dm_mem_page_range ha_page_range; 450 union dm_mem_page_range ha_region_range; 451 struct work_struct wrk; 452 }; 453 454 static bool hot_add = true; 455 static bool do_hot_add; 456 /* 457 * Delay reporting memory pressure by 458 * the specified number of seconds. 459 */ 460 static uint pressure_report_delay = 45; 461 462 module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); 463 MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); 464 465 module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); 466 MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); 467 static atomic_t trans_id = ATOMIC_INIT(0); 468 469 static int dm_ring_size = (5 * PAGE_SIZE); 470 471 /* 472 * Driver specific state. 473 */ 474 475 enum hv_dm_state { 476 DM_INITIALIZING = 0, 477 DM_INITIALIZED, 478 DM_BALLOON_UP, 479 DM_BALLOON_DOWN, 480 DM_HOT_ADD, 481 DM_INIT_ERROR 482 }; 483 484 485 static __u8 recv_buffer[PAGE_SIZE]; 486 static __u8 *send_buffer; 487 #define PAGES_IN_2M 512 488 #define HA_CHUNK (32 * 1024) 489 490 struct hv_dynmem_device { 491 struct hv_device *dev; 492 enum hv_dm_state state; 493 struct completion host_event; 494 struct completion config_event; 495 496 /* 497 * Number of pages we have currently ballooned out. 498 */ 499 unsigned int num_pages_ballooned; 500 501 /* 502 * State to manage the ballooning (up) operation. 503 */ 504 struct balloon_state balloon_wrk; 505 506 /* 507 * State to execute the "hot-add" operation. 508 */ 509 struct hot_add_wrk ha_wrk; 510 511 /* 512 * This state tracks if the host has specified a hot-add 513 * region. 514 */ 515 bool host_specified_ha_region; 516 517 /* 518 * State to synchronize hot-add. 519 */ 520 struct completion ol_waitevent; 521 bool ha_waiting; 522 /* 523 * This thread handles hot-add 524 * requests from the host as well as notifying 525 * the host with regards to memory pressure in 526 * the guest. 527 */ 528 struct task_struct *thread; 529 530 /* 531 * A list of hot-add regions. 532 */ 533 struct list_head ha_region_list; 534 535 /* 536 * We start with the highest version we can support 537 * and downgrade based on the host; we save here the 538 * next version to try. 539 */ 540 __u32 next_version; 541 }; 542 543 static struct hv_dynmem_device dm_device; 544 545 #ifdef CONFIG_MEMORY_HOTPLUG 546 547 static void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size) 548 { 549 int i; 550 551 for (i = 0; i < size; i++) { 552 struct page *pg; 553 pg = pfn_to_page(start_pfn + i); 554 __online_page_set_limits(pg); 555 __online_page_increment_counters(pg); 556 __online_page_free(pg); 557 } 558 } 559 560 static void hv_mem_hot_add(unsigned long start, unsigned long size, 561 unsigned long pfn_count, 562 struct hv_hotadd_state *has) 563 { 564 int ret = 0; 565 int i, nid, t; 566 unsigned long start_pfn; 567 unsigned long processed_pfn; 568 unsigned long total_pfn = pfn_count; 569 570 for (i = 0; i < (size/HA_CHUNK); i++) { 571 start_pfn = start + (i * HA_CHUNK); 572 has->ha_end_pfn += HA_CHUNK; 573 574 if (total_pfn > HA_CHUNK) { 575 processed_pfn = HA_CHUNK; 576 total_pfn -= HA_CHUNK; 577 } else { 578 processed_pfn = total_pfn; 579 total_pfn = 0; 580 } 581 582 has->covered_end_pfn += processed_pfn; 583 584 init_completion(&dm_device.ol_waitevent); 585 dm_device.ha_waiting = true; 586 587 nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); 588 ret = add_memory(nid, PFN_PHYS((start_pfn)), 589 (HA_CHUNK << PAGE_SHIFT)); 590 591 if (ret) { 592 pr_info("hot_add memory failed error is %d\n", ret); 593 if (ret == -EEXIST) { 594 /* 595 * This error indicates that the error 596 * is not a transient failure. This is the 597 * case where the guest's physical address map 598 * precludes hot adding memory. Stop all further 599 * memory hot-add. 600 */ 601 do_hot_add = false; 602 } 603 has->ha_end_pfn -= HA_CHUNK; 604 has->covered_end_pfn -= processed_pfn; 605 break; 606 } 607 608 /* 609 * Wait for the memory block to be onlined. 610 */ 611 t = wait_for_completion_timeout(&dm_device.ol_waitevent, 5*HZ); 612 if (t == 0) { 613 pr_info("hot_add memory timedout\n"); 614 has->ha_end_pfn -= HA_CHUNK; 615 has->covered_end_pfn -= processed_pfn; 616 break; 617 } 618 619 } 620 621 return; 622 } 623 624 static void hv_online_page(struct page *pg) 625 { 626 struct list_head *cur; 627 struct hv_hotadd_state *has; 628 unsigned long cur_start_pgp; 629 unsigned long cur_end_pgp; 630 631 if (dm_device.ha_waiting) { 632 dm_device.ha_waiting = false; 633 complete(&dm_device.ol_waitevent); 634 } 635 636 list_for_each(cur, &dm_device.ha_region_list) { 637 has = list_entry(cur, struct hv_hotadd_state, list); 638 cur_start_pgp = (unsigned long) 639 pfn_to_page(has->covered_start_pfn); 640 cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); 641 642 if (((unsigned long)pg >= cur_start_pgp) && 643 ((unsigned long)pg < cur_end_pgp)) { 644 /* 645 * This frame is currently backed; online the 646 * page. 647 */ 648 __online_page_set_limits(pg); 649 __online_page_increment_counters(pg); 650 __online_page_free(pg); 651 has->covered_start_pfn++; 652 } 653 } 654 } 655 656 static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) 657 { 658 struct list_head *cur; 659 struct hv_hotadd_state *has; 660 unsigned long residual, new_inc; 661 662 if (list_empty(&dm_device.ha_region_list)) 663 return false; 664 665 list_for_each(cur, &dm_device.ha_region_list) { 666 has = list_entry(cur, struct hv_hotadd_state, list); 667 668 /* 669 * If the pfn range we are dealing with is not in the current 670 * "hot add block", move on. 671 */ 672 if ((start_pfn >= has->end_pfn)) 673 continue; 674 /* 675 * If the current hot add-request extends beyond 676 * our current limit; extend it. 677 */ 678 if ((start_pfn + pfn_cnt) > has->end_pfn) { 679 residual = (start_pfn + pfn_cnt - has->end_pfn); 680 /* 681 * Extend the region by multiples of HA_CHUNK. 682 */ 683 new_inc = (residual / HA_CHUNK) * HA_CHUNK; 684 if (residual % HA_CHUNK) 685 new_inc += HA_CHUNK; 686 687 has->end_pfn += new_inc; 688 } 689 690 /* 691 * If the current start pfn is not where the covered_end 692 * is, update it. 693 */ 694 695 if (has->covered_end_pfn != start_pfn) { 696 has->covered_end_pfn = start_pfn; 697 has->covered_start_pfn = start_pfn; 698 } 699 return true; 700 701 } 702 703 return false; 704 } 705 706 static unsigned long handle_pg_range(unsigned long pg_start, 707 unsigned long pg_count) 708 { 709 unsigned long start_pfn = pg_start; 710 unsigned long pfn_cnt = pg_count; 711 unsigned long size; 712 struct list_head *cur; 713 struct hv_hotadd_state *has; 714 unsigned long pgs_ol = 0; 715 unsigned long old_covered_state; 716 717 if (list_empty(&dm_device.ha_region_list)) 718 return 0; 719 720 list_for_each(cur, &dm_device.ha_region_list) { 721 has = list_entry(cur, struct hv_hotadd_state, list); 722 723 /* 724 * If the pfn range we are dealing with is not in the current 725 * "hot add block", move on. 726 */ 727 if ((start_pfn >= has->end_pfn)) 728 continue; 729 730 old_covered_state = has->covered_end_pfn; 731 732 if (start_pfn < has->ha_end_pfn) { 733 /* 734 * This is the case where we are backing pages 735 * in an already hot added region. Bring 736 * these pages online first. 737 */ 738 pgs_ol = has->ha_end_pfn - start_pfn; 739 if (pgs_ol > pfn_cnt) 740 pgs_ol = pfn_cnt; 741 hv_bring_pgs_online(start_pfn, pgs_ol); 742 has->covered_end_pfn += pgs_ol; 743 has->covered_start_pfn += pgs_ol; 744 pfn_cnt -= pgs_ol; 745 } 746 747 if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { 748 /* 749 * We have some residual hot add range 750 * that needs to be hot added; hot add 751 * it now. Hot add a multiple of 752 * of HA_CHUNK that fully covers the pages 753 * we have. 754 */ 755 size = (has->end_pfn - has->ha_end_pfn); 756 if (pfn_cnt <= size) { 757 size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); 758 if (pfn_cnt % HA_CHUNK) 759 size += HA_CHUNK; 760 } else { 761 pfn_cnt = size; 762 } 763 hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has); 764 } 765 /* 766 * If we managed to online any pages that were given to us, 767 * we declare success. 768 */ 769 return has->covered_end_pfn - old_covered_state; 770 771 } 772 773 return 0; 774 } 775 776 static unsigned long process_hot_add(unsigned long pg_start, 777 unsigned long pfn_cnt, 778 unsigned long rg_start, 779 unsigned long rg_size) 780 { 781 struct hv_hotadd_state *ha_region = NULL; 782 783 if (pfn_cnt == 0) 784 return 0; 785 786 if (!dm_device.host_specified_ha_region) 787 if (pfn_covered(pg_start, pfn_cnt)) 788 goto do_pg_range; 789 790 /* 791 * If the host has specified a hot-add range; deal with it first. 792 */ 793 794 if (rg_size != 0) { 795 ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL); 796 if (!ha_region) 797 return 0; 798 799 INIT_LIST_HEAD(&ha_region->list); 800 801 list_add_tail(&ha_region->list, &dm_device.ha_region_list); 802 ha_region->start_pfn = rg_start; 803 ha_region->ha_end_pfn = rg_start; 804 ha_region->covered_start_pfn = pg_start; 805 ha_region->covered_end_pfn = pg_start; 806 ha_region->end_pfn = rg_start + rg_size; 807 } 808 809 do_pg_range: 810 /* 811 * Process the page range specified; bringing them 812 * online if possible. 813 */ 814 return handle_pg_range(pg_start, pfn_cnt); 815 } 816 817 #endif 818 819 static void hot_add_req(struct work_struct *dummy) 820 { 821 struct dm_hot_add_response resp; 822 #ifdef CONFIG_MEMORY_HOTPLUG 823 unsigned long pg_start, pfn_cnt; 824 unsigned long rg_start, rg_sz; 825 #endif 826 struct hv_dynmem_device *dm = &dm_device; 827 828 memset(&resp, 0, sizeof(struct dm_hot_add_response)); 829 resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE; 830 resp.hdr.size = sizeof(struct dm_hot_add_response); 831 resp.hdr.trans_id = atomic_inc_return(&trans_id); 832 833 #ifdef CONFIG_MEMORY_HOTPLUG 834 pg_start = dm->ha_wrk.ha_page_range.finfo.start_page; 835 pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt; 836 837 rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; 838 rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; 839 840 if ((rg_start == 0) && (!dm->host_specified_ha_region)) { 841 unsigned long region_size; 842 unsigned long region_start; 843 844 /* 845 * The host has not specified the hot-add region. 846 * Based on the hot-add page range being specified, 847 * compute a hot-add region that can cover the pages 848 * that need to be hot-added while ensuring the alignment 849 * and size requirements of Linux as it relates to hot-add. 850 */ 851 region_start = pg_start; 852 region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; 853 if (pfn_cnt % HA_CHUNK) 854 region_size += HA_CHUNK; 855 856 region_start = (pg_start / HA_CHUNK) * HA_CHUNK; 857 858 rg_start = region_start; 859 rg_sz = region_size; 860 } 861 862 if (do_hot_add) 863 resp.page_count = process_hot_add(pg_start, pfn_cnt, 864 rg_start, rg_sz); 865 #endif 866 /* 867 * The result field of the response structure has the 868 * following semantics: 869 * 870 * 1. If all or some pages hot-added: Guest should return success. 871 * 872 * 2. If no pages could be hot-added: 873 * 874 * If the guest returns success, then the host 875 * will not attempt any further hot-add operations. This 876 * signifies a permanent failure. 877 * 878 * If the guest returns failure, then this failure will be 879 * treated as a transient failure and the host may retry the 880 * hot-add operation after some delay. 881 */ 882 if (resp.page_count > 0) 883 resp.result = 1; 884 else if (!do_hot_add) 885 resp.result = 1; 886 else 887 resp.result = 0; 888 889 if (!do_hot_add || (resp.page_count == 0)) 890 pr_info("Memory hot add failed\n"); 891 892 dm->state = DM_INITIALIZED; 893 vmbus_sendpacket(dm->dev->channel, &resp, 894 sizeof(struct dm_hot_add_response), 895 (unsigned long)NULL, 896 VM_PKT_DATA_INBAND, 0); 897 } 898 899 static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) 900 { 901 struct dm_info_header *info_hdr; 902 903 info_hdr = (struct dm_info_header *)msg->info; 904 905 switch (info_hdr->type) { 906 case INFO_TYPE_MAX_PAGE_CNT: 907 pr_info("Received INFO_TYPE_MAX_PAGE_CNT\n"); 908 pr_info("Data Size is %d\n", info_hdr->data_size); 909 break; 910 default: 911 pr_info("Received Unknown type: %d\n", info_hdr->type); 912 } 913 } 914 915 static unsigned long compute_balloon_floor(void) 916 { 917 unsigned long min_pages; 918 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) 919 /* Simple continuous piecewiese linear function: 920 * max MiB -> min MiB gradient 921 * 0 0 922 * 16 16 923 * 32 24 924 * 128 72 (1/2) 925 * 512 168 (1/4) 926 * 2048 360 (1/8) 927 * 8192 552 (1/32) 928 * 32768 1320 929 * 131072 4392 930 */ 931 if (totalram_pages < MB2PAGES(128)) 932 min_pages = MB2PAGES(8) + (totalram_pages >> 1); 933 else if (totalram_pages < MB2PAGES(512)) 934 min_pages = MB2PAGES(40) + (totalram_pages >> 2); 935 else if (totalram_pages < MB2PAGES(2048)) 936 min_pages = MB2PAGES(104) + (totalram_pages >> 3); 937 else 938 min_pages = MB2PAGES(296) + (totalram_pages >> 5); 939 #undef MB2PAGES 940 return min_pages; 941 } 942 943 /* 944 * Post our status as it relates memory pressure to the 945 * host. Host expects the guests to post this status 946 * periodically at 1 second intervals. 947 * 948 * The metrics specified in this protocol are very Windows 949 * specific and so we cook up numbers here to convey our memory 950 * pressure. 951 */ 952 953 static void post_status(struct hv_dynmem_device *dm) 954 { 955 struct dm_status status; 956 struct sysinfo val; 957 958 if (pressure_report_delay > 0) { 959 --pressure_report_delay; 960 return; 961 } 962 si_meminfo(&val); 963 memset(&status, 0, sizeof(struct dm_status)); 964 status.hdr.type = DM_STATUS_REPORT; 965 status.hdr.size = sizeof(struct dm_status); 966 status.hdr.trans_id = atomic_inc_return(&trans_id); 967 968 /* 969 * The host expects the guest to report free memory. 970 * Further, the host expects the pressure information to 971 * include the ballooned out pages. 972 * For a given amount of memory that we are managing, we 973 * need to compute a floor below which we should not balloon. 974 * Compute this and add it to the pressure report. 975 */ 976 status.num_avail = val.freeram; 977 status.num_committed = vm_memory_committed() + 978 dm->num_pages_ballooned + 979 compute_balloon_floor(); 980 981 vmbus_sendpacket(dm->dev->channel, &status, 982 sizeof(struct dm_status), 983 (unsigned long)NULL, 984 VM_PKT_DATA_INBAND, 0); 985 986 } 987 988 static void free_balloon_pages(struct hv_dynmem_device *dm, 989 union dm_mem_page_range *range_array) 990 { 991 int num_pages = range_array->finfo.page_cnt; 992 __u64 start_frame = range_array->finfo.start_page; 993 struct page *pg; 994 int i; 995 996 for (i = 0; i < num_pages; i++) { 997 pg = pfn_to_page(i + start_frame); 998 __free_page(pg); 999 dm->num_pages_ballooned--; 1000 } 1001 } 1002 1003 1004 1005 static int alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages, 1006 struct dm_balloon_response *bl_resp, int alloc_unit, 1007 bool *alloc_error) 1008 { 1009 int i = 0; 1010 struct page *pg; 1011 1012 if (num_pages < alloc_unit) 1013 return 0; 1014 1015 for (i = 0; (i * alloc_unit) < num_pages; i++) { 1016 if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) > 1017 PAGE_SIZE) 1018 return i * alloc_unit; 1019 1020 /* 1021 * We execute this code in a thread context. Furthermore, 1022 * we don't want the kernel to try too hard. 1023 */ 1024 pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY | 1025 __GFP_NOMEMALLOC | __GFP_NOWARN, 1026 get_order(alloc_unit << PAGE_SHIFT)); 1027 1028 if (!pg) { 1029 *alloc_error = true; 1030 return i * alloc_unit; 1031 } 1032 1033 1034 dm->num_pages_ballooned += alloc_unit; 1035 1036 /* 1037 * If we allocatted 2M pages; split them so we 1038 * can free them in any order we get. 1039 */ 1040 1041 if (alloc_unit != 1) 1042 split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); 1043 1044 bl_resp->range_count++; 1045 bl_resp->range_array[i].finfo.start_page = 1046 page_to_pfn(pg); 1047 bl_resp->range_array[i].finfo.page_cnt = alloc_unit; 1048 bl_resp->hdr.size += sizeof(union dm_mem_page_range); 1049 1050 } 1051 1052 return num_pages; 1053 } 1054 1055 1056 1057 static void balloon_up(struct work_struct *dummy) 1058 { 1059 int num_pages = dm_device.balloon_wrk.num_pages; 1060 int num_ballooned = 0; 1061 struct dm_balloon_response *bl_resp; 1062 int alloc_unit; 1063 int ret; 1064 bool alloc_error = false; 1065 bool done = false; 1066 int i; 1067 1068 1069 /* 1070 * We will attempt 2M allocations. However, if we fail to 1071 * allocate 2M chunks, we will go back to 4k allocations. 1072 */ 1073 alloc_unit = 512; 1074 1075 while (!done) { 1076 bl_resp = (struct dm_balloon_response *)send_buffer; 1077 memset(send_buffer, 0, PAGE_SIZE); 1078 bl_resp->hdr.type = DM_BALLOON_RESPONSE; 1079 bl_resp->hdr.trans_id = atomic_inc_return(&trans_id); 1080 bl_resp->hdr.size = sizeof(struct dm_balloon_response); 1081 bl_resp->more_pages = 1; 1082 1083 1084 num_pages -= num_ballooned; 1085 num_ballooned = alloc_balloon_pages(&dm_device, num_pages, 1086 bl_resp, alloc_unit, 1087 &alloc_error); 1088 1089 if ((alloc_error) && (alloc_unit != 1)) { 1090 alloc_unit = 1; 1091 continue; 1092 } 1093 1094 if ((alloc_error) || (num_ballooned == num_pages)) { 1095 bl_resp->more_pages = 0; 1096 done = true; 1097 dm_device.state = DM_INITIALIZED; 1098 } 1099 1100 /* 1101 * We are pushing a lot of data through the channel; 1102 * deal with transient failures caused because of the 1103 * lack of space in the ring buffer. 1104 */ 1105 1106 do { 1107 ret = vmbus_sendpacket(dm_device.dev->channel, 1108 bl_resp, 1109 bl_resp->hdr.size, 1110 (unsigned long)NULL, 1111 VM_PKT_DATA_INBAND, 0); 1112 1113 if (ret == -EAGAIN) 1114 msleep(20); 1115 1116 } while (ret == -EAGAIN); 1117 1118 if (ret) { 1119 /* 1120 * Free up the memory we allocatted. 1121 */ 1122 pr_info("Balloon response failed\n"); 1123 1124 for (i = 0; i < bl_resp->range_count; i++) 1125 free_balloon_pages(&dm_device, 1126 &bl_resp->range_array[i]); 1127 1128 done = true; 1129 } 1130 } 1131 1132 } 1133 1134 static void balloon_down(struct hv_dynmem_device *dm, 1135 struct dm_unballoon_request *req) 1136 { 1137 union dm_mem_page_range *range_array = req->range_array; 1138 int range_count = req->range_count; 1139 struct dm_unballoon_response resp; 1140 int i; 1141 1142 for (i = 0; i < range_count; i++) 1143 free_balloon_pages(dm, &range_array[i]); 1144 1145 if (req->more_pages == 1) 1146 return; 1147 1148 memset(&resp, 0, sizeof(struct dm_unballoon_response)); 1149 resp.hdr.type = DM_UNBALLOON_RESPONSE; 1150 resp.hdr.trans_id = atomic_inc_return(&trans_id); 1151 resp.hdr.size = sizeof(struct dm_unballoon_response); 1152 1153 vmbus_sendpacket(dm_device.dev->channel, &resp, 1154 sizeof(struct dm_unballoon_response), 1155 (unsigned long)NULL, 1156 VM_PKT_DATA_INBAND, 0); 1157 1158 dm->state = DM_INITIALIZED; 1159 } 1160 1161 static void balloon_onchannelcallback(void *context); 1162 1163 static int dm_thread_func(void *dm_dev) 1164 { 1165 struct hv_dynmem_device *dm = dm_dev; 1166 int t; 1167 1168 while (!kthread_should_stop()) { 1169 t = wait_for_completion_timeout(&dm_device.config_event, 1*HZ); 1170 /* 1171 * The host expects us to post information on the memory 1172 * pressure every second. 1173 */ 1174 1175 if (t == 0) 1176 post_status(dm); 1177 1178 } 1179 1180 return 0; 1181 } 1182 1183 1184 static void version_resp(struct hv_dynmem_device *dm, 1185 struct dm_version_response *vresp) 1186 { 1187 struct dm_version_request version_req; 1188 int ret; 1189 1190 if (vresp->is_accepted) { 1191 /* 1192 * We are done; wakeup the 1193 * context waiting for version 1194 * negotiation. 1195 */ 1196 complete(&dm->host_event); 1197 return; 1198 } 1199 /* 1200 * If there are more versions to try, continue 1201 * with negotiations; if not 1202 * shutdown the service since we are not able 1203 * to negotiate a suitable version number 1204 * with the host. 1205 */ 1206 if (dm->next_version == 0) 1207 goto version_error; 1208 1209 dm->next_version = 0; 1210 memset(&version_req, 0, sizeof(struct dm_version_request)); 1211 version_req.hdr.type = DM_VERSION_REQUEST; 1212 version_req.hdr.size = sizeof(struct dm_version_request); 1213 version_req.hdr.trans_id = atomic_inc_return(&trans_id); 1214 version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN7; 1215 version_req.is_last_attempt = 1; 1216 1217 ret = vmbus_sendpacket(dm->dev->channel, &version_req, 1218 sizeof(struct dm_version_request), 1219 (unsigned long)NULL, 1220 VM_PKT_DATA_INBAND, 0); 1221 1222 if (ret) 1223 goto version_error; 1224 1225 return; 1226 1227 version_error: 1228 dm->state = DM_INIT_ERROR; 1229 complete(&dm->host_event); 1230 } 1231 1232 static void cap_resp(struct hv_dynmem_device *dm, 1233 struct dm_capabilities_resp_msg *cap_resp) 1234 { 1235 if (!cap_resp->is_accepted) { 1236 pr_info("Capabilities not accepted by host\n"); 1237 dm->state = DM_INIT_ERROR; 1238 } 1239 complete(&dm->host_event); 1240 } 1241 1242 static void balloon_onchannelcallback(void *context) 1243 { 1244 struct hv_device *dev = context; 1245 u32 recvlen; 1246 u64 requestid; 1247 struct dm_message *dm_msg; 1248 struct dm_header *dm_hdr; 1249 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1250 struct dm_balloon *bal_msg; 1251 struct dm_hot_add *ha_msg; 1252 union dm_mem_page_range *ha_pg_range; 1253 union dm_mem_page_range *ha_region; 1254 1255 memset(recv_buffer, 0, sizeof(recv_buffer)); 1256 vmbus_recvpacket(dev->channel, recv_buffer, 1257 PAGE_SIZE, &recvlen, &requestid); 1258 1259 if (recvlen > 0) { 1260 dm_msg = (struct dm_message *)recv_buffer; 1261 dm_hdr = &dm_msg->hdr; 1262 1263 switch (dm_hdr->type) { 1264 case DM_VERSION_RESPONSE: 1265 version_resp(dm, 1266 (struct dm_version_response *)dm_msg); 1267 break; 1268 1269 case DM_CAPABILITIES_RESPONSE: 1270 cap_resp(dm, 1271 (struct dm_capabilities_resp_msg *)dm_msg); 1272 break; 1273 1274 case DM_BALLOON_REQUEST: 1275 if (dm->state == DM_BALLOON_UP) 1276 pr_warn("Currently ballooning\n"); 1277 bal_msg = (struct dm_balloon *)recv_buffer; 1278 dm->state = DM_BALLOON_UP; 1279 dm_device.balloon_wrk.num_pages = bal_msg->num_pages; 1280 schedule_work(&dm_device.balloon_wrk.wrk); 1281 break; 1282 1283 case DM_UNBALLOON_REQUEST: 1284 dm->state = DM_BALLOON_DOWN; 1285 balloon_down(dm, 1286 (struct dm_unballoon_request *)recv_buffer); 1287 break; 1288 1289 case DM_MEM_HOT_ADD_REQUEST: 1290 if (dm->state == DM_HOT_ADD) 1291 pr_warn("Currently hot-adding\n"); 1292 dm->state = DM_HOT_ADD; 1293 ha_msg = (struct dm_hot_add *)recv_buffer; 1294 if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) { 1295 /* 1296 * This is a normal hot-add request specifying 1297 * hot-add memory. 1298 */ 1299 ha_pg_range = &ha_msg->range; 1300 dm->ha_wrk.ha_page_range = *ha_pg_range; 1301 dm->ha_wrk.ha_region_range.page_range = 0; 1302 } else { 1303 /* 1304 * Host is specifying that we first hot-add 1305 * a region and then partially populate this 1306 * region. 1307 */ 1308 dm->host_specified_ha_region = true; 1309 ha_pg_range = &ha_msg->range; 1310 ha_region = &ha_pg_range[1]; 1311 dm->ha_wrk.ha_page_range = *ha_pg_range; 1312 dm->ha_wrk.ha_region_range = *ha_region; 1313 } 1314 schedule_work(&dm_device.ha_wrk.wrk); 1315 break; 1316 1317 case DM_INFO_MESSAGE: 1318 process_info(dm, (struct dm_info_msg *)dm_msg); 1319 break; 1320 1321 default: 1322 pr_err("Unhandled message: type: %d\n", dm_hdr->type); 1323 1324 } 1325 } 1326 1327 } 1328 1329 static int balloon_probe(struct hv_device *dev, 1330 const struct hv_vmbus_device_id *dev_id) 1331 { 1332 int ret, t; 1333 struct dm_version_request version_req; 1334 struct dm_capabilities cap_msg; 1335 1336 do_hot_add = hot_add; 1337 1338 /* 1339 * First allocate a send buffer. 1340 */ 1341 1342 send_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); 1343 if (!send_buffer) 1344 return -ENOMEM; 1345 1346 ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, 1347 balloon_onchannelcallback, dev); 1348 1349 if (ret) 1350 goto probe_error0; 1351 1352 dm_device.dev = dev; 1353 dm_device.state = DM_INITIALIZING; 1354 dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7; 1355 init_completion(&dm_device.host_event); 1356 init_completion(&dm_device.config_event); 1357 INIT_LIST_HEAD(&dm_device.ha_region_list); 1358 INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); 1359 INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); 1360 dm_device.host_specified_ha_region = false; 1361 1362 dm_device.thread = 1363 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); 1364 if (IS_ERR(dm_device.thread)) { 1365 ret = PTR_ERR(dm_device.thread); 1366 goto probe_error1; 1367 } 1368 1369 #ifdef CONFIG_MEMORY_HOTPLUG 1370 set_online_page_callback(&hv_online_page); 1371 #endif 1372 1373 hv_set_drvdata(dev, &dm_device); 1374 /* 1375 * Initiate the hand shake with the host and negotiate 1376 * a version that the host can support. We start with the 1377 * highest version number and go down if the host cannot 1378 * support it. 1379 */ 1380 memset(&version_req, 0, sizeof(struct dm_version_request)); 1381 version_req.hdr.type = DM_VERSION_REQUEST; 1382 version_req.hdr.size = sizeof(struct dm_version_request); 1383 version_req.hdr.trans_id = atomic_inc_return(&trans_id); 1384 version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN8; 1385 version_req.is_last_attempt = 0; 1386 1387 ret = vmbus_sendpacket(dev->channel, &version_req, 1388 sizeof(struct dm_version_request), 1389 (unsigned long)NULL, 1390 VM_PKT_DATA_INBAND, 0); 1391 if (ret) 1392 goto probe_error2; 1393 1394 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); 1395 if (t == 0) { 1396 ret = -ETIMEDOUT; 1397 goto probe_error2; 1398 } 1399 1400 /* 1401 * If we could not negotiate a compatible version with the host 1402 * fail the probe function. 1403 */ 1404 if (dm_device.state == DM_INIT_ERROR) { 1405 ret = -ETIMEDOUT; 1406 goto probe_error2; 1407 } 1408 /* 1409 * Now submit our capabilities to the host. 1410 */ 1411 memset(&cap_msg, 0, sizeof(struct dm_capabilities)); 1412 cap_msg.hdr.type = DM_CAPABILITIES_REPORT; 1413 cap_msg.hdr.size = sizeof(struct dm_capabilities); 1414 cap_msg.hdr.trans_id = atomic_inc_return(&trans_id); 1415 1416 cap_msg.caps.cap_bits.balloon = 1; 1417 cap_msg.caps.cap_bits.hot_add = 1; 1418 1419 /* 1420 * Specify our alignment requirements as it relates 1421 * memory hot-add. Specify 128MB alignment. 1422 */ 1423 cap_msg.caps.cap_bits.hot_add_alignment = 7; 1424 1425 /* 1426 * Currently the host does not use these 1427 * values and we set them to what is done in the 1428 * Windows driver. 1429 */ 1430 cap_msg.min_page_cnt = 0; 1431 cap_msg.max_page_number = -1; 1432 1433 ret = vmbus_sendpacket(dev->channel, &cap_msg, 1434 sizeof(struct dm_capabilities), 1435 (unsigned long)NULL, 1436 VM_PKT_DATA_INBAND, 0); 1437 if (ret) 1438 goto probe_error2; 1439 1440 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); 1441 if (t == 0) { 1442 ret = -ETIMEDOUT; 1443 goto probe_error2; 1444 } 1445 1446 /* 1447 * If the host does not like our capabilities, 1448 * fail the probe function. 1449 */ 1450 if (dm_device.state == DM_INIT_ERROR) { 1451 ret = -ETIMEDOUT; 1452 goto probe_error2; 1453 } 1454 1455 dm_device.state = DM_INITIALIZED; 1456 1457 return 0; 1458 1459 probe_error2: 1460 #ifdef CONFIG_MEMORY_HOTPLUG 1461 restore_online_page_callback(&hv_online_page); 1462 #endif 1463 kthread_stop(dm_device.thread); 1464 1465 probe_error1: 1466 vmbus_close(dev->channel); 1467 probe_error0: 1468 kfree(send_buffer); 1469 return ret; 1470 } 1471 1472 static int balloon_remove(struct hv_device *dev) 1473 { 1474 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1475 struct list_head *cur, *tmp; 1476 struct hv_hotadd_state *has; 1477 1478 if (dm->num_pages_ballooned != 0) 1479 pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); 1480 1481 cancel_work_sync(&dm->balloon_wrk.wrk); 1482 cancel_work_sync(&dm->ha_wrk.wrk); 1483 1484 vmbus_close(dev->channel); 1485 kthread_stop(dm->thread); 1486 kfree(send_buffer); 1487 #ifdef CONFIG_MEMORY_HOTPLUG 1488 restore_online_page_callback(&hv_online_page); 1489 #endif 1490 list_for_each_safe(cur, tmp, &dm->ha_region_list) { 1491 has = list_entry(cur, struct hv_hotadd_state, list); 1492 list_del(&has->list); 1493 kfree(has); 1494 } 1495 1496 return 0; 1497 } 1498 1499 static const struct hv_vmbus_device_id id_table[] = { 1500 /* Dynamic Memory Class ID */ 1501 /* 525074DC-8985-46e2-8057-A307DC18A502 */ 1502 { HV_DM_GUID, }, 1503 { }, 1504 }; 1505 1506 MODULE_DEVICE_TABLE(vmbus, id_table); 1507 1508 static struct hv_driver balloon_drv = { 1509 .name = "hv_balloon", 1510 .id_table = id_table, 1511 .probe = balloon_probe, 1512 .remove = balloon_remove, 1513 }; 1514 1515 static int __init init_balloon_drv(void) 1516 { 1517 1518 return vmbus_driver_register(&balloon_drv); 1519 } 1520 1521 module_init(init_balloon_drv); 1522 1523 MODULE_DESCRIPTION("Hyper-V Balloon"); 1524 MODULE_VERSION(HV_DRV_VERSION); 1525 MODULE_LICENSE("GPL"); 1526