1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2012, Microsoft Corporation. 4 * 5 * Author: 6 * K. Y. Srinivasan <kys@microsoft.com> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kernel.h> 12 #include <linux/jiffies.h> 13 #include <linux/mman.h> 14 #include <linux/delay.h> 15 #include <linux/init.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/kthread.h> 19 #include <linux/completion.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/memory.h> 22 #include <linux/notifier.h> 23 #include <linux/percpu_counter.h> 24 25 #include <linux/hyperv.h> 26 #include <asm/hyperv-tlfs.h> 27 28 #include <asm/mshyperv.h> 29 30 #define CREATE_TRACE_POINTS 31 #include "hv_trace_balloon.h" 32 33 /* 34 * We begin with definitions supporting the Dynamic Memory protocol 35 * with the host. 36 * 37 * Begin protocol definitions. 38 */ 39 40 41 42 /* 43 * Protocol versions. The low word is the minor version, the high word the major 44 * version. 45 * 46 * History: 47 * Initial version 1.0 48 * Changed to 0.1 on 2009/03/25 49 * Changes to 0.2 on 2009/05/14 50 * Changes to 0.3 on 2009/12/03 51 * Changed to 1.0 on 2011/04/05 52 */ 53 54 #define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor))) 55 #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16) 56 #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff) 57 58 enum { 59 DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3), 60 DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0), 61 DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0), 62 63 DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1, 64 DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2, 65 DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3, 66 67 DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10 68 }; 69 70 71 72 /* 73 * Message Types 74 */ 75 76 enum dm_message_type { 77 /* 78 * Version 0.3 79 */ 80 DM_ERROR = 0, 81 DM_VERSION_REQUEST = 1, 82 DM_VERSION_RESPONSE = 2, 83 DM_CAPABILITIES_REPORT = 3, 84 DM_CAPABILITIES_RESPONSE = 4, 85 DM_STATUS_REPORT = 5, 86 DM_BALLOON_REQUEST = 6, 87 DM_BALLOON_RESPONSE = 7, 88 DM_UNBALLOON_REQUEST = 8, 89 DM_UNBALLOON_RESPONSE = 9, 90 DM_MEM_HOT_ADD_REQUEST = 10, 91 DM_MEM_HOT_ADD_RESPONSE = 11, 92 DM_VERSION_03_MAX = 11, 93 /* 94 * Version 1.0. 95 */ 96 DM_INFO_MESSAGE = 12, 97 DM_VERSION_1_MAX = 12 98 }; 99 100 101 /* 102 * Structures defining the dynamic memory management 103 * protocol. 104 */ 105 106 union dm_version { 107 struct { 108 __u16 minor_version; 109 __u16 major_version; 110 }; 111 __u32 version; 112 } __packed; 113 114 115 union dm_caps { 116 struct { 117 __u64 balloon:1; 118 __u64 hot_add:1; 119 /* 120 * To support guests that may have alignment 121 * limitations on hot-add, the guest can specify 122 * its alignment requirements; a value of n 123 * represents an alignment of 2^n in mega bytes. 124 */ 125 __u64 hot_add_alignment:4; 126 __u64 reservedz:58; 127 } cap_bits; 128 __u64 caps; 129 } __packed; 130 131 union dm_mem_page_range { 132 struct { 133 /* 134 * The PFN number of the first page in the range. 135 * 40 bits is the architectural limit of a PFN 136 * number for AMD64. 137 */ 138 __u64 start_page:40; 139 /* 140 * The number of pages in the range. 141 */ 142 __u64 page_cnt:24; 143 } finfo; 144 __u64 page_range; 145 } __packed; 146 147 148 149 /* 150 * The header for all dynamic memory messages: 151 * 152 * type: Type of the message. 153 * size: Size of the message in bytes; including the header. 154 * trans_id: The guest is responsible for manufacturing this ID. 155 */ 156 157 struct dm_header { 158 __u16 type; 159 __u16 size; 160 __u32 trans_id; 161 } __packed; 162 163 /* 164 * A generic message format for dynamic memory. 165 * Specific message formats are defined later in the file. 166 */ 167 168 struct dm_message { 169 struct dm_header hdr; 170 __u8 data[]; /* enclosed message */ 171 } __packed; 172 173 174 /* 175 * Specific message types supporting the dynamic memory protocol. 176 */ 177 178 /* 179 * Version negotiation message. Sent from the guest to the host. 180 * The guest is free to try different versions until the host 181 * accepts the version. 182 * 183 * dm_version: The protocol version requested. 184 * is_last_attempt: If TRUE, this is the last version guest will request. 185 * reservedz: Reserved field, set to zero. 186 */ 187 188 struct dm_version_request { 189 struct dm_header hdr; 190 union dm_version version; 191 __u32 is_last_attempt:1; 192 __u32 reservedz:31; 193 } __packed; 194 195 /* 196 * Version response message; Host to Guest and indicates 197 * if the host has accepted the version sent by the guest. 198 * 199 * is_accepted: If TRUE, host has accepted the version and the guest 200 * should proceed to the next stage of the protocol. FALSE indicates that 201 * guest should re-try with a different version. 202 * 203 * reservedz: Reserved field, set to zero. 204 */ 205 206 struct dm_version_response { 207 struct dm_header hdr; 208 __u64 is_accepted:1; 209 __u64 reservedz:63; 210 } __packed; 211 212 /* 213 * Message reporting capabilities. This is sent from the guest to the 214 * host. 215 */ 216 217 struct dm_capabilities { 218 struct dm_header hdr; 219 union dm_caps caps; 220 __u64 min_page_cnt; 221 __u64 max_page_number; 222 } __packed; 223 224 /* 225 * Response to the capabilities message. This is sent from the host to the 226 * guest. This message notifies if the host has accepted the guest's 227 * capabilities. If the host has not accepted, the guest must shutdown 228 * the service. 229 * 230 * is_accepted: Indicates if the host has accepted guest's capabilities. 231 * reservedz: Must be 0. 232 */ 233 234 struct dm_capabilities_resp_msg { 235 struct dm_header hdr; 236 __u64 is_accepted:1; 237 __u64 reservedz:63; 238 } __packed; 239 240 /* 241 * This message is used to report memory pressure from the guest. 242 * This message is not part of any transaction and there is no 243 * response to this message. 244 * 245 * num_avail: Available memory in pages. 246 * num_committed: Committed memory in pages. 247 * page_file_size: The accumulated size of all page files 248 * in the system in pages. 249 * zero_free: The nunber of zero and free pages. 250 * page_file_writes: The writes to the page file in pages. 251 * io_diff: An indicator of file cache efficiency or page file activity, 252 * calculated as File Cache Page Fault Count - Page Read Count. 253 * This value is in pages. 254 * 255 * Some of these metrics are Windows specific and fortunately 256 * the algorithm on the host side that computes the guest memory 257 * pressure only uses num_committed value. 258 */ 259 260 struct dm_status { 261 struct dm_header hdr; 262 __u64 num_avail; 263 __u64 num_committed; 264 __u64 page_file_size; 265 __u64 zero_free; 266 __u32 page_file_writes; 267 __u32 io_diff; 268 } __packed; 269 270 271 /* 272 * Message to ask the guest to allocate memory - balloon up message. 273 * This message is sent from the host to the guest. The guest may not be 274 * able to allocate as much memory as requested. 275 * 276 * num_pages: number of pages to allocate. 277 */ 278 279 struct dm_balloon { 280 struct dm_header hdr; 281 __u32 num_pages; 282 __u32 reservedz; 283 } __packed; 284 285 286 /* 287 * Balloon response message; this message is sent from the guest 288 * to the host in response to the balloon message. 289 * 290 * reservedz: Reserved; must be set to zero. 291 * more_pages: If FALSE, this is the last message of the transaction. 292 * if TRUE there will atleast one more message from the guest. 293 * 294 * range_count: The number of ranges in the range array. 295 * 296 * range_array: An array of page ranges returned to the host. 297 * 298 */ 299 300 struct dm_balloon_response { 301 struct dm_header hdr; 302 __u32 reservedz; 303 __u32 more_pages:1; 304 __u32 range_count:31; 305 union dm_mem_page_range range_array[]; 306 } __packed; 307 308 /* 309 * Un-balloon message; this message is sent from the host 310 * to the guest to give guest more memory. 311 * 312 * more_pages: If FALSE, this is the last message of the transaction. 313 * if TRUE there will atleast one more message from the guest. 314 * 315 * reservedz: Reserved; must be set to zero. 316 * 317 * range_count: The number of ranges in the range array. 318 * 319 * range_array: An array of page ranges returned to the host. 320 * 321 */ 322 323 struct dm_unballoon_request { 324 struct dm_header hdr; 325 __u32 more_pages:1; 326 __u32 reservedz:31; 327 __u32 range_count; 328 union dm_mem_page_range range_array[]; 329 } __packed; 330 331 /* 332 * Un-balloon response message; this message is sent from the guest 333 * to the host in response to an unballoon request. 334 * 335 */ 336 337 struct dm_unballoon_response { 338 struct dm_header hdr; 339 } __packed; 340 341 342 /* 343 * Hot add request message. Message sent from the host to the guest. 344 * 345 * mem_range: Memory range to hot add. 346 * 347 */ 348 349 struct dm_hot_add { 350 struct dm_header hdr; 351 union dm_mem_page_range range; 352 } __packed; 353 354 /* 355 * Hot add response message. 356 * This message is sent by the guest to report the status of a hot add request. 357 * If page_count is less than the requested page count, then the host should 358 * assume all further hot add requests will fail, since this indicates that 359 * the guest has hit an upper physical memory barrier. 360 * 361 * Hot adds may also fail due to low resources; in this case, the guest must 362 * not complete this message until the hot add can succeed, and the host must 363 * not send a new hot add request until the response is sent. 364 * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS 365 * times it fails the request. 366 * 367 * 368 * page_count: number of pages that were successfully hot added. 369 * 370 * result: result of the operation 1: success, 0: failure. 371 * 372 */ 373 374 struct dm_hot_add_response { 375 struct dm_header hdr; 376 __u32 page_count; 377 __u32 result; 378 } __packed; 379 380 /* 381 * Types of information sent from host to the guest. 382 */ 383 384 enum dm_info_type { 385 INFO_TYPE_MAX_PAGE_CNT = 0, 386 MAX_INFO_TYPE 387 }; 388 389 390 /* 391 * Header for the information message. 392 */ 393 394 struct dm_info_header { 395 enum dm_info_type type; 396 __u32 data_size; 397 } __packed; 398 399 /* 400 * This message is sent from the host to the guest to pass 401 * some relevant information (win8 addition). 402 * 403 * reserved: no used. 404 * info_size: size of the information blob. 405 * info: information blob. 406 */ 407 408 struct dm_info_msg { 409 struct dm_header hdr; 410 __u32 reserved; 411 __u32 info_size; 412 __u8 info[]; 413 }; 414 415 /* 416 * End protocol definitions. 417 */ 418 419 /* 420 * State to manage hot adding memory into the guest. 421 * The range start_pfn : end_pfn specifies the range 422 * that the host has asked us to hot add. The range 423 * start_pfn : ha_end_pfn specifies the range that we have 424 * currently hot added. We hot add in multiples of 128M 425 * chunks; it is possible that we may not be able to bring 426 * online all the pages in the region. The range 427 * covered_start_pfn:covered_end_pfn defines the pages that can 428 * be brough online. 429 */ 430 431 struct hv_hotadd_state { 432 struct list_head list; 433 unsigned long start_pfn; 434 unsigned long covered_start_pfn; 435 unsigned long covered_end_pfn; 436 unsigned long ha_end_pfn; 437 unsigned long end_pfn; 438 /* 439 * A list of gaps. 440 */ 441 struct list_head gap_list; 442 }; 443 444 struct hv_hotadd_gap { 445 struct list_head list; 446 unsigned long start_pfn; 447 unsigned long end_pfn; 448 }; 449 450 struct balloon_state { 451 __u32 num_pages; 452 struct work_struct wrk; 453 }; 454 455 struct hot_add_wrk { 456 union dm_mem_page_range ha_page_range; 457 union dm_mem_page_range ha_region_range; 458 struct work_struct wrk; 459 }; 460 461 static bool allow_hibernation; 462 static bool hot_add = true; 463 static bool do_hot_add; 464 /* 465 * Delay reporting memory pressure by 466 * the specified number of seconds. 467 */ 468 static uint pressure_report_delay = 45; 469 470 /* 471 * The last time we posted a pressure report to host. 472 */ 473 static unsigned long last_post_time; 474 475 module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); 476 MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); 477 478 module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); 479 MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); 480 static atomic_t trans_id = ATOMIC_INIT(0); 481 482 static int dm_ring_size = 20 * 1024; 483 484 /* 485 * Driver specific state. 486 */ 487 488 enum hv_dm_state { 489 DM_INITIALIZING = 0, 490 DM_INITIALIZED, 491 DM_BALLOON_UP, 492 DM_BALLOON_DOWN, 493 DM_HOT_ADD, 494 DM_INIT_ERROR 495 }; 496 497 498 static __u8 recv_buffer[HV_HYP_PAGE_SIZE]; 499 static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE]; 500 #define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE) 501 #define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE) 502 503 struct hv_dynmem_device { 504 struct hv_device *dev; 505 enum hv_dm_state state; 506 struct completion host_event; 507 struct completion config_event; 508 509 /* 510 * Number of pages we have currently ballooned out. 511 */ 512 unsigned int num_pages_ballooned; 513 unsigned int num_pages_onlined; 514 unsigned int num_pages_added; 515 516 /* 517 * State to manage the ballooning (up) operation. 518 */ 519 struct balloon_state balloon_wrk; 520 521 /* 522 * State to execute the "hot-add" operation. 523 */ 524 struct hot_add_wrk ha_wrk; 525 526 /* 527 * This state tracks if the host has specified a hot-add 528 * region. 529 */ 530 bool host_specified_ha_region; 531 532 /* 533 * State to synchronize hot-add. 534 */ 535 struct completion ol_waitevent; 536 bool ha_waiting; 537 /* 538 * This thread handles hot-add 539 * requests from the host as well as notifying 540 * the host with regards to memory pressure in 541 * the guest. 542 */ 543 struct task_struct *thread; 544 545 /* 546 * Protects ha_region_list, num_pages_onlined counter and individual 547 * regions from ha_region_list. 548 */ 549 spinlock_t ha_lock; 550 551 /* 552 * A list of hot-add regions. 553 */ 554 struct list_head ha_region_list; 555 556 /* 557 * We start with the highest version we can support 558 * and downgrade based on the host; we save here the 559 * next version to try. 560 */ 561 __u32 next_version; 562 563 /* 564 * The negotiated version agreed by host. 565 */ 566 __u32 version; 567 }; 568 569 static struct hv_dynmem_device dm_device; 570 571 static void post_status(struct hv_dynmem_device *dm); 572 573 #ifdef CONFIG_MEMORY_HOTPLUG 574 static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, 575 unsigned long pfn) 576 { 577 struct hv_hotadd_gap *gap; 578 579 /* The page is not backed. */ 580 if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn)) 581 return false; 582 583 /* Check for gaps. */ 584 list_for_each_entry(gap, &has->gap_list, list) { 585 if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn)) 586 return false; 587 } 588 589 return true; 590 } 591 592 static unsigned long hv_page_offline_check(unsigned long start_pfn, 593 unsigned long nr_pages) 594 { 595 unsigned long pfn = start_pfn, count = 0; 596 struct hv_hotadd_state *has; 597 bool found; 598 599 while (pfn < start_pfn + nr_pages) { 600 /* 601 * Search for HAS which covers the pfn and when we find one 602 * count how many consequitive PFNs are covered. 603 */ 604 found = false; 605 list_for_each_entry(has, &dm_device.ha_region_list, list) { 606 while ((pfn >= has->start_pfn) && 607 (pfn < has->end_pfn) && 608 (pfn < start_pfn + nr_pages)) { 609 found = true; 610 if (has_pfn_is_backed(has, pfn)) 611 count++; 612 pfn++; 613 } 614 } 615 616 /* 617 * This PFN is not in any HAS (e.g. we're offlining a region 618 * which was present at boot), no need to account for it. Go 619 * to the next one. 620 */ 621 if (!found) 622 pfn++; 623 } 624 625 return count; 626 } 627 628 static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, 629 void *v) 630 { 631 struct memory_notify *mem = (struct memory_notify *)v; 632 unsigned long flags, pfn_count; 633 634 switch (val) { 635 case MEM_ONLINE: 636 case MEM_CANCEL_ONLINE: 637 if (dm_device.ha_waiting) { 638 dm_device.ha_waiting = false; 639 complete(&dm_device.ol_waitevent); 640 } 641 break; 642 643 case MEM_OFFLINE: 644 spin_lock_irqsave(&dm_device.ha_lock, flags); 645 pfn_count = hv_page_offline_check(mem->start_pfn, 646 mem->nr_pages); 647 if (pfn_count <= dm_device.num_pages_onlined) { 648 dm_device.num_pages_onlined -= pfn_count; 649 } else { 650 /* 651 * We're offlining more pages than we managed to online. 652 * This is unexpected. In any case don't let 653 * num_pages_onlined wrap around zero. 654 */ 655 WARN_ON_ONCE(1); 656 dm_device.num_pages_onlined = 0; 657 } 658 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 659 break; 660 case MEM_GOING_ONLINE: 661 case MEM_GOING_OFFLINE: 662 case MEM_CANCEL_OFFLINE: 663 break; 664 } 665 return NOTIFY_OK; 666 } 667 668 static struct notifier_block hv_memory_nb = { 669 .notifier_call = hv_memory_notifier, 670 .priority = 0 671 }; 672 673 /* Check if the particular page is backed and can be onlined and online it. */ 674 static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) 675 { 676 if (!has_pfn_is_backed(has, page_to_pfn(pg))) { 677 if (!PageOffline(pg)) 678 __SetPageOffline(pg); 679 return; 680 } 681 if (PageOffline(pg)) 682 __ClearPageOffline(pg); 683 684 /* This frame is currently backed; online the page. */ 685 generic_online_page(pg, 0); 686 687 lockdep_assert_held(&dm_device.ha_lock); 688 dm_device.num_pages_onlined++; 689 } 690 691 static void hv_bring_pgs_online(struct hv_hotadd_state *has, 692 unsigned long start_pfn, unsigned long size) 693 { 694 int i; 695 696 pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); 697 for (i = 0; i < size; i++) 698 hv_page_online_one(has, pfn_to_page(start_pfn + i)); 699 } 700 701 static void hv_mem_hot_add(unsigned long start, unsigned long size, 702 unsigned long pfn_count, 703 struct hv_hotadd_state *has) 704 { 705 int ret = 0; 706 int i, nid; 707 unsigned long start_pfn; 708 unsigned long processed_pfn; 709 unsigned long total_pfn = pfn_count; 710 unsigned long flags; 711 712 for (i = 0; i < (size/HA_CHUNK); i++) { 713 start_pfn = start + (i * HA_CHUNK); 714 715 spin_lock_irqsave(&dm_device.ha_lock, flags); 716 has->ha_end_pfn += HA_CHUNK; 717 718 if (total_pfn > HA_CHUNK) { 719 processed_pfn = HA_CHUNK; 720 total_pfn -= HA_CHUNK; 721 } else { 722 processed_pfn = total_pfn; 723 total_pfn = 0; 724 } 725 726 has->covered_end_pfn += processed_pfn; 727 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 728 729 init_completion(&dm_device.ol_waitevent); 730 dm_device.ha_waiting = !memhp_auto_online; 731 732 nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); 733 ret = add_memory(nid, PFN_PHYS((start_pfn)), 734 (HA_CHUNK << PAGE_SHIFT)); 735 736 if (ret) { 737 pr_err("hot_add memory failed error is %d\n", ret); 738 if (ret == -EEXIST) { 739 /* 740 * This error indicates that the error 741 * is not a transient failure. This is the 742 * case where the guest's physical address map 743 * precludes hot adding memory. Stop all further 744 * memory hot-add. 745 */ 746 do_hot_add = false; 747 } 748 spin_lock_irqsave(&dm_device.ha_lock, flags); 749 has->ha_end_pfn -= HA_CHUNK; 750 has->covered_end_pfn -= processed_pfn; 751 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 752 break; 753 } 754 755 /* 756 * Wait for the memory block to be onlined when memory onlining 757 * is done outside of kernel (memhp_auto_online). Since the hot 758 * add has succeeded, it is ok to proceed even if the pages in 759 * the hot added region have not been "onlined" within the 760 * allowed time. 761 */ 762 if (dm_device.ha_waiting) 763 wait_for_completion_timeout(&dm_device.ol_waitevent, 764 5*HZ); 765 post_status(&dm_device); 766 } 767 } 768 769 static void hv_online_page(struct page *pg, unsigned int order) 770 { 771 struct hv_hotadd_state *has; 772 unsigned long flags; 773 unsigned long pfn = page_to_pfn(pg); 774 775 spin_lock_irqsave(&dm_device.ha_lock, flags); 776 list_for_each_entry(has, &dm_device.ha_region_list, list) { 777 /* The page belongs to a different HAS. */ 778 if ((pfn < has->start_pfn) || 779 (pfn + (1UL << order) > has->end_pfn)) 780 continue; 781 782 hv_bring_pgs_online(has, pfn, 1UL << order); 783 break; 784 } 785 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 786 } 787 788 static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) 789 { 790 struct hv_hotadd_state *has; 791 struct hv_hotadd_gap *gap; 792 unsigned long residual, new_inc; 793 int ret = 0; 794 unsigned long flags; 795 796 spin_lock_irqsave(&dm_device.ha_lock, flags); 797 list_for_each_entry(has, &dm_device.ha_region_list, list) { 798 /* 799 * If the pfn range we are dealing with is not in the current 800 * "hot add block", move on. 801 */ 802 if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn) 803 continue; 804 805 /* 806 * If the current start pfn is not where the covered_end 807 * is, create a gap and update covered_end_pfn. 808 */ 809 if (has->covered_end_pfn != start_pfn) { 810 gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC); 811 if (!gap) { 812 ret = -ENOMEM; 813 break; 814 } 815 816 INIT_LIST_HEAD(&gap->list); 817 gap->start_pfn = has->covered_end_pfn; 818 gap->end_pfn = start_pfn; 819 list_add_tail(&gap->list, &has->gap_list); 820 821 has->covered_end_pfn = start_pfn; 822 } 823 824 /* 825 * If the current hot add-request extends beyond 826 * our current limit; extend it. 827 */ 828 if ((start_pfn + pfn_cnt) > has->end_pfn) { 829 residual = (start_pfn + pfn_cnt - has->end_pfn); 830 /* 831 * Extend the region by multiples of HA_CHUNK. 832 */ 833 new_inc = (residual / HA_CHUNK) * HA_CHUNK; 834 if (residual % HA_CHUNK) 835 new_inc += HA_CHUNK; 836 837 has->end_pfn += new_inc; 838 } 839 840 ret = 1; 841 break; 842 } 843 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 844 845 return ret; 846 } 847 848 static unsigned long handle_pg_range(unsigned long pg_start, 849 unsigned long pg_count) 850 { 851 unsigned long start_pfn = pg_start; 852 unsigned long pfn_cnt = pg_count; 853 unsigned long size; 854 struct hv_hotadd_state *has; 855 unsigned long pgs_ol = 0; 856 unsigned long old_covered_state; 857 unsigned long res = 0, flags; 858 859 pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count, 860 pg_start); 861 862 spin_lock_irqsave(&dm_device.ha_lock, flags); 863 list_for_each_entry(has, &dm_device.ha_region_list, list) { 864 /* 865 * If the pfn range we are dealing with is not in the current 866 * "hot add block", move on. 867 */ 868 if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn) 869 continue; 870 871 old_covered_state = has->covered_end_pfn; 872 873 if (start_pfn < has->ha_end_pfn) { 874 /* 875 * This is the case where we are backing pages 876 * in an already hot added region. Bring 877 * these pages online first. 878 */ 879 pgs_ol = has->ha_end_pfn - start_pfn; 880 if (pgs_ol > pfn_cnt) 881 pgs_ol = pfn_cnt; 882 883 has->covered_end_pfn += pgs_ol; 884 pfn_cnt -= pgs_ol; 885 /* 886 * Check if the corresponding memory block is already 887 * online. It is possible to observe struct pages still 888 * being uninitialized here so check section instead. 889 * In case the section is online we need to bring the 890 * rest of pfns (which were not backed previously) 891 * online too. 892 */ 893 if (start_pfn > has->start_pfn && 894 online_section_nr(pfn_to_section_nr(start_pfn))) 895 hv_bring_pgs_online(has, start_pfn, pgs_ol); 896 897 } 898 899 if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { 900 /* 901 * We have some residual hot add range 902 * that needs to be hot added; hot add 903 * it now. Hot add a multiple of 904 * of HA_CHUNK that fully covers the pages 905 * we have. 906 */ 907 size = (has->end_pfn - has->ha_end_pfn); 908 if (pfn_cnt <= size) { 909 size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); 910 if (pfn_cnt % HA_CHUNK) 911 size += HA_CHUNK; 912 } else { 913 pfn_cnt = size; 914 } 915 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 916 hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has); 917 spin_lock_irqsave(&dm_device.ha_lock, flags); 918 } 919 /* 920 * If we managed to online any pages that were given to us, 921 * we declare success. 922 */ 923 res = has->covered_end_pfn - old_covered_state; 924 break; 925 } 926 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 927 928 return res; 929 } 930 931 static unsigned long process_hot_add(unsigned long pg_start, 932 unsigned long pfn_cnt, 933 unsigned long rg_start, 934 unsigned long rg_size) 935 { 936 struct hv_hotadd_state *ha_region = NULL; 937 int covered; 938 unsigned long flags; 939 940 if (pfn_cnt == 0) 941 return 0; 942 943 if (!dm_device.host_specified_ha_region) { 944 covered = pfn_covered(pg_start, pfn_cnt); 945 if (covered < 0) 946 return 0; 947 948 if (covered) 949 goto do_pg_range; 950 } 951 952 /* 953 * If the host has specified a hot-add range; deal with it first. 954 */ 955 956 if (rg_size != 0) { 957 ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL); 958 if (!ha_region) 959 return 0; 960 961 INIT_LIST_HEAD(&ha_region->list); 962 INIT_LIST_HEAD(&ha_region->gap_list); 963 964 ha_region->start_pfn = rg_start; 965 ha_region->ha_end_pfn = rg_start; 966 ha_region->covered_start_pfn = pg_start; 967 ha_region->covered_end_pfn = pg_start; 968 ha_region->end_pfn = rg_start + rg_size; 969 970 spin_lock_irqsave(&dm_device.ha_lock, flags); 971 list_add_tail(&ha_region->list, &dm_device.ha_region_list); 972 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 973 } 974 975 do_pg_range: 976 /* 977 * Process the page range specified; bringing them 978 * online if possible. 979 */ 980 return handle_pg_range(pg_start, pfn_cnt); 981 } 982 983 #endif 984 985 static void hot_add_req(struct work_struct *dummy) 986 { 987 struct dm_hot_add_response resp; 988 #ifdef CONFIG_MEMORY_HOTPLUG 989 unsigned long pg_start, pfn_cnt; 990 unsigned long rg_start, rg_sz; 991 #endif 992 struct hv_dynmem_device *dm = &dm_device; 993 994 memset(&resp, 0, sizeof(struct dm_hot_add_response)); 995 resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE; 996 resp.hdr.size = sizeof(struct dm_hot_add_response); 997 998 #ifdef CONFIG_MEMORY_HOTPLUG 999 pg_start = dm->ha_wrk.ha_page_range.finfo.start_page; 1000 pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt; 1001 1002 rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; 1003 rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; 1004 1005 if ((rg_start == 0) && (!dm->host_specified_ha_region)) { 1006 unsigned long region_size; 1007 unsigned long region_start; 1008 1009 /* 1010 * The host has not specified the hot-add region. 1011 * Based on the hot-add page range being specified, 1012 * compute a hot-add region that can cover the pages 1013 * that need to be hot-added while ensuring the alignment 1014 * and size requirements of Linux as it relates to hot-add. 1015 */ 1016 region_start = pg_start; 1017 region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; 1018 if (pfn_cnt % HA_CHUNK) 1019 region_size += HA_CHUNK; 1020 1021 region_start = (pg_start / HA_CHUNK) * HA_CHUNK; 1022 1023 rg_start = region_start; 1024 rg_sz = region_size; 1025 } 1026 1027 if (do_hot_add) 1028 resp.page_count = process_hot_add(pg_start, pfn_cnt, 1029 rg_start, rg_sz); 1030 1031 dm->num_pages_added += resp.page_count; 1032 #endif 1033 /* 1034 * The result field of the response structure has the 1035 * following semantics: 1036 * 1037 * 1. If all or some pages hot-added: Guest should return success. 1038 * 1039 * 2. If no pages could be hot-added: 1040 * 1041 * If the guest returns success, then the host 1042 * will not attempt any further hot-add operations. This 1043 * signifies a permanent failure. 1044 * 1045 * If the guest returns failure, then this failure will be 1046 * treated as a transient failure and the host may retry the 1047 * hot-add operation after some delay. 1048 */ 1049 if (resp.page_count > 0) 1050 resp.result = 1; 1051 else if (!do_hot_add) 1052 resp.result = 1; 1053 else 1054 resp.result = 0; 1055 1056 if (!do_hot_add || resp.page_count == 0) { 1057 if (!allow_hibernation) 1058 pr_err("Memory hot add failed\n"); 1059 else 1060 pr_info("Ignore hot-add request!\n"); 1061 } 1062 1063 dm->state = DM_INITIALIZED; 1064 resp.hdr.trans_id = atomic_inc_return(&trans_id); 1065 vmbus_sendpacket(dm->dev->channel, &resp, 1066 sizeof(struct dm_hot_add_response), 1067 (unsigned long)NULL, 1068 VM_PKT_DATA_INBAND, 0); 1069 } 1070 1071 static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) 1072 { 1073 struct dm_info_header *info_hdr; 1074 1075 info_hdr = (struct dm_info_header *)msg->info; 1076 1077 switch (info_hdr->type) { 1078 case INFO_TYPE_MAX_PAGE_CNT: 1079 if (info_hdr->data_size == sizeof(__u64)) { 1080 __u64 *max_page_count = (__u64 *)&info_hdr[1]; 1081 1082 pr_info("Max. dynamic memory size: %llu MB\n", 1083 (*max_page_count) >> (20 - HV_HYP_PAGE_SHIFT)); 1084 } 1085 1086 break; 1087 default: 1088 pr_warn("Received Unknown type: %d\n", info_hdr->type); 1089 } 1090 } 1091 1092 static unsigned long compute_balloon_floor(void) 1093 { 1094 unsigned long min_pages; 1095 unsigned long nr_pages = totalram_pages(); 1096 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) 1097 /* Simple continuous piecewiese linear function: 1098 * max MiB -> min MiB gradient 1099 * 0 0 1100 * 16 16 1101 * 32 24 1102 * 128 72 (1/2) 1103 * 512 168 (1/4) 1104 * 2048 360 (1/8) 1105 * 8192 744 (1/16) 1106 * 32768 1512 (1/32) 1107 */ 1108 if (nr_pages < MB2PAGES(128)) 1109 min_pages = MB2PAGES(8) + (nr_pages >> 1); 1110 else if (nr_pages < MB2PAGES(512)) 1111 min_pages = MB2PAGES(40) + (nr_pages >> 2); 1112 else if (nr_pages < MB2PAGES(2048)) 1113 min_pages = MB2PAGES(104) + (nr_pages >> 3); 1114 else if (nr_pages < MB2PAGES(8192)) 1115 min_pages = MB2PAGES(232) + (nr_pages >> 4); 1116 else 1117 min_pages = MB2PAGES(488) + (nr_pages >> 5); 1118 #undef MB2PAGES 1119 return min_pages; 1120 } 1121 1122 /* 1123 * Post our status as it relates memory pressure to the 1124 * host. Host expects the guests to post this status 1125 * periodically at 1 second intervals. 1126 * 1127 * The metrics specified in this protocol are very Windows 1128 * specific and so we cook up numbers here to convey our memory 1129 * pressure. 1130 */ 1131 1132 static void post_status(struct hv_dynmem_device *dm) 1133 { 1134 struct dm_status status; 1135 unsigned long now = jiffies; 1136 unsigned long last_post = last_post_time; 1137 1138 if (pressure_report_delay > 0) { 1139 --pressure_report_delay; 1140 return; 1141 } 1142 1143 if (!time_after(now, (last_post_time + HZ))) 1144 return; 1145 1146 memset(&status, 0, sizeof(struct dm_status)); 1147 status.hdr.type = DM_STATUS_REPORT; 1148 status.hdr.size = sizeof(struct dm_status); 1149 status.hdr.trans_id = atomic_inc_return(&trans_id); 1150 1151 /* 1152 * The host expects the guest to report free and committed memory. 1153 * Furthermore, the host expects the pressure information to include 1154 * the ballooned out pages. For a given amount of memory that we are 1155 * managing we need to compute a floor below which we should not 1156 * balloon. Compute this and add it to the pressure report. 1157 * We also need to report all offline pages (num_pages_added - 1158 * num_pages_onlined) as committed to the host, otherwise it can try 1159 * asking us to balloon them out. 1160 */ 1161 status.num_avail = si_mem_available(); 1162 status.num_committed = vm_memory_committed() + 1163 dm->num_pages_ballooned + 1164 (dm->num_pages_added > dm->num_pages_onlined ? 1165 dm->num_pages_added - dm->num_pages_onlined : 0) + 1166 compute_balloon_floor(); 1167 1168 trace_balloon_status(status.num_avail, status.num_committed, 1169 vm_memory_committed(), dm->num_pages_ballooned, 1170 dm->num_pages_added, dm->num_pages_onlined); 1171 /* 1172 * If our transaction ID is no longer current, just don't 1173 * send the status. This can happen if we were interrupted 1174 * after we picked our transaction ID. 1175 */ 1176 if (status.hdr.trans_id != atomic_read(&trans_id)) 1177 return; 1178 1179 /* 1180 * If the last post time that we sampled has changed, 1181 * we have raced, don't post the status. 1182 */ 1183 if (last_post != last_post_time) 1184 return; 1185 1186 last_post_time = jiffies; 1187 vmbus_sendpacket(dm->dev->channel, &status, 1188 sizeof(struct dm_status), 1189 (unsigned long)NULL, 1190 VM_PKT_DATA_INBAND, 0); 1191 1192 } 1193 1194 static void free_balloon_pages(struct hv_dynmem_device *dm, 1195 union dm_mem_page_range *range_array) 1196 { 1197 int num_pages = range_array->finfo.page_cnt; 1198 __u64 start_frame = range_array->finfo.start_page; 1199 struct page *pg; 1200 int i; 1201 1202 for (i = 0; i < num_pages; i++) { 1203 pg = pfn_to_page(i + start_frame); 1204 __ClearPageOffline(pg); 1205 __free_page(pg); 1206 dm->num_pages_ballooned--; 1207 } 1208 } 1209 1210 1211 1212 static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, 1213 unsigned int num_pages, 1214 struct dm_balloon_response *bl_resp, 1215 int alloc_unit) 1216 { 1217 unsigned int i, j; 1218 struct page *pg; 1219 1220 if (num_pages < alloc_unit) 1221 return 0; 1222 1223 for (i = 0; (i * alloc_unit) < num_pages; i++) { 1224 if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) > 1225 HV_HYP_PAGE_SIZE) 1226 return i * alloc_unit; 1227 1228 /* 1229 * We execute this code in a thread context. Furthermore, 1230 * we don't want the kernel to try too hard. 1231 */ 1232 pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY | 1233 __GFP_NOMEMALLOC | __GFP_NOWARN, 1234 get_order(alloc_unit << PAGE_SHIFT)); 1235 1236 if (!pg) 1237 return i * alloc_unit; 1238 1239 dm->num_pages_ballooned += alloc_unit; 1240 1241 /* 1242 * If we allocatted 2M pages; split them so we 1243 * can free them in any order we get. 1244 */ 1245 1246 if (alloc_unit != 1) 1247 split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); 1248 1249 /* mark all pages offline */ 1250 for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++) 1251 __SetPageOffline(pg + j); 1252 1253 bl_resp->range_count++; 1254 bl_resp->range_array[i].finfo.start_page = 1255 page_to_pfn(pg); 1256 bl_resp->range_array[i].finfo.page_cnt = alloc_unit; 1257 bl_resp->hdr.size += sizeof(union dm_mem_page_range); 1258 1259 } 1260 1261 return num_pages; 1262 } 1263 1264 static void balloon_up(struct work_struct *dummy) 1265 { 1266 unsigned int num_pages = dm_device.balloon_wrk.num_pages; 1267 unsigned int num_ballooned = 0; 1268 struct dm_balloon_response *bl_resp; 1269 int alloc_unit; 1270 int ret; 1271 bool done = false; 1272 int i; 1273 long avail_pages; 1274 unsigned long floor; 1275 1276 /* The host balloons pages in 2M granularity. */ 1277 WARN_ON_ONCE(num_pages % PAGES_IN_2M != 0); 1278 1279 /* 1280 * We will attempt 2M allocations. However, if we fail to 1281 * allocate 2M chunks, we will go back to PAGE_SIZE allocations. 1282 */ 1283 alloc_unit = PAGES_IN_2M; 1284 1285 avail_pages = si_mem_available(); 1286 floor = compute_balloon_floor(); 1287 1288 /* Refuse to balloon below the floor, keep the 2M granularity. */ 1289 if (avail_pages < num_pages || avail_pages - num_pages < floor) { 1290 pr_warn("Balloon request will be partially fulfilled. %s\n", 1291 avail_pages < num_pages ? "Not enough memory." : 1292 "Balloon floor reached."); 1293 1294 num_pages = avail_pages > floor ? (avail_pages - floor) : 0; 1295 num_pages -= num_pages % PAGES_IN_2M; 1296 } 1297 1298 while (!done) { 1299 memset(balloon_up_send_buffer, 0, HV_HYP_PAGE_SIZE); 1300 bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer; 1301 bl_resp->hdr.type = DM_BALLOON_RESPONSE; 1302 bl_resp->hdr.size = sizeof(struct dm_balloon_response); 1303 bl_resp->more_pages = 1; 1304 1305 num_pages -= num_ballooned; 1306 num_ballooned = alloc_balloon_pages(&dm_device, num_pages, 1307 bl_resp, alloc_unit); 1308 1309 if (alloc_unit != 1 && num_ballooned == 0) { 1310 alloc_unit = 1; 1311 continue; 1312 } 1313 1314 if (num_ballooned == 0 || num_ballooned == num_pages) { 1315 pr_debug("Ballooned %u out of %u requested pages.\n", 1316 num_pages, dm_device.balloon_wrk.num_pages); 1317 1318 bl_resp->more_pages = 0; 1319 done = true; 1320 dm_device.state = DM_INITIALIZED; 1321 } 1322 1323 /* 1324 * We are pushing a lot of data through the channel; 1325 * deal with transient failures caused because of the 1326 * lack of space in the ring buffer. 1327 */ 1328 1329 do { 1330 bl_resp->hdr.trans_id = atomic_inc_return(&trans_id); 1331 ret = vmbus_sendpacket(dm_device.dev->channel, 1332 bl_resp, 1333 bl_resp->hdr.size, 1334 (unsigned long)NULL, 1335 VM_PKT_DATA_INBAND, 0); 1336 1337 if (ret == -EAGAIN) 1338 msleep(20); 1339 post_status(&dm_device); 1340 } while (ret == -EAGAIN); 1341 1342 if (ret) { 1343 /* 1344 * Free up the memory we allocatted. 1345 */ 1346 pr_err("Balloon response failed\n"); 1347 1348 for (i = 0; i < bl_resp->range_count; i++) 1349 free_balloon_pages(&dm_device, 1350 &bl_resp->range_array[i]); 1351 1352 done = true; 1353 } 1354 } 1355 1356 } 1357 1358 static void balloon_down(struct hv_dynmem_device *dm, 1359 struct dm_unballoon_request *req) 1360 { 1361 union dm_mem_page_range *range_array = req->range_array; 1362 int range_count = req->range_count; 1363 struct dm_unballoon_response resp; 1364 int i; 1365 unsigned int prev_pages_ballooned = dm->num_pages_ballooned; 1366 1367 for (i = 0; i < range_count; i++) { 1368 free_balloon_pages(dm, &range_array[i]); 1369 complete(&dm_device.config_event); 1370 } 1371 1372 pr_debug("Freed %u ballooned pages.\n", 1373 prev_pages_ballooned - dm->num_pages_ballooned); 1374 1375 if (req->more_pages == 1) 1376 return; 1377 1378 memset(&resp, 0, sizeof(struct dm_unballoon_response)); 1379 resp.hdr.type = DM_UNBALLOON_RESPONSE; 1380 resp.hdr.trans_id = atomic_inc_return(&trans_id); 1381 resp.hdr.size = sizeof(struct dm_unballoon_response); 1382 1383 vmbus_sendpacket(dm_device.dev->channel, &resp, 1384 sizeof(struct dm_unballoon_response), 1385 (unsigned long)NULL, 1386 VM_PKT_DATA_INBAND, 0); 1387 1388 dm->state = DM_INITIALIZED; 1389 } 1390 1391 static void balloon_onchannelcallback(void *context); 1392 1393 static int dm_thread_func(void *dm_dev) 1394 { 1395 struct hv_dynmem_device *dm = dm_dev; 1396 1397 while (!kthread_should_stop()) { 1398 wait_for_completion_interruptible_timeout( 1399 &dm_device.config_event, 1*HZ); 1400 /* 1401 * The host expects us to post information on the memory 1402 * pressure every second. 1403 */ 1404 reinit_completion(&dm_device.config_event); 1405 post_status(dm); 1406 } 1407 1408 return 0; 1409 } 1410 1411 1412 static void version_resp(struct hv_dynmem_device *dm, 1413 struct dm_version_response *vresp) 1414 { 1415 struct dm_version_request version_req; 1416 int ret; 1417 1418 if (vresp->is_accepted) { 1419 /* 1420 * We are done; wakeup the 1421 * context waiting for version 1422 * negotiation. 1423 */ 1424 complete(&dm->host_event); 1425 return; 1426 } 1427 /* 1428 * If there are more versions to try, continue 1429 * with negotiations; if not 1430 * shutdown the service since we are not able 1431 * to negotiate a suitable version number 1432 * with the host. 1433 */ 1434 if (dm->next_version == 0) 1435 goto version_error; 1436 1437 memset(&version_req, 0, sizeof(struct dm_version_request)); 1438 version_req.hdr.type = DM_VERSION_REQUEST; 1439 version_req.hdr.size = sizeof(struct dm_version_request); 1440 version_req.hdr.trans_id = atomic_inc_return(&trans_id); 1441 version_req.version.version = dm->next_version; 1442 dm->version = version_req.version.version; 1443 1444 /* 1445 * Set the next version to try in case current version fails. 1446 * Win7 protocol ought to be the last one to try. 1447 */ 1448 switch (version_req.version.version) { 1449 case DYNMEM_PROTOCOL_VERSION_WIN8: 1450 dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7; 1451 version_req.is_last_attempt = 0; 1452 break; 1453 default: 1454 dm->next_version = 0; 1455 version_req.is_last_attempt = 1; 1456 } 1457 1458 ret = vmbus_sendpacket(dm->dev->channel, &version_req, 1459 sizeof(struct dm_version_request), 1460 (unsigned long)NULL, 1461 VM_PKT_DATA_INBAND, 0); 1462 1463 if (ret) 1464 goto version_error; 1465 1466 return; 1467 1468 version_error: 1469 dm->state = DM_INIT_ERROR; 1470 complete(&dm->host_event); 1471 } 1472 1473 static void cap_resp(struct hv_dynmem_device *dm, 1474 struct dm_capabilities_resp_msg *cap_resp) 1475 { 1476 if (!cap_resp->is_accepted) { 1477 pr_err("Capabilities not accepted by host\n"); 1478 dm->state = DM_INIT_ERROR; 1479 } 1480 complete(&dm->host_event); 1481 } 1482 1483 static void balloon_onchannelcallback(void *context) 1484 { 1485 struct hv_device *dev = context; 1486 u32 recvlen; 1487 u64 requestid; 1488 struct dm_message *dm_msg; 1489 struct dm_header *dm_hdr; 1490 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1491 struct dm_balloon *bal_msg; 1492 struct dm_hot_add *ha_msg; 1493 union dm_mem_page_range *ha_pg_range; 1494 union dm_mem_page_range *ha_region; 1495 1496 memset(recv_buffer, 0, sizeof(recv_buffer)); 1497 vmbus_recvpacket(dev->channel, recv_buffer, 1498 HV_HYP_PAGE_SIZE, &recvlen, &requestid); 1499 1500 if (recvlen > 0) { 1501 dm_msg = (struct dm_message *)recv_buffer; 1502 dm_hdr = &dm_msg->hdr; 1503 1504 switch (dm_hdr->type) { 1505 case DM_VERSION_RESPONSE: 1506 version_resp(dm, 1507 (struct dm_version_response *)dm_msg); 1508 break; 1509 1510 case DM_CAPABILITIES_RESPONSE: 1511 cap_resp(dm, 1512 (struct dm_capabilities_resp_msg *)dm_msg); 1513 break; 1514 1515 case DM_BALLOON_REQUEST: 1516 if (allow_hibernation) { 1517 pr_info("Ignore balloon-up request!\n"); 1518 break; 1519 } 1520 1521 if (dm->state == DM_BALLOON_UP) 1522 pr_warn("Currently ballooning\n"); 1523 bal_msg = (struct dm_balloon *)recv_buffer; 1524 dm->state = DM_BALLOON_UP; 1525 dm_device.balloon_wrk.num_pages = bal_msg->num_pages; 1526 schedule_work(&dm_device.balloon_wrk.wrk); 1527 break; 1528 1529 case DM_UNBALLOON_REQUEST: 1530 if (allow_hibernation) { 1531 pr_info("Ignore balloon-down request!\n"); 1532 break; 1533 } 1534 1535 dm->state = DM_BALLOON_DOWN; 1536 balloon_down(dm, 1537 (struct dm_unballoon_request *)recv_buffer); 1538 break; 1539 1540 case DM_MEM_HOT_ADD_REQUEST: 1541 if (dm->state == DM_HOT_ADD) 1542 pr_warn("Currently hot-adding\n"); 1543 dm->state = DM_HOT_ADD; 1544 ha_msg = (struct dm_hot_add *)recv_buffer; 1545 if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) { 1546 /* 1547 * This is a normal hot-add request specifying 1548 * hot-add memory. 1549 */ 1550 dm->host_specified_ha_region = false; 1551 ha_pg_range = &ha_msg->range; 1552 dm->ha_wrk.ha_page_range = *ha_pg_range; 1553 dm->ha_wrk.ha_region_range.page_range = 0; 1554 } else { 1555 /* 1556 * Host is specifying that we first hot-add 1557 * a region and then partially populate this 1558 * region. 1559 */ 1560 dm->host_specified_ha_region = true; 1561 ha_pg_range = &ha_msg->range; 1562 ha_region = &ha_pg_range[1]; 1563 dm->ha_wrk.ha_page_range = *ha_pg_range; 1564 dm->ha_wrk.ha_region_range = *ha_region; 1565 } 1566 schedule_work(&dm_device.ha_wrk.wrk); 1567 break; 1568 1569 case DM_INFO_MESSAGE: 1570 process_info(dm, (struct dm_info_msg *)dm_msg); 1571 break; 1572 1573 default: 1574 pr_warn("Unhandled message: type: %d\n", dm_hdr->type); 1575 1576 } 1577 } 1578 1579 } 1580 1581 static int balloon_connect_vsp(struct hv_device *dev) 1582 { 1583 struct dm_version_request version_req; 1584 struct dm_capabilities cap_msg; 1585 unsigned long t; 1586 int ret; 1587 1588 ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, 1589 balloon_onchannelcallback, dev); 1590 if (ret) 1591 return ret; 1592 1593 /* 1594 * Initiate the hand shake with the host and negotiate 1595 * a version that the host can support. We start with the 1596 * highest version number and go down if the host cannot 1597 * support it. 1598 */ 1599 memset(&version_req, 0, sizeof(struct dm_version_request)); 1600 version_req.hdr.type = DM_VERSION_REQUEST; 1601 version_req.hdr.size = sizeof(struct dm_version_request); 1602 version_req.hdr.trans_id = atomic_inc_return(&trans_id); 1603 version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10; 1604 version_req.is_last_attempt = 0; 1605 dm_device.version = version_req.version.version; 1606 1607 ret = vmbus_sendpacket(dev->channel, &version_req, 1608 sizeof(struct dm_version_request), 1609 (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); 1610 if (ret) 1611 goto out; 1612 1613 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); 1614 if (t == 0) { 1615 ret = -ETIMEDOUT; 1616 goto out; 1617 } 1618 1619 /* 1620 * If we could not negotiate a compatible version with the host 1621 * fail the probe function. 1622 */ 1623 if (dm_device.state == DM_INIT_ERROR) { 1624 ret = -EPROTO; 1625 goto out; 1626 } 1627 1628 pr_info("Using Dynamic Memory protocol version %u.%u\n", 1629 DYNMEM_MAJOR_VERSION(dm_device.version), 1630 DYNMEM_MINOR_VERSION(dm_device.version)); 1631 1632 /* 1633 * Now submit our capabilities to the host. 1634 */ 1635 memset(&cap_msg, 0, sizeof(struct dm_capabilities)); 1636 cap_msg.hdr.type = DM_CAPABILITIES_REPORT; 1637 cap_msg.hdr.size = sizeof(struct dm_capabilities); 1638 cap_msg.hdr.trans_id = atomic_inc_return(&trans_id); 1639 1640 /* 1641 * When hibernation (i.e. virtual ACPI S4 state) is enabled, the host 1642 * currently still requires the bits to be set, so we have to add code 1643 * to fail the host's hot-add and balloon up/down requests, if any. 1644 */ 1645 cap_msg.caps.cap_bits.balloon = 1; 1646 cap_msg.caps.cap_bits.hot_add = 1; 1647 1648 /* 1649 * Specify our alignment requirements as it relates 1650 * memory hot-add. Specify 128MB alignment. 1651 */ 1652 cap_msg.caps.cap_bits.hot_add_alignment = 7; 1653 1654 /* 1655 * Currently the host does not use these 1656 * values and we set them to what is done in the 1657 * Windows driver. 1658 */ 1659 cap_msg.min_page_cnt = 0; 1660 cap_msg.max_page_number = -1; 1661 1662 ret = vmbus_sendpacket(dev->channel, &cap_msg, 1663 sizeof(struct dm_capabilities), 1664 (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); 1665 if (ret) 1666 goto out; 1667 1668 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); 1669 if (t == 0) { 1670 ret = -ETIMEDOUT; 1671 goto out; 1672 } 1673 1674 /* 1675 * If the host does not like our capabilities, 1676 * fail the probe function. 1677 */ 1678 if (dm_device.state == DM_INIT_ERROR) { 1679 ret = -EPROTO; 1680 goto out; 1681 } 1682 1683 return 0; 1684 out: 1685 vmbus_close(dev->channel); 1686 return ret; 1687 } 1688 1689 static int balloon_probe(struct hv_device *dev, 1690 const struct hv_vmbus_device_id *dev_id) 1691 { 1692 int ret; 1693 1694 allow_hibernation = hv_is_hibernation_supported(); 1695 if (allow_hibernation) 1696 hot_add = false; 1697 1698 #ifdef CONFIG_MEMORY_HOTPLUG 1699 do_hot_add = hot_add; 1700 #else 1701 do_hot_add = false; 1702 #endif 1703 dm_device.dev = dev; 1704 dm_device.state = DM_INITIALIZING; 1705 dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; 1706 init_completion(&dm_device.host_event); 1707 init_completion(&dm_device.config_event); 1708 INIT_LIST_HEAD(&dm_device.ha_region_list); 1709 spin_lock_init(&dm_device.ha_lock); 1710 INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); 1711 INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); 1712 dm_device.host_specified_ha_region = false; 1713 1714 #ifdef CONFIG_MEMORY_HOTPLUG 1715 set_online_page_callback(&hv_online_page); 1716 register_memory_notifier(&hv_memory_nb); 1717 #endif 1718 1719 hv_set_drvdata(dev, &dm_device); 1720 1721 ret = balloon_connect_vsp(dev); 1722 if (ret != 0) 1723 return ret; 1724 1725 dm_device.state = DM_INITIALIZED; 1726 1727 dm_device.thread = 1728 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); 1729 if (IS_ERR(dm_device.thread)) { 1730 ret = PTR_ERR(dm_device.thread); 1731 goto probe_error; 1732 } 1733 1734 return 0; 1735 1736 probe_error: 1737 dm_device.state = DM_INIT_ERROR; 1738 dm_device.thread = NULL; 1739 vmbus_close(dev->channel); 1740 #ifdef CONFIG_MEMORY_HOTPLUG 1741 unregister_memory_notifier(&hv_memory_nb); 1742 restore_online_page_callback(&hv_online_page); 1743 #endif 1744 return ret; 1745 } 1746 1747 static int balloon_remove(struct hv_device *dev) 1748 { 1749 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1750 struct hv_hotadd_state *has, *tmp; 1751 struct hv_hotadd_gap *gap, *tmp_gap; 1752 unsigned long flags; 1753 1754 if (dm->num_pages_ballooned != 0) 1755 pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); 1756 1757 cancel_work_sync(&dm->balloon_wrk.wrk); 1758 cancel_work_sync(&dm->ha_wrk.wrk); 1759 1760 kthread_stop(dm->thread); 1761 vmbus_close(dev->channel); 1762 #ifdef CONFIG_MEMORY_HOTPLUG 1763 unregister_memory_notifier(&hv_memory_nb); 1764 restore_online_page_callback(&hv_online_page); 1765 #endif 1766 spin_lock_irqsave(&dm_device.ha_lock, flags); 1767 list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { 1768 list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { 1769 list_del(&gap->list); 1770 kfree(gap); 1771 } 1772 list_del(&has->list); 1773 kfree(has); 1774 } 1775 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 1776 1777 return 0; 1778 } 1779 1780 static int balloon_suspend(struct hv_device *hv_dev) 1781 { 1782 struct hv_dynmem_device *dm = hv_get_drvdata(hv_dev); 1783 1784 tasklet_disable(&hv_dev->channel->callback_event); 1785 1786 cancel_work_sync(&dm->balloon_wrk.wrk); 1787 cancel_work_sync(&dm->ha_wrk.wrk); 1788 1789 if (dm->thread) { 1790 kthread_stop(dm->thread); 1791 dm->thread = NULL; 1792 vmbus_close(hv_dev->channel); 1793 } 1794 1795 tasklet_enable(&hv_dev->channel->callback_event); 1796 1797 return 0; 1798 1799 } 1800 1801 static int balloon_resume(struct hv_device *dev) 1802 { 1803 int ret; 1804 1805 dm_device.state = DM_INITIALIZING; 1806 1807 ret = balloon_connect_vsp(dev); 1808 1809 if (ret != 0) 1810 goto out; 1811 1812 dm_device.thread = 1813 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); 1814 if (IS_ERR(dm_device.thread)) { 1815 ret = PTR_ERR(dm_device.thread); 1816 dm_device.thread = NULL; 1817 goto close_channel; 1818 } 1819 1820 dm_device.state = DM_INITIALIZED; 1821 return 0; 1822 close_channel: 1823 vmbus_close(dev->channel); 1824 out: 1825 dm_device.state = DM_INIT_ERROR; 1826 #ifdef CONFIG_MEMORY_HOTPLUG 1827 unregister_memory_notifier(&hv_memory_nb); 1828 restore_online_page_callback(&hv_online_page); 1829 #endif 1830 return ret; 1831 } 1832 1833 static const struct hv_vmbus_device_id id_table[] = { 1834 /* Dynamic Memory Class ID */ 1835 /* 525074DC-8985-46e2-8057-A307DC18A502 */ 1836 { HV_DM_GUID, }, 1837 { }, 1838 }; 1839 1840 MODULE_DEVICE_TABLE(vmbus, id_table); 1841 1842 static struct hv_driver balloon_drv = { 1843 .name = "hv_balloon", 1844 .id_table = id_table, 1845 .probe = balloon_probe, 1846 .remove = balloon_remove, 1847 .suspend = balloon_suspend, 1848 .resume = balloon_resume, 1849 .driver = { 1850 .probe_type = PROBE_PREFER_ASYNCHRONOUS, 1851 }, 1852 }; 1853 1854 static int __init init_balloon_drv(void) 1855 { 1856 1857 return vmbus_driver_register(&balloon_drv); 1858 } 1859 1860 module_init(init_balloon_drv); 1861 1862 MODULE_DESCRIPTION("Hyper-V Balloon"); 1863 MODULE_LICENSE("GPL"); 1864