1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2012, Microsoft Corporation. 4 * 5 * Author: 6 * K. Y. Srinivasan <kys@microsoft.com> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kernel.h> 12 #include <linux/jiffies.h> 13 #include <linux/mman.h> 14 #include <linux/delay.h> 15 #include <linux/init.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/kthread.h> 19 #include <linux/completion.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/memory.h> 22 #include <linux/notifier.h> 23 #include <linux/percpu_counter.h> 24 25 #include <linux/hyperv.h> 26 #include <asm/hyperv-tlfs.h> 27 28 #include <asm/mshyperv.h> 29 30 #define CREATE_TRACE_POINTS 31 #include "hv_trace_balloon.h" 32 33 /* 34 * We begin with definitions supporting the Dynamic Memory protocol 35 * with the host. 36 * 37 * Begin protocol definitions. 38 */ 39 40 41 42 /* 43 * Protocol versions. The low word is the minor version, the high word the major 44 * version. 45 * 46 * History: 47 * Initial version 1.0 48 * Changed to 0.1 on 2009/03/25 49 * Changes to 0.2 on 2009/05/14 50 * Changes to 0.3 on 2009/12/03 51 * Changed to 1.0 on 2011/04/05 52 */ 53 54 #define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor))) 55 #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16) 56 #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff) 57 58 enum { 59 DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3), 60 DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0), 61 DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0), 62 63 DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1, 64 DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2, 65 DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3, 66 67 DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10 68 }; 69 70 71 72 /* 73 * Message Types 74 */ 75 76 enum dm_message_type { 77 /* 78 * Version 0.3 79 */ 80 DM_ERROR = 0, 81 DM_VERSION_REQUEST = 1, 82 DM_VERSION_RESPONSE = 2, 83 DM_CAPABILITIES_REPORT = 3, 84 DM_CAPABILITIES_RESPONSE = 4, 85 DM_STATUS_REPORT = 5, 86 DM_BALLOON_REQUEST = 6, 87 DM_BALLOON_RESPONSE = 7, 88 DM_UNBALLOON_REQUEST = 8, 89 DM_UNBALLOON_RESPONSE = 9, 90 DM_MEM_HOT_ADD_REQUEST = 10, 91 DM_MEM_HOT_ADD_RESPONSE = 11, 92 DM_VERSION_03_MAX = 11, 93 /* 94 * Version 1.0. 95 */ 96 DM_INFO_MESSAGE = 12, 97 DM_VERSION_1_MAX = 12 98 }; 99 100 101 /* 102 * Structures defining the dynamic memory management 103 * protocol. 104 */ 105 106 union dm_version { 107 struct { 108 __u16 minor_version; 109 __u16 major_version; 110 }; 111 __u32 version; 112 } __packed; 113 114 115 union dm_caps { 116 struct { 117 __u64 balloon:1; 118 __u64 hot_add:1; 119 /* 120 * To support guests that may have alignment 121 * limitations on hot-add, the guest can specify 122 * its alignment requirements; a value of n 123 * represents an alignment of 2^n in mega bytes. 124 */ 125 __u64 hot_add_alignment:4; 126 __u64 reservedz:58; 127 } cap_bits; 128 __u64 caps; 129 } __packed; 130 131 union dm_mem_page_range { 132 struct { 133 /* 134 * The PFN number of the first page in the range. 135 * 40 bits is the architectural limit of a PFN 136 * number for AMD64. 137 */ 138 __u64 start_page:40; 139 /* 140 * The number of pages in the range. 141 */ 142 __u64 page_cnt:24; 143 } finfo; 144 __u64 page_range; 145 } __packed; 146 147 148 149 /* 150 * The header for all dynamic memory messages: 151 * 152 * type: Type of the message. 153 * size: Size of the message in bytes; including the header. 154 * trans_id: The guest is responsible for manufacturing this ID. 155 */ 156 157 struct dm_header { 158 __u16 type; 159 __u16 size; 160 __u32 trans_id; 161 } __packed; 162 163 /* 164 * A generic message format for dynamic memory. 165 * Specific message formats are defined later in the file. 166 */ 167 168 struct dm_message { 169 struct dm_header hdr; 170 __u8 data[]; /* enclosed message */ 171 } __packed; 172 173 174 /* 175 * Specific message types supporting the dynamic memory protocol. 176 */ 177 178 /* 179 * Version negotiation message. Sent from the guest to the host. 180 * The guest is free to try different versions until the host 181 * accepts the version. 182 * 183 * dm_version: The protocol version requested. 184 * is_last_attempt: If TRUE, this is the last version guest will request. 185 * reservedz: Reserved field, set to zero. 186 */ 187 188 struct dm_version_request { 189 struct dm_header hdr; 190 union dm_version version; 191 __u32 is_last_attempt:1; 192 __u32 reservedz:31; 193 } __packed; 194 195 /* 196 * Version response message; Host to Guest and indicates 197 * if the host has accepted the version sent by the guest. 198 * 199 * is_accepted: If TRUE, host has accepted the version and the guest 200 * should proceed to the next stage of the protocol. FALSE indicates that 201 * guest should re-try with a different version. 202 * 203 * reservedz: Reserved field, set to zero. 204 */ 205 206 struct dm_version_response { 207 struct dm_header hdr; 208 __u64 is_accepted:1; 209 __u64 reservedz:63; 210 } __packed; 211 212 /* 213 * Message reporting capabilities. This is sent from the guest to the 214 * host. 215 */ 216 217 struct dm_capabilities { 218 struct dm_header hdr; 219 union dm_caps caps; 220 __u64 min_page_cnt; 221 __u64 max_page_number; 222 } __packed; 223 224 /* 225 * Response to the capabilities message. This is sent from the host to the 226 * guest. This message notifies if the host has accepted the guest's 227 * capabilities. If the host has not accepted, the guest must shutdown 228 * the service. 229 * 230 * is_accepted: Indicates if the host has accepted guest's capabilities. 231 * reservedz: Must be 0. 232 */ 233 234 struct dm_capabilities_resp_msg { 235 struct dm_header hdr; 236 __u64 is_accepted:1; 237 __u64 reservedz:63; 238 } __packed; 239 240 /* 241 * This message is used to report memory pressure from the guest. 242 * This message is not part of any transaction and there is no 243 * response to this message. 244 * 245 * num_avail: Available memory in pages. 246 * num_committed: Committed memory in pages. 247 * page_file_size: The accumulated size of all page files 248 * in the system in pages. 249 * zero_free: The nunber of zero and free pages. 250 * page_file_writes: The writes to the page file in pages. 251 * io_diff: An indicator of file cache efficiency or page file activity, 252 * calculated as File Cache Page Fault Count - Page Read Count. 253 * This value is in pages. 254 * 255 * Some of these metrics are Windows specific and fortunately 256 * the algorithm on the host side that computes the guest memory 257 * pressure only uses num_committed value. 258 */ 259 260 struct dm_status { 261 struct dm_header hdr; 262 __u64 num_avail; 263 __u64 num_committed; 264 __u64 page_file_size; 265 __u64 zero_free; 266 __u32 page_file_writes; 267 __u32 io_diff; 268 } __packed; 269 270 271 /* 272 * Message to ask the guest to allocate memory - balloon up message. 273 * This message is sent from the host to the guest. The guest may not be 274 * able to allocate as much memory as requested. 275 * 276 * num_pages: number of pages to allocate. 277 */ 278 279 struct dm_balloon { 280 struct dm_header hdr; 281 __u32 num_pages; 282 __u32 reservedz; 283 } __packed; 284 285 286 /* 287 * Balloon response message; this message is sent from the guest 288 * to the host in response to the balloon message. 289 * 290 * reservedz: Reserved; must be set to zero. 291 * more_pages: If FALSE, this is the last message of the transaction. 292 * if TRUE there will atleast one more message from the guest. 293 * 294 * range_count: The number of ranges in the range array. 295 * 296 * range_array: An array of page ranges returned to the host. 297 * 298 */ 299 300 struct dm_balloon_response { 301 struct dm_header hdr; 302 __u32 reservedz; 303 __u32 more_pages:1; 304 __u32 range_count:31; 305 union dm_mem_page_range range_array[]; 306 } __packed; 307 308 /* 309 * Un-balloon message; this message is sent from the host 310 * to the guest to give guest more memory. 311 * 312 * more_pages: If FALSE, this is the last message of the transaction. 313 * if TRUE there will atleast one more message from the guest. 314 * 315 * reservedz: Reserved; must be set to zero. 316 * 317 * range_count: The number of ranges in the range array. 318 * 319 * range_array: An array of page ranges returned to the host. 320 * 321 */ 322 323 struct dm_unballoon_request { 324 struct dm_header hdr; 325 __u32 more_pages:1; 326 __u32 reservedz:31; 327 __u32 range_count; 328 union dm_mem_page_range range_array[]; 329 } __packed; 330 331 /* 332 * Un-balloon response message; this message is sent from the guest 333 * to the host in response to an unballoon request. 334 * 335 */ 336 337 struct dm_unballoon_response { 338 struct dm_header hdr; 339 } __packed; 340 341 342 /* 343 * Hot add request message. Message sent from the host to the guest. 344 * 345 * mem_range: Memory range to hot add. 346 * 347 */ 348 349 struct dm_hot_add { 350 struct dm_header hdr; 351 union dm_mem_page_range range; 352 } __packed; 353 354 /* 355 * Hot add response message. 356 * This message is sent by the guest to report the status of a hot add request. 357 * If page_count is less than the requested page count, then the host should 358 * assume all further hot add requests will fail, since this indicates that 359 * the guest has hit an upper physical memory barrier. 360 * 361 * Hot adds may also fail due to low resources; in this case, the guest must 362 * not complete this message until the hot add can succeed, and the host must 363 * not send a new hot add request until the response is sent. 364 * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS 365 * times it fails the request. 366 * 367 * 368 * page_count: number of pages that were successfully hot added. 369 * 370 * result: result of the operation 1: success, 0: failure. 371 * 372 */ 373 374 struct dm_hot_add_response { 375 struct dm_header hdr; 376 __u32 page_count; 377 __u32 result; 378 } __packed; 379 380 /* 381 * Types of information sent from host to the guest. 382 */ 383 384 enum dm_info_type { 385 INFO_TYPE_MAX_PAGE_CNT = 0, 386 MAX_INFO_TYPE 387 }; 388 389 390 /* 391 * Header for the information message. 392 */ 393 394 struct dm_info_header { 395 enum dm_info_type type; 396 __u32 data_size; 397 } __packed; 398 399 /* 400 * This message is sent from the host to the guest to pass 401 * some relevant information (win8 addition). 402 * 403 * reserved: no used. 404 * info_size: size of the information blob. 405 * info: information blob. 406 */ 407 408 struct dm_info_msg { 409 struct dm_header hdr; 410 __u32 reserved; 411 __u32 info_size; 412 __u8 info[]; 413 }; 414 415 /* 416 * End protocol definitions. 417 */ 418 419 /* 420 * State to manage hot adding memory into the guest. 421 * The range start_pfn : end_pfn specifies the range 422 * that the host has asked us to hot add. The range 423 * start_pfn : ha_end_pfn specifies the range that we have 424 * currently hot added. We hot add in multiples of 128M 425 * chunks; it is possible that we may not be able to bring 426 * online all the pages in the region. The range 427 * covered_start_pfn:covered_end_pfn defines the pages that can 428 * be brough online. 429 */ 430 431 struct hv_hotadd_state { 432 struct list_head list; 433 unsigned long start_pfn; 434 unsigned long covered_start_pfn; 435 unsigned long covered_end_pfn; 436 unsigned long ha_end_pfn; 437 unsigned long end_pfn; 438 /* 439 * A list of gaps. 440 */ 441 struct list_head gap_list; 442 }; 443 444 struct hv_hotadd_gap { 445 struct list_head list; 446 unsigned long start_pfn; 447 unsigned long end_pfn; 448 }; 449 450 struct balloon_state { 451 __u32 num_pages; 452 struct work_struct wrk; 453 }; 454 455 struct hot_add_wrk { 456 union dm_mem_page_range ha_page_range; 457 union dm_mem_page_range ha_region_range; 458 struct work_struct wrk; 459 }; 460 461 static bool allow_hibernation; 462 static bool hot_add = true; 463 static bool do_hot_add; 464 /* 465 * Delay reporting memory pressure by 466 * the specified number of seconds. 467 */ 468 static uint pressure_report_delay = 45; 469 470 /* 471 * The last time we posted a pressure report to host. 472 */ 473 static unsigned long last_post_time; 474 475 module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); 476 MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); 477 478 module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); 479 MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); 480 static atomic_t trans_id = ATOMIC_INIT(0); 481 482 static int dm_ring_size = 20 * 1024; 483 484 /* 485 * Driver specific state. 486 */ 487 488 enum hv_dm_state { 489 DM_INITIALIZING = 0, 490 DM_INITIALIZED, 491 DM_BALLOON_UP, 492 DM_BALLOON_DOWN, 493 DM_HOT_ADD, 494 DM_INIT_ERROR 495 }; 496 497 498 static __u8 recv_buffer[HV_HYP_PAGE_SIZE]; 499 static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE]; 500 #define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE) 501 #define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE) 502 503 struct hv_dynmem_device { 504 struct hv_device *dev; 505 enum hv_dm_state state; 506 struct completion host_event; 507 struct completion config_event; 508 509 /* 510 * Number of pages we have currently ballooned out. 511 */ 512 unsigned int num_pages_ballooned; 513 unsigned int num_pages_onlined; 514 unsigned int num_pages_added; 515 516 /* 517 * State to manage the ballooning (up) operation. 518 */ 519 struct balloon_state balloon_wrk; 520 521 /* 522 * State to execute the "hot-add" operation. 523 */ 524 struct hot_add_wrk ha_wrk; 525 526 /* 527 * This state tracks if the host has specified a hot-add 528 * region. 529 */ 530 bool host_specified_ha_region; 531 532 /* 533 * State to synchronize hot-add. 534 */ 535 struct completion ol_waitevent; 536 bool ha_waiting; 537 /* 538 * This thread handles hot-add 539 * requests from the host as well as notifying 540 * the host with regards to memory pressure in 541 * the guest. 542 */ 543 struct task_struct *thread; 544 545 /* 546 * Protects ha_region_list, num_pages_onlined counter and individual 547 * regions from ha_region_list. 548 */ 549 spinlock_t ha_lock; 550 551 /* 552 * A list of hot-add regions. 553 */ 554 struct list_head ha_region_list; 555 556 /* 557 * We start with the highest version we can support 558 * and downgrade based on the host; we save here the 559 * next version to try. 560 */ 561 __u32 next_version; 562 563 /* 564 * The negotiated version agreed by host. 565 */ 566 __u32 version; 567 }; 568 569 static struct hv_dynmem_device dm_device; 570 571 static void post_status(struct hv_dynmem_device *dm); 572 573 #ifdef CONFIG_MEMORY_HOTPLUG 574 static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, 575 unsigned long pfn) 576 { 577 struct hv_hotadd_gap *gap; 578 579 /* The page is not backed. */ 580 if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn)) 581 return false; 582 583 /* Check for gaps. */ 584 list_for_each_entry(gap, &has->gap_list, list) { 585 if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn)) 586 return false; 587 } 588 589 return true; 590 } 591 592 static unsigned long hv_page_offline_check(unsigned long start_pfn, 593 unsigned long nr_pages) 594 { 595 unsigned long pfn = start_pfn, count = 0; 596 struct hv_hotadd_state *has; 597 bool found; 598 599 while (pfn < start_pfn + nr_pages) { 600 /* 601 * Search for HAS which covers the pfn and when we find one 602 * count how many consequitive PFNs are covered. 603 */ 604 found = false; 605 list_for_each_entry(has, &dm_device.ha_region_list, list) { 606 while ((pfn >= has->start_pfn) && 607 (pfn < has->end_pfn) && 608 (pfn < start_pfn + nr_pages)) { 609 found = true; 610 if (has_pfn_is_backed(has, pfn)) 611 count++; 612 pfn++; 613 } 614 } 615 616 /* 617 * This PFN is not in any HAS (e.g. we're offlining a region 618 * which was present at boot), no need to account for it. Go 619 * to the next one. 620 */ 621 if (!found) 622 pfn++; 623 } 624 625 return count; 626 } 627 628 static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, 629 void *v) 630 { 631 struct memory_notify *mem = (struct memory_notify *)v; 632 unsigned long flags, pfn_count; 633 634 switch (val) { 635 case MEM_ONLINE: 636 case MEM_CANCEL_ONLINE: 637 if (dm_device.ha_waiting) { 638 dm_device.ha_waiting = false; 639 complete(&dm_device.ol_waitevent); 640 } 641 break; 642 643 case MEM_OFFLINE: 644 spin_lock_irqsave(&dm_device.ha_lock, flags); 645 pfn_count = hv_page_offline_check(mem->start_pfn, 646 mem->nr_pages); 647 if (pfn_count <= dm_device.num_pages_onlined) { 648 dm_device.num_pages_onlined -= pfn_count; 649 } else { 650 /* 651 * We're offlining more pages than we managed to online. 652 * This is unexpected. In any case don't let 653 * num_pages_onlined wrap around zero. 654 */ 655 WARN_ON_ONCE(1); 656 dm_device.num_pages_onlined = 0; 657 } 658 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 659 break; 660 case MEM_GOING_ONLINE: 661 case MEM_GOING_OFFLINE: 662 case MEM_CANCEL_OFFLINE: 663 break; 664 } 665 return NOTIFY_OK; 666 } 667 668 static struct notifier_block hv_memory_nb = { 669 .notifier_call = hv_memory_notifier, 670 .priority = 0 671 }; 672 673 /* Check if the particular page is backed and can be onlined and online it. */ 674 static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) 675 { 676 if (!has_pfn_is_backed(has, page_to_pfn(pg))) { 677 if (!PageOffline(pg)) 678 __SetPageOffline(pg); 679 return; 680 } 681 if (PageOffline(pg)) 682 __ClearPageOffline(pg); 683 684 /* This frame is currently backed; online the page. */ 685 generic_online_page(pg, 0); 686 687 lockdep_assert_held(&dm_device.ha_lock); 688 dm_device.num_pages_onlined++; 689 } 690 691 static void hv_bring_pgs_online(struct hv_hotadd_state *has, 692 unsigned long start_pfn, unsigned long size) 693 { 694 int i; 695 696 pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); 697 for (i = 0; i < size; i++) 698 hv_page_online_one(has, pfn_to_page(start_pfn + i)); 699 } 700 701 static void hv_mem_hot_add(unsigned long start, unsigned long size, 702 unsigned long pfn_count, 703 struct hv_hotadd_state *has) 704 { 705 int ret = 0; 706 int i, nid; 707 unsigned long start_pfn; 708 unsigned long processed_pfn; 709 unsigned long total_pfn = pfn_count; 710 unsigned long flags; 711 712 for (i = 0; i < (size/HA_CHUNK); i++) { 713 start_pfn = start + (i * HA_CHUNK); 714 715 spin_lock_irqsave(&dm_device.ha_lock, flags); 716 has->ha_end_pfn += HA_CHUNK; 717 718 if (total_pfn > HA_CHUNK) { 719 processed_pfn = HA_CHUNK; 720 total_pfn -= HA_CHUNK; 721 } else { 722 processed_pfn = total_pfn; 723 total_pfn = 0; 724 } 725 726 has->covered_end_pfn += processed_pfn; 727 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 728 729 init_completion(&dm_device.ol_waitevent); 730 dm_device.ha_waiting = !memhp_auto_online; 731 732 nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); 733 ret = add_memory(nid, PFN_PHYS((start_pfn)), 734 (HA_CHUNK << PAGE_SHIFT)); 735 736 if (ret) { 737 pr_err("hot_add memory failed error is %d\n", ret); 738 if (ret == -EEXIST) { 739 /* 740 * This error indicates that the error 741 * is not a transient failure. This is the 742 * case where the guest's physical address map 743 * precludes hot adding memory. Stop all further 744 * memory hot-add. 745 */ 746 do_hot_add = false; 747 } 748 spin_lock_irqsave(&dm_device.ha_lock, flags); 749 has->ha_end_pfn -= HA_CHUNK; 750 has->covered_end_pfn -= processed_pfn; 751 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 752 break; 753 } 754 755 /* 756 * Wait for the memory block to be onlined when memory onlining 757 * is done outside of kernel (memhp_auto_online). Since the hot 758 * add has succeeded, it is ok to proceed even if the pages in 759 * the hot added region have not been "onlined" within the 760 * allowed time. 761 */ 762 if (dm_device.ha_waiting) 763 wait_for_completion_timeout(&dm_device.ol_waitevent, 764 5*HZ); 765 post_status(&dm_device); 766 } 767 } 768 769 static void hv_online_page(struct page *pg, unsigned int order) 770 { 771 struct hv_hotadd_state *has; 772 unsigned long flags; 773 unsigned long pfn = page_to_pfn(pg); 774 775 spin_lock_irqsave(&dm_device.ha_lock, flags); 776 list_for_each_entry(has, &dm_device.ha_region_list, list) { 777 /* The page belongs to a different HAS. */ 778 if ((pfn < has->start_pfn) || 779 (pfn + (1UL << order) > has->end_pfn)) 780 continue; 781 782 hv_bring_pgs_online(has, pfn, 1UL << order); 783 break; 784 } 785 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 786 } 787 788 static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) 789 { 790 struct hv_hotadd_state *has; 791 struct hv_hotadd_gap *gap; 792 unsigned long residual, new_inc; 793 int ret = 0; 794 unsigned long flags; 795 796 spin_lock_irqsave(&dm_device.ha_lock, flags); 797 list_for_each_entry(has, &dm_device.ha_region_list, list) { 798 /* 799 * If the pfn range we are dealing with is not in the current 800 * "hot add block", move on. 801 */ 802 if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn) 803 continue; 804 805 /* 806 * If the current start pfn is not where the covered_end 807 * is, create a gap and update covered_end_pfn. 808 */ 809 if (has->covered_end_pfn != start_pfn) { 810 gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC); 811 if (!gap) { 812 ret = -ENOMEM; 813 break; 814 } 815 816 INIT_LIST_HEAD(&gap->list); 817 gap->start_pfn = has->covered_end_pfn; 818 gap->end_pfn = start_pfn; 819 list_add_tail(&gap->list, &has->gap_list); 820 821 has->covered_end_pfn = start_pfn; 822 } 823 824 /* 825 * If the current hot add-request extends beyond 826 * our current limit; extend it. 827 */ 828 if ((start_pfn + pfn_cnt) > has->end_pfn) { 829 residual = (start_pfn + pfn_cnt - has->end_pfn); 830 /* 831 * Extend the region by multiples of HA_CHUNK. 832 */ 833 new_inc = (residual / HA_CHUNK) * HA_CHUNK; 834 if (residual % HA_CHUNK) 835 new_inc += HA_CHUNK; 836 837 has->end_pfn += new_inc; 838 } 839 840 ret = 1; 841 break; 842 } 843 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 844 845 return ret; 846 } 847 848 static unsigned long handle_pg_range(unsigned long pg_start, 849 unsigned long pg_count) 850 { 851 unsigned long start_pfn = pg_start; 852 unsigned long pfn_cnt = pg_count; 853 unsigned long size; 854 struct hv_hotadd_state *has; 855 unsigned long pgs_ol = 0; 856 unsigned long old_covered_state; 857 unsigned long res = 0, flags; 858 859 pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count, 860 pg_start); 861 862 spin_lock_irqsave(&dm_device.ha_lock, flags); 863 list_for_each_entry(has, &dm_device.ha_region_list, list) { 864 /* 865 * If the pfn range we are dealing with is not in the current 866 * "hot add block", move on. 867 */ 868 if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn) 869 continue; 870 871 old_covered_state = has->covered_end_pfn; 872 873 if (start_pfn < has->ha_end_pfn) { 874 /* 875 * This is the case where we are backing pages 876 * in an already hot added region. Bring 877 * these pages online first. 878 */ 879 pgs_ol = has->ha_end_pfn - start_pfn; 880 if (pgs_ol > pfn_cnt) 881 pgs_ol = pfn_cnt; 882 883 has->covered_end_pfn += pgs_ol; 884 pfn_cnt -= pgs_ol; 885 /* 886 * Check if the corresponding memory block is already 887 * online. It is possible to observe struct pages still 888 * being uninitialized here so check section instead. 889 * In case the section is online we need to bring the 890 * rest of pfns (which were not backed previously) 891 * online too. 892 */ 893 if (start_pfn > has->start_pfn && 894 online_section_nr(pfn_to_section_nr(start_pfn))) 895 hv_bring_pgs_online(has, start_pfn, pgs_ol); 896 897 } 898 899 if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { 900 /* 901 * We have some residual hot add range 902 * that needs to be hot added; hot add 903 * it now. Hot add a multiple of 904 * of HA_CHUNK that fully covers the pages 905 * we have. 906 */ 907 size = (has->end_pfn - has->ha_end_pfn); 908 if (pfn_cnt <= size) { 909 size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); 910 if (pfn_cnt % HA_CHUNK) 911 size += HA_CHUNK; 912 } else { 913 pfn_cnt = size; 914 } 915 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 916 hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has); 917 spin_lock_irqsave(&dm_device.ha_lock, flags); 918 } 919 /* 920 * If we managed to online any pages that were given to us, 921 * we declare success. 922 */ 923 res = has->covered_end_pfn - old_covered_state; 924 break; 925 } 926 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 927 928 return res; 929 } 930 931 static unsigned long process_hot_add(unsigned long pg_start, 932 unsigned long pfn_cnt, 933 unsigned long rg_start, 934 unsigned long rg_size) 935 { 936 struct hv_hotadd_state *ha_region = NULL; 937 int covered; 938 unsigned long flags; 939 940 if (pfn_cnt == 0) 941 return 0; 942 943 if (!dm_device.host_specified_ha_region) { 944 covered = pfn_covered(pg_start, pfn_cnt); 945 if (covered < 0) 946 return 0; 947 948 if (covered) 949 goto do_pg_range; 950 } 951 952 /* 953 * If the host has specified a hot-add range; deal with it first. 954 */ 955 956 if (rg_size != 0) { 957 ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL); 958 if (!ha_region) 959 return 0; 960 961 INIT_LIST_HEAD(&ha_region->list); 962 INIT_LIST_HEAD(&ha_region->gap_list); 963 964 ha_region->start_pfn = rg_start; 965 ha_region->ha_end_pfn = rg_start; 966 ha_region->covered_start_pfn = pg_start; 967 ha_region->covered_end_pfn = pg_start; 968 ha_region->end_pfn = rg_start + rg_size; 969 970 spin_lock_irqsave(&dm_device.ha_lock, flags); 971 list_add_tail(&ha_region->list, &dm_device.ha_region_list); 972 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 973 } 974 975 do_pg_range: 976 /* 977 * Process the page range specified; bringing them 978 * online if possible. 979 */ 980 return handle_pg_range(pg_start, pfn_cnt); 981 } 982 983 #endif 984 985 static void hot_add_req(struct work_struct *dummy) 986 { 987 struct dm_hot_add_response resp; 988 #ifdef CONFIG_MEMORY_HOTPLUG 989 unsigned long pg_start, pfn_cnt; 990 unsigned long rg_start, rg_sz; 991 #endif 992 struct hv_dynmem_device *dm = &dm_device; 993 994 memset(&resp, 0, sizeof(struct dm_hot_add_response)); 995 resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE; 996 resp.hdr.size = sizeof(struct dm_hot_add_response); 997 998 #ifdef CONFIG_MEMORY_HOTPLUG 999 pg_start = dm->ha_wrk.ha_page_range.finfo.start_page; 1000 pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt; 1001 1002 rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; 1003 rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; 1004 1005 if ((rg_start == 0) && (!dm->host_specified_ha_region)) { 1006 unsigned long region_size; 1007 unsigned long region_start; 1008 1009 /* 1010 * The host has not specified the hot-add region. 1011 * Based on the hot-add page range being specified, 1012 * compute a hot-add region that can cover the pages 1013 * that need to be hot-added while ensuring the alignment 1014 * and size requirements of Linux as it relates to hot-add. 1015 */ 1016 region_start = pg_start; 1017 region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; 1018 if (pfn_cnt % HA_CHUNK) 1019 region_size += HA_CHUNK; 1020 1021 region_start = (pg_start / HA_CHUNK) * HA_CHUNK; 1022 1023 rg_start = region_start; 1024 rg_sz = region_size; 1025 } 1026 1027 if (do_hot_add) 1028 resp.page_count = process_hot_add(pg_start, pfn_cnt, 1029 rg_start, rg_sz); 1030 1031 dm->num_pages_added += resp.page_count; 1032 #endif 1033 /* 1034 * The result field of the response structure has the 1035 * following semantics: 1036 * 1037 * 1. If all or some pages hot-added: Guest should return success. 1038 * 1039 * 2. If no pages could be hot-added: 1040 * 1041 * If the guest returns success, then the host 1042 * will not attempt any further hot-add operations. This 1043 * signifies a permanent failure. 1044 * 1045 * If the guest returns failure, then this failure will be 1046 * treated as a transient failure and the host may retry the 1047 * hot-add operation after some delay. 1048 */ 1049 if (resp.page_count > 0) 1050 resp.result = 1; 1051 else if (!do_hot_add) 1052 resp.result = 1; 1053 else 1054 resp.result = 0; 1055 1056 if (!do_hot_add || resp.page_count == 0) { 1057 if (!allow_hibernation) 1058 pr_err("Memory hot add failed\n"); 1059 else 1060 pr_info("Ignore hot-add request!\n"); 1061 } 1062 1063 dm->state = DM_INITIALIZED; 1064 resp.hdr.trans_id = atomic_inc_return(&trans_id); 1065 vmbus_sendpacket(dm->dev->channel, &resp, 1066 sizeof(struct dm_hot_add_response), 1067 (unsigned long)NULL, 1068 VM_PKT_DATA_INBAND, 0); 1069 } 1070 1071 static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) 1072 { 1073 struct dm_info_header *info_hdr; 1074 1075 info_hdr = (struct dm_info_header *)msg->info; 1076 1077 switch (info_hdr->type) { 1078 case INFO_TYPE_MAX_PAGE_CNT: 1079 if (info_hdr->data_size == sizeof(__u64)) { 1080 __u64 *max_page_count = (__u64 *)&info_hdr[1]; 1081 1082 pr_info("Max. dynamic memory size: %llu MB\n", 1083 (*max_page_count) >> (20 - HV_HYP_PAGE_SHIFT)); 1084 } 1085 1086 break; 1087 default: 1088 pr_warn("Received Unknown type: %d\n", info_hdr->type); 1089 } 1090 } 1091 1092 static unsigned long compute_balloon_floor(void) 1093 { 1094 unsigned long min_pages; 1095 unsigned long nr_pages = totalram_pages(); 1096 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) 1097 /* Simple continuous piecewiese linear function: 1098 * max MiB -> min MiB gradient 1099 * 0 0 1100 * 16 16 1101 * 32 24 1102 * 128 72 (1/2) 1103 * 512 168 (1/4) 1104 * 2048 360 (1/8) 1105 * 8192 744 (1/16) 1106 * 32768 1512 (1/32) 1107 */ 1108 if (nr_pages < MB2PAGES(128)) 1109 min_pages = MB2PAGES(8) + (nr_pages >> 1); 1110 else if (nr_pages < MB2PAGES(512)) 1111 min_pages = MB2PAGES(40) + (nr_pages >> 2); 1112 else if (nr_pages < MB2PAGES(2048)) 1113 min_pages = MB2PAGES(104) + (nr_pages >> 3); 1114 else if (nr_pages < MB2PAGES(8192)) 1115 min_pages = MB2PAGES(232) + (nr_pages >> 4); 1116 else 1117 min_pages = MB2PAGES(488) + (nr_pages >> 5); 1118 #undef MB2PAGES 1119 return min_pages; 1120 } 1121 1122 /* 1123 * Post our status as it relates memory pressure to the 1124 * host. Host expects the guests to post this status 1125 * periodically at 1 second intervals. 1126 * 1127 * The metrics specified in this protocol are very Windows 1128 * specific and so we cook up numbers here to convey our memory 1129 * pressure. 1130 */ 1131 1132 static void post_status(struct hv_dynmem_device *dm) 1133 { 1134 struct dm_status status; 1135 unsigned long now = jiffies; 1136 unsigned long last_post = last_post_time; 1137 1138 if (pressure_report_delay > 0) { 1139 --pressure_report_delay; 1140 return; 1141 } 1142 1143 if (!time_after(now, (last_post_time + HZ))) 1144 return; 1145 1146 memset(&status, 0, sizeof(struct dm_status)); 1147 status.hdr.type = DM_STATUS_REPORT; 1148 status.hdr.size = sizeof(struct dm_status); 1149 status.hdr.trans_id = atomic_inc_return(&trans_id); 1150 1151 /* 1152 * The host expects the guest to report free and committed memory. 1153 * Furthermore, the host expects the pressure information to include 1154 * the ballooned out pages. For a given amount of memory that we are 1155 * managing we need to compute a floor below which we should not 1156 * balloon. Compute this and add it to the pressure report. 1157 * We also need to report all offline pages (num_pages_added - 1158 * num_pages_onlined) as committed to the host, otherwise it can try 1159 * asking us to balloon them out. 1160 */ 1161 status.num_avail = si_mem_available(); 1162 status.num_committed = vm_memory_committed() + 1163 dm->num_pages_ballooned + 1164 (dm->num_pages_added > dm->num_pages_onlined ? 1165 dm->num_pages_added - dm->num_pages_onlined : 0) + 1166 compute_balloon_floor(); 1167 1168 trace_balloon_status(status.num_avail, status.num_committed, 1169 vm_memory_committed(), dm->num_pages_ballooned, 1170 dm->num_pages_added, dm->num_pages_onlined); 1171 /* 1172 * If our transaction ID is no longer current, just don't 1173 * send the status. This can happen if we were interrupted 1174 * after we picked our transaction ID. 1175 */ 1176 if (status.hdr.trans_id != atomic_read(&trans_id)) 1177 return; 1178 1179 /* 1180 * If the last post time that we sampled has changed, 1181 * we have raced, don't post the status. 1182 */ 1183 if (last_post != last_post_time) 1184 return; 1185 1186 last_post_time = jiffies; 1187 vmbus_sendpacket(dm->dev->channel, &status, 1188 sizeof(struct dm_status), 1189 (unsigned long)NULL, 1190 VM_PKT_DATA_INBAND, 0); 1191 1192 } 1193 1194 static void free_balloon_pages(struct hv_dynmem_device *dm, 1195 union dm_mem_page_range *range_array) 1196 { 1197 int num_pages = range_array->finfo.page_cnt; 1198 __u64 start_frame = range_array->finfo.start_page; 1199 struct page *pg; 1200 int i; 1201 1202 for (i = 0; i < num_pages; i++) { 1203 pg = pfn_to_page(i + start_frame); 1204 __ClearPageOffline(pg); 1205 __free_page(pg); 1206 dm->num_pages_ballooned--; 1207 } 1208 } 1209 1210 1211 1212 static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, 1213 unsigned int num_pages, 1214 struct dm_balloon_response *bl_resp, 1215 int alloc_unit) 1216 { 1217 unsigned int i, j; 1218 struct page *pg; 1219 1220 for (i = 0; i < num_pages / alloc_unit; i++) { 1221 if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) > 1222 HV_HYP_PAGE_SIZE) 1223 return i * alloc_unit; 1224 1225 /* 1226 * We execute this code in a thread context. Furthermore, 1227 * we don't want the kernel to try too hard. 1228 */ 1229 pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY | 1230 __GFP_NOMEMALLOC | __GFP_NOWARN, 1231 get_order(alloc_unit << PAGE_SHIFT)); 1232 1233 if (!pg) 1234 return i * alloc_unit; 1235 1236 dm->num_pages_ballooned += alloc_unit; 1237 1238 /* 1239 * If we allocatted 2M pages; split them so we 1240 * can free them in any order we get. 1241 */ 1242 1243 if (alloc_unit != 1) 1244 split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); 1245 1246 /* mark all pages offline */ 1247 for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++) 1248 __SetPageOffline(pg + j); 1249 1250 bl_resp->range_count++; 1251 bl_resp->range_array[i].finfo.start_page = 1252 page_to_pfn(pg); 1253 bl_resp->range_array[i].finfo.page_cnt = alloc_unit; 1254 bl_resp->hdr.size += sizeof(union dm_mem_page_range); 1255 1256 } 1257 1258 return i * alloc_unit; 1259 } 1260 1261 static void balloon_up(struct work_struct *dummy) 1262 { 1263 unsigned int num_pages = dm_device.balloon_wrk.num_pages; 1264 unsigned int num_ballooned = 0; 1265 struct dm_balloon_response *bl_resp; 1266 int alloc_unit; 1267 int ret; 1268 bool done = false; 1269 int i; 1270 long avail_pages; 1271 unsigned long floor; 1272 1273 /* 1274 * We will attempt 2M allocations. However, if we fail to 1275 * allocate 2M chunks, we will go back to PAGE_SIZE allocations. 1276 */ 1277 alloc_unit = PAGES_IN_2M; 1278 1279 avail_pages = si_mem_available(); 1280 floor = compute_balloon_floor(); 1281 1282 /* Refuse to balloon below the floor. */ 1283 if (avail_pages < num_pages || avail_pages - num_pages < floor) { 1284 pr_warn("Balloon request will be partially fulfilled. %s\n", 1285 avail_pages < num_pages ? "Not enough memory." : 1286 "Balloon floor reached."); 1287 1288 num_pages = avail_pages > floor ? (avail_pages - floor) : 0; 1289 } 1290 1291 while (!done) { 1292 memset(balloon_up_send_buffer, 0, HV_HYP_PAGE_SIZE); 1293 bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer; 1294 bl_resp->hdr.type = DM_BALLOON_RESPONSE; 1295 bl_resp->hdr.size = sizeof(struct dm_balloon_response); 1296 bl_resp->more_pages = 1; 1297 1298 num_pages -= num_ballooned; 1299 num_ballooned = alloc_balloon_pages(&dm_device, num_pages, 1300 bl_resp, alloc_unit); 1301 1302 if (alloc_unit != 1 && num_ballooned == 0) { 1303 alloc_unit = 1; 1304 continue; 1305 } 1306 1307 if (num_ballooned == 0 || num_ballooned == num_pages) { 1308 pr_debug("Ballooned %u out of %u requested pages.\n", 1309 num_pages, dm_device.balloon_wrk.num_pages); 1310 1311 bl_resp->more_pages = 0; 1312 done = true; 1313 dm_device.state = DM_INITIALIZED; 1314 } 1315 1316 /* 1317 * We are pushing a lot of data through the channel; 1318 * deal with transient failures caused because of the 1319 * lack of space in the ring buffer. 1320 */ 1321 1322 do { 1323 bl_resp->hdr.trans_id = atomic_inc_return(&trans_id); 1324 ret = vmbus_sendpacket(dm_device.dev->channel, 1325 bl_resp, 1326 bl_resp->hdr.size, 1327 (unsigned long)NULL, 1328 VM_PKT_DATA_INBAND, 0); 1329 1330 if (ret == -EAGAIN) 1331 msleep(20); 1332 post_status(&dm_device); 1333 } while (ret == -EAGAIN); 1334 1335 if (ret) { 1336 /* 1337 * Free up the memory we allocatted. 1338 */ 1339 pr_err("Balloon response failed\n"); 1340 1341 for (i = 0; i < bl_resp->range_count; i++) 1342 free_balloon_pages(&dm_device, 1343 &bl_resp->range_array[i]); 1344 1345 done = true; 1346 } 1347 } 1348 1349 } 1350 1351 static void balloon_down(struct hv_dynmem_device *dm, 1352 struct dm_unballoon_request *req) 1353 { 1354 union dm_mem_page_range *range_array = req->range_array; 1355 int range_count = req->range_count; 1356 struct dm_unballoon_response resp; 1357 int i; 1358 unsigned int prev_pages_ballooned = dm->num_pages_ballooned; 1359 1360 for (i = 0; i < range_count; i++) { 1361 free_balloon_pages(dm, &range_array[i]); 1362 complete(&dm_device.config_event); 1363 } 1364 1365 pr_debug("Freed %u ballooned pages.\n", 1366 prev_pages_ballooned - dm->num_pages_ballooned); 1367 1368 if (req->more_pages == 1) 1369 return; 1370 1371 memset(&resp, 0, sizeof(struct dm_unballoon_response)); 1372 resp.hdr.type = DM_UNBALLOON_RESPONSE; 1373 resp.hdr.trans_id = atomic_inc_return(&trans_id); 1374 resp.hdr.size = sizeof(struct dm_unballoon_response); 1375 1376 vmbus_sendpacket(dm_device.dev->channel, &resp, 1377 sizeof(struct dm_unballoon_response), 1378 (unsigned long)NULL, 1379 VM_PKT_DATA_INBAND, 0); 1380 1381 dm->state = DM_INITIALIZED; 1382 } 1383 1384 static void balloon_onchannelcallback(void *context); 1385 1386 static int dm_thread_func(void *dm_dev) 1387 { 1388 struct hv_dynmem_device *dm = dm_dev; 1389 1390 while (!kthread_should_stop()) { 1391 wait_for_completion_interruptible_timeout( 1392 &dm_device.config_event, 1*HZ); 1393 /* 1394 * The host expects us to post information on the memory 1395 * pressure every second. 1396 */ 1397 reinit_completion(&dm_device.config_event); 1398 post_status(dm); 1399 } 1400 1401 return 0; 1402 } 1403 1404 1405 static void version_resp(struct hv_dynmem_device *dm, 1406 struct dm_version_response *vresp) 1407 { 1408 struct dm_version_request version_req; 1409 int ret; 1410 1411 if (vresp->is_accepted) { 1412 /* 1413 * We are done; wakeup the 1414 * context waiting for version 1415 * negotiation. 1416 */ 1417 complete(&dm->host_event); 1418 return; 1419 } 1420 /* 1421 * If there are more versions to try, continue 1422 * with negotiations; if not 1423 * shutdown the service since we are not able 1424 * to negotiate a suitable version number 1425 * with the host. 1426 */ 1427 if (dm->next_version == 0) 1428 goto version_error; 1429 1430 memset(&version_req, 0, sizeof(struct dm_version_request)); 1431 version_req.hdr.type = DM_VERSION_REQUEST; 1432 version_req.hdr.size = sizeof(struct dm_version_request); 1433 version_req.hdr.trans_id = atomic_inc_return(&trans_id); 1434 version_req.version.version = dm->next_version; 1435 dm->version = version_req.version.version; 1436 1437 /* 1438 * Set the next version to try in case current version fails. 1439 * Win7 protocol ought to be the last one to try. 1440 */ 1441 switch (version_req.version.version) { 1442 case DYNMEM_PROTOCOL_VERSION_WIN8: 1443 dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7; 1444 version_req.is_last_attempt = 0; 1445 break; 1446 default: 1447 dm->next_version = 0; 1448 version_req.is_last_attempt = 1; 1449 } 1450 1451 ret = vmbus_sendpacket(dm->dev->channel, &version_req, 1452 sizeof(struct dm_version_request), 1453 (unsigned long)NULL, 1454 VM_PKT_DATA_INBAND, 0); 1455 1456 if (ret) 1457 goto version_error; 1458 1459 return; 1460 1461 version_error: 1462 dm->state = DM_INIT_ERROR; 1463 complete(&dm->host_event); 1464 } 1465 1466 static void cap_resp(struct hv_dynmem_device *dm, 1467 struct dm_capabilities_resp_msg *cap_resp) 1468 { 1469 if (!cap_resp->is_accepted) { 1470 pr_err("Capabilities not accepted by host\n"); 1471 dm->state = DM_INIT_ERROR; 1472 } 1473 complete(&dm->host_event); 1474 } 1475 1476 static void balloon_onchannelcallback(void *context) 1477 { 1478 struct hv_device *dev = context; 1479 u32 recvlen; 1480 u64 requestid; 1481 struct dm_message *dm_msg; 1482 struct dm_header *dm_hdr; 1483 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1484 struct dm_balloon *bal_msg; 1485 struct dm_hot_add *ha_msg; 1486 union dm_mem_page_range *ha_pg_range; 1487 union dm_mem_page_range *ha_region; 1488 1489 memset(recv_buffer, 0, sizeof(recv_buffer)); 1490 vmbus_recvpacket(dev->channel, recv_buffer, 1491 HV_HYP_PAGE_SIZE, &recvlen, &requestid); 1492 1493 if (recvlen > 0) { 1494 dm_msg = (struct dm_message *)recv_buffer; 1495 dm_hdr = &dm_msg->hdr; 1496 1497 switch (dm_hdr->type) { 1498 case DM_VERSION_RESPONSE: 1499 version_resp(dm, 1500 (struct dm_version_response *)dm_msg); 1501 break; 1502 1503 case DM_CAPABILITIES_RESPONSE: 1504 cap_resp(dm, 1505 (struct dm_capabilities_resp_msg *)dm_msg); 1506 break; 1507 1508 case DM_BALLOON_REQUEST: 1509 if (allow_hibernation) { 1510 pr_info("Ignore balloon-up request!\n"); 1511 break; 1512 } 1513 1514 if (dm->state == DM_BALLOON_UP) 1515 pr_warn("Currently ballooning\n"); 1516 bal_msg = (struct dm_balloon *)recv_buffer; 1517 dm->state = DM_BALLOON_UP; 1518 dm_device.balloon_wrk.num_pages = bal_msg->num_pages; 1519 schedule_work(&dm_device.balloon_wrk.wrk); 1520 break; 1521 1522 case DM_UNBALLOON_REQUEST: 1523 if (allow_hibernation) { 1524 pr_info("Ignore balloon-down request!\n"); 1525 break; 1526 } 1527 1528 dm->state = DM_BALLOON_DOWN; 1529 balloon_down(dm, 1530 (struct dm_unballoon_request *)recv_buffer); 1531 break; 1532 1533 case DM_MEM_HOT_ADD_REQUEST: 1534 if (dm->state == DM_HOT_ADD) 1535 pr_warn("Currently hot-adding\n"); 1536 dm->state = DM_HOT_ADD; 1537 ha_msg = (struct dm_hot_add *)recv_buffer; 1538 if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) { 1539 /* 1540 * This is a normal hot-add request specifying 1541 * hot-add memory. 1542 */ 1543 dm->host_specified_ha_region = false; 1544 ha_pg_range = &ha_msg->range; 1545 dm->ha_wrk.ha_page_range = *ha_pg_range; 1546 dm->ha_wrk.ha_region_range.page_range = 0; 1547 } else { 1548 /* 1549 * Host is specifying that we first hot-add 1550 * a region and then partially populate this 1551 * region. 1552 */ 1553 dm->host_specified_ha_region = true; 1554 ha_pg_range = &ha_msg->range; 1555 ha_region = &ha_pg_range[1]; 1556 dm->ha_wrk.ha_page_range = *ha_pg_range; 1557 dm->ha_wrk.ha_region_range = *ha_region; 1558 } 1559 schedule_work(&dm_device.ha_wrk.wrk); 1560 break; 1561 1562 case DM_INFO_MESSAGE: 1563 process_info(dm, (struct dm_info_msg *)dm_msg); 1564 break; 1565 1566 default: 1567 pr_warn("Unhandled message: type: %d\n", dm_hdr->type); 1568 1569 } 1570 } 1571 1572 } 1573 1574 static int balloon_connect_vsp(struct hv_device *dev) 1575 { 1576 struct dm_version_request version_req; 1577 struct dm_capabilities cap_msg; 1578 unsigned long t; 1579 int ret; 1580 1581 ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, 1582 balloon_onchannelcallback, dev); 1583 if (ret) 1584 return ret; 1585 1586 /* 1587 * Initiate the hand shake with the host and negotiate 1588 * a version that the host can support. We start with the 1589 * highest version number and go down if the host cannot 1590 * support it. 1591 */ 1592 memset(&version_req, 0, sizeof(struct dm_version_request)); 1593 version_req.hdr.type = DM_VERSION_REQUEST; 1594 version_req.hdr.size = sizeof(struct dm_version_request); 1595 version_req.hdr.trans_id = atomic_inc_return(&trans_id); 1596 version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10; 1597 version_req.is_last_attempt = 0; 1598 dm_device.version = version_req.version.version; 1599 1600 ret = vmbus_sendpacket(dev->channel, &version_req, 1601 sizeof(struct dm_version_request), 1602 (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); 1603 if (ret) 1604 goto out; 1605 1606 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); 1607 if (t == 0) { 1608 ret = -ETIMEDOUT; 1609 goto out; 1610 } 1611 1612 /* 1613 * If we could not negotiate a compatible version with the host 1614 * fail the probe function. 1615 */ 1616 if (dm_device.state == DM_INIT_ERROR) { 1617 ret = -EPROTO; 1618 goto out; 1619 } 1620 1621 pr_info("Using Dynamic Memory protocol version %u.%u\n", 1622 DYNMEM_MAJOR_VERSION(dm_device.version), 1623 DYNMEM_MINOR_VERSION(dm_device.version)); 1624 1625 /* 1626 * Now submit our capabilities to the host. 1627 */ 1628 memset(&cap_msg, 0, sizeof(struct dm_capabilities)); 1629 cap_msg.hdr.type = DM_CAPABILITIES_REPORT; 1630 cap_msg.hdr.size = sizeof(struct dm_capabilities); 1631 cap_msg.hdr.trans_id = atomic_inc_return(&trans_id); 1632 1633 /* 1634 * When hibernation (i.e. virtual ACPI S4 state) is enabled, the host 1635 * currently still requires the bits to be set, so we have to add code 1636 * to fail the host's hot-add and balloon up/down requests, if any. 1637 */ 1638 cap_msg.caps.cap_bits.balloon = 1; 1639 cap_msg.caps.cap_bits.hot_add = 1; 1640 1641 /* 1642 * Specify our alignment requirements as it relates 1643 * memory hot-add. Specify 128MB alignment. 1644 */ 1645 cap_msg.caps.cap_bits.hot_add_alignment = 7; 1646 1647 /* 1648 * Currently the host does not use these 1649 * values and we set them to what is done in the 1650 * Windows driver. 1651 */ 1652 cap_msg.min_page_cnt = 0; 1653 cap_msg.max_page_number = -1; 1654 1655 ret = vmbus_sendpacket(dev->channel, &cap_msg, 1656 sizeof(struct dm_capabilities), 1657 (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); 1658 if (ret) 1659 goto out; 1660 1661 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); 1662 if (t == 0) { 1663 ret = -ETIMEDOUT; 1664 goto out; 1665 } 1666 1667 /* 1668 * If the host does not like our capabilities, 1669 * fail the probe function. 1670 */ 1671 if (dm_device.state == DM_INIT_ERROR) { 1672 ret = -EPROTO; 1673 goto out; 1674 } 1675 1676 return 0; 1677 out: 1678 vmbus_close(dev->channel); 1679 return ret; 1680 } 1681 1682 static int balloon_probe(struct hv_device *dev, 1683 const struct hv_vmbus_device_id *dev_id) 1684 { 1685 int ret; 1686 1687 allow_hibernation = hv_is_hibernation_supported(); 1688 if (allow_hibernation) 1689 hot_add = false; 1690 1691 #ifdef CONFIG_MEMORY_HOTPLUG 1692 do_hot_add = hot_add; 1693 #else 1694 do_hot_add = false; 1695 #endif 1696 dm_device.dev = dev; 1697 dm_device.state = DM_INITIALIZING; 1698 dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; 1699 init_completion(&dm_device.host_event); 1700 init_completion(&dm_device.config_event); 1701 INIT_LIST_HEAD(&dm_device.ha_region_list); 1702 spin_lock_init(&dm_device.ha_lock); 1703 INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); 1704 INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); 1705 dm_device.host_specified_ha_region = false; 1706 1707 #ifdef CONFIG_MEMORY_HOTPLUG 1708 set_online_page_callback(&hv_online_page); 1709 register_memory_notifier(&hv_memory_nb); 1710 #endif 1711 1712 hv_set_drvdata(dev, &dm_device); 1713 1714 ret = balloon_connect_vsp(dev); 1715 if (ret != 0) 1716 return ret; 1717 1718 dm_device.state = DM_INITIALIZED; 1719 1720 dm_device.thread = 1721 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); 1722 if (IS_ERR(dm_device.thread)) { 1723 ret = PTR_ERR(dm_device.thread); 1724 goto probe_error; 1725 } 1726 1727 return 0; 1728 1729 probe_error: 1730 dm_device.state = DM_INIT_ERROR; 1731 dm_device.thread = NULL; 1732 vmbus_close(dev->channel); 1733 #ifdef CONFIG_MEMORY_HOTPLUG 1734 unregister_memory_notifier(&hv_memory_nb); 1735 restore_online_page_callback(&hv_online_page); 1736 #endif 1737 return ret; 1738 } 1739 1740 static int balloon_remove(struct hv_device *dev) 1741 { 1742 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1743 struct hv_hotadd_state *has, *tmp; 1744 struct hv_hotadd_gap *gap, *tmp_gap; 1745 unsigned long flags; 1746 1747 if (dm->num_pages_ballooned != 0) 1748 pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); 1749 1750 cancel_work_sync(&dm->balloon_wrk.wrk); 1751 cancel_work_sync(&dm->ha_wrk.wrk); 1752 1753 kthread_stop(dm->thread); 1754 vmbus_close(dev->channel); 1755 #ifdef CONFIG_MEMORY_HOTPLUG 1756 unregister_memory_notifier(&hv_memory_nb); 1757 restore_online_page_callback(&hv_online_page); 1758 #endif 1759 spin_lock_irqsave(&dm_device.ha_lock, flags); 1760 list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { 1761 list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { 1762 list_del(&gap->list); 1763 kfree(gap); 1764 } 1765 list_del(&has->list); 1766 kfree(has); 1767 } 1768 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 1769 1770 return 0; 1771 } 1772 1773 static int balloon_suspend(struct hv_device *hv_dev) 1774 { 1775 struct hv_dynmem_device *dm = hv_get_drvdata(hv_dev); 1776 1777 tasklet_disable(&hv_dev->channel->callback_event); 1778 1779 cancel_work_sync(&dm->balloon_wrk.wrk); 1780 cancel_work_sync(&dm->ha_wrk.wrk); 1781 1782 if (dm->thread) { 1783 kthread_stop(dm->thread); 1784 dm->thread = NULL; 1785 vmbus_close(hv_dev->channel); 1786 } 1787 1788 tasklet_enable(&hv_dev->channel->callback_event); 1789 1790 return 0; 1791 1792 } 1793 1794 static int balloon_resume(struct hv_device *dev) 1795 { 1796 int ret; 1797 1798 dm_device.state = DM_INITIALIZING; 1799 1800 ret = balloon_connect_vsp(dev); 1801 1802 if (ret != 0) 1803 goto out; 1804 1805 dm_device.thread = 1806 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); 1807 if (IS_ERR(dm_device.thread)) { 1808 ret = PTR_ERR(dm_device.thread); 1809 dm_device.thread = NULL; 1810 goto close_channel; 1811 } 1812 1813 dm_device.state = DM_INITIALIZED; 1814 return 0; 1815 close_channel: 1816 vmbus_close(dev->channel); 1817 out: 1818 dm_device.state = DM_INIT_ERROR; 1819 #ifdef CONFIG_MEMORY_HOTPLUG 1820 unregister_memory_notifier(&hv_memory_nb); 1821 restore_online_page_callback(&hv_online_page); 1822 #endif 1823 return ret; 1824 } 1825 1826 static const struct hv_vmbus_device_id id_table[] = { 1827 /* Dynamic Memory Class ID */ 1828 /* 525074DC-8985-46e2-8057-A307DC18A502 */ 1829 { HV_DM_GUID, }, 1830 { }, 1831 }; 1832 1833 MODULE_DEVICE_TABLE(vmbus, id_table); 1834 1835 static struct hv_driver balloon_drv = { 1836 .name = "hv_balloon", 1837 .id_table = id_table, 1838 .probe = balloon_probe, 1839 .remove = balloon_remove, 1840 .suspend = balloon_suspend, 1841 .resume = balloon_resume, 1842 .driver = { 1843 .probe_type = PROBE_PREFER_ASYNCHRONOUS, 1844 }, 1845 }; 1846 1847 static int __init init_balloon_drv(void) 1848 { 1849 1850 return vmbus_driver_register(&balloon_drv); 1851 } 1852 1853 module_init(init_balloon_drv); 1854 1855 MODULE_DESCRIPTION("Hyper-V Balloon"); 1856 MODULE_LICENSE("GPL"); 1857