1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2012, Microsoft Corporation. 4 * 5 * Author: 6 * K. Y. Srinivasan <kys@microsoft.com> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kernel.h> 12 #include <linux/jiffies.h> 13 #include <linux/mman.h> 14 #include <linux/delay.h> 15 #include <linux/init.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/kthread.h> 19 #include <linux/completion.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/memory.h> 22 #include <linux/notifier.h> 23 #include <linux/percpu_counter.h> 24 25 #include <linux/hyperv.h> 26 #include <asm/hyperv-tlfs.h> 27 28 #include <asm/mshyperv.h> 29 30 #define CREATE_TRACE_POINTS 31 #include "hv_trace_balloon.h" 32 33 /* 34 * We begin with definitions supporting the Dynamic Memory protocol 35 * with the host. 36 * 37 * Begin protocol definitions. 38 */ 39 40 41 42 /* 43 * Protocol versions. The low word is the minor version, the high word the major 44 * version. 45 * 46 * History: 47 * Initial version 1.0 48 * Changed to 0.1 on 2009/03/25 49 * Changes to 0.2 on 2009/05/14 50 * Changes to 0.3 on 2009/12/03 51 * Changed to 1.0 on 2011/04/05 52 */ 53 54 #define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor))) 55 #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16) 56 #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff) 57 58 enum { 59 DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3), 60 DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0), 61 DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0), 62 63 DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1, 64 DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2, 65 DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3, 66 67 DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10 68 }; 69 70 71 72 /* 73 * Message Types 74 */ 75 76 enum dm_message_type { 77 /* 78 * Version 0.3 79 */ 80 DM_ERROR = 0, 81 DM_VERSION_REQUEST = 1, 82 DM_VERSION_RESPONSE = 2, 83 DM_CAPABILITIES_REPORT = 3, 84 DM_CAPABILITIES_RESPONSE = 4, 85 DM_STATUS_REPORT = 5, 86 DM_BALLOON_REQUEST = 6, 87 DM_BALLOON_RESPONSE = 7, 88 DM_UNBALLOON_REQUEST = 8, 89 DM_UNBALLOON_RESPONSE = 9, 90 DM_MEM_HOT_ADD_REQUEST = 10, 91 DM_MEM_HOT_ADD_RESPONSE = 11, 92 DM_VERSION_03_MAX = 11, 93 /* 94 * Version 1.0. 95 */ 96 DM_INFO_MESSAGE = 12, 97 DM_VERSION_1_MAX = 12 98 }; 99 100 101 /* 102 * Structures defining the dynamic memory management 103 * protocol. 104 */ 105 106 union dm_version { 107 struct { 108 __u16 minor_version; 109 __u16 major_version; 110 }; 111 __u32 version; 112 } __packed; 113 114 115 union dm_caps { 116 struct { 117 __u64 balloon:1; 118 __u64 hot_add:1; 119 /* 120 * To support guests that may have alignment 121 * limitations on hot-add, the guest can specify 122 * its alignment requirements; a value of n 123 * represents an alignment of 2^n in mega bytes. 124 */ 125 __u64 hot_add_alignment:4; 126 __u64 reservedz:58; 127 } cap_bits; 128 __u64 caps; 129 } __packed; 130 131 union dm_mem_page_range { 132 struct { 133 /* 134 * The PFN number of the first page in the range. 135 * 40 bits is the architectural limit of a PFN 136 * number for AMD64. 137 */ 138 __u64 start_page:40; 139 /* 140 * The number of pages in the range. 141 */ 142 __u64 page_cnt:24; 143 } finfo; 144 __u64 page_range; 145 } __packed; 146 147 148 149 /* 150 * The header for all dynamic memory messages: 151 * 152 * type: Type of the message. 153 * size: Size of the message in bytes; including the header. 154 * trans_id: The guest is responsible for manufacturing this ID. 155 */ 156 157 struct dm_header { 158 __u16 type; 159 __u16 size; 160 __u32 trans_id; 161 } __packed; 162 163 /* 164 * A generic message format for dynamic memory. 165 * Specific message formats are defined later in the file. 166 */ 167 168 struct dm_message { 169 struct dm_header hdr; 170 __u8 data[]; /* enclosed message */ 171 } __packed; 172 173 174 /* 175 * Specific message types supporting the dynamic memory protocol. 176 */ 177 178 /* 179 * Version negotiation message. Sent from the guest to the host. 180 * The guest is free to try different versions until the host 181 * accepts the version. 182 * 183 * dm_version: The protocol version requested. 184 * is_last_attempt: If TRUE, this is the last version guest will request. 185 * reservedz: Reserved field, set to zero. 186 */ 187 188 struct dm_version_request { 189 struct dm_header hdr; 190 union dm_version version; 191 __u32 is_last_attempt:1; 192 __u32 reservedz:31; 193 } __packed; 194 195 /* 196 * Version response message; Host to Guest and indicates 197 * if the host has accepted the version sent by the guest. 198 * 199 * is_accepted: If TRUE, host has accepted the version and the guest 200 * should proceed to the next stage of the protocol. FALSE indicates that 201 * guest should re-try with a different version. 202 * 203 * reservedz: Reserved field, set to zero. 204 */ 205 206 struct dm_version_response { 207 struct dm_header hdr; 208 __u64 is_accepted:1; 209 __u64 reservedz:63; 210 } __packed; 211 212 /* 213 * Message reporting capabilities. This is sent from the guest to the 214 * host. 215 */ 216 217 struct dm_capabilities { 218 struct dm_header hdr; 219 union dm_caps caps; 220 __u64 min_page_cnt; 221 __u64 max_page_number; 222 } __packed; 223 224 /* 225 * Response to the capabilities message. This is sent from the host to the 226 * guest. This message notifies if the host has accepted the guest's 227 * capabilities. If the host has not accepted, the guest must shutdown 228 * the service. 229 * 230 * is_accepted: Indicates if the host has accepted guest's capabilities. 231 * reservedz: Must be 0. 232 */ 233 234 struct dm_capabilities_resp_msg { 235 struct dm_header hdr; 236 __u64 is_accepted:1; 237 __u64 reservedz:63; 238 } __packed; 239 240 /* 241 * This message is used to report memory pressure from the guest. 242 * This message is not part of any transaction and there is no 243 * response to this message. 244 * 245 * num_avail: Available memory in pages. 246 * num_committed: Committed memory in pages. 247 * page_file_size: The accumulated size of all page files 248 * in the system in pages. 249 * zero_free: The nunber of zero and free pages. 250 * page_file_writes: The writes to the page file in pages. 251 * io_diff: An indicator of file cache efficiency or page file activity, 252 * calculated as File Cache Page Fault Count - Page Read Count. 253 * This value is in pages. 254 * 255 * Some of these metrics are Windows specific and fortunately 256 * the algorithm on the host side that computes the guest memory 257 * pressure only uses num_committed value. 258 */ 259 260 struct dm_status { 261 struct dm_header hdr; 262 __u64 num_avail; 263 __u64 num_committed; 264 __u64 page_file_size; 265 __u64 zero_free; 266 __u32 page_file_writes; 267 __u32 io_diff; 268 } __packed; 269 270 271 /* 272 * Message to ask the guest to allocate memory - balloon up message. 273 * This message is sent from the host to the guest. The guest may not be 274 * able to allocate as much memory as requested. 275 * 276 * num_pages: number of pages to allocate. 277 */ 278 279 struct dm_balloon { 280 struct dm_header hdr; 281 __u32 num_pages; 282 __u32 reservedz; 283 } __packed; 284 285 286 /* 287 * Balloon response message; this message is sent from the guest 288 * to the host in response to the balloon message. 289 * 290 * reservedz: Reserved; must be set to zero. 291 * more_pages: If FALSE, this is the last message of the transaction. 292 * if TRUE there will atleast one more message from the guest. 293 * 294 * range_count: The number of ranges in the range array. 295 * 296 * range_array: An array of page ranges returned to the host. 297 * 298 */ 299 300 struct dm_balloon_response { 301 struct dm_header hdr; 302 __u32 reservedz; 303 __u32 more_pages:1; 304 __u32 range_count:31; 305 union dm_mem_page_range range_array[]; 306 } __packed; 307 308 /* 309 * Un-balloon message; this message is sent from the host 310 * to the guest to give guest more memory. 311 * 312 * more_pages: If FALSE, this is the last message of the transaction. 313 * if TRUE there will atleast one more message from the guest. 314 * 315 * reservedz: Reserved; must be set to zero. 316 * 317 * range_count: The number of ranges in the range array. 318 * 319 * range_array: An array of page ranges returned to the host. 320 * 321 */ 322 323 struct dm_unballoon_request { 324 struct dm_header hdr; 325 __u32 more_pages:1; 326 __u32 reservedz:31; 327 __u32 range_count; 328 union dm_mem_page_range range_array[]; 329 } __packed; 330 331 /* 332 * Un-balloon response message; this message is sent from the guest 333 * to the host in response to an unballoon request. 334 * 335 */ 336 337 struct dm_unballoon_response { 338 struct dm_header hdr; 339 } __packed; 340 341 342 /* 343 * Hot add request message. Message sent from the host to the guest. 344 * 345 * mem_range: Memory range to hot add. 346 * 347 */ 348 349 struct dm_hot_add { 350 struct dm_header hdr; 351 union dm_mem_page_range range; 352 } __packed; 353 354 /* 355 * Hot add response message. 356 * This message is sent by the guest to report the status of a hot add request. 357 * If page_count is less than the requested page count, then the host should 358 * assume all further hot add requests will fail, since this indicates that 359 * the guest has hit an upper physical memory barrier. 360 * 361 * Hot adds may also fail due to low resources; in this case, the guest must 362 * not complete this message until the hot add can succeed, and the host must 363 * not send a new hot add request until the response is sent. 364 * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS 365 * times it fails the request. 366 * 367 * 368 * page_count: number of pages that were successfully hot added. 369 * 370 * result: result of the operation 1: success, 0: failure. 371 * 372 */ 373 374 struct dm_hot_add_response { 375 struct dm_header hdr; 376 __u32 page_count; 377 __u32 result; 378 } __packed; 379 380 /* 381 * Types of information sent from host to the guest. 382 */ 383 384 enum dm_info_type { 385 INFO_TYPE_MAX_PAGE_CNT = 0, 386 MAX_INFO_TYPE 387 }; 388 389 390 /* 391 * Header for the information message. 392 */ 393 394 struct dm_info_header { 395 enum dm_info_type type; 396 __u32 data_size; 397 } __packed; 398 399 /* 400 * This message is sent from the host to the guest to pass 401 * some relevant information (win8 addition). 402 * 403 * reserved: no used. 404 * info_size: size of the information blob. 405 * info: information blob. 406 */ 407 408 struct dm_info_msg { 409 struct dm_header hdr; 410 __u32 reserved; 411 __u32 info_size; 412 __u8 info[]; 413 }; 414 415 /* 416 * End protocol definitions. 417 */ 418 419 /* 420 * State to manage hot adding memory into the guest. 421 * The range start_pfn : end_pfn specifies the range 422 * that the host has asked us to hot add. The range 423 * start_pfn : ha_end_pfn specifies the range that we have 424 * currently hot added. We hot add in multiples of 128M 425 * chunks; it is possible that we may not be able to bring 426 * online all the pages in the region. The range 427 * covered_start_pfn:covered_end_pfn defines the pages that can 428 * be brough online. 429 */ 430 431 struct hv_hotadd_state { 432 struct list_head list; 433 unsigned long start_pfn; 434 unsigned long covered_start_pfn; 435 unsigned long covered_end_pfn; 436 unsigned long ha_end_pfn; 437 unsigned long end_pfn; 438 /* 439 * A list of gaps. 440 */ 441 struct list_head gap_list; 442 }; 443 444 struct hv_hotadd_gap { 445 struct list_head list; 446 unsigned long start_pfn; 447 unsigned long end_pfn; 448 }; 449 450 struct balloon_state { 451 __u32 num_pages; 452 struct work_struct wrk; 453 }; 454 455 struct hot_add_wrk { 456 union dm_mem_page_range ha_page_range; 457 union dm_mem_page_range ha_region_range; 458 struct work_struct wrk; 459 }; 460 461 static bool allow_hibernation; 462 static bool hot_add = true; 463 static bool do_hot_add; 464 /* 465 * Delay reporting memory pressure by 466 * the specified number of seconds. 467 */ 468 static uint pressure_report_delay = 45; 469 470 /* 471 * The last time we posted a pressure report to host. 472 */ 473 static unsigned long last_post_time; 474 475 module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); 476 MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); 477 478 module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); 479 MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); 480 static atomic_t trans_id = ATOMIC_INIT(0); 481 482 static int dm_ring_size = 20 * 1024; 483 484 /* 485 * Driver specific state. 486 */ 487 488 enum hv_dm_state { 489 DM_INITIALIZING = 0, 490 DM_INITIALIZED, 491 DM_BALLOON_UP, 492 DM_BALLOON_DOWN, 493 DM_HOT_ADD, 494 DM_INIT_ERROR 495 }; 496 497 498 static __u8 recv_buffer[HV_HYP_PAGE_SIZE]; 499 static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE]; 500 #define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE) 501 #define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE) 502 503 struct hv_dynmem_device { 504 struct hv_device *dev; 505 enum hv_dm_state state; 506 struct completion host_event; 507 struct completion config_event; 508 509 /* 510 * Number of pages we have currently ballooned out. 511 */ 512 unsigned int num_pages_ballooned; 513 unsigned int num_pages_onlined; 514 unsigned int num_pages_added; 515 516 /* 517 * State to manage the ballooning (up) operation. 518 */ 519 struct balloon_state balloon_wrk; 520 521 /* 522 * State to execute the "hot-add" operation. 523 */ 524 struct hot_add_wrk ha_wrk; 525 526 /* 527 * This state tracks if the host has specified a hot-add 528 * region. 529 */ 530 bool host_specified_ha_region; 531 532 /* 533 * State to synchronize hot-add. 534 */ 535 struct completion ol_waitevent; 536 /* 537 * This thread handles hot-add 538 * requests from the host as well as notifying 539 * the host with regards to memory pressure in 540 * the guest. 541 */ 542 struct task_struct *thread; 543 544 /* 545 * Protects ha_region_list, num_pages_onlined counter and individual 546 * regions from ha_region_list. 547 */ 548 spinlock_t ha_lock; 549 550 /* 551 * A list of hot-add regions. 552 */ 553 struct list_head ha_region_list; 554 555 /* 556 * We start with the highest version we can support 557 * and downgrade based on the host; we save here the 558 * next version to try. 559 */ 560 __u32 next_version; 561 562 /* 563 * The negotiated version agreed by host. 564 */ 565 __u32 version; 566 }; 567 568 static struct hv_dynmem_device dm_device; 569 570 static void post_status(struct hv_dynmem_device *dm); 571 572 #ifdef CONFIG_MEMORY_HOTPLUG 573 static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, 574 unsigned long pfn) 575 { 576 struct hv_hotadd_gap *gap; 577 578 /* The page is not backed. */ 579 if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn)) 580 return false; 581 582 /* Check for gaps. */ 583 list_for_each_entry(gap, &has->gap_list, list) { 584 if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn)) 585 return false; 586 } 587 588 return true; 589 } 590 591 static unsigned long hv_page_offline_check(unsigned long start_pfn, 592 unsigned long nr_pages) 593 { 594 unsigned long pfn = start_pfn, count = 0; 595 struct hv_hotadd_state *has; 596 bool found; 597 598 while (pfn < start_pfn + nr_pages) { 599 /* 600 * Search for HAS which covers the pfn and when we find one 601 * count how many consequitive PFNs are covered. 602 */ 603 found = false; 604 list_for_each_entry(has, &dm_device.ha_region_list, list) { 605 while ((pfn >= has->start_pfn) && 606 (pfn < has->end_pfn) && 607 (pfn < start_pfn + nr_pages)) { 608 found = true; 609 if (has_pfn_is_backed(has, pfn)) 610 count++; 611 pfn++; 612 } 613 } 614 615 /* 616 * This PFN is not in any HAS (e.g. we're offlining a region 617 * which was present at boot), no need to account for it. Go 618 * to the next one. 619 */ 620 if (!found) 621 pfn++; 622 } 623 624 return count; 625 } 626 627 static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, 628 void *v) 629 { 630 struct memory_notify *mem = (struct memory_notify *)v; 631 unsigned long flags, pfn_count; 632 633 switch (val) { 634 case MEM_ONLINE: 635 case MEM_CANCEL_ONLINE: 636 complete(&dm_device.ol_waitevent); 637 break; 638 639 case MEM_OFFLINE: 640 spin_lock_irqsave(&dm_device.ha_lock, flags); 641 pfn_count = hv_page_offline_check(mem->start_pfn, 642 mem->nr_pages); 643 if (pfn_count <= dm_device.num_pages_onlined) { 644 dm_device.num_pages_onlined -= pfn_count; 645 } else { 646 /* 647 * We're offlining more pages than we managed to online. 648 * This is unexpected. In any case don't let 649 * num_pages_onlined wrap around zero. 650 */ 651 WARN_ON_ONCE(1); 652 dm_device.num_pages_onlined = 0; 653 } 654 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 655 break; 656 case MEM_GOING_ONLINE: 657 case MEM_GOING_OFFLINE: 658 case MEM_CANCEL_OFFLINE: 659 break; 660 } 661 return NOTIFY_OK; 662 } 663 664 static struct notifier_block hv_memory_nb = { 665 .notifier_call = hv_memory_notifier, 666 .priority = 0 667 }; 668 669 /* Check if the particular page is backed and can be onlined and online it. */ 670 static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) 671 { 672 if (!has_pfn_is_backed(has, page_to_pfn(pg))) { 673 if (!PageOffline(pg)) 674 __SetPageOffline(pg); 675 return; 676 } 677 if (PageOffline(pg)) 678 __ClearPageOffline(pg); 679 680 /* This frame is currently backed; online the page. */ 681 generic_online_page(pg, 0); 682 683 lockdep_assert_held(&dm_device.ha_lock); 684 dm_device.num_pages_onlined++; 685 } 686 687 static void hv_bring_pgs_online(struct hv_hotadd_state *has, 688 unsigned long start_pfn, unsigned long size) 689 { 690 int i; 691 692 pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); 693 for (i = 0; i < size; i++) 694 hv_page_online_one(has, pfn_to_page(start_pfn + i)); 695 } 696 697 static void hv_mem_hot_add(unsigned long start, unsigned long size, 698 unsigned long pfn_count, 699 struct hv_hotadd_state *has) 700 { 701 int ret = 0; 702 int i, nid; 703 unsigned long start_pfn; 704 unsigned long processed_pfn; 705 unsigned long total_pfn = pfn_count; 706 unsigned long flags; 707 708 for (i = 0; i < (size/HA_CHUNK); i++) { 709 start_pfn = start + (i * HA_CHUNK); 710 711 spin_lock_irqsave(&dm_device.ha_lock, flags); 712 has->ha_end_pfn += HA_CHUNK; 713 714 if (total_pfn > HA_CHUNK) { 715 processed_pfn = HA_CHUNK; 716 total_pfn -= HA_CHUNK; 717 } else { 718 processed_pfn = total_pfn; 719 total_pfn = 0; 720 } 721 722 has->covered_end_pfn += processed_pfn; 723 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 724 725 reinit_completion(&dm_device.ol_waitevent); 726 727 nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); 728 ret = add_memory(nid, PFN_PHYS((start_pfn)), 729 (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE); 730 731 if (ret) { 732 pr_err("hot_add memory failed error is %d\n", ret); 733 if (ret == -EEXIST) { 734 /* 735 * This error indicates that the error 736 * is not a transient failure. This is the 737 * case where the guest's physical address map 738 * precludes hot adding memory. Stop all further 739 * memory hot-add. 740 */ 741 do_hot_add = false; 742 } 743 spin_lock_irqsave(&dm_device.ha_lock, flags); 744 has->ha_end_pfn -= HA_CHUNK; 745 has->covered_end_pfn -= processed_pfn; 746 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 747 break; 748 } 749 750 /* 751 * Wait for memory to get onlined. If the kernel onlined the 752 * memory when adding it, this will return directly. Otherwise, 753 * it will wait for user space to online the memory. This helps 754 * to avoid adding memory faster than it is getting onlined. As 755 * adding succeeded, it is ok to proceed even if the memory was 756 * not onlined in time. 757 */ 758 wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ); 759 post_status(&dm_device); 760 } 761 } 762 763 static void hv_online_page(struct page *pg, unsigned int order) 764 { 765 struct hv_hotadd_state *has; 766 unsigned long flags; 767 unsigned long pfn = page_to_pfn(pg); 768 769 spin_lock_irqsave(&dm_device.ha_lock, flags); 770 list_for_each_entry(has, &dm_device.ha_region_list, list) { 771 /* The page belongs to a different HAS. */ 772 if ((pfn < has->start_pfn) || 773 (pfn + (1UL << order) > has->end_pfn)) 774 continue; 775 776 hv_bring_pgs_online(has, pfn, 1UL << order); 777 break; 778 } 779 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 780 } 781 782 static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) 783 { 784 struct hv_hotadd_state *has; 785 struct hv_hotadd_gap *gap; 786 unsigned long residual, new_inc; 787 int ret = 0; 788 unsigned long flags; 789 790 spin_lock_irqsave(&dm_device.ha_lock, flags); 791 list_for_each_entry(has, &dm_device.ha_region_list, list) { 792 /* 793 * If the pfn range we are dealing with is not in the current 794 * "hot add block", move on. 795 */ 796 if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn) 797 continue; 798 799 /* 800 * If the current start pfn is not where the covered_end 801 * is, create a gap and update covered_end_pfn. 802 */ 803 if (has->covered_end_pfn != start_pfn) { 804 gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC); 805 if (!gap) { 806 ret = -ENOMEM; 807 break; 808 } 809 810 INIT_LIST_HEAD(&gap->list); 811 gap->start_pfn = has->covered_end_pfn; 812 gap->end_pfn = start_pfn; 813 list_add_tail(&gap->list, &has->gap_list); 814 815 has->covered_end_pfn = start_pfn; 816 } 817 818 /* 819 * If the current hot add-request extends beyond 820 * our current limit; extend it. 821 */ 822 if ((start_pfn + pfn_cnt) > has->end_pfn) { 823 residual = (start_pfn + pfn_cnt - has->end_pfn); 824 /* 825 * Extend the region by multiples of HA_CHUNK. 826 */ 827 new_inc = (residual / HA_CHUNK) * HA_CHUNK; 828 if (residual % HA_CHUNK) 829 new_inc += HA_CHUNK; 830 831 has->end_pfn += new_inc; 832 } 833 834 ret = 1; 835 break; 836 } 837 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 838 839 return ret; 840 } 841 842 static unsigned long handle_pg_range(unsigned long pg_start, 843 unsigned long pg_count) 844 { 845 unsigned long start_pfn = pg_start; 846 unsigned long pfn_cnt = pg_count; 847 unsigned long size; 848 struct hv_hotadd_state *has; 849 unsigned long pgs_ol = 0; 850 unsigned long old_covered_state; 851 unsigned long res = 0, flags; 852 853 pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count, 854 pg_start); 855 856 spin_lock_irqsave(&dm_device.ha_lock, flags); 857 list_for_each_entry(has, &dm_device.ha_region_list, list) { 858 /* 859 * If the pfn range we are dealing with is not in the current 860 * "hot add block", move on. 861 */ 862 if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn) 863 continue; 864 865 old_covered_state = has->covered_end_pfn; 866 867 if (start_pfn < has->ha_end_pfn) { 868 /* 869 * This is the case where we are backing pages 870 * in an already hot added region. Bring 871 * these pages online first. 872 */ 873 pgs_ol = has->ha_end_pfn - start_pfn; 874 if (pgs_ol > pfn_cnt) 875 pgs_ol = pfn_cnt; 876 877 has->covered_end_pfn += pgs_ol; 878 pfn_cnt -= pgs_ol; 879 /* 880 * Check if the corresponding memory block is already 881 * online. It is possible to observe struct pages still 882 * being uninitialized here so check section instead. 883 * In case the section is online we need to bring the 884 * rest of pfns (which were not backed previously) 885 * online too. 886 */ 887 if (start_pfn > has->start_pfn && 888 online_section_nr(pfn_to_section_nr(start_pfn))) 889 hv_bring_pgs_online(has, start_pfn, pgs_ol); 890 891 } 892 893 if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { 894 /* 895 * We have some residual hot add range 896 * that needs to be hot added; hot add 897 * it now. Hot add a multiple of 898 * of HA_CHUNK that fully covers the pages 899 * we have. 900 */ 901 size = (has->end_pfn - has->ha_end_pfn); 902 if (pfn_cnt <= size) { 903 size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); 904 if (pfn_cnt % HA_CHUNK) 905 size += HA_CHUNK; 906 } else { 907 pfn_cnt = size; 908 } 909 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 910 hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has); 911 spin_lock_irqsave(&dm_device.ha_lock, flags); 912 } 913 /* 914 * If we managed to online any pages that were given to us, 915 * we declare success. 916 */ 917 res = has->covered_end_pfn - old_covered_state; 918 break; 919 } 920 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 921 922 return res; 923 } 924 925 static unsigned long process_hot_add(unsigned long pg_start, 926 unsigned long pfn_cnt, 927 unsigned long rg_start, 928 unsigned long rg_size) 929 { 930 struct hv_hotadd_state *ha_region = NULL; 931 int covered; 932 unsigned long flags; 933 934 if (pfn_cnt == 0) 935 return 0; 936 937 if (!dm_device.host_specified_ha_region) { 938 covered = pfn_covered(pg_start, pfn_cnt); 939 if (covered < 0) 940 return 0; 941 942 if (covered) 943 goto do_pg_range; 944 } 945 946 /* 947 * If the host has specified a hot-add range; deal with it first. 948 */ 949 950 if (rg_size != 0) { 951 ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL); 952 if (!ha_region) 953 return 0; 954 955 INIT_LIST_HEAD(&ha_region->list); 956 INIT_LIST_HEAD(&ha_region->gap_list); 957 958 ha_region->start_pfn = rg_start; 959 ha_region->ha_end_pfn = rg_start; 960 ha_region->covered_start_pfn = pg_start; 961 ha_region->covered_end_pfn = pg_start; 962 ha_region->end_pfn = rg_start + rg_size; 963 964 spin_lock_irqsave(&dm_device.ha_lock, flags); 965 list_add_tail(&ha_region->list, &dm_device.ha_region_list); 966 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 967 } 968 969 do_pg_range: 970 /* 971 * Process the page range specified; bringing them 972 * online if possible. 973 */ 974 return handle_pg_range(pg_start, pfn_cnt); 975 } 976 977 #endif 978 979 static void hot_add_req(struct work_struct *dummy) 980 { 981 struct dm_hot_add_response resp; 982 #ifdef CONFIG_MEMORY_HOTPLUG 983 unsigned long pg_start, pfn_cnt; 984 unsigned long rg_start, rg_sz; 985 #endif 986 struct hv_dynmem_device *dm = &dm_device; 987 988 memset(&resp, 0, sizeof(struct dm_hot_add_response)); 989 resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE; 990 resp.hdr.size = sizeof(struct dm_hot_add_response); 991 992 #ifdef CONFIG_MEMORY_HOTPLUG 993 pg_start = dm->ha_wrk.ha_page_range.finfo.start_page; 994 pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt; 995 996 rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; 997 rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; 998 999 if ((rg_start == 0) && (!dm->host_specified_ha_region)) { 1000 unsigned long region_size; 1001 unsigned long region_start; 1002 1003 /* 1004 * The host has not specified the hot-add region. 1005 * Based on the hot-add page range being specified, 1006 * compute a hot-add region that can cover the pages 1007 * that need to be hot-added while ensuring the alignment 1008 * and size requirements of Linux as it relates to hot-add. 1009 */ 1010 region_start = pg_start; 1011 region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; 1012 if (pfn_cnt % HA_CHUNK) 1013 region_size += HA_CHUNK; 1014 1015 region_start = (pg_start / HA_CHUNK) * HA_CHUNK; 1016 1017 rg_start = region_start; 1018 rg_sz = region_size; 1019 } 1020 1021 if (do_hot_add) 1022 resp.page_count = process_hot_add(pg_start, pfn_cnt, 1023 rg_start, rg_sz); 1024 1025 dm->num_pages_added += resp.page_count; 1026 #endif 1027 /* 1028 * The result field of the response structure has the 1029 * following semantics: 1030 * 1031 * 1. If all or some pages hot-added: Guest should return success. 1032 * 1033 * 2. If no pages could be hot-added: 1034 * 1035 * If the guest returns success, then the host 1036 * will not attempt any further hot-add operations. This 1037 * signifies a permanent failure. 1038 * 1039 * If the guest returns failure, then this failure will be 1040 * treated as a transient failure and the host may retry the 1041 * hot-add operation after some delay. 1042 */ 1043 if (resp.page_count > 0) 1044 resp.result = 1; 1045 else if (!do_hot_add) 1046 resp.result = 1; 1047 else 1048 resp.result = 0; 1049 1050 if (!do_hot_add || resp.page_count == 0) { 1051 if (!allow_hibernation) 1052 pr_err("Memory hot add failed\n"); 1053 else 1054 pr_info("Ignore hot-add request!\n"); 1055 } 1056 1057 dm->state = DM_INITIALIZED; 1058 resp.hdr.trans_id = atomic_inc_return(&trans_id); 1059 vmbus_sendpacket(dm->dev->channel, &resp, 1060 sizeof(struct dm_hot_add_response), 1061 (unsigned long)NULL, 1062 VM_PKT_DATA_INBAND, 0); 1063 } 1064 1065 static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) 1066 { 1067 struct dm_info_header *info_hdr; 1068 1069 info_hdr = (struct dm_info_header *)msg->info; 1070 1071 switch (info_hdr->type) { 1072 case INFO_TYPE_MAX_PAGE_CNT: 1073 if (info_hdr->data_size == sizeof(__u64)) { 1074 __u64 *max_page_count = (__u64 *)&info_hdr[1]; 1075 1076 pr_info("Max. dynamic memory size: %llu MB\n", 1077 (*max_page_count) >> (20 - HV_HYP_PAGE_SHIFT)); 1078 } 1079 1080 break; 1081 default: 1082 pr_warn("Received Unknown type: %d\n", info_hdr->type); 1083 } 1084 } 1085 1086 static unsigned long compute_balloon_floor(void) 1087 { 1088 unsigned long min_pages; 1089 unsigned long nr_pages = totalram_pages(); 1090 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) 1091 /* Simple continuous piecewiese linear function: 1092 * max MiB -> min MiB gradient 1093 * 0 0 1094 * 16 16 1095 * 32 24 1096 * 128 72 (1/2) 1097 * 512 168 (1/4) 1098 * 2048 360 (1/8) 1099 * 8192 744 (1/16) 1100 * 32768 1512 (1/32) 1101 */ 1102 if (nr_pages < MB2PAGES(128)) 1103 min_pages = MB2PAGES(8) + (nr_pages >> 1); 1104 else if (nr_pages < MB2PAGES(512)) 1105 min_pages = MB2PAGES(40) + (nr_pages >> 2); 1106 else if (nr_pages < MB2PAGES(2048)) 1107 min_pages = MB2PAGES(104) + (nr_pages >> 3); 1108 else if (nr_pages < MB2PAGES(8192)) 1109 min_pages = MB2PAGES(232) + (nr_pages >> 4); 1110 else 1111 min_pages = MB2PAGES(488) + (nr_pages >> 5); 1112 #undef MB2PAGES 1113 return min_pages; 1114 } 1115 1116 /* 1117 * Post our status as it relates memory pressure to the 1118 * host. Host expects the guests to post this status 1119 * periodically at 1 second intervals. 1120 * 1121 * The metrics specified in this protocol are very Windows 1122 * specific and so we cook up numbers here to convey our memory 1123 * pressure. 1124 */ 1125 1126 static void post_status(struct hv_dynmem_device *dm) 1127 { 1128 struct dm_status status; 1129 unsigned long now = jiffies; 1130 unsigned long last_post = last_post_time; 1131 1132 if (pressure_report_delay > 0) { 1133 --pressure_report_delay; 1134 return; 1135 } 1136 1137 if (!time_after(now, (last_post_time + HZ))) 1138 return; 1139 1140 memset(&status, 0, sizeof(struct dm_status)); 1141 status.hdr.type = DM_STATUS_REPORT; 1142 status.hdr.size = sizeof(struct dm_status); 1143 status.hdr.trans_id = atomic_inc_return(&trans_id); 1144 1145 /* 1146 * The host expects the guest to report free and committed memory. 1147 * Furthermore, the host expects the pressure information to include 1148 * the ballooned out pages. For a given amount of memory that we are 1149 * managing we need to compute a floor below which we should not 1150 * balloon. Compute this and add it to the pressure report. 1151 * We also need to report all offline pages (num_pages_added - 1152 * num_pages_onlined) as committed to the host, otherwise it can try 1153 * asking us to balloon them out. 1154 */ 1155 status.num_avail = si_mem_available(); 1156 status.num_committed = vm_memory_committed() + 1157 dm->num_pages_ballooned + 1158 (dm->num_pages_added > dm->num_pages_onlined ? 1159 dm->num_pages_added - dm->num_pages_onlined : 0) + 1160 compute_balloon_floor(); 1161 1162 trace_balloon_status(status.num_avail, status.num_committed, 1163 vm_memory_committed(), dm->num_pages_ballooned, 1164 dm->num_pages_added, dm->num_pages_onlined); 1165 /* 1166 * If our transaction ID is no longer current, just don't 1167 * send the status. This can happen if we were interrupted 1168 * after we picked our transaction ID. 1169 */ 1170 if (status.hdr.trans_id != atomic_read(&trans_id)) 1171 return; 1172 1173 /* 1174 * If the last post time that we sampled has changed, 1175 * we have raced, don't post the status. 1176 */ 1177 if (last_post != last_post_time) 1178 return; 1179 1180 last_post_time = jiffies; 1181 vmbus_sendpacket(dm->dev->channel, &status, 1182 sizeof(struct dm_status), 1183 (unsigned long)NULL, 1184 VM_PKT_DATA_INBAND, 0); 1185 1186 } 1187 1188 static void free_balloon_pages(struct hv_dynmem_device *dm, 1189 union dm_mem_page_range *range_array) 1190 { 1191 int num_pages = range_array->finfo.page_cnt; 1192 __u64 start_frame = range_array->finfo.start_page; 1193 struct page *pg; 1194 int i; 1195 1196 for (i = 0; i < num_pages; i++) { 1197 pg = pfn_to_page(i + start_frame); 1198 __ClearPageOffline(pg); 1199 __free_page(pg); 1200 dm->num_pages_ballooned--; 1201 adjust_managed_page_count(pg, 1); 1202 } 1203 } 1204 1205 1206 1207 static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, 1208 unsigned int num_pages, 1209 struct dm_balloon_response *bl_resp, 1210 int alloc_unit) 1211 { 1212 unsigned int i, j; 1213 struct page *pg; 1214 1215 for (i = 0; i < num_pages / alloc_unit; i++) { 1216 if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) > 1217 HV_HYP_PAGE_SIZE) 1218 return i * alloc_unit; 1219 1220 /* 1221 * We execute this code in a thread context. Furthermore, 1222 * we don't want the kernel to try too hard. 1223 */ 1224 pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY | 1225 __GFP_NOMEMALLOC | __GFP_NOWARN, 1226 get_order(alloc_unit << PAGE_SHIFT)); 1227 1228 if (!pg) 1229 return i * alloc_unit; 1230 1231 dm->num_pages_ballooned += alloc_unit; 1232 1233 /* 1234 * If we allocatted 2M pages; split them so we 1235 * can free them in any order we get. 1236 */ 1237 1238 if (alloc_unit != 1) 1239 split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); 1240 1241 /* mark all pages offline */ 1242 for (j = 0; j < alloc_unit; j++) { 1243 __SetPageOffline(pg + j); 1244 adjust_managed_page_count(pg + j, -1); 1245 } 1246 1247 bl_resp->range_count++; 1248 bl_resp->range_array[i].finfo.start_page = 1249 page_to_pfn(pg); 1250 bl_resp->range_array[i].finfo.page_cnt = alloc_unit; 1251 bl_resp->hdr.size += sizeof(union dm_mem_page_range); 1252 1253 } 1254 1255 return i * alloc_unit; 1256 } 1257 1258 static void balloon_up(struct work_struct *dummy) 1259 { 1260 unsigned int num_pages = dm_device.balloon_wrk.num_pages; 1261 unsigned int num_ballooned = 0; 1262 struct dm_balloon_response *bl_resp; 1263 int alloc_unit; 1264 int ret; 1265 bool done = false; 1266 int i; 1267 long avail_pages; 1268 unsigned long floor; 1269 1270 /* 1271 * We will attempt 2M allocations. However, if we fail to 1272 * allocate 2M chunks, we will go back to PAGE_SIZE allocations. 1273 */ 1274 alloc_unit = PAGES_IN_2M; 1275 1276 avail_pages = si_mem_available(); 1277 floor = compute_balloon_floor(); 1278 1279 /* Refuse to balloon below the floor. */ 1280 if (avail_pages < num_pages || avail_pages - num_pages < floor) { 1281 pr_info("Balloon request will be partially fulfilled. %s\n", 1282 avail_pages < num_pages ? "Not enough memory." : 1283 "Balloon floor reached."); 1284 1285 num_pages = avail_pages > floor ? (avail_pages - floor) : 0; 1286 } 1287 1288 while (!done) { 1289 memset(balloon_up_send_buffer, 0, HV_HYP_PAGE_SIZE); 1290 bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer; 1291 bl_resp->hdr.type = DM_BALLOON_RESPONSE; 1292 bl_resp->hdr.size = sizeof(struct dm_balloon_response); 1293 bl_resp->more_pages = 1; 1294 1295 num_pages -= num_ballooned; 1296 num_ballooned = alloc_balloon_pages(&dm_device, num_pages, 1297 bl_resp, alloc_unit); 1298 1299 if (alloc_unit != 1 && num_ballooned == 0) { 1300 alloc_unit = 1; 1301 continue; 1302 } 1303 1304 if (num_ballooned == 0 || num_ballooned == num_pages) { 1305 pr_debug("Ballooned %u out of %u requested pages.\n", 1306 num_pages, dm_device.balloon_wrk.num_pages); 1307 1308 bl_resp->more_pages = 0; 1309 done = true; 1310 dm_device.state = DM_INITIALIZED; 1311 } 1312 1313 /* 1314 * We are pushing a lot of data through the channel; 1315 * deal with transient failures caused because of the 1316 * lack of space in the ring buffer. 1317 */ 1318 1319 do { 1320 bl_resp->hdr.trans_id = atomic_inc_return(&trans_id); 1321 ret = vmbus_sendpacket(dm_device.dev->channel, 1322 bl_resp, 1323 bl_resp->hdr.size, 1324 (unsigned long)NULL, 1325 VM_PKT_DATA_INBAND, 0); 1326 1327 if (ret == -EAGAIN) 1328 msleep(20); 1329 post_status(&dm_device); 1330 } while (ret == -EAGAIN); 1331 1332 if (ret) { 1333 /* 1334 * Free up the memory we allocatted. 1335 */ 1336 pr_err("Balloon response failed\n"); 1337 1338 for (i = 0; i < bl_resp->range_count; i++) 1339 free_balloon_pages(&dm_device, 1340 &bl_resp->range_array[i]); 1341 1342 done = true; 1343 } 1344 } 1345 1346 } 1347 1348 static void balloon_down(struct hv_dynmem_device *dm, 1349 struct dm_unballoon_request *req) 1350 { 1351 union dm_mem_page_range *range_array = req->range_array; 1352 int range_count = req->range_count; 1353 struct dm_unballoon_response resp; 1354 int i; 1355 unsigned int prev_pages_ballooned = dm->num_pages_ballooned; 1356 1357 for (i = 0; i < range_count; i++) { 1358 free_balloon_pages(dm, &range_array[i]); 1359 complete(&dm_device.config_event); 1360 } 1361 1362 pr_debug("Freed %u ballooned pages.\n", 1363 prev_pages_ballooned - dm->num_pages_ballooned); 1364 1365 if (req->more_pages == 1) 1366 return; 1367 1368 memset(&resp, 0, sizeof(struct dm_unballoon_response)); 1369 resp.hdr.type = DM_UNBALLOON_RESPONSE; 1370 resp.hdr.trans_id = atomic_inc_return(&trans_id); 1371 resp.hdr.size = sizeof(struct dm_unballoon_response); 1372 1373 vmbus_sendpacket(dm_device.dev->channel, &resp, 1374 sizeof(struct dm_unballoon_response), 1375 (unsigned long)NULL, 1376 VM_PKT_DATA_INBAND, 0); 1377 1378 dm->state = DM_INITIALIZED; 1379 } 1380 1381 static void balloon_onchannelcallback(void *context); 1382 1383 static int dm_thread_func(void *dm_dev) 1384 { 1385 struct hv_dynmem_device *dm = dm_dev; 1386 1387 while (!kthread_should_stop()) { 1388 wait_for_completion_interruptible_timeout( 1389 &dm_device.config_event, 1*HZ); 1390 /* 1391 * The host expects us to post information on the memory 1392 * pressure every second. 1393 */ 1394 reinit_completion(&dm_device.config_event); 1395 post_status(dm); 1396 } 1397 1398 return 0; 1399 } 1400 1401 1402 static void version_resp(struct hv_dynmem_device *dm, 1403 struct dm_version_response *vresp) 1404 { 1405 struct dm_version_request version_req; 1406 int ret; 1407 1408 if (vresp->is_accepted) { 1409 /* 1410 * We are done; wakeup the 1411 * context waiting for version 1412 * negotiation. 1413 */ 1414 complete(&dm->host_event); 1415 return; 1416 } 1417 /* 1418 * If there are more versions to try, continue 1419 * with negotiations; if not 1420 * shutdown the service since we are not able 1421 * to negotiate a suitable version number 1422 * with the host. 1423 */ 1424 if (dm->next_version == 0) 1425 goto version_error; 1426 1427 memset(&version_req, 0, sizeof(struct dm_version_request)); 1428 version_req.hdr.type = DM_VERSION_REQUEST; 1429 version_req.hdr.size = sizeof(struct dm_version_request); 1430 version_req.hdr.trans_id = atomic_inc_return(&trans_id); 1431 version_req.version.version = dm->next_version; 1432 dm->version = version_req.version.version; 1433 1434 /* 1435 * Set the next version to try in case current version fails. 1436 * Win7 protocol ought to be the last one to try. 1437 */ 1438 switch (version_req.version.version) { 1439 case DYNMEM_PROTOCOL_VERSION_WIN8: 1440 dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7; 1441 version_req.is_last_attempt = 0; 1442 break; 1443 default: 1444 dm->next_version = 0; 1445 version_req.is_last_attempt = 1; 1446 } 1447 1448 ret = vmbus_sendpacket(dm->dev->channel, &version_req, 1449 sizeof(struct dm_version_request), 1450 (unsigned long)NULL, 1451 VM_PKT_DATA_INBAND, 0); 1452 1453 if (ret) 1454 goto version_error; 1455 1456 return; 1457 1458 version_error: 1459 dm->state = DM_INIT_ERROR; 1460 complete(&dm->host_event); 1461 } 1462 1463 static void cap_resp(struct hv_dynmem_device *dm, 1464 struct dm_capabilities_resp_msg *cap_resp) 1465 { 1466 if (!cap_resp->is_accepted) { 1467 pr_err("Capabilities not accepted by host\n"); 1468 dm->state = DM_INIT_ERROR; 1469 } 1470 complete(&dm->host_event); 1471 } 1472 1473 static void balloon_onchannelcallback(void *context) 1474 { 1475 struct hv_device *dev = context; 1476 u32 recvlen; 1477 u64 requestid; 1478 struct dm_message *dm_msg; 1479 struct dm_header *dm_hdr; 1480 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1481 struct dm_balloon *bal_msg; 1482 struct dm_hot_add *ha_msg; 1483 union dm_mem_page_range *ha_pg_range; 1484 union dm_mem_page_range *ha_region; 1485 1486 memset(recv_buffer, 0, sizeof(recv_buffer)); 1487 vmbus_recvpacket(dev->channel, recv_buffer, 1488 HV_HYP_PAGE_SIZE, &recvlen, &requestid); 1489 1490 if (recvlen > 0) { 1491 dm_msg = (struct dm_message *)recv_buffer; 1492 dm_hdr = &dm_msg->hdr; 1493 1494 switch (dm_hdr->type) { 1495 case DM_VERSION_RESPONSE: 1496 version_resp(dm, 1497 (struct dm_version_response *)dm_msg); 1498 break; 1499 1500 case DM_CAPABILITIES_RESPONSE: 1501 cap_resp(dm, 1502 (struct dm_capabilities_resp_msg *)dm_msg); 1503 break; 1504 1505 case DM_BALLOON_REQUEST: 1506 if (allow_hibernation) { 1507 pr_info("Ignore balloon-up request!\n"); 1508 break; 1509 } 1510 1511 if (dm->state == DM_BALLOON_UP) 1512 pr_warn("Currently ballooning\n"); 1513 bal_msg = (struct dm_balloon *)recv_buffer; 1514 dm->state = DM_BALLOON_UP; 1515 dm_device.balloon_wrk.num_pages = bal_msg->num_pages; 1516 schedule_work(&dm_device.balloon_wrk.wrk); 1517 break; 1518 1519 case DM_UNBALLOON_REQUEST: 1520 if (allow_hibernation) { 1521 pr_info("Ignore balloon-down request!\n"); 1522 break; 1523 } 1524 1525 dm->state = DM_BALLOON_DOWN; 1526 balloon_down(dm, 1527 (struct dm_unballoon_request *)recv_buffer); 1528 break; 1529 1530 case DM_MEM_HOT_ADD_REQUEST: 1531 if (dm->state == DM_HOT_ADD) 1532 pr_warn("Currently hot-adding\n"); 1533 dm->state = DM_HOT_ADD; 1534 ha_msg = (struct dm_hot_add *)recv_buffer; 1535 if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) { 1536 /* 1537 * This is a normal hot-add request specifying 1538 * hot-add memory. 1539 */ 1540 dm->host_specified_ha_region = false; 1541 ha_pg_range = &ha_msg->range; 1542 dm->ha_wrk.ha_page_range = *ha_pg_range; 1543 dm->ha_wrk.ha_region_range.page_range = 0; 1544 } else { 1545 /* 1546 * Host is specifying that we first hot-add 1547 * a region and then partially populate this 1548 * region. 1549 */ 1550 dm->host_specified_ha_region = true; 1551 ha_pg_range = &ha_msg->range; 1552 ha_region = &ha_pg_range[1]; 1553 dm->ha_wrk.ha_page_range = *ha_pg_range; 1554 dm->ha_wrk.ha_region_range = *ha_region; 1555 } 1556 schedule_work(&dm_device.ha_wrk.wrk); 1557 break; 1558 1559 case DM_INFO_MESSAGE: 1560 process_info(dm, (struct dm_info_msg *)dm_msg); 1561 break; 1562 1563 default: 1564 pr_warn("Unhandled message: type: %d\n", dm_hdr->type); 1565 1566 } 1567 } 1568 1569 } 1570 1571 static int balloon_connect_vsp(struct hv_device *dev) 1572 { 1573 struct dm_version_request version_req; 1574 struct dm_capabilities cap_msg; 1575 unsigned long t; 1576 int ret; 1577 1578 ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, 1579 balloon_onchannelcallback, dev); 1580 if (ret) 1581 return ret; 1582 1583 /* 1584 * Initiate the hand shake with the host and negotiate 1585 * a version that the host can support. We start with the 1586 * highest version number and go down if the host cannot 1587 * support it. 1588 */ 1589 memset(&version_req, 0, sizeof(struct dm_version_request)); 1590 version_req.hdr.type = DM_VERSION_REQUEST; 1591 version_req.hdr.size = sizeof(struct dm_version_request); 1592 version_req.hdr.trans_id = atomic_inc_return(&trans_id); 1593 version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10; 1594 version_req.is_last_attempt = 0; 1595 dm_device.version = version_req.version.version; 1596 1597 ret = vmbus_sendpacket(dev->channel, &version_req, 1598 sizeof(struct dm_version_request), 1599 (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); 1600 if (ret) 1601 goto out; 1602 1603 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); 1604 if (t == 0) { 1605 ret = -ETIMEDOUT; 1606 goto out; 1607 } 1608 1609 /* 1610 * If we could not negotiate a compatible version with the host 1611 * fail the probe function. 1612 */ 1613 if (dm_device.state == DM_INIT_ERROR) { 1614 ret = -EPROTO; 1615 goto out; 1616 } 1617 1618 pr_info("Using Dynamic Memory protocol version %u.%u\n", 1619 DYNMEM_MAJOR_VERSION(dm_device.version), 1620 DYNMEM_MINOR_VERSION(dm_device.version)); 1621 1622 /* 1623 * Now submit our capabilities to the host. 1624 */ 1625 memset(&cap_msg, 0, sizeof(struct dm_capabilities)); 1626 cap_msg.hdr.type = DM_CAPABILITIES_REPORT; 1627 cap_msg.hdr.size = sizeof(struct dm_capabilities); 1628 cap_msg.hdr.trans_id = atomic_inc_return(&trans_id); 1629 1630 /* 1631 * When hibernation (i.e. virtual ACPI S4 state) is enabled, the host 1632 * currently still requires the bits to be set, so we have to add code 1633 * to fail the host's hot-add and balloon up/down requests, if any. 1634 */ 1635 cap_msg.caps.cap_bits.balloon = 1; 1636 cap_msg.caps.cap_bits.hot_add = 1; 1637 1638 /* 1639 * Specify our alignment requirements as it relates 1640 * memory hot-add. Specify 128MB alignment. 1641 */ 1642 cap_msg.caps.cap_bits.hot_add_alignment = 7; 1643 1644 /* 1645 * Currently the host does not use these 1646 * values and we set them to what is done in the 1647 * Windows driver. 1648 */ 1649 cap_msg.min_page_cnt = 0; 1650 cap_msg.max_page_number = -1; 1651 1652 ret = vmbus_sendpacket(dev->channel, &cap_msg, 1653 sizeof(struct dm_capabilities), 1654 (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); 1655 if (ret) 1656 goto out; 1657 1658 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); 1659 if (t == 0) { 1660 ret = -ETIMEDOUT; 1661 goto out; 1662 } 1663 1664 /* 1665 * If the host does not like our capabilities, 1666 * fail the probe function. 1667 */ 1668 if (dm_device.state == DM_INIT_ERROR) { 1669 ret = -EPROTO; 1670 goto out; 1671 } 1672 1673 return 0; 1674 out: 1675 vmbus_close(dev->channel); 1676 return ret; 1677 } 1678 1679 static int balloon_probe(struct hv_device *dev, 1680 const struct hv_vmbus_device_id *dev_id) 1681 { 1682 int ret; 1683 1684 allow_hibernation = hv_is_hibernation_supported(); 1685 if (allow_hibernation) 1686 hot_add = false; 1687 1688 #ifdef CONFIG_MEMORY_HOTPLUG 1689 do_hot_add = hot_add; 1690 #else 1691 do_hot_add = false; 1692 #endif 1693 dm_device.dev = dev; 1694 dm_device.state = DM_INITIALIZING; 1695 dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; 1696 init_completion(&dm_device.host_event); 1697 init_completion(&dm_device.config_event); 1698 INIT_LIST_HEAD(&dm_device.ha_region_list); 1699 spin_lock_init(&dm_device.ha_lock); 1700 INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); 1701 INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); 1702 dm_device.host_specified_ha_region = false; 1703 1704 #ifdef CONFIG_MEMORY_HOTPLUG 1705 set_online_page_callback(&hv_online_page); 1706 init_completion(&dm_device.ol_waitevent); 1707 register_memory_notifier(&hv_memory_nb); 1708 #endif 1709 1710 hv_set_drvdata(dev, &dm_device); 1711 1712 ret = balloon_connect_vsp(dev); 1713 if (ret != 0) 1714 return ret; 1715 1716 dm_device.state = DM_INITIALIZED; 1717 1718 dm_device.thread = 1719 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); 1720 if (IS_ERR(dm_device.thread)) { 1721 ret = PTR_ERR(dm_device.thread); 1722 goto probe_error; 1723 } 1724 1725 return 0; 1726 1727 probe_error: 1728 dm_device.state = DM_INIT_ERROR; 1729 dm_device.thread = NULL; 1730 vmbus_close(dev->channel); 1731 #ifdef CONFIG_MEMORY_HOTPLUG 1732 unregister_memory_notifier(&hv_memory_nb); 1733 restore_online_page_callback(&hv_online_page); 1734 #endif 1735 return ret; 1736 } 1737 1738 static int balloon_remove(struct hv_device *dev) 1739 { 1740 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1741 struct hv_hotadd_state *has, *tmp; 1742 struct hv_hotadd_gap *gap, *tmp_gap; 1743 unsigned long flags; 1744 1745 if (dm->num_pages_ballooned != 0) 1746 pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); 1747 1748 cancel_work_sync(&dm->balloon_wrk.wrk); 1749 cancel_work_sync(&dm->ha_wrk.wrk); 1750 1751 kthread_stop(dm->thread); 1752 vmbus_close(dev->channel); 1753 #ifdef CONFIG_MEMORY_HOTPLUG 1754 unregister_memory_notifier(&hv_memory_nb); 1755 restore_online_page_callback(&hv_online_page); 1756 #endif 1757 spin_lock_irqsave(&dm_device.ha_lock, flags); 1758 list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { 1759 list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { 1760 list_del(&gap->list); 1761 kfree(gap); 1762 } 1763 list_del(&has->list); 1764 kfree(has); 1765 } 1766 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 1767 1768 return 0; 1769 } 1770 1771 static int balloon_suspend(struct hv_device *hv_dev) 1772 { 1773 struct hv_dynmem_device *dm = hv_get_drvdata(hv_dev); 1774 1775 tasklet_disable(&hv_dev->channel->callback_event); 1776 1777 cancel_work_sync(&dm->balloon_wrk.wrk); 1778 cancel_work_sync(&dm->ha_wrk.wrk); 1779 1780 if (dm->thread) { 1781 kthread_stop(dm->thread); 1782 dm->thread = NULL; 1783 vmbus_close(hv_dev->channel); 1784 } 1785 1786 tasklet_enable(&hv_dev->channel->callback_event); 1787 1788 return 0; 1789 1790 } 1791 1792 static int balloon_resume(struct hv_device *dev) 1793 { 1794 int ret; 1795 1796 dm_device.state = DM_INITIALIZING; 1797 1798 ret = balloon_connect_vsp(dev); 1799 1800 if (ret != 0) 1801 goto out; 1802 1803 dm_device.thread = 1804 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); 1805 if (IS_ERR(dm_device.thread)) { 1806 ret = PTR_ERR(dm_device.thread); 1807 dm_device.thread = NULL; 1808 goto close_channel; 1809 } 1810 1811 dm_device.state = DM_INITIALIZED; 1812 return 0; 1813 close_channel: 1814 vmbus_close(dev->channel); 1815 out: 1816 dm_device.state = DM_INIT_ERROR; 1817 #ifdef CONFIG_MEMORY_HOTPLUG 1818 unregister_memory_notifier(&hv_memory_nb); 1819 restore_online_page_callback(&hv_online_page); 1820 #endif 1821 return ret; 1822 } 1823 1824 static const struct hv_vmbus_device_id id_table[] = { 1825 /* Dynamic Memory Class ID */ 1826 /* 525074DC-8985-46e2-8057-A307DC18A502 */ 1827 { HV_DM_GUID, }, 1828 { }, 1829 }; 1830 1831 MODULE_DEVICE_TABLE(vmbus, id_table); 1832 1833 static struct hv_driver balloon_drv = { 1834 .name = "hv_balloon", 1835 .id_table = id_table, 1836 .probe = balloon_probe, 1837 .remove = balloon_remove, 1838 .suspend = balloon_suspend, 1839 .resume = balloon_resume, 1840 .driver = { 1841 .probe_type = PROBE_PREFER_ASYNCHRONOUS, 1842 }, 1843 }; 1844 1845 static int __init init_balloon_drv(void) 1846 { 1847 1848 return vmbus_driver_register(&balloon_drv); 1849 } 1850 1851 module_init(init_balloon_drv); 1852 1853 MODULE_DESCRIPTION("Hyper-V Balloon"); 1854 MODULE_LICENSE("GPL"); 1855