1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2012, Microsoft Corporation. 4 * 5 * Author: 6 * K. Y. Srinivasan <kys@microsoft.com> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kernel.h> 12 #include <linux/jiffies.h> 13 #include <linux/mman.h> 14 #include <linux/delay.h> 15 #include <linux/init.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/kthread.h> 19 #include <linux/completion.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/memory.h> 22 #include <linux/notifier.h> 23 #include <linux/percpu_counter.h> 24 25 #include <linux/hyperv.h> 26 #include <asm/hyperv-tlfs.h> 27 28 #include <asm/mshyperv.h> 29 30 #define CREATE_TRACE_POINTS 31 #include "hv_trace_balloon.h" 32 33 /* 34 * We begin with definitions supporting the Dynamic Memory protocol 35 * with the host. 36 * 37 * Begin protocol definitions. 38 */ 39 40 41 42 /* 43 * Protocol versions. The low word is the minor version, the high word the major 44 * version. 45 * 46 * History: 47 * Initial version 1.0 48 * Changed to 0.1 on 2009/03/25 49 * Changes to 0.2 on 2009/05/14 50 * Changes to 0.3 on 2009/12/03 51 * Changed to 1.0 on 2011/04/05 52 */ 53 54 #define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor))) 55 #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16) 56 #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff) 57 58 enum { 59 DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3), 60 DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0), 61 DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0), 62 63 DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1, 64 DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2, 65 DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3, 66 67 DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10 68 }; 69 70 71 72 /* 73 * Message Types 74 */ 75 76 enum dm_message_type { 77 /* 78 * Version 0.3 79 */ 80 DM_ERROR = 0, 81 DM_VERSION_REQUEST = 1, 82 DM_VERSION_RESPONSE = 2, 83 DM_CAPABILITIES_REPORT = 3, 84 DM_CAPABILITIES_RESPONSE = 4, 85 DM_STATUS_REPORT = 5, 86 DM_BALLOON_REQUEST = 6, 87 DM_BALLOON_RESPONSE = 7, 88 DM_UNBALLOON_REQUEST = 8, 89 DM_UNBALLOON_RESPONSE = 9, 90 DM_MEM_HOT_ADD_REQUEST = 10, 91 DM_MEM_HOT_ADD_RESPONSE = 11, 92 DM_VERSION_03_MAX = 11, 93 /* 94 * Version 1.0. 95 */ 96 DM_INFO_MESSAGE = 12, 97 DM_VERSION_1_MAX = 12 98 }; 99 100 101 /* 102 * Structures defining the dynamic memory management 103 * protocol. 104 */ 105 106 union dm_version { 107 struct { 108 __u16 minor_version; 109 __u16 major_version; 110 }; 111 __u32 version; 112 } __packed; 113 114 115 union dm_caps { 116 struct { 117 __u64 balloon:1; 118 __u64 hot_add:1; 119 /* 120 * To support guests that may have alignment 121 * limitations on hot-add, the guest can specify 122 * its alignment requirements; a value of n 123 * represents an alignment of 2^n in mega bytes. 124 */ 125 __u64 hot_add_alignment:4; 126 __u64 reservedz:58; 127 } cap_bits; 128 __u64 caps; 129 } __packed; 130 131 union dm_mem_page_range { 132 struct { 133 /* 134 * The PFN number of the first page in the range. 135 * 40 bits is the architectural limit of a PFN 136 * number for AMD64. 137 */ 138 __u64 start_page:40; 139 /* 140 * The number of pages in the range. 141 */ 142 __u64 page_cnt:24; 143 } finfo; 144 __u64 page_range; 145 } __packed; 146 147 148 149 /* 150 * The header for all dynamic memory messages: 151 * 152 * type: Type of the message. 153 * size: Size of the message in bytes; including the header. 154 * trans_id: The guest is responsible for manufacturing this ID. 155 */ 156 157 struct dm_header { 158 __u16 type; 159 __u16 size; 160 __u32 trans_id; 161 } __packed; 162 163 /* 164 * A generic message format for dynamic memory. 165 * Specific message formats are defined later in the file. 166 */ 167 168 struct dm_message { 169 struct dm_header hdr; 170 __u8 data[]; /* enclosed message */ 171 } __packed; 172 173 174 /* 175 * Specific message types supporting the dynamic memory protocol. 176 */ 177 178 /* 179 * Version negotiation message. Sent from the guest to the host. 180 * The guest is free to try different versions until the host 181 * accepts the version. 182 * 183 * dm_version: The protocol version requested. 184 * is_last_attempt: If TRUE, this is the last version guest will request. 185 * reservedz: Reserved field, set to zero. 186 */ 187 188 struct dm_version_request { 189 struct dm_header hdr; 190 union dm_version version; 191 __u32 is_last_attempt:1; 192 __u32 reservedz:31; 193 } __packed; 194 195 /* 196 * Version response message; Host to Guest and indicates 197 * if the host has accepted the version sent by the guest. 198 * 199 * is_accepted: If TRUE, host has accepted the version and the guest 200 * should proceed to the next stage of the protocol. FALSE indicates that 201 * guest should re-try with a different version. 202 * 203 * reservedz: Reserved field, set to zero. 204 */ 205 206 struct dm_version_response { 207 struct dm_header hdr; 208 __u64 is_accepted:1; 209 __u64 reservedz:63; 210 } __packed; 211 212 /* 213 * Message reporting capabilities. This is sent from the guest to the 214 * host. 215 */ 216 217 struct dm_capabilities { 218 struct dm_header hdr; 219 union dm_caps caps; 220 __u64 min_page_cnt; 221 __u64 max_page_number; 222 } __packed; 223 224 /* 225 * Response to the capabilities message. This is sent from the host to the 226 * guest. This message notifies if the host has accepted the guest's 227 * capabilities. If the host has not accepted, the guest must shutdown 228 * the service. 229 * 230 * is_accepted: Indicates if the host has accepted guest's capabilities. 231 * reservedz: Must be 0. 232 */ 233 234 struct dm_capabilities_resp_msg { 235 struct dm_header hdr; 236 __u64 is_accepted:1; 237 __u64 reservedz:63; 238 } __packed; 239 240 /* 241 * This message is used to report memory pressure from the guest. 242 * This message is not part of any transaction and there is no 243 * response to this message. 244 * 245 * num_avail: Available memory in pages. 246 * num_committed: Committed memory in pages. 247 * page_file_size: The accumulated size of all page files 248 * in the system in pages. 249 * zero_free: The nunber of zero and free pages. 250 * page_file_writes: The writes to the page file in pages. 251 * io_diff: An indicator of file cache efficiency or page file activity, 252 * calculated as File Cache Page Fault Count - Page Read Count. 253 * This value is in pages. 254 * 255 * Some of these metrics are Windows specific and fortunately 256 * the algorithm on the host side that computes the guest memory 257 * pressure only uses num_committed value. 258 */ 259 260 struct dm_status { 261 struct dm_header hdr; 262 __u64 num_avail; 263 __u64 num_committed; 264 __u64 page_file_size; 265 __u64 zero_free; 266 __u32 page_file_writes; 267 __u32 io_diff; 268 } __packed; 269 270 271 /* 272 * Message to ask the guest to allocate memory - balloon up message. 273 * This message is sent from the host to the guest. The guest may not be 274 * able to allocate as much memory as requested. 275 * 276 * num_pages: number of pages to allocate. 277 */ 278 279 struct dm_balloon { 280 struct dm_header hdr; 281 __u32 num_pages; 282 __u32 reservedz; 283 } __packed; 284 285 286 /* 287 * Balloon response message; this message is sent from the guest 288 * to the host in response to the balloon message. 289 * 290 * reservedz: Reserved; must be set to zero. 291 * more_pages: If FALSE, this is the last message of the transaction. 292 * if TRUE there will atleast one more message from the guest. 293 * 294 * range_count: The number of ranges in the range array. 295 * 296 * range_array: An array of page ranges returned to the host. 297 * 298 */ 299 300 struct dm_balloon_response { 301 struct dm_header hdr; 302 __u32 reservedz; 303 __u32 more_pages:1; 304 __u32 range_count:31; 305 union dm_mem_page_range range_array[]; 306 } __packed; 307 308 /* 309 * Un-balloon message; this message is sent from the host 310 * to the guest to give guest more memory. 311 * 312 * more_pages: If FALSE, this is the last message of the transaction. 313 * if TRUE there will atleast one more message from the guest. 314 * 315 * reservedz: Reserved; must be set to zero. 316 * 317 * range_count: The number of ranges in the range array. 318 * 319 * range_array: An array of page ranges returned to the host. 320 * 321 */ 322 323 struct dm_unballoon_request { 324 struct dm_header hdr; 325 __u32 more_pages:1; 326 __u32 reservedz:31; 327 __u32 range_count; 328 union dm_mem_page_range range_array[]; 329 } __packed; 330 331 /* 332 * Un-balloon response message; this message is sent from the guest 333 * to the host in response to an unballoon request. 334 * 335 */ 336 337 struct dm_unballoon_response { 338 struct dm_header hdr; 339 } __packed; 340 341 342 /* 343 * Hot add request message. Message sent from the host to the guest. 344 * 345 * mem_range: Memory range to hot add. 346 * 347 */ 348 349 struct dm_hot_add { 350 struct dm_header hdr; 351 union dm_mem_page_range range; 352 } __packed; 353 354 /* 355 * Hot add response message. 356 * This message is sent by the guest to report the status of a hot add request. 357 * If page_count is less than the requested page count, then the host should 358 * assume all further hot add requests will fail, since this indicates that 359 * the guest has hit an upper physical memory barrier. 360 * 361 * Hot adds may also fail due to low resources; in this case, the guest must 362 * not complete this message until the hot add can succeed, and the host must 363 * not send a new hot add request until the response is sent. 364 * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS 365 * times it fails the request. 366 * 367 * 368 * page_count: number of pages that were successfully hot added. 369 * 370 * result: result of the operation 1: success, 0: failure. 371 * 372 */ 373 374 struct dm_hot_add_response { 375 struct dm_header hdr; 376 __u32 page_count; 377 __u32 result; 378 } __packed; 379 380 /* 381 * Types of information sent from host to the guest. 382 */ 383 384 enum dm_info_type { 385 INFO_TYPE_MAX_PAGE_CNT = 0, 386 MAX_INFO_TYPE 387 }; 388 389 390 /* 391 * Header for the information message. 392 */ 393 394 struct dm_info_header { 395 enum dm_info_type type; 396 __u32 data_size; 397 } __packed; 398 399 /* 400 * This message is sent from the host to the guest to pass 401 * some relevant information (win8 addition). 402 * 403 * reserved: no used. 404 * info_size: size of the information blob. 405 * info: information blob. 406 */ 407 408 struct dm_info_msg { 409 struct dm_header hdr; 410 __u32 reserved; 411 __u32 info_size; 412 __u8 info[]; 413 }; 414 415 /* 416 * End protocol definitions. 417 */ 418 419 /* 420 * State to manage hot adding memory into the guest. 421 * The range start_pfn : end_pfn specifies the range 422 * that the host has asked us to hot add. The range 423 * start_pfn : ha_end_pfn specifies the range that we have 424 * currently hot added. We hot add in multiples of 128M 425 * chunks; it is possible that we may not be able to bring 426 * online all the pages in the region. The range 427 * covered_start_pfn:covered_end_pfn defines the pages that can 428 * be brough online. 429 */ 430 431 struct hv_hotadd_state { 432 struct list_head list; 433 unsigned long start_pfn; 434 unsigned long covered_start_pfn; 435 unsigned long covered_end_pfn; 436 unsigned long ha_end_pfn; 437 unsigned long end_pfn; 438 /* 439 * A list of gaps. 440 */ 441 struct list_head gap_list; 442 }; 443 444 struct hv_hotadd_gap { 445 struct list_head list; 446 unsigned long start_pfn; 447 unsigned long end_pfn; 448 }; 449 450 struct balloon_state { 451 __u32 num_pages; 452 struct work_struct wrk; 453 }; 454 455 struct hot_add_wrk { 456 union dm_mem_page_range ha_page_range; 457 union dm_mem_page_range ha_region_range; 458 struct work_struct wrk; 459 }; 460 461 static bool allow_hibernation; 462 static bool hot_add = true; 463 static bool do_hot_add; 464 /* 465 * Delay reporting memory pressure by 466 * the specified number of seconds. 467 */ 468 static uint pressure_report_delay = 45; 469 470 /* 471 * The last time we posted a pressure report to host. 472 */ 473 static unsigned long last_post_time; 474 475 module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); 476 MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); 477 478 module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); 479 MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); 480 static atomic_t trans_id = ATOMIC_INIT(0); 481 482 static int dm_ring_size = 20 * 1024; 483 484 /* 485 * Driver specific state. 486 */ 487 488 enum hv_dm_state { 489 DM_INITIALIZING = 0, 490 DM_INITIALIZED, 491 DM_BALLOON_UP, 492 DM_BALLOON_DOWN, 493 DM_HOT_ADD, 494 DM_INIT_ERROR 495 }; 496 497 498 static __u8 recv_buffer[HV_HYP_PAGE_SIZE]; 499 static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE]; 500 #define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE) 501 #define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE) 502 503 struct hv_dynmem_device { 504 struct hv_device *dev; 505 enum hv_dm_state state; 506 struct completion host_event; 507 struct completion config_event; 508 509 /* 510 * Number of pages we have currently ballooned out. 511 */ 512 unsigned int num_pages_ballooned; 513 unsigned int num_pages_onlined; 514 unsigned int num_pages_added; 515 516 /* 517 * State to manage the ballooning (up) operation. 518 */ 519 struct balloon_state balloon_wrk; 520 521 /* 522 * State to execute the "hot-add" operation. 523 */ 524 struct hot_add_wrk ha_wrk; 525 526 /* 527 * This state tracks if the host has specified a hot-add 528 * region. 529 */ 530 bool host_specified_ha_region; 531 532 /* 533 * State to synchronize hot-add. 534 */ 535 struct completion ol_waitevent; 536 /* 537 * This thread handles hot-add 538 * requests from the host as well as notifying 539 * the host with regards to memory pressure in 540 * the guest. 541 */ 542 struct task_struct *thread; 543 544 /* 545 * Protects ha_region_list, num_pages_onlined counter and individual 546 * regions from ha_region_list. 547 */ 548 spinlock_t ha_lock; 549 550 /* 551 * A list of hot-add regions. 552 */ 553 struct list_head ha_region_list; 554 555 /* 556 * We start with the highest version we can support 557 * and downgrade based on the host; we save here the 558 * next version to try. 559 */ 560 __u32 next_version; 561 562 /* 563 * The negotiated version agreed by host. 564 */ 565 __u32 version; 566 }; 567 568 static struct hv_dynmem_device dm_device; 569 570 static void post_status(struct hv_dynmem_device *dm); 571 572 #ifdef CONFIG_MEMORY_HOTPLUG 573 static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, 574 unsigned long pfn) 575 { 576 struct hv_hotadd_gap *gap; 577 578 /* The page is not backed. */ 579 if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn)) 580 return false; 581 582 /* Check for gaps. */ 583 list_for_each_entry(gap, &has->gap_list, list) { 584 if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn)) 585 return false; 586 } 587 588 return true; 589 } 590 591 static unsigned long hv_page_offline_check(unsigned long start_pfn, 592 unsigned long nr_pages) 593 { 594 unsigned long pfn = start_pfn, count = 0; 595 struct hv_hotadd_state *has; 596 bool found; 597 598 while (pfn < start_pfn + nr_pages) { 599 /* 600 * Search for HAS which covers the pfn and when we find one 601 * count how many consequitive PFNs are covered. 602 */ 603 found = false; 604 list_for_each_entry(has, &dm_device.ha_region_list, list) { 605 while ((pfn >= has->start_pfn) && 606 (pfn < has->end_pfn) && 607 (pfn < start_pfn + nr_pages)) { 608 found = true; 609 if (has_pfn_is_backed(has, pfn)) 610 count++; 611 pfn++; 612 } 613 } 614 615 /* 616 * This PFN is not in any HAS (e.g. we're offlining a region 617 * which was present at boot), no need to account for it. Go 618 * to the next one. 619 */ 620 if (!found) 621 pfn++; 622 } 623 624 return count; 625 } 626 627 static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, 628 void *v) 629 { 630 struct memory_notify *mem = (struct memory_notify *)v; 631 unsigned long flags, pfn_count; 632 633 switch (val) { 634 case MEM_ONLINE: 635 case MEM_CANCEL_ONLINE: 636 complete(&dm_device.ol_waitevent); 637 break; 638 639 case MEM_OFFLINE: 640 spin_lock_irqsave(&dm_device.ha_lock, flags); 641 pfn_count = hv_page_offline_check(mem->start_pfn, 642 mem->nr_pages); 643 if (pfn_count <= dm_device.num_pages_onlined) { 644 dm_device.num_pages_onlined -= pfn_count; 645 } else { 646 /* 647 * We're offlining more pages than we managed to online. 648 * This is unexpected. In any case don't let 649 * num_pages_onlined wrap around zero. 650 */ 651 WARN_ON_ONCE(1); 652 dm_device.num_pages_onlined = 0; 653 } 654 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 655 break; 656 case MEM_GOING_ONLINE: 657 case MEM_GOING_OFFLINE: 658 case MEM_CANCEL_OFFLINE: 659 break; 660 } 661 return NOTIFY_OK; 662 } 663 664 static struct notifier_block hv_memory_nb = { 665 .notifier_call = hv_memory_notifier, 666 .priority = 0 667 }; 668 669 /* Check if the particular page is backed and can be onlined and online it. */ 670 static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) 671 { 672 if (!has_pfn_is_backed(has, page_to_pfn(pg))) { 673 if (!PageOffline(pg)) 674 __SetPageOffline(pg); 675 return; 676 } 677 if (PageOffline(pg)) 678 __ClearPageOffline(pg); 679 680 /* This frame is currently backed; online the page. */ 681 generic_online_page(pg, 0); 682 683 lockdep_assert_held(&dm_device.ha_lock); 684 dm_device.num_pages_onlined++; 685 } 686 687 static void hv_bring_pgs_online(struct hv_hotadd_state *has, 688 unsigned long start_pfn, unsigned long size) 689 { 690 int i; 691 692 pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); 693 for (i = 0; i < size; i++) 694 hv_page_online_one(has, pfn_to_page(start_pfn + i)); 695 } 696 697 static void hv_mem_hot_add(unsigned long start, unsigned long size, 698 unsigned long pfn_count, 699 struct hv_hotadd_state *has) 700 { 701 int ret = 0; 702 int i, nid; 703 unsigned long start_pfn; 704 unsigned long processed_pfn; 705 unsigned long total_pfn = pfn_count; 706 unsigned long flags; 707 708 for (i = 0; i < (size/HA_CHUNK); i++) { 709 start_pfn = start + (i * HA_CHUNK); 710 711 spin_lock_irqsave(&dm_device.ha_lock, flags); 712 has->ha_end_pfn += HA_CHUNK; 713 714 if (total_pfn > HA_CHUNK) { 715 processed_pfn = HA_CHUNK; 716 total_pfn -= HA_CHUNK; 717 } else { 718 processed_pfn = total_pfn; 719 total_pfn = 0; 720 } 721 722 has->covered_end_pfn += processed_pfn; 723 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 724 725 reinit_completion(&dm_device.ol_waitevent); 726 727 nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); 728 ret = add_memory(nid, PFN_PHYS((start_pfn)), 729 (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE); 730 731 if (ret) { 732 pr_err("hot_add memory failed error is %d\n", ret); 733 if (ret == -EEXIST) { 734 /* 735 * This error indicates that the error 736 * is not a transient failure. This is the 737 * case where the guest's physical address map 738 * precludes hot adding memory. Stop all further 739 * memory hot-add. 740 */ 741 do_hot_add = false; 742 } 743 spin_lock_irqsave(&dm_device.ha_lock, flags); 744 has->ha_end_pfn -= HA_CHUNK; 745 has->covered_end_pfn -= processed_pfn; 746 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 747 break; 748 } 749 750 /* 751 * Wait for memory to get onlined. If the kernel onlined the 752 * memory when adding it, this will return directly. Otherwise, 753 * it will wait for user space to online the memory. This helps 754 * to avoid adding memory faster than it is getting onlined. As 755 * adding succeeded, it is ok to proceed even if the memory was 756 * not onlined in time. 757 */ 758 wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ); 759 post_status(&dm_device); 760 } 761 } 762 763 static void hv_online_page(struct page *pg, unsigned int order) 764 { 765 struct hv_hotadd_state *has; 766 unsigned long flags; 767 unsigned long pfn = page_to_pfn(pg); 768 769 spin_lock_irqsave(&dm_device.ha_lock, flags); 770 list_for_each_entry(has, &dm_device.ha_region_list, list) { 771 /* The page belongs to a different HAS. */ 772 if ((pfn < has->start_pfn) || 773 (pfn + (1UL << order) > has->end_pfn)) 774 continue; 775 776 hv_bring_pgs_online(has, pfn, 1UL << order); 777 break; 778 } 779 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 780 } 781 782 static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) 783 { 784 struct hv_hotadd_state *has; 785 struct hv_hotadd_gap *gap; 786 unsigned long residual, new_inc; 787 int ret = 0; 788 unsigned long flags; 789 790 spin_lock_irqsave(&dm_device.ha_lock, flags); 791 list_for_each_entry(has, &dm_device.ha_region_list, list) { 792 /* 793 * If the pfn range we are dealing with is not in the current 794 * "hot add block", move on. 795 */ 796 if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn) 797 continue; 798 799 /* 800 * If the current start pfn is not where the covered_end 801 * is, create a gap and update covered_end_pfn. 802 */ 803 if (has->covered_end_pfn != start_pfn) { 804 gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC); 805 if (!gap) { 806 ret = -ENOMEM; 807 break; 808 } 809 810 INIT_LIST_HEAD(&gap->list); 811 gap->start_pfn = has->covered_end_pfn; 812 gap->end_pfn = start_pfn; 813 list_add_tail(&gap->list, &has->gap_list); 814 815 has->covered_end_pfn = start_pfn; 816 } 817 818 /* 819 * If the current hot add-request extends beyond 820 * our current limit; extend it. 821 */ 822 if ((start_pfn + pfn_cnt) > has->end_pfn) { 823 residual = (start_pfn + pfn_cnt - has->end_pfn); 824 /* 825 * Extend the region by multiples of HA_CHUNK. 826 */ 827 new_inc = (residual / HA_CHUNK) * HA_CHUNK; 828 if (residual % HA_CHUNK) 829 new_inc += HA_CHUNK; 830 831 has->end_pfn += new_inc; 832 } 833 834 ret = 1; 835 break; 836 } 837 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 838 839 return ret; 840 } 841 842 static unsigned long handle_pg_range(unsigned long pg_start, 843 unsigned long pg_count) 844 { 845 unsigned long start_pfn = pg_start; 846 unsigned long pfn_cnt = pg_count; 847 unsigned long size; 848 struct hv_hotadd_state *has; 849 unsigned long pgs_ol = 0; 850 unsigned long old_covered_state; 851 unsigned long res = 0, flags; 852 853 pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count, 854 pg_start); 855 856 spin_lock_irqsave(&dm_device.ha_lock, flags); 857 list_for_each_entry(has, &dm_device.ha_region_list, list) { 858 /* 859 * If the pfn range we are dealing with is not in the current 860 * "hot add block", move on. 861 */ 862 if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn) 863 continue; 864 865 old_covered_state = has->covered_end_pfn; 866 867 if (start_pfn < has->ha_end_pfn) { 868 /* 869 * This is the case where we are backing pages 870 * in an already hot added region. Bring 871 * these pages online first. 872 */ 873 pgs_ol = has->ha_end_pfn - start_pfn; 874 if (pgs_ol > pfn_cnt) 875 pgs_ol = pfn_cnt; 876 877 has->covered_end_pfn += pgs_ol; 878 pfn_cnt -= pgs_ol; 879 /* 880 * Check if the corresponding memory block is already 881 * online. It is possible to observe struct pages still 882 * being uninitialized here so check section instead. 883 * In case the section is online we need to bring the 884 * rest of pfns (which were not backed previously) 885 * online too. 886 */ 887 if (start_pfn > has->start_pfn && 888 online_section_nr(pfn_to_section_nr(start_pfn))) 889 hv_bring_pgs_online(has, start_pfn, pgs_ol); 890 891 } 892 893 if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { 894 /* 895 * We have some residual hot add range 896 * that needs to be hot added; hot add 897 * it now. Hot add a multiple of 898 * of HA_CHUNK that fully covers the pages 899 * we have. 900 */ 901 size = (has->end_pfn - has->ha_end_pfn); 902 if (pfn_cnt <= size) { 903 size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); 904 if (pfn_cnt % HA_CHUNK) 905 size += HA_CHUNK; 906 } else { 907 pfn_cnt = size; 908 } 909 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 910 hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has); 911 spin_lock_irqsave(&dm_device.ha_lock, flags); 912 } 913 /* 914 * If we managed to online any pages that were given to us, 915 * we declare success. 916 */ 917 res = has->covered_end_pfn - old_covered_state; 918 break; 919 } 920 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 921 922 return res; 923 } 924 925 static unsigned long process_hot_add(unsigned long pg_start, 926 unsigned long pfn_cnt, 927 unsigned long rg_start, 928 unsigned long rg_size) 929 { 930 struct hv_hotadd_state *ha_region = NULL; 931 int covered; 932 unsigned long flags; 933 934 if (pfn_cnt == 0) 935 return 0; 936 937 if (!dm_device.host_specified_ha_region) { 938 covered = pfn_covered(pg_start, pfn_cnt); 939 if (covered < 0) 940 return 0; 941 942 if (covered) 943 goto do_pg_range; 944 } 945 946 /* 947 * If the host has specified a hot-add range; deal with it first. 948 */ 949 950 if (rg_size != 0) { 951 ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL); 952 if (!ha_region) 953 return 0; 954 955 INIT_LIST_HEAD(&ha_region->list); 956 INIT_LIST_HEAD(&ha_region->gap_list); 957 958 ha_region->start_pfn = rg_start; 959 ha_region->ha_end_pfn = rg_start; 960 ha_region->covered_start_pfn = pg_start; 961 ha_region->covered_end_pfn = pg_start; 962 ha_region->end_pfn = rg_start + rg_size; 963 964 spin_lock_irqsave(&dm_device.ha_lock, flags); 965 list_add_tail(&ha_region->list, &dm_device.ha_region_list); 966 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 967 } 968 969 do_pg_range: 970 /* 971 * Process the page range specified; bringing them 972 * online if possible. 973 */ 974 return handle_pg_range(pg_start, pfn_cnt); 975 } 976 977 #endif 978 979 static void hot_add_req(struct work_struct *dummy) 980 { 981 struct dm_hot_add_response resp; 982 #ifdef CONFIG_MEMORY_HOTPLUG 983 unsigned long pg_start, pfn_cnt; 984 unsigned long rg_start, rg_sz; 985 #endif 986 struct hv_dynmem_device *dm = &dm_device; 987 988 memset(&resp, 0, sizeof(struct dm_hot_add_response)); 989 resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE; 990 resp.hdr.size = sizeof(struct dm_hot_add_response); 991 992 #ifdef CONFIG_MEMORY_HOTPLUG 993 pg_start = dm->ha_wrk.ha_page_range.finfo.start_page; 994 pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt; 995 996 rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; 997 rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; 998 999 if ((rg_start == 0) && (!dm->host_specified_ha_region)) { 1000 unsigned long region_size; 1001 unsigned long region_start; 1002 1003 /* 1004 * The host has not specified the hot-add region. 1005 * Based on the hot-add page range being specified, 1006 * compute a hot-add region that can cover the pages 1007 * that need to be hot-added while ensuring the alignment 1008 * and size requirements of Linux as it relates to hot-add. 1009 */ 1010 region_start = pg_start; 1011 region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; 1012 if (pfn_cnt % HA_CHUNK) 1013 region_size += HA_CHUNK; 1014 1015 region_start = (pg_start / HA_CHUNK) * HA_CHUNK; 1016 1017 rg_start = region_start; 1018 rg_sz = region_size; 1019 } 1020 1021 if (do_hot_add) 1022 resp.page_count = process_hot_add(pg_start, pfn_cnt, 1023 rg_start, rg_sz); 1024 1025 dm->num_pages_added += resp.page_count; 1026 #endif 1027 /* 1028 * The result field of the response structure has the 1029 * following semantics: 1030 * 1031 * 1. If all or some pages hot-added: Guest should return success. 1032 * 1033 * 2. If no pages could be hot-added: 1034 * 1035 * If the guest returns success, then the host 1036 * will not attempt any further hot-add operations. This 1037 * signifies a permanent failure. 1038 * 1039 * If the guest returns failure, then this failure will be 1040 * treated as a transient failure and the host may retry the 1041 * hot-add operation after some delay. 1042 */ 1043 if (resp.page_count > 0) 1044 resp.result = 1; 1045 else if (!do_hot_add) 1046 resp.result = 1; 1047 else 1048 resp.result = 0; 1049 1050 if (!do_hot_add || resp.page_count == 0) { 1051 if (!allow_hibernation) 1052 pr_err("Memory hot add failed\n"); 1053 else 1054 pr_info("Ignore hot-add request!\n"); 1055 } 1056 1057 dm->state = DM_INITIALIZED; 1058 resp.hdr.trans_id = atomic_inc_return(&trans_id); 1059 vmbus_sendpacket(dm->dev->channel, &resp, 1060 sizeof(struct dm_hot_add_response), 1061 (unsigned long)NULL, 1062 VM_PKT_DATA_INBAND, 0); 1063 } 1064 1065 static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) 1066 { 1067 struct dm_info_header *info_hdr; 1068 1069 info_hdr = (struct dm_info_header *)msg->info; 1070 1071 switch (info_hdr->type) { 1072 case INFO_TYPE_MAX_PAGE_CNT: 1073 if (info_hdr->data_size == sizeof(__u64)) { 1074 __u64 *max_page_count = (__u64 *)&info_hdr[1]; 1075 1076 pr_info("Max. dynamic memory size: %llu MB\n", 1077 (*max_page_count) >> (20 - HV_HYP_PAGE_SHIFT)); 1078 } 1079 1080 break; 1081 default: 1082 pr_warn("Received Unknown type: %d\n", info_hdr->type); 1083 } 1084 } 1085 1086 static unsigned long compute_balloon_floor(void) 1087 { 1088 unsigned long min_pages; 1089 unsigned long nr_pages = totalram_pages(); 1090 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) 1091 /* Simple continuous piecewiese linear function: 1092 * max MiB -> min MiB gradient 1093 * 0 0 1094 * 16 16 1095 * 32 24 1096 * 128 72 (1/2) 1097 * 512 168 (1/4) 1098 * 2048 360 (1/8) 1099 * 8192 744 (1/16) 1100 * 32768 1512 (1/32) 1101 */ 1102 if (nr_pages < MB2PAGES(128)) 1103 min_pages = MB2PAGES(8) + (nr_pages >> 1); 1104 else if (nr_pages < MB2PAGES(512)) 1105 min_pages = MB2PAGES(40) + (nr_pages >> 2); 1106 else if (nr_pages < MB2PAGES(2048)) 1107 min_pages = MB2PAGES(104) + (nr_pages >> 3); 1108 else if (nr_pages < MB2PAGES(8192)) 1109 min_pages = MB2PAGES(232) + (nr_pages >> 4); 1110 else 1111 min_pages = MB2PAGES(488) + (nr_pages >> 5); 1112 #undef MB2PAGES 1113 return min_pages; 1114 } 1115 1116 /* 1117 * Post our status as it relates memory pressure to the 1118 * host. Host expects the guests to post this status 1119 * periodically at 1 second intervals. 1120 * 1121 * The metrics specified in this protocol are very Windows 1122 * specific and so we cook up numbers here to convey our memory 1123 * pressure. 1124 */ 1125 1126 static void post_status(struct hv_dynmem_device *dm) 1127 { 1128 struct dm_status status; 1129 unsigned long now = jiffies; 1130 unsigned long last_post = last_post_time; 1131 1132 if (pressure_report_delay > 0) { 1133 --pressure_report_delay; 1134 return; 1135 } 1136 1137 if (!time_after(now, (last_post_time + HZ))) 1138 return; 1139 1140 memset(&status, 0, sizeof(struct dm_status)); 1141 status.hdr.type = DM_STATUS_REPORT; 1142 status.hdr.size = sizeof(struct dm_status); 1143 status.hdr.trans_id = atomic_inc_return(&trans_id); 1144 1145 /* 1146 * The host expects the guest to report free and committed memory. 1147 * Furthermore, the host expects the pressure information to include 1148 * the ballooned out pages. For a given amount of memory that we are 1149 * managing we need to compute a floor below which we should not 1150 * balloon. Compute this and add it to the pressure report. 1151 * We also need to report all offline pages (num_pages_added - 1152 * num_pages_onlined) as committed to the host, otherwise it can try 1153 * asking us to balloon them out. 1154 */ 1155 status.num_avail = si_mem_available(); 1156 status.num_committed = vm_memory_committed() + 1157 dm->num_pages_ballooned + 1158 (dm->num_pages_added > dm->num_pages_onlined ? 1159 dm->num_pages_added - dm->num_pages_onlined : 0) + 1160 compute_balloon_floor(); 1161 1162 trace_balloon_status(status.num_avail, status.num_committed, 1163 vm_memory_committed(), dm->num_pages_ballooned, 1164 dm->num_pages_added, dm->num_pages_onlined); 1165 /* 1166 * If our transaction ID is no longer current, just don't 1167 * send the status. This can happen if we were interrupted 1168 * after we picked our transaction ID. 1169 */ 1170 if (status.hdr.trans_id != atomic_read(&trans_id)) 1171 return; 1172 1173 /* 1174 * If the last post time that we sampled has changed, 1175 * we have raced, don't post the status. 1176 */ 1177 if (last_post != last_post_time) 1178 return; 1179 1180 last_post_time = jiffies; 1181 vmbus_sendpacket(dm->dev->channel, &status, 1182 sizeof(struct dm_status), 1183 (unsigned long)NULL, 1184 VM_PKT_DATA_INBAND, 0); 1185 1186 } 1187 1188 static void free_balloon_pages(struct hv_dynmem_device *dm, 1189 union dm_mem_page_range *range_array) 1190 { 1191 int num_pages = range_array->finfo.page_cnt; 1192 __u64 start_frame = range_array->finfo.start_page; 1193 struct page *pg; 1194 int i; 1195 1196 for (i = 0; i < num_pages; i++) { 1197 pg = pfn_to_page(i + start_frame); 1198 __ClearPageOffline(pg); 1199 __free_page(pg); 1200 dm->num_pages_ballooned--; 1201 } 1202 } 1203 1204 1205 1206 static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, 1207 unsigned int num_pages, 1208 struct dm_balloon_response *bl_resp, 1209 int alloc_unit) 1210 { 1211 unsigned int i, j; 1212 struct page *pg; 1213 1214 for (i = 0; i < num_pages / alloc_unit; i++) { 1215 if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) > 1216 HV_HYP_PAGE_SIZE) 1217 return i * alloc_unit; 1218 1219 /* 1220 * We execute this code in a thread context. Furthermore, 1221 * we don't want the kernel to try too hard. 1222 */ 1223 pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY | 1224 __GFP_NOMEMALLOC | __GFP_NOWARN, 1225 get_order(alloc_unit << PAGE_SHIFT)); 1226 1227 if (!pg) 1228 return i * alloc_unit; 1229 1230 dm->num_pages_ballooned += alloc_unit; 1231 1232 /* 1233 * If we allocatted 2M pages; split them so we 1234 * can free them in any order we get. 1235 */ 1236 1237 if (alloc_unit != 1) 1238 split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); 1239 1240 /* mark all pages offline */ 1241 for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++) 1242 __SetPageOffline(pg + j); 1243 1244 bl_resp->range_count++; 1245 bl_resp->range_array[i].finfo.start_page = 1246 page_to_pfn(pg); 1247 bl_resp->range_array[i].finfo.page_cnt = alloc_unit; 1248 bl_resp->hdr.size += sizeof(union dm_mem_page_range); 1249 1250 } 1251 1252 return i * alloc_unit; 1253 } 1254 1255 static void balloon_up(struct work_struct *dummy) 1256 { 1257 unsigned int num_pages = dm_device.balloon_wrk.num_pages; 1258 unsigned int num_ballooned = 0; 1259 struct dm_balloon_response *bl_resp; 1260 int alloc_unit; 1261 int ret; 1262 bool done = false; 1263 int i; 1264 long avail_pages; 1265 unsigned long floor; 1266 1267 /* 1268 * We will attempt 2M allocations. However, if we fail to 1269 * allocate 2M chunks, we will go back to PAGE_SIZE allocations. 1270 */ 1271 alloc_unit = PAGES_IN_2M; 1272 1273 avail_pages = si_mem_available(); 1274 floor = compute_balloon_floor(); 1275 1276 /* Refuse to balloon below the floor. */ 1277 if (avail_pages < num_pages || avail_pages - num_pages < floor) { 1278 pr_info("Balloon request will be partially fulfilled. %s\n", 1279 avail_pages < num_pages ? "Not enough memory." : 1280 "Balloon floor reached."); 1281 1282 num_pages = avail_pages > floor ? (avail_pages - floor) : 0; 1283 } 1284 1285 while (!done) { 1286 memset(balloon_up_send_buffer, 0, HV_HYP_PAGE_SIZE); 1287 bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer; 1288 bl_resp->hdr.type = DM_BALLOON_RESPONSE; 1289 bl_resp->hdr.size = sizeof(struct dm_balloon_response); 1290 bl_resp->more_pages = 1; 1291 1292 num_pages -= num_ballooned; 1293 num_ballooned = alloc_balloon_pages(&dm_device, num_pages, 1294 bl_resp, alloc_unit); 1295 1296 if (alloc_unit != 1 && num_ballooned == 0) { 1297 alloc_unit = 1; 1298 continue; 1299 } 1300 1301 if (num_ballooned == 0 || num_ballooned == num_pages) { 1302 pr_debug("Ballooned %u out of %u requested pages.\n", 1303 num_pages, dm_device.balloon_wrk.num_pages); 1304 1305 bl_resp->more_pages = 0; 1306 done = true; 1307 dm_device.state = DM_INITIALIZED; 1308 } 1309 1310 /* 1311 * We are pushing a lot of data through the channel; 1312 * deal with transient failures caused because of the 1313 * lack of space in the ring buffer. 1314 */ 1315 1316 do { 1317 bl_resp->hdr.trans_id = atomic_inc_return(&trans_id); 1318 ret = vmbus_sendpacket(dm_device.dev->channel, 1319 bl_resp, 1320 bl_resp->hdr.size, 1321 (unsigned long)NULL, 1322 VM_PKT_DATA_INBAND, 0); 1323 1324 if (ret == -EAGAIN) 1325 msleep(20); 1326 post_status(&dm_device); 1327 } while (ret == -EAGAIN); 1328 1329 if (ret) { 1330 /* 1331 * Free up the memory we allocatted. 1332 */ 1333 pr_err("Balloon response failed\n"); 1334 1335 for (i = 0; i < bl_resp->range_count; i++) 1336 free_balloon_pages(&dm_device, 1337 &bl_resp->range_array[i]); 1338 1339 done = true; 1340 } 1341 } 1342 1343 } 1344 1345 static void balloon_down(struct hv_dynmem_device *dm, 1346 struct dm_unballoon_request *req) 1347 { 1348 union dm_mem_page_range *range_array = req->range_array; 1349 int range_count = req->range_count; 1350 struct dm_unballoon_response resp; 1351 int i; 1352 unsigned int prev_pages_ballooned = dm->num_pages_ballooned; 1353 1354 for (i = 0; i < range_count; i++) { 1355 free_balloon_pages(dm, &range_array[i]); 1356 complete(&dm_device.config_event); 1357 } 1358 1359 pr_debug("Freed %u ballooned pages.\n", 1360 prev_pages_ballooned - dm->num_pages_ballooned); 1361 1362 if (req->more_pages == 1) 1363 return; 1364 1365 memset(&resp, 0, sizeof(struct dm_unballoon_response)); 1366 resp.hdr.type = DM_UNBALLOON_RESPONSE; 1367 resp.hdr.trans_id = atomic_inc_return(&trans_id); 1368 resp.hdr.size = sizeof(struct dm_unballoon_response); 1369 1370 vmbus_sendpacket(dm_device.dev->channel, &resp, 1371 sizeof(struct dm_unballoon_response), 1372 (unsigned long)NULL, 1373 VM_PKT_DATA_INBAND, 0); 1374 1375 dm->state = DM_INITIALIZED; 1376 } 1377 1378 static void balloon_onchannelcallback(void *context); 1379 1380 static int dm_thread_func(void *dm_dev) 1381 { 1382 struct hv_dynmem_device *dm = dm_dev; 1383 1384 while (!kthread_should_stop()) { 1385 wait_for_completion_interruptible_timeout( 1386 &dm_device.config_event, 1*HZ); 1387 /* 1388 * The host expects us to post information on the memory 1389 * pressure every second. 1390 */ 1391 reinit_completion(&dm_device.config_event); 1392 post_status(dm); 1393 } 1394 1395 return 0; 1396 } 1397 1398 1399 static void version_resp(struct hv_dynmem_device *dm, 1400 struct dm_version_response *vresp) 1401 { 1402 struct dm_version_request version_req; 1403 int ret; 1404 1405 if (vresp->is_accepted) { 1406 /* 1407 * We are done; wakeup the 1408 * context waiting for version 1409 * negotiation. 1410 */ 1411 complete(&dm->host_event); 1412 return; 1413 } 1414 /* 1415 * If there are more versions to try, continue 1416 * with negotiations; if not 1417 * shutdown the service since we are not able 1418 * to negotiate a suitable version number 1419 * with the host. 1420 */ 1421 if (dm->next_version == 0) 1422 goto version_error; 1423 1424 memset(&version_req, 0, sizeof(struct dm_version_request)); 1425 version_req.hdr.type = DM_VERSION_REQUEST; 1426 version_req.hdr.size = sizeof(struct dm_version_request); 1427 version_req.hdr.trans_id = atomic_inc_return(&trans_id); 1428 version_req.version.version = dm->next_version; 1429 dm->version = version_req.version.version; 1430 1431 /* 1432 * Set the next version to try in case current version fails. 1433 * Win7 protocol ought to be the last one to try. 1434 */ 1435 switch (version_req.version.version) { 1436 case DYNMEM_PROTOCOL_VERSION_WIN8: 1437 dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7; 1438 version_req.is_last_attempt = 0; 1439 break; 1440 default: 1441 dm->next_version = 0; 1442 version_req.is_last_attempt = 1; 1443 } 1444 1445 ret = vmbus_sendpacket(dm->dev->channel, &version_req, 1446 sizeof(struct dm_version_request), 1447 (unsigned long)NULL, 1448 VM_PKT_DATA_INBAND, 0); 1449 1450 if (ret) 1451 goto version_error; 1452 1453 return; 1454 1455 version_error: 1456 dm->state = DM_INIT_ERROR; 1457 complete(&dm->host_event); 1458 } 1459 1460 static void cap_resp(struct hv_dynmem_device *dm, 1461 struct dm_capabilities_resp_msg *cap_resp) 1462 { 1463 if (!cap_resp->is_accepted) { 1464 pr_err("Capabilities not accepted by host\n"); 1465 dm->state = DM_INIT_ERROR; 1466 } 1467 complete(&dm->host_event); 1468 } 1469 1470 static void balloon_onchannelcallback(void *context) 1471 { 1472 struct hv_device *dev = context; 1473 u32 recvlen; 1474 u64 requestid; 1475 struct dm_message *dm_msg; 1476 struct dm_header *dm_hdr; 1477 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1478 struct dm_balloon *bal_msg; 1479 struct dm_hot_add *ha_msg; 1480 union dm_mem_page_range *ha_pg_range; 1481 union dm_mem_page_range *ha_region; 1482 1483 memset(recv_buffer, 0, sizeof(recv_buffer)); 1484 vmbus_recvpacket(dev->channel, recv_buffer, 1485 HV_HYP_PAGE_SIZE, &recvlen, &requestid); 1486 1487 if (recvlen > 0) { 1488 dm_msg = (struct dm_message *)recv_buffer; 1489 dm_hdr = &dm_msg->hdr; 1490 1491 switch (dm_hdr->type) { 1492 case DM_VERSION_RESPONSE: 1493 version_resp(dm, 1494 (struct dm_version_response *)dm_msg); 1495 break; 1496 1497 case DM_CAPABILITIES_RESPONSE: 1498 cap_resp(dm, 1499 (struct dm_capabilities_resp_msg *)dm_msg); 1500 break; 1501 1502 case DM_BALLOON_REQUEST: 1503 if (allow_hibernation) { 1504 pr_info("Ignore balloon-up request!\n"); 1505 break; 1506 } 1507 1508 if (dm->state == DM_BALLOON_UP) 1509 pr_warn("Currently ballooning\n"); 1510 bal_msg = (struct dm_balloon *)recv_buffer; 1511 dm->state = DM_BALLOON_UP; 1512 dm_device.balloon_wrk.num_pages = bal_msg->num_pages; 1513 schedule_work(&dm_device.balloon_wrk.wrk); 1514 break; 1515 1516 case DM_UNBALLOON_REQUEST: 1517 if (allow_hibernation) { 1518 pr_info("Ignore balloon-down request!\n"); 1519 break; 1520 } 1521 1522 dm->state = DM_BALLOON_DOWN; 1523 balloon_down(dm, 1524 (struct dm_unballoon_request *)recv_buffer); 1525 break; 1526 1527 case DM_MEM_HOT_ADD_REQUEST: 1528 if (dm->state == DM_HOT_ADD) 1529 pr_warn("Currently hot-adding\n"); 1530 dm->state = DM_HOT_ADD; 1531 ha_msg = (struct dm_hot_add *)recv_buffer; 1532 if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) { 1533 /* 1534 * This is a normal hot-add request specifying 1535 * hot-add memory. 1536 */ 1537 dm->host_specified_ha_region = false; 1538 ha_pg_range = &ha_msg->range; 1539 dm->ha_wrk.ha_page_range = *ha_pg_range; 1540 dm->ha_wrk.ha_region_range.page_range = 0; 1541 } else { 1542 /* 1543 * Host is specifying that we first hot-add 1544 * a region and then partially populate this 1545 * region. 1546 */ 1547 dm->host_specified_ha_region = true; 1548 ha_pg_range = &ha_msg->range; 1549 ha_region = &ha_pg_range[1]; 1550 dm->ha_wrk.ha_page_range = *ha_pg_range; 1551 dm->ha_wrk.ha_region_range = *ha_region; 1552 } 1553 schedule_work(&dm_device.ha_wrk.wrk); 1554 break; 1555 1556 case DM_INFO_MESSAGE: 1557 process_info(dm, (struct dm_info_msg *)dm_msg); 1558 break; 1559 1560 default: 1561 pr_warn("Unhandled message: type: %d\n", dm_hdr->type); 1562 1563 } 1564 } 1565 1566 } 1567 1568 static int balloon_connect_vsp(struct hv_device *dev) 1569 { 1570 struct dm_version_request version_req; 1571 struct dm_capabilities cap_msg; 1572 unsigned long t; 1573 int ret; 1574 1575 ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, 1576 balloon_onchannelcallback, dev); 1577 if (ret) 1578 return ret; 1579 1580 /* 1581 * Initiate the hand shake with the host and negotiate 1582 * a version that the host can support. We start with the 1583 * highest version number and go down if the host cannot 1584 * support it. 1585 */ 1586 memset(&version_req, 0, sizeof(struct dm_version_request)); 1587 version_req.hdr.type = DM_VERSION_REQUEST; 1588 version_req.hdr.size = sizeof(struct dm_version_request); 1589 version_req.hdr.trans_id = atomic_inc_return(&trans_id); 1590 version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10; 1591 version_req.is_last_attempt = 0; 1592 dm_device.version = version_req.version.version; 1593 1594 ret = vmbus_sendpacket(dev->channel, &version_req, 1595 sizeof(struct dm_version_request), 1596 (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); 1597 if (ret) 1598 goto out; 1599 1600 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); 1601 if (t == 0) { 1602 ret = -ETIMEDOUT; 1603 goto out; 1604 } 1605 1606 /* 1607 * If we could not negotiate a compatible version with the host 1608 * fail the probe function. 1609 */ 1610 if (dm_device.state == DM_INIT_ERROR) { 1611 ret = -EPROTO; 1612 goto out; 1613 } 1614 1615 pr_info("Using Dynamic Memory protocol version %u.%u\n", 1616 DYNMEM_MAJOR_VERSION(dm_device.version), 1617 DYNMEM_MINOR_VERSION(dm_device.version)); 1618 1619 /* 1620 * Now submit our capabilities to the host. 1621 */ 1622 memset(&cap_msg, 0, sizeof(struct dm_capabilities)); 1623 cap_msg.hdr.type = DM_CAPABILITIES_REPORT; 1624 cap_msg.hdr.size = sizeof(struct dm_capabilities); 1625 cap_msg.hdr.trans_id = atomic_inc_return(&trans_id); 1626 1627 /* 1628 * When hibernation (i.e. virtual ACPI S4 state) is enabled, the host 1629 * currently still requires the bits to be set, so we have to add code 1630 * to fail the host's hot-add and balloon up/down requests, if any. 1631 */ 1632 cap_msg.caps.cap_bits.balloon = 1; 1633 cap_msg.caps.cap_bits.hot_add = 1; 1634 1635 /* 1636 * Specify our alignment requirements as it relates 1637 * memory hot-add. Specify 128MB alignment. 1638 */ 1639 cap_msg.caps.cap_bits.hot_add_alignment = 7; 1640 1641 /* 1642 * Currently the host does not use these 1643 * values and we set them to what is done in the 1644 * Windows driver. 1645 */ 1646 cap_msg.min_page_cnt = 0; 1647 cap_msg.max_page_number = -1; 1648 1649 ret = vmbus_sendpacket(dev->channel, &cap_msg, 1650 sizeof(struct dm_capabilities), 1651 (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); 1652 if (ret) 1653 goto out; 1654 1655 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); 1656 if (t == 0) { 1657 ret = -ETIMEDOUT; 1658 goto out; 1659 } 1660 1661 /* 1662 * If the host does not like our capabilities, 1663 * fail the probe function. 1664 */ 1665 if (dm_device.state == DM_INIT_ERROR) { 1666 ret = -EPROTO; 1667 goto out; 1668 } 1669 1670 return 0; 1671 out: 1672 vmbus_close(dev->channel); 1673 return ret; 1674 } 1675 1676 static int balloon_probe(struct hv_device *dev, 1677 const struct hv_vmbus_device_id *dev_id) 1678 { 1679 int ret; 1680 1681 allow_hibernation = hv_is_hibernation_supported(); 1682 if (allow_hibernation) 1683 hot_add = false; 1684 1685 #ifdef CONFIG_MEMORY_HOTPLUG 1686 do_hot_add = hot_add; 1687 #else 1688 do_hot_add = false; 1689 #endif 1690 dm_device.dev = dev; 1691 dm_device.state = DM_INITIALIZING; 1692 dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; 1693 init_completion(&dm_device.host_event); 1694 init_completion(&dm_device.config_event); 1695 INIT_LIST_HEAD(&dm_device.ha_region_list); 1696 spin_lock_init(&dm_device.ha_lock); 1697 INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); 1698 INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); 1699 dm_device.host_specified_ha_region = false; 1700 1701 #ifdef CONFIG_MEMORY_HOTPLUG 1702 set_online_page_callback(&hv_online_page); 1703 init_completion(&dm_device.ol_waitevent); 1704 register_memory_notifier(&hv_memory_nb); 1705 #endif 1706 1707 hv_set_drvdata(dev, &dm_device); 1708 1709 ret = balloon_connect_vsp(dev); 1710 if (ret != 0) 1711 return ret; 1712 1713 dm_device.state = DM_INITIALIZED; 1714 1715 dm_device.thread = 1716 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); 1717 if (IS_ERR(dm_device.thread)) { 1718 ret = PTR_ERR(dm_device.thread); 1719 goto probe_error; 1720 } 1721 1722 return 0; 1723 1724 probe_error: 1725 dm_device.state = DM_INIT_ERROR; 1726 dm_device.thread = NULL; 1727 vmbus_close(dev->channel); 1728 #ifdef CONFIG_MEMORY_HOTPLUG 1729 unregister_memory_notifier(&hv_memory_nb); 1730 restore_online_page_callback(&hv_online_page); 1731 #endif 1732 return ret; 1733 } 1734 1735 static int balloon_remove(struct hv_device *dev) 1736 { 1737 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1738 struct hv_hotadd_state *has, *tmp; 1739 struct hv_hotadd_gap *gap, *tmp_gap; 1740 unsigned long flags; 1741 1742 if (dm->num_pages_ballooned != 0) 1743 pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); 1744 1745 cancel_work_sync(&dm->balloon_wrk.wrk); 1746 cancel_work_sync(&dm->ha_wrk.wrk); 1747 1748 kthread_stop(dm->thread); 1749 vmbus_close(dev->channel); 1750 #ifdef CONFIG_MEMORY_HOTPLUG 1751 unregister_memory_notifier(&hv_memory_nb); 1752 restore_online_page_callback(&hv_online_page); 1753 #endif 1754 spin_lock_irqsave(&dm_device.ha_lock, flags); 1755 list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { 1756 list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { 1757 list_del(&gap->list); 1758 kfree(gap); 1759 } 1760 list_del(&has->list); 1761 kfree(has); 1762 } 1763 spin_unlock_irqrestore(&dm_device.ha_lock, flags); 1764 1765 return 0; 1766 } 1767 1768 static int balloon_suspend(struct hv_device *hv_dev) 1769 { 1770 struct hv_dynmem_device *dm = hv_get_drvdata(hv_dev); 1771 1772 tasklet_disable(&hv_dev->channel->callback_event); 1773 1774 cancel_work_sync(&dm->balloon_wrk.wrk); 1775 cancel_work_sync(&dm->ha_wrk.wrk); 1776 1777 if (dm->thread) { 1778 kthread_stop(dm->thread); 1779 dm->thread = NULL; 1780 vmbus_close(hv_dev->channel); 1781 } 1782 1783 tasklet_enable(&hv_dev->channel->callback_event); 1784 1785 return 0; 1786 1787 } 1788 1789 static int balloon_resume(struct hv_device *dev) 1790 { 1791 int ret; 1792 1793 dm_device.state = DM_INITIALIZING; 1794 1795 ret = balloon_connect_vsp(dev); 1796 1797 if (ret != 0) 1798 goto out; 1799 1800 dm_device.thread = 1801 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); 1802 if (IS_ERR(dm_device.thread)) { 1803 ret = PTR_ERR(dm_device.thread); 1804 dm_device.thread = NULL; 1805 goto close_channel; 1806 } 1807 1808 dm_device.state = DM_INITIALIZED; 1809 return 0; 1810 close_channel: 1811 vmbus_close(dev->channel); 1812 out: 1813 dm_device.state = DM_INIT_ERROR; 1814 #ifdef CONFIG_MEMORY_HOTPLUG 1815 unregister_memory_notifier(&hv_memory_nb); 1816 restore_online_page_callback(&hv_online_page); 1817 #endif 1818 return ret; 1819 } 1820 1821 static const struct hv_vmbus_device_id id_table[] = { 1822 /* Dynamic Memory Class ID */ 1823 /* 525074DC-8985-46e2-8057-A307DC18A502 */ 1824 { HV_DM_GUID, }, 1825 { }, 1826 }; 1827 1828 MODULE_DEVICE_TABLE(vmbus, id_table); 1829 1830 static struct hv_driver balloon_drv = { 1831 .name = "hv_balloon", 1832 .id_table = id_table, 1833 .probe = balloon_probe, 1834 .remove = balloon_remove, 1835 .suspend = balloon_suspend, 1836 .resume = balloon_resume, 1837 .driver = { 1838 .probe_type = PROBE_PREFER_ASYNCHRONOUS, 1839 }, 1840 }; 1841 1842 static int __init init_balloon_drv(void) 1843 { 1844 1845 return vmbus_driver_register(&balloon_drv); 1846 } 1847 1848 module_init(init_balloon_drv); 1849 1850 MODULE_DESCRIPTION("Hyper-V Balloon"); 1851 MODULE_LICENSE("GPL"); 1852