xref: /openbmc/linux/drivers/hv/hv_balloon.c (revision c4ee0af3)
1 /*
2  * Copyright (c) 2012, Microsoft Corporation.
3  *
4  * Author:
5  *   K. Y. Srinivasan <kys@microsoft.com>
6  *
7  * This program is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU General Public License version 2 as published
9  * by the Free Software Foundation.
10  *
11  * This program is distributed in the hope that it will be useful, but
12  * WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14  * NON INFRINGEMENT.  See the GNU General Public License for more
15  * details.
16  *
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/kernel.h>
22 #include <linux/mman.h>
23 #include <linux/delay.h>
24 #include <linux/init.h>
25 #include <linux/module.h>
26 #include <linux/slab.h>
27 #include <linux/kthread.h>
28 #include <linux/completion.h>
29 #include <linux/memory_hotplug.h>
30 #include <linux/memory.h>
31 #include <linux/notifier.h>
32 #include <linux/percpu_counter.h>
33 
34 #include <linux/hyperv.h>
35 
36 /*
37  * We begin with definitions supporting the Dynamic Memory protocol
38  * with the host.
39  *
40  * Begin protocol definitions.
41  */
42 
43 
44 
45 /*
46  * Protocol versions. The low word is the minor version, the high word the major
47  * version.
48  *
49  * History:
50  * Initial version 1.0
51  * Changed to 0.1 on 2009/03/25
52  * Changes to 0.2 on 2009/05/14
53  * Changes to 0.3 on 2009/12/03
54  * Changed to 1.0 on 2011/04/05
55  */
56 
57 #define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor)))
58 #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16)
59 #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff)
60 
61 enum {
62 	DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3),
63 	DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0),
64 
65 	DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1,
66 	DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2,
67 
68 	DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN8
69 };
70 
71 
72 
73 /*
74  * Message Types
75  */
76 
77 enum dm_message_type {
78 	/*
79 	 * Version 0.3
80 	 */
81 	DM_ERROR			= 0,
82 	DM_VERSION_REQUEST		= 1,
83 	DM_VERSION_RESPONSE		= 2,
84 	DM_CAPABILITIES_REPORT		= 3,
85 	DM_CAPABILITIES_RESPONSE	= 4,
86 	DM_STATUS_REPORT		= 5,
87 	DM_BALLOON_REQUEST		= 6,
88 	DM_BALLOON_RESPONSE		= 7,
89 	DM_UNBALLOON_REQUEST		= 8,
90 	DM_UNBALLOON_RESPONSE		= 9,
91 	DM_MEM_HOT_ADD_REQUEST		= 10,
92 	DM_MEM_HOT_ADD_RESPONSE		= 11,
93 	DM_VERSION_03_MAX		= 11,
94 	/*
95 	 * Version 1.0.
96 	 */
97 	DM_INFO_MESSAGE			= 12,
98 	DM_VERSION_1_MAX		= 12
99 };
100 
101 
102 /*
103  * Structures defining the dynamic memory management
104  * protocol.
105  */
106 
107 union dm_version {
108 	struct {
109 		__u16 minor_version;
110 		__u16 major_version;
111 	};
112 	__u32 version;
113 } __packed;
114 
115 
116 union dm_caps {
117 	struct {
118 		__u64 balloon:1;
119 		__u64 hot_add:1;
120 		/*
121 		 * To support guests that may have alignment
122 		 * limitations on hot-add, the guest can specify
123 		 * its alignment requirements; a value of n
124 		 * represents an alignment of 2^n in mega bytes.
125 		 */
126 		__u64 hot_add_alignment:4;
127 		__u64 reservedz:58;
128 	} cap_bits;
129 	__u64 caps;
130 } __packed;
131 
132 union dm_mem_page_range {
133 	struct  {
134 		/*
135 		 * The PFN number of the first page in the range.
136 		 * 40 bits is the architectural limit of a PFN
137 		 * number for AMD64.
138 		 */
139 		__u64 start_page:40;
140 		/*
141 		 * The number of pages in the range.
142 		 */
143 		__u64 page_cnt:24;
144 	} finfo;
145 	__u64  page_range;
146 } __packed;
147 
148 
149 
150 /*
151  * The header for all dynamic memory messages:
152  *
153  * type: Type of the message.
154  * size: Size of the message in bytes; including the header.
155  * trans_id: The guest is responsible for manufacturing this ID.
156  */
157 
158 struct dm_header {
159 	__u16 type;
160 	__u16 size;
161 	__u32 trans_id;
162 } __packed;
163 
164 /*
165  * A generic message format for dynamic memory.
166  * Specific message formats are defined later in the file.
167  */
168 
169 struct dm_message {
170 	struct dm_header hdr;
171 	__u8 data[]; /* enclosed message */
172 } __packed;
173 
174 
175 /*
176  * Specific message types supporting the dynamic memory protocol.
177  */
178 
179 /*
180  * Version negotiation message. Sent from the guest to the host.
181  * The guest is free to try different versions until the host
182  * accepts the version.
183  *
184  * dm_version: The protocol version requested.
185  * is_last_attempt: If TRUE, this is the last version guest will request.
186  * reservedz: Reserved field, set to zero.
187  */
188 
189 struct dm_version_request {
190 	struct dm_header hdr;
191 	union dm_version version;
192 	__u32 is_last_attempt:1;
193 	__u32 reservedz:31;
194 } __packed;
195 
196 /*
197  * Version response message; Host to Guest and indicates
198  * if the host has accepted the version sent by the guest.
199  *
200  * is_accepted: If TRUE, host has accepted the version and the guest
201  * should proceed to the next stage of the protocol. FALSE indicates that
202  * guest should re-try with a different version.
203  *
204  * reservedz: Reserved field, set to zero.
205  */
206 
207 struct dm_version_response {
208 	struct dm_header hdr;
209 	__u64 is_accepted:1;
210 	__u64 reservedz:63;
211 } __packed;
212 
213 /*
214  * Message reporting capabilities. This is sent from the guest to the
215  * host.
216  */
217 
218 struct dm_capabilities {
219 	struct dm_header hdr;
220 	union dm_caps caps;
221 	__u64 min_page_cnt;
222 	__u64 max_page_number;
223 } __packed;
224 
225 /*
226  * Response to the capabilities message. This is sent from the host to the
227  * guest. This message notifies if the host has accepted the guest's
228  * capabilities. If the host has not accepted, the guest must shutdown
229  * the service.
230  *
231  * is_accepted: Indicates if the host has accepted guest's capabilities.
232  * reservedz: Must be 0.
233  */
234 
235 struct dm_capabilities_resp_msg {
236 	struct dm_header hdr;
237 	__u64 is_accepted:1;
238 	__u64 reservedz:63;
239 } __packed;
240 
241 /*
242  * This message is used to report memory pressure from the guest.
243  * This message is not part of any transaction and there is no
244  * response to this message.
245  *
246  * num_avail: Available memory in pages.
247  * num_committed: Committed memory in pages.
248  * page_file_size: The accumulated size of all page files
249  *		   in the system in pages.
250  * zero_free: The nunber of zero and free pages.
251  * page_file_writes: The writes to the page file in pages.
252  * io_diff: An indicator of file cache efficiency or page file activity,
253  *	    calculated as File Cache Page Fault Count - Page Read Count.
254  *	    This value is in pages.
255  *
256  * Some of these metrics are Windows specific and fortunately
257  * the algorithm on the host side that computes the guest memory
258  * pressure only uses num_committed value.
259  */
260 
261 struct dm_status {
262 	struct dm_header hdr;
263 	__u64 num_avail;
264 	__u64 num_committed;
265 	__u64 page_file_size;
266 	__u64 zero_free;
267 	__u32 page_file_writes;
268 	__u32 io_diff;
269 } __packed;
270 
271 
272 /*
273  * Message to ask the guest to allocate memory - balloon up message.
274  * This message is sent from the host to the guest. The guest may not be
275  * able to allocate as much memory as requested.
276  *
277  * num_pages: number of pages to allocate.
278  */
279 
280 struct dm_balloon {
281 	struct dm_header hdr;
282 	__u32 num_pages;
283 	__u32 reservedz;
284 } __packed;
285 
286 
287 /*
288  * Balloon response message; this message is sent from the guest
289  * to the host in response to the balloon message.
290  *
291  * reservedz: Reserved; must be set to zero.
292  * more_pages: If FALSE, this is the last message of the transaction.
293  * if TRUE there will atleast one more message from the guest.
294  *
295  * range_count: The number of ranges in the range array.
296  *
297  * range_array: An array of page ranges returned to the host.
298  *
299  */
300 
301 struct dm_balloon_response {
302 	struct dm_header hdr;
303 	__u32 reservedz;
304 	__u32 more_pages:1;
305 	__u32 range_count:31;
306 	union dm_mem_page_range range_array[];
307 } __packed;
308 
309 /*
310  * Un-balloon message; this message is sent from the host
311  * to the guest to give guest more memory.
312  *
313  * more_pages: If FALSE, this is the last message of the transaction.
314  * if TRUE there will atleast one more message from the guest.
315  *
316  * reservedz: Reserved; must be set to zero.
317  *
318  * range_count: The number of ranges in the range array.
319  *
320  * range_array: An array of page ranges returned to the host.
321  *
322  */
323 
324 struct dm_unballoon_request {
325 	struct dm_header hdr;
326 	__u32 more_pages:1;
327 	__u32 reservedz:31;
328 	__u32 range_count;
329 	union dm_mem_page_range range_array[];
330 } __packed;
331 
332 /*
333  * Un-balloon response message; this message is sent from the guest
334  * to the host in response to an unballoon request.
335  *
336  */
337 
338 struct dm_unballoon_response {
339 	struct dm_header hdr;
340 } __packed;
341 
342 
343 /*
344  * Hot add request message. Message sent from the host to the guest.
345  *
346  * mem_range: Memory range to hot add.
347  *
348  * On Linux we currently don't support this since we cannot hot add
349  * arbitrary granularity of memory.
350  */
351 
352 struct dm_hot_add {
353 	struct dm_header hdr;
354 	union dm_mem_page_range range;
355 } __packed;
356 
357 /*
358  * Hot add response message.
359  * This message is sent by the guest to report the status of a hot add request.
360  * If page_count is less than the requested page count, then the host should
361  * assume all further hot add requests will fail, since this indicates that
362  * the guest has hit an upper physical memory barrier.
363  *
364  * Hot adds may also fail due to low resources; in this case, the guest must
365  * not complete this message until the hot add can succeed, and the host must
366  * not send a new hot add request until the response is sent.
367  * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS
368  * times it fails the request.
369  *
370  *
371  * page_count: number of pages that were successfully hot added.
372  *
373  * result: result of the operation 1: success, 0: failure.
374  *
375  */
376 
377 struct dm_hot_add_response {
378 	struct dm_header hdr;
379 	__u32 page_count;
380 	__u32 result;
381 } __packed;
382 
383 /*
384  * Types of information sent from host to the guest.
385  */
386 
387 enum dm_info_type {
388 	INFO_TYPE_MAX_PAGE_CNT = 0,
389 	MAX_INFO_TYPE
390 };
391 
392 
393 /*
394  * Header for the information message.
395  */
396 
397 struct dm_info_header {
398 	enum dm_info_type type;
399 	__u32 data_size;
400 } __packed;
401 
402 /*
403  * This message is sent from the host to the guest to pass
404  * some relevant information (win8 addition).
405  *
406  * reserved: no used.
407  * info_size: size of the information blob.
408  * info: information blob.
409  */
410 
411 struct dm_info_msg {
412 	struct dm_header hdr;
413 	__u32 reserved;
414 	__u32 info_size;
415 	__u8  info[];
416 };
417 
418 /*
419  * End protocol definitions.
420  */
421 
422 /*
423  * State to manage hot adding memory into the guest.
424  * The range start_pfn : end_pfn specifies the range
425  * that the host has asked us to hot add. The range
426  * start_pfn : ha_end_pfn specifies the range that we have
427  * currently hot added. We hot add in multiples of 128M
428  * chunks; it is possible that we may not be able to bring
429  * online all the pages in the region. The range
430  * covered_start_pfn : covered_end_pfn defines the pages that can
431  * be brough online.
432  */
433 
434 struct hv_hotadd_state {
435 	struct list_head list;
436 	unsigned long start_pfn;
437 	unsigned long covered_start_pfn;
438 	unsigned long covered_end_pfn;
439 	unsigned long ha_end_pfn;
440 	unsigned long end_pfn;
441 };
442 
443 struct balloon_state {
444 	__u32 num_pages;
445 	struct work_struct wrk;
446 };
447 
448 struct hot_add_wrk {
449 	union dm_mem_page_range ha_page_range;
450 	union dm_mem_page_range ha_region_range;
451 	struct work_struct wrk;
452 };
453 
454 static bool hot_add = true;
455 static bool do_hot_add;
456 /*
457  * Delay reporting memory pressure by
458  * the specified number of seconds.
459  */
460 static uint pressure_report_delay = 45;
461 
462 module_param(hot_add, bool, (S_IRUGO | S_IWUSR));
463 MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");
464 
465 module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR));
466 MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
467 static atomic_t trans_id = ATOMIC_INIT(0);
468 
469 static int dm_ring_size = (5 * PAGE_SIZE);
470 
471 /*
472  * Driver specific state.
473  */
474 
475 enum hv_dm_state {
476 	DM_INITIALIZING = 0,
477 	DM_INITIALIZED,
478 	DM_BALLOON_UP,
479 	DM_BALLOON_DOWN,
480 	DM_HOT_ADD,
481 	DM_INIT_ERROR
482 };
483 
484 
485 static __u8 recv_buffer[PAGE_SIZE];
486 static __u8 *send_buffer;
487 #define PAGES_IN_2M	512
488 #define HA_CHUNK (32 * 1024)
489 
490 struct hv_dynmem_device {
491 	struct hv_device *dev;
492 	enum hv_dm_state state;
493 	struct completion host_event;
494 	struct completion config_event;
495 
496 	/*
497 	 * Number of pages we have currently ballooned out.
498 	 */
499 	unsigned int num_pages_ballooned;
500 
501 	/*
502 	 * State to manage the ballooning (up) operation.
503 	 */
504 	struct balloon_state balloon_wrk;
505 
506 	/*
507 	 * State to execute the "hot-add" operation.
508 	 */
509 	struct hot_add_wrk ha_wrk;
510 
511 	/*
512 	 * This state tracks if the host has specified a hot-add
513 	 * region.
514 	 */
515 	bool host_specified_ha_region;
516 
517 	/*
518 	 * State to synchronize hot-add.
519 	 */
520 	struct completion  ol_waitevent;
521 	bool ha_waiting;
522 	/*
523 	 * This thread handles hot-add
524 	 * requests from the host as well as notifying
525 	 * the host with regards to memory pressure in
526 	 * the guest.
527 	 */
528 	struct task_struct *thread;
529 
530 	/*
531 	 * A list of hot-add regions.
532 	 */
533 	struct list_head ha_region_list;
534 
535 	/*
536 	 * We start with the highest version we can support
537 	 * and downgrade based on the host; we save here the
538 	 * next version to try.
539 	 */
540 	__u32 next_version;
541 };
542 
543 static struct hv_dynmem_device dm_device;
544 
545 #ifdef CONFIG_MEMORY_HOTPLUG
546 
547 static void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size)
548 {
549 	int i;
550 
551 	for (i = 0; i < size; i++) {
552 		struct page *pg;
553 		pg = pfn_to_page(start_pfn + i);
554 		__online_page_set_limits(pg);
555 		__online_page_increment_counters(pg);
556 		__online_page_free(pg);
557 	}
558 }
559 
560 static void hv_mem_hot_add(unsigned long start, unsigned long size,
561 				unsigned long pfn_count,
562 				struct hv_hotadd_state *has)
563 {
564 	int ret = 0;
565 	int i, nid;
566 	unsigned long start_pfn;
567 	unsigned long processed_pfn;
568 	unsigned long total_pfn = pfn_count;
569 
570 	for (i = 0; i < (size/HA_CHUNK); i++) {
571 		start_pfn = start + (i * HA_CHUNK);
572 		has->ha_end_pfn +=  HA_CHUNK;
573 
574 		if (total_pfn > HA_CHUNK) {
575 			processed_pfn = HA_CHUNK;
576 			total_pfn -= HA_CHUNK;
577 		} else {
578 			processed_pfn = total_pfn;
579 			total_pfn = 0;
580 		}
581 
582 		has->covered_end_pfn +=  processed_pfn;
583 
584 		init_completion(&dm_device.ol_waitevent);
585 		dm_device.ha_waiting = true;
586 
587 		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
588 		ret = add_memory(nid, PFN_PHYS((start_pfn)),
589 				(HA_CHUNK << PAGE_SHIFT));
590 
591 		if (ret) {
592 			pr_info("hot_add memory failed error is %d\n", ret);
593 			if (ret == -EEXIST) {
594 				/*
595 				 * This error indicates that the error
596 				 * is not a transient failure. This is the
597 				 * case where the guest's physical address map
598 				 * precludes hot adding memory. Stop all further
599 				 * memory hot-add.
600 				 */
601 				do_hot_add = false;
602 			}
603 			has->ha_end_pfn -= HA_CHUNK;
604 			has->covered_end_pfn -=  processed_pfn;
605 			break;
606 		}
607 
608 		/*
609 		 * Wait for the memory block to be onlined.
610 		 * Since the hot add has succeeded, it is ok to
611 		 * proceed even if the pages in the hot added region
612 		 * have not been "onlined" within the allowed time.
613 		 */
614 		wait_for_completion_timeout(&dm_device.ol_waitevent, 5*HZ);
615 
616 	}
617 
618 	return;
619 }
620 
621 static void hv_online_page(struct page *pg)
622 {
623 	struct list_head *cur;
624 	struct hv_hotadd_state *has;
625 	unsigned long cur_start_pgp;
626 	unsigned long cur_end_pgp;
627 
628 	if (dm_device.ha_waiting) {
629 		dm_device.ha_waiting = false;
630 		complete(&dm_device.ol_waitevent);
631 	}
632 
633 	list_for_each(cur, &dm_device.ha_region_list) {
634 		has = list_entry(cur, struct hv_hotadd_state, list);
635 		cur_start_pgp = (unsigned long)
636 				pfn_to_page(has->covered_start_pfn);
637 		cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn);
638 
639 		if (((unsigned long)pg >= cur_start_pgp) &&
640 			((unsigned long)pg < cur_end_pgp)) {
641 			/*
642 			 * This frame is currently backed; online the
643 			 * page.
644 			 */
645 			__online_page_set_limits(pg);
646 			__online_page_increment_counters(pg);
647 			__online_page_free(pg);
648 			has->covered_start_pfn++;
649 		}
650 	}
651 }
652 
653 static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
654 {
655 	struct list_head *cur;
656 	struct hv_hotadd_state *has;
657 	unsigned long residual, new_inc;
658 
659 	if (list_empty(&dm_device.ha_region_list))
660 		return false;
661 
662 	list_for_each(cur, &dm_device.ha_region_list) {
663 		has = list_entry(cur, struct hv_hotadd_state, list);
664 
665 		/*
666 		 * If the pfn range we are dealing with is not in the current
667 		 * "hot add block", move on.
668 		 */
669 		if ((start_pfn >= has->end_pfn))
670 			continue;
671 		/*
672 		 * If the current hot add-request extends beyond
673 		 * our current limit; extend it.
674 		 */
675 		if ((start_pfn + pfn_cnt) > has->end_pfn) {
676 			residual = (start_pfn + pfn_cnt - has->end_pfn);
677 			/*
678 			 * Extend the region by multiples of HA_CHUNK.
679 			 */
680 			new_inc = (residual / HA_CHUNK) * HA_CHUNK;
681 			if (residual % HA_CHUNK)
682 				new_inc += HA_CHUNK;
683 
684 			has->end_pfn += new_inc;
685 		}
686 
687 		/*
688 		 * If the current start pfn is not where the covered_end
689 		 * is, update it.
690 		 */
691 
692 		if (has->covered_end_pfn != start_pfn) {
693 			has->covered_end_pfn = start_pfn;
694 			has->covered_start_pfn = start_pfn;
695 		}
696 		return true;
697 
698 	}
699 
700 	return false;
701 }
702 
703 static unsigned long handle_pg_range(unsigned long pg_start,
704 					unsigned long pg_count)
705 {
706 	unsigned long start_pfn = pg_start;
707 	unsigned long pfn_cnt = pg_count;
708 	unsigned long size;
709 	struct list_head *cur;
710 	struct hv_hotadd_state *has;
711 	unsigned long pgs_ol = 0;
712 	unsigned long old_covered_state;
713 
714 	if (list_empty(&dm_device.ha_region_list))
715 		return 0;
716 
717 	list_for_each(cur, &dm_device.ha_region_list) {
718 		has = list_entry(cur, struct hv_hotadd_state, list);
719 
720 		/*
721 		 * If the pfn range we are dealing with is not in the current
722 		 * "hot add block", move on.
723 		 */
724 		if ((start_pfn >= has->end_pfn))
725 			continue;
726 
727 		old_covered_state = has->covered_end_pfn;
728 
729 		if (start_pfn < has->ha_end_pfn) {
730 			/*
731 			 * This is the case where we are backing pages
732 			 * in an already hot added region. Bring
733 			 * these pages online first.
734 			 */
735 			pgs_ol = has->ha_end_pfn - start_pfn;
736 			if (pgs_ol > pfn_cnt)
737 				pgs_ol = pfn_cnt;
738 			hv_bring_pgs_online(start_pfn, pgs_ol);
739 			has->covered_end_pfn +=  pgs_ol;
740 			has->covered_start_pfn +=  pgs_ol;
741 			pfn_cnt -= pgs_ol;
742 		}
743 
744 		if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) {
745 			/*
746 			 * We have some residual hot add range
747 			 * that needs to be hot added; hot add
748 			 * it now. Hot add a multiple of
749 			 * of HA_CHUNK that fully covers the pages
750 			 * we have.
751 			 */
752 			size = (has->end_pfn - has->ha_end_pfn);
753 			if (pfn_cnt <= size) {
754 				size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK);
755 				if (pfn_cnt % HA_CHUNK)
756 					size += HA_CHUNK;
757 			} else {
758 				pfn_cnt = size;
759 			}
760 			hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has);
761 		}
762 		/*
763 		 * If we managed to online any pages that were given to us,
764 		 * we declare success.
765 		 */
766 		return has->covered_end_pfn - old_covered_state;
767 
768 	}
769 
770 	return 0;
771 }
772 
773 static unsigned long process_hot_add(unsigned long pg_start,
774 					unsigned long pfn_cnt,
775 					unsigned long rg_start,
776 					unsigned long rg_size)
777 {
778 	struct hv_hotadd_state *ha_region = NULL;
779 
780 	if (pfn_cnt == 0)
781 		return 0;
782 
783 	if (!dm_device.host_specified_ha_region)
784 		if (pfn_covered(pg_start, pfn_cnt))
785 			goto do_pg_range;
786 
787 	/*
788 	 * If the host has specified a hot-add range; deal with it first.
789 	 */
790 
791 	if (rg_size != 0) {
792 		ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL);
793 		if (!ha_region)
794 			return 0;
795 
796 		INIT_LIST_HEAD(&ha_region->list);
797 
798 		list_add_tail(&ha_region->list, &dm_device.ha_region_list);
799 		ha_region->start_pfn = rg_start;
800 		ha_region->ha_end_pfn = rg_start;
801 		ha_region->covered_start_pfn = pg_start;
802 		ha_region->covered_end_pfn = pg_start;
803 		ha_region->end_pfn = rg_start + rg_size;
804 	}
805 
806 do_pg_range:
807 	/*
808 	 * Process the page range specified; bringing them
809 	 * online if possible.
810 	 */
811 	return handle_pg_range(pg_start, pfn_cnt);
812 }
813 
814 #endif
815 
816 static void hot_add_req(struct work_struct *dummy)
817 {
818 	struct dm_hot_add_response resp;
819 #ifdef CONFIG_MEMORY_HOTPLUG
820 	unsigned long pg_start, pfn_cnt;
821 	unsigned long rg_start, rg_sz;
822 #endif
823 	struct hv_dynmem_device *dm = &dm_device;
824 
825 	memset(&resp, 0, sizeof(struct dm_hot_add_response));
826 	resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE;
827 	resp.hdr.size = sizeof(struct dm_hot_add_response);
828 
829 #ifdef CONFIG_MEMORY_HOTPLUG
830 	pg_start = dm->ha_wrk.ha_page_range.finfo.start_page;
831 	pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt;
832 
833 	rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
834 	rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
835 
836 	if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
837 		unsigned long region_size;
838 		unsigned long region_start;
839 
840 		/*
841 		 * The host has not specified the hot-add region.
842 		 * Based on the hot-add page range being specified,
843 		 * compute a hot-add region that can cover the pages
844 		 * that need to be hot-added while ensuring the alignment
845 		 * and size requirements of Linux as it relates to hot-add.
846 		 */
847 		region_start = pg_start;
848 		region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
849 		if (pfn_cnt % HA_CHUNK)
850 			region_size += HA_CHUNK;
851 
852 		region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
853 
854 		rg_start = region_start;
855 		rg_sz = region_size;
856 	}
857 
858 	if (do_hot_add)
859 		resp.page_count = process_hot_add(pg_start, pfn_cnt,
860 						rg_start, rg_sz);
861 #endif
862 	/*
863 	 * The result field of the response structure has the
864 	 * following semantics:
865 	 *
866 	 * 1. If all or some pages hot-added: Guest should return success.
867 	 *
868 	 * 2. If no pages could be hot-added:
869 	 *
870 	 * If the guest returns success, then the host
871 	 * will not attempt any further hot-add operations. This
872 	 * signifies a permanent failure.
873 	 *
874 	 * If the guest returns failure, then this failure will be
875 	 * treated as a transient failure and the host may retry the
876 	 * hot-add operation after some delay.
877 	 */
878 	if (resp.page_count > 0)
879 		resp.result = 1;
880 	else if (!do_hot_add)
881 		resp.result = 1;
882 	else
883 		resp.result = 0;
884 
885 	if (!do_hot_add || (resp.page_count == 0))
886 		pr_info("Memory hot add failed\n");
887 
888 	dm->state = DM_INITIALIZED;
889 	resp.hdr.trans_id = atomic_inc_return(&trans_id);
890 	vmbus_sendpacket(dm->dev->channel, &resp,
891 			sizeof(struct dm_hot_add_response),
892 			(unsigned long)NULL,
893 			VM_PKT_DATA_INBAND, 0);
894 }
895 
896 static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg)
897 {
898 	struct dm_info_header *info_hdr;
899 
900 	info_hdr = (struct dm_info_header *)msg->info;
901 
902 	switch (info_hdr->type) {
903 	case INFO_TYPE_MAX_PAGE_CNT:
904 		pr_info("Received INFO_TYPE_MAX_PAGE_CNT\n");
905 		pr_info("Data Size is %d\n", info_hdr->data_size);
906 		break;
907 	default:
908 		pr_info("Received Unknown type: %d\n", info_hdr->type);
909 	}
910 }
911 
912 static unsigned long compute_balloon_floor(void)
913 {
914 	unsigned long min_pages;
915 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
916 	/* Simple continuous piecewiese linear function:
917 	 *  max MiB -> min MiB  gradient
918 	 *       0         0
919 	 *      16        16
920 	 *      32        24
921 	 *     128        72    (1/2)
922 	 *     512       168    (1/4)
923 	 *    2048       360    (1/8)
924 	 *    8192       552    (1/32)
925 	 *   32768      1320
926 	 *  131072      4392
927 	 */
928 	if (totalram_pages < MB2PAGES(128))
929 		min_pages = MB2PAGES(8) + (totalram_pages >> 1);
930 	else if (totalram_pages < MB2PAGES(512))
931 		min_pages = MB2PAGES(40) + (totalram_pages >> 2);
932 	else if (totalram_pages < MB2PAGES(2048))
933 		min_pages = MB2PAGES(104) + (totalram_pages >> 3);
934 	else
935 		min_pages = MB2PAGES(296) + (totalram_pages >> 5);
936 #undef MB2PAGES
937 	return min_pages;
938 }
939 
940 /*
941  * Post our status as it relates memory pressure to the
942  * host. Host expects the guests to post this status
943  * periodically at 1 second intervals.
944  *
945  * The metrics specified in this protocol are very Windows
946  * specific and so we cook up numbers here to convey our memory
947  * pressure.
948  */
949 
950 static void post_status(struct hv_dynmem_device *dm)
951 {
952 	struct dm_status status;
953 	struct sysinfo val;
954 
955 	if (pressure_report_delay > 0) {
956 		--pressure_report_delay;
957 		return;
958 	}
959 	si_meminfo(&val);
960 	memset(&status, 0, sizeof(struct dm_status));
961 	status.hdr.type = DM_STATUS_REPORT;
962 	status.hdr.size = sizeof(struct dm_status);
963 	status.hdr.trans_id = atomic_inc_return(&trans_id);
964 
965 	/*
966 	 * The host expects the guest to report free memory.
967 	 * Further, the host expects the pressure information to
968 	 * include the ballooned out pages.
969 	 * For a given amount of memory that we are managing, we
970 	 * need to compute a floor below which we should not balloon.
971 	 * Compute this and add it to the pressure report.
972 	 */
973 	status.num_avail = val.freeram;
974 	status.num_committed = vm_memory_committed() +
975 				dm->num_pages_ballooned +
976 				compute_balloon_floor();
977 
978 	/*
979 	 * If our transaction ID is no longer current, just don't
980 	 * send the status. This can happen if we were interrupted
981 	 * after we picked our transaction ID.
982 	 */
983 	if (status.hdr.trans_id != atomic_read(&trans_id))
984 		return;
985 
986 	vmbus_sendpacket(dm->dev->channel, &status,
987 				sizeof(struct dm_status),
988 				(unsigned long)NULL,
989 				VM_PKT_DATA_INBAND, 0);
990 
991 }
992 
993 static void free_balloon_pages(struct hv_dynmem_device *dm,
994 			 union dm_mem_page_range *range_array)
995 {
996 	int num_pages = range_array->finfo.page_cnt;
997 	__u64 start_frame = range_array->finfo.start_page;
998 	struct page *pg;
999 	int i;
1000 
1001 	for (i = 0; i < num_pages; i++) {
1002 		pg = pfn_to_page(i + start_frame);
1003 		__free_page(pg);
1004 		dm->num_pages_ballooned--;
1005 	}
1006 }
1007 
1008 
1009 
1010 static int  alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages,
1011 			 struct dm_balloon_response *bl_resp, int alloc_unit,
1012 			 bool *alloc_error)
1013 {
1014 	int i = 0;
1015 	struct page *pg;
1016 
1017 	if (num_pages < alloc_unit)
1018 		return 0;
1019 
1020 	for (i = 0; (i * alloc_unit) < num_pages; i++) {
1021 		if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
1022 			PAGE_SIZE)
1023 			return i * alloc_unit;
1024 
1025 		/*
1026 		 * We execute this code in a thread context. Furthermore,
1027 		 * we don't want the kernel to try too hard.
1028 		 */
1029 		pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY |
1030 				__GFP_NOMEMALLOC | __GFP_NOWARN,
1031 				get_order(alloc_unit << PAGE_SHIFT));
1032 
1033 		if (!pg) {
1034 			*alloc_error = true;
1035 			return i * alloc_unit;
1036 		}
1037 
1038 
1039 		dm->num_pages_ballooned += alloc_unit;
1040 
1041 		/*
1042 		 * If we allocatted 2M pages; split them so we
1043 		 * can free them in any order we get.
1044 		 */
1045 
1046 		if (alloc_unit != 1)
1047 			split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
1048 
1049 		bl_resp->range_count++;
1050 		bl_resp->range_array[i].finfo.start_page =
1051 			page_to_pfn(pg);
1052 		bl_resp->range_array[i].finfo.page_cnt = alloc_unit;
1053 		bl_resp->hdr.size += sizeof(union dm_mem_page_range);
1054 
1055 	}
1056 
1057 	return num_pages;
1058 }
1059 
1060 
1061 
1062 static void balloon_up(struct work_struct *dummy)
1063 {
1064 	int num_pages = dm_device.balloon_wrk.num_pages;
1065 	int num_ballooned = 0;
1066 	struct dm_balloon_response *bl_resp;
1067 	int alloc_unit;
1068 	int ret;
1069 	bool alloc_error = false;
1070 	bool done = false;
1071 	int i;
1072 
1073 
1074 	/*
1075 	 * We will attempt 2M allocations. However, if we fail to
1076 	 * allocate 2M chunks, we will go back to 4k allocations.
1077 	 */
1078 	alloc_unit = 512;
1079 
1080 	while (!done) {
1081 		bl_resp = (struct dm_balloon_response *)send_buffer;
1082 		memset(send_buffer, 0, PAGE_SIZE);
1083 		bl_resp->hdr.type = DM_BALLOON_RESPONSE;
1084 		bl_resp->hdr.size = sizeof(struct dm_balloon_response);
1085 		bl_resp->more_pages = 1;
1086 
1087 
1088 		num_pages -= num_ballooned;
1089 		num_ballooned = alloc_balloon_pages(&dm_device, num_pages,
1090 						bl_resp, alloc_unit,
1091 						 &alloc_error);
1092 
1093 		if ((alloc_error) && (alloc_unit != 1)) {
1094 			alloc_unit = 1;
1095 			continue;
1096 		}
1097 
1098 		if ((alloc_error) || (num_ballooned == num_pages)) {
1099 			bl_resp->more_pages = 0;
1100 			done = true;
1101 			dm_device.state = DM_INITIALIZED;
1102 		}
1103 
1104 		/*
1105 		 * We are pushing a lot of data through the channel;
1106 		 * deal with transient failures caused because of the
1107 		 * lack of space in the ring buffer.
1108 		 */
1109 
1110 		do {
1111 			bl_resp->hdr.trans_id = atomic_inc_return(&trans_id);
1112 			ret = vmbus_sendpacket(dm_device.dev->channel,
1113 						bl_resp,
1114 						bl_resp->hdr.size,
1115 						(unsigned long)NULL,
1116 						VM_PKT_DATA_INBAND, 0);
1117 
1118 			if (ret == -EAGAIN)
1119 				msleep(20);
1120 
1121 		} while (ret == -EAGAIN);
1122 
1123 		if (ret) {
1124 			/*
1125 			 * Free up the memory we allocatted.
1126 			 */
1127 			pr_info("Balloon response failed\n");
1128 
1129 			for (i = 0; i < bl_resp->range_count; i++)
1130 				free_balloon_pages(&dm_device,
1131 						 &bl_resp->range_array[i]);
1132 
1133 			done = true;
1134 		}
1135 	}
1136 
1137 }
1138 
1139 static void balloon_down(struct hv_dynmem_device *dm,
1140 			struct dm_unballoon_request *req)
1141 {
1142 	union dm_mem_page_range *range_array = req->range_array;
1143 	int range_count = req->range_count;
1144 	struct dm_unballoon_response resp;
1145 	int i;
1146 
1147 	for (i = 0; i < range_count; i++)
1148 		free_balloon_pages(dm, &range_array[i]);
1149 
1150 	if (req->more_pages == 1)
1151 		return;
1152 
1153 	memset(&resp, 0, sizeof(struct dm_unballoon_response));
1154 	resp.hdr.type = DM_UNBALLOON_RESPONSE;
1155 	resp.hdr.trans_id = atomic_inc_return(&trans_id);
1156 	resp.hdr.size = sizeof(struct dm_unballoon_response);
1157 
1158 	vmbus_sendpacket(dm_device.dev->channel, &resp,
1159 				sizeof(struct dm_unballoon_response),
1160 				(unsigned long)NULL,
1161 				VM_PKT_DATA_INBAND, 0);
1162 
1163 	dm->state = DM_INITIALIZED;
1164 }
1165 
1166 static void balloon_onchannelcallback(void *context);
1167 
1168 static int dm_thread_func(void *dm_dev)
1169 {
1170 	struct hv_dynmem_device *dm = dm_dev;
1171 	int t;
1172 
1173 	while (!kthread_should_stop()) {
1174 		t = wait_for_completion_timeout(&dm_device.config_event, 1*HZ);
1175 		/*
1176 		 * The host expects us to post information on the memory
1177 		 * pressure every second.
1178 		 */
1179 
1180 		if (t == 0)
1181 			post_status(dm);
1182 
1183 	}
1184 
1185 	return 0;
1186 }
1187 
1188 
1189 static void version_resp(struct hv_dynmem_device *dm,
1190 			struct dm_version_response *vresp)
1191 {
1192 	struct dm_version_request version_req;
1193 	int ret;
1194 
1195 	if (vresp->is_accepted) {
1196 		/*
1197 		 * We are done; wakeup the
1198 		 * context waiting for version
1199 		 * negotiation.
1200 		 */
1201 		complete(&dm->host_event);
1202 		return;
1203 	}
1204 	/*
1205 	 * If there are more versions to try, continue
1206 	 * with negotiations; if not
1207 	 * shutdown the service since we are not able
1208 	 * to negotiate a suitable version number
1209 	 * with the host.
1210 	 */
1211 	if (dm->next_version == 0)
1212 		goto version_error;
1213 
1214 	dm->next_version = 0;
1215 	memset(&version_req, 0, sizeof(struct dm_version_request));
1216 	version_req.hdr.type = DM_VERSION_REQUEST;
1217 	version_req.hdr.size = sizeof(struct dm_version_request);
1218 	version_req.hdr.trans_id = atomic_inc_return(&trans_id);
1219 	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN7;
1220 	version_req.is_last_attempt = 1;
1221 
1222 	ret = vmbus_sendpacket(dm->dev->channel, &version_req,
1223 				sizeof(struct dm_version_request),
1224 				(unsigned long)NULL,
1225 				VM_PKT_DATA_INBAND, 0);
1226 
1227 	if (ret)
1228 		goto version_error;
1229 
1230 	return;
1231 
1232 version_error:
1233 	dm->state = DM_INIT_ERROR;
1234 	complete(&dm->host_event);
1235 }
1236 
1237 static void cap_resp(struct hv_dynmem_device *dm,
1238 			struct dm_capabilities_resp_msg *cap_resp)
1239 {
1240 	if (!cap_resp->is_accepted) {
1241 		pr_info("Capabilities not accepted by host\n");
1242 		dm->state = DM_INIT_ERROR;
1243 	}
1244 	complete(&dm->host_event);
1245 }
1246 
1247 static void balloon_onchannelcallback(void *context)
1248 {
1249 	struct hv_device *dev = context;
1250 	u32 recvlen;
1251 	u64 requestid;
1252 	struct dm_message *dm_msg;
1253 	struct dm_header *dm_hdr;
1254 	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
1255 	struct dm_balloon *bal_msg;
1256 	struct dm_hot_add *ha_msg;
1257 	union dm_mem_page_range *ha_pg_range;
1258 	union dm_mem_page_range *ha_region;
1259 
1260 	memset(recv_buffer, 0, sizeof(recv_buffer));
1261 	vmbus_recvpacket(dev->channel, recv_buffer,
1262 			 PAGE_SIZE, &recvlen, &requestid);
1263 
1264 	if (recvlen > 0) {
1265 		dm_msg = (struct dm_message *)recv_buffer;
1266 		dm_hdr = &dm_msg->hdr;
1267 
1268 		switch (dm_hdr->type) {
1269 		case DM_VERSION_RESPONSE:
1270 			version_resp(dm,
1271 				 (struct dm_version_response *)dm_msg);
1272 			break;
1273 
1274 		case DM_CAPABILITIES_RESPONSE:
1275 			cap_resp(dm,
1276 				 (struct dm_capabilities_resp_msg *)dm_msg);
1277 			break;
1278 
1279 		case DM_BALLOON_REQUEST:
1280 			if (dm->state == DM_BALLOON_UP)
1281 				pr_warn("Currently ballooning\n");
1282 			bal_msg = (struct dm_balloon *)recv_buffer;
1283 			dm->state = DM_BALLOON_UP;
1284 			dm_device.balloon_wrk.num_pages = bal_msg->num_pages;
1285 			schedule_work(&dm_device.balloon_wrk.wrk);
1286 			break;
1287 
1288 		case DM_UNBALLOON_REQUEST:
1289 			dm->state = DM_BALLOON_DOWN;
1290 			balloon_down(dm,
1291 				 (struct dm_unballoon_request *)recv_buffer);
1292 			break;
1293 
1294 		case DM_MEM_HOT_ADD_REQUEST:
1295 			if (dm->state == DM_HOT_ADD)
1296 				pr_warn("Currently hot-adding\n");
1297 			dm->state = DM_HOT_ADD;
1298 			ha_msg = (struct dm_hot_add *)recv_buffer;
1299 			if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) {
1300 				/*
1301 				 * This is a normal hot-add request specifying
1302 				 * hot-add memory.
1303 				 */
1304 				ha_pg_range = &ha_msg->range;
1305 				dm->ha_wrk.ha_page_range = *ha_pg_range;
1306 				dm->ha_wrk.ha_region_range.page_range = 0;
1307 			} else {
1308 				/*
1309 				 * Host is specifying that we first hot-add
1310 				 * a region and then partially populate this
1311 				 * region.
1312 				 */
1313 				dm->host_specified_ha_region = true;
1314 				ha_pg_range = &ha_msg->range;
1315 				ha_region = &ha_pg_range[1];
1316 				dm->ha_wrk.ha_page_range = *ha_pg_range;
1317 				dm->ha_wrk.ha_region_range = *ha_region;
1318 			}
1319 			schedule_work(&dm_device.ha_wrk.wrk);
1320 			break;
1321 
1322 		case DM_INFO_MESSAGE:
1323 			process_info(dm, (struct dm_info_msg *)dm_msg);
1324 			break;
1325 
1326 		default:
1327 			pr_err("Unhandled message: type: %d\n", dm_hdr->type);
1328 
1329 		}
1330 	}
1331 
1332 }
1333 
1334 static int balloon_probe(struct hv_device *dev,
1335 			const struct hv_vmbus_device_id *dev_id)
1336 {
1337 	int ret, t;
1338 	struct dm_version_request version_req;
1339 	struct dm_capabilities cap_msg;
1340 
1341 	do_hot_add = hot_add;
1342 
1343 	/*
1344 	 * First allocate a send buffer.
1345 	 */
1346 
1347 	send_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
1348 	if (!send_buffer)
1349 		return -ENOMEM;
1350 
1351 	ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0,
1352 			balloon_onchannelcallback, dev);
1353 
1354 	if (ret)
1355 		goto probe_error0;
1356 
1357 	dm_device.dev = dev;
1358 	dm_device.state = DM_INITIALIZING;
1359 	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
1360 	init_completion(&dm_device.host_event);
1361 	init_completion(&dm_device.config_event);
1362 	INIT_LIST_HEAD(&dm_device.ha_region_list);
1363 	INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
1364 	INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
1365 	dm_device.host_specified_ha_region = false;
1366 
1367 	dm_device.thread =
1368 		 kthread_run(dm_thread_func, &dm_device, "hv_balloon");
1369 	if (IS_ERR(dm_device.thread)) {
1370 		ret = PTR_ERR(dm_device.thread);
1371 		goto probe_error1;
1372 	}
1373 
1374 #ifdef CONFIG_MEMORY_HOTPLUG
1375 	set_online_page_callback(&hv_online_page);
1376 #endif
1377 
1378 	hv_set_drvdata(dev, &dm_device);
1379 	/*
1380 	 * Initiate the hand shake with the host and negotiate
1381 	 * a version that the host can support. We start with the
1382 	 * highest version number and go down if the host cannot
1383 	 * support it.
1384 	 */
1385 	memset(&version_req, 0, sizeof(struct dm_version_request));
1386 	version_req.hdr.type = DM_VERSION_REQUEST;
1387 	version_req.hdr.size = sizeof(struct dm_version_request);
1388 	version_req.hdr.trans_id = atomic_inc_return(&trans_id);
1389 	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN8;
1390 	version_req.is_last_attempt = 0;
1391 
1392 	ret = vmbus_sendpacket(dev->channel, &version_req,
1393 				sizeof(struct dm_version_request),
1394 				(unsigned long)NULL,
1395 				VM_PKT_DATA_INBAND, 0);
1396 	if (ret)
1397 		goto probe_error2;
1398 
1399 	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
1400 	if (t == 0) {
1401 		ret = -ETIMEDOUT;
1402 		goto probe_error2;
1403 	}
1404 
1405 	/*
1406 	 * If we could not negotiate a compatible version with the host
1407 	 * fail the probe function.
1408 	 */
1409 	if (dm_device.state == DM_INIT_ERROR) {
1410 		ret = -ETIMEDOUT;
1411 		goto probe_error2;
1412 	}
1413 	/*
1414 	 * Now submit our capabilities to the host.
1415 	 */
1416 	memset(&cap_msg, 0, sizeof(struct dm_capabilities));
1417 	cap_msg.hdr.type = DM_CAPABILITIES_REPORT;
1418 	cap_msg.hdr.size = sizeof(struct dm_capabilities);
1419 	cap_msg.hdr.trans_id = atomic_inc_return(&trans_id);
1420 
1421 	cap_msg.caps.cap_bits.balloon = 1;
1422 	cap_msg.caps.cap_bits.hot_add = 1;
1423 
1424 	/*
1425 	 * Specify our alignment requirements as it relates
1426 	 * memory hot-add. Specify 128MB alignment.
1427 	 */
1428 	cap_msg.caps.cap_bits.hot_add_alignment = 7;
1429 
1430 	/*
1431 	 * Currently the host does not use these
1432 	 * values and we set them to what is done in the
1433 	 * Windows driver.
1434 	 */
1435 	cap_msg.min_page_cnt = 0;
1436 	cap_msg.max_page_number = -1;
1437 
1438 	ret = vmbus_sendpacket(dev->channel, &cap_msg,
1439 				sizeof(struct dm_capabilities),
1440 				(unsigned long)NULL,
1441 				VM_PKT_DATA_INBAND, 0);
1442 	if (ret)
1443 		goto probe_error2;
1444 
1445 	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
1446 	if (t == 0) {
1447 		ret = -ETIMEDOUT;
1448 		goto probe_error2;
1449 	}
1450 
1451 	/*
1452 	 * If the host does not like our capabilities,
1453 	 * fail the probe function.
1454 	 */
1455 	if (dm_device.state == DM_INIT_ERROR) {
1456 		ret = -ETIMEDOUT;
1457 		goto probe_error2;
1458 	}
1459 
1460 	dm_device.state = DM_INITIALIZED;
1461 
1462 	return 0;
1463 
1464 probe_error2:
1465 #ifdef CONFIG_MEMORY_HOTPLUG
1466 	restore_online_page_callback(&hv_online_page);
1467 #endif
1468 	kthread_stop(dm_device.thread);
1469 
1470 probe_error1:
1471 	vmbus_close(dev->channel);
1472 probe_error0:
1473 	kfree(send_buffer);
1474 	return ret;
1475 }
1476 
1477 static int balloon_remove(struct hv_device *dev)
1478 {
1479 	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
1480 	struct list_head *cur, *tmp;
1481 	struct hv_hotadd_state *has;
1482 
1483 	if (dm->num_pages_ballooned != 0)
1484 		pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
1485 
1486 	cancel_work_sync(&dm->balloon_wrk.wrk);
1487 	cancel_work_sync(&dm->ha_wrk.wrk);
1488 
1489 	vmbus_close(dev->channel);
1490 	kthread_stop(dm->thread);
1491 	kfree(send_buffer);
1492 #ifdef CONFIG_MEMORY_HOTPLUG
1493 	restore_online_page_callback(&hv_online_page);
1494 #endif
1495 	list_for_each_safe(cur, tmp, &dm->ha_region_list) {
1496 		has = list_entry(cur, struct hv_hotadd_state, list);
1497 		list_del(&has->list);
1498 		kfree(has);
1499 	}
1500 
1501 	return 0;
1502 }
1503 
1504 static const struct hv_vmbus_device_id id_table[] = {
1505 	/* Dynamic Memory Class ID */
1506 	/* 525074DC-8985-46e2-8057-A307DC18A502 */
1507 	{ HV_DM_GUID, },
1508 	{ },
1509 };
1510 
1511 MODULE_DEVICE_TABLE(vmbus, id_table);
1512 
1513 static  struct hv_driver balloon_drv = {
1514 	.name = "hv_balloon",
1515 	.id_table = id_table,
1516 	.probe =  balloon_probe,
1517 	.remove =  balloon_remove,
1518 };
1519 
1520 static int __init init_balloon_drv(void)
1521 {
1522 
1523 	return vmbus_driver_register(&balloon_drv);
1524 }
1525 
1526 module_init(init_balloon_drv);
1527 
1528 MODULE_DESCRIPTION("Hyper-V Balloon");
1529 MODULE_LICENSE("GPL");
1530