xref: /openbmc/linux/drivers/misc/vmw_balloon.c (revision 09bae3b6)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * VMware Balloon driver.
4  *
5  * Copyright (C) 2000-2018, VMware, Inc. All Rights Reserved.
6  *
7  * This is VMware physical memory management driver for Linux. The driver
8  * acts like a "balloon" that can be inflated to reclaim physical pages by
9  * reserving them in the guest and invalidating them in the monitor,
10  * freeing up the underlying machine pages so they can be allocated to
11  * other guests.  The balloon can also be deflated to allow the guest to
12  * use more physical memory. Higher level policies can control the sizes
13  * of balloons in VMs in order to manage physical memory resources.
14  */
15 
16 //#define DEBUG
17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18 
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/mm.h>
22 #include <linux/vmalloc.h>
23 #include <linux/sched.h>
24 #include <linux/module.h>
25 #include <linux/workqueue.h>
26 #include <linux/debugfs.h>
27 #include <linux/seq_file.h>
28 #include <linux/vmw_vmci_defs.h>
29 #include <linux/vmw_vmci_api.h>
30 #include <asm/hypervisor.h>
31 
32 MODULE_AUTHOR("VMware, Inc.");
33 MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
34 MODULE_VERSION("1.5.0.0-k");
35 MODULE_ALIAS("dmi:*:svnVMware*:*");
36 MODULE_ALIAS("vmware_vmmemctl");
37 MODULE_LICENSE("GPL");
38 
39 /*
40  * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
41  * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
42  * __GFP_NOWARN, to suppress page allocation failure warnings.
43  */
44 #define VMW_PAGE_ALLOC_NOSLEEP		(__GFP_HIGHMEM|__GFP_NOWARN)
45 
46 /*
47  * Use GFP_HIGHUSER when executing in a separate kernel thread
48  * context and allocation can sleep.  This is less stressful to
49  * the guest memory system, since it allows the thread to block
50  * while memory is reclaimed, and won't take pages from emergency
51  * low-memory pools.
52  */
53 #define VMW_PAGE_ALLOC_CANSLEEP		(GFP_HIGHUSER)
54 
55 /* Maximum number of refused pages we accumulate during inflation cycle */
56 #define VMW_BALLOON_MAX_REFUSED		16
57 
58 /*
59  * Hypervisor communication port definitions.
60  */
61 #define VMW_BALLOON_HV_PORT		0x5670
62 #define VMW_BALLOON_HV_MAGIC		0x456c6d6f
63 #define VMW_BALLOON_GUEST_ID		1	/* Linux */
64 
65 enum vmwballoon_capabilities {
66 	/*
67 	 * Bit 0 is reserved and not associated to any capability.
68 	 */
69 	VMW_BALLOON_BASIC_CMDS			= (1 << 1),
70 	VMW_BALLOON_BATCHED_CMDS		= (1 << 2),
71 	VMW_BALLOON_BATCHED_2M_CMDS		= (1 << 3),
72 	VMW_BALLOON_SIGNALLED_WAKEUP_CMD	= (1 << 4),
73 };
74 
75 #define VMW_BALLOON_CAPABILITIES	(VMW_BALLOON_BASIC_CMDS \
76 					| VMW_BALLOON_BATCHED_CMDS \
77 					| VMW_BALLOON_BATCHED_2M_CMDS \
78 					| VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
79 
80 #define VMW_BALLOON_2M_SHIFT		(9)
81 #define VMW_BALLOON_NUM_PAGE_SIZES	(2)
82 
83 /*
84  * Backdoor commands availability:
85  *
86  * START, GET_TARGET and GUEST_ID are always available,
87  *
88  * VMW_BALLOON_BASIC_CMDS:
89  *	LOCK and UNLOCK commands,
90  * VMW_BALLOON_BATCHED_CMDS:
91  *	BATCHED_LOCK and BATCHED_UNLOCK commands.
92  * VMW BALLOON_BATCHED_2M_CMDS:
93  *	BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
94  * VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
95  *	VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
96  */
97 #define VMW_BALLOON_CMD_START			0
98 #define VMW_BALLOON_CMD_GET_TARGET		1
99 #define VMW_BALLOON_CMD_LOCK			2
100 #define VMW_BALLOON_CMD_UNLOCK			3
101 #define VMW_BALLOON_CMD_GUEST_ID		4
102 #define VMW_BALLOON_CMD_BATCHED_LOCK		6
103 #define VMW_BALLOON_CMD_BATCHED_UNLOCK		7
104 #define VMW_BALLOON_CMD_BATCHED_2M_LOCK		8
105 #define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK	9
106 #define VMW_BALLOON_CMD_VMCI_DOORBELL_SET	10
107 
108 
109 /* error codes */
110 #define VMW_BALLOON_SUCCESS		        0
111 #define VMW_BALLOON_FAILURE		        -1
112 #define VMW_BALLOON_ERROR_CMD_INVALID	        1
113 #define VMW_BALLOON_ERROR_PPN_INVALID	        2
114 #define VMW_BALLOON_ERROR_PPN_LOCKED	        3
115 #define VMW_BALLOON_ERROR_PPN_UNLOCKED	        4
116 #define VMW_BALLOON_ERROR_PPN_PINNED	        5
117 #define VMW_BALLOON_ERROR_PPN_NOTNEEDED	        6
118 #define VMW_BALLOON_ERROR_RESET		        7
119 #define VMW_BALLOON_ERROR_BUSY		        8
120 
121 #define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES	(0x03000000)
122 
123 /* Batch page description */
124 
125 /*
126  * Layout of a page in the batch page:
127  *
128  * +-------------+----------+--------+
129  * |             |          |        |
130  * | Page number | Reserved | Status |
131  * |             |          |        |
132  * +-------------+----------+--------+
133  * 64  PAGE_SHIFT          6         0
134  *
135  * The reserved field should be set to 0.
136  */
137 #define VMW_BALLOON_BATCH_MAX_PAGES	(PAGE_SIZE / sizeof(u64))
138 #define VMW_BALLOON_BATCH_STATUS_MASK	((1UL << 5) - 1)
139 #define VMW_BALLOON_BATCH_PAGE_MASK	(~((1UL << PAGE_SHIFT) - 1))
140 
141 struct vmballoon_batch_page {
142 	u64 pages[VMW_BALLOON_BATCH_MAX_PAGES];
143 };
144 
145 static u64 vmballoon_batch_get_pa(struct vmballoon_batch_page *batch, int idx)
146 {
147 	return batch->pages[idx] & VMW_BALLOON_BATCH_PAGE_MASK;
148 }
149 
150 static int vmballoon_batch_get_status(struct vmballoon_batch_page *batch,
151 				int idx)
152 {
153 	return (int)(batch->pages[idx] & VMW_BALLOON_BATCH_STATUS_MASK);
154 }
155 
156 static void vmballoon_batch_set_pa(struct vmballoon_batch_page *batch, int idx,
157 				u64 pa)
158 {
159 	batch->pages[idx] = pa;
160 }
161 
162 
163 #define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)		\
164 ({								\
165 	unsigned long __status, __dummy1, __dummy2, __dummy3;	\
166 	__asm__ __volatile__ ("inl %%dx" :			\
167 		"=a"(__status),					\
168 		"=c"(__dummy1),					\
169 		"=d"(__dummy2),					\
170 		"=b"(result),					\
171 		"=S" (__dummy3) :				\
172 		"0"(VMW_BALLOON_HV_MAGIC),			\
173 		"1"(VMW_BALLOON_CMD_##cmd),			\
174 		"2"(VMW_BALLOON_HV_PORT),			\
175 		"3"(arg1),					\
176 		"4" (arg2) :					\
177 		"memory");					\
178 	if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)	\
179 		result = __dummy1;				\
180 	result &= -1UL;						\
181 	__status & -1UL;					\
182 })
183 
184 #ifdef CONFIG_DEBUG_FS
185 struct vmballoon_stats {
186 	unsigned int timer;
187 	unsigned int doorbell;
188 
189 	/* allocation statistics */
190 	unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
191 	unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
192 	unsigned int sleep_alloc;
193 	unsigned int sleep_alloc_fail;
194 	unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
195 	unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
196 	unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
197 
198 	/* monitor operations */
199 	unsigned int lock[VMW_BALLOON_NUM_PAGE_SIZES];
200 	unsigned int lock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
201 	unsigned int unlock[VMW_BALLOON_NUM_PAGE_SIZES];
202 	unsigned int unlock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
203 	unsigned int target;
204 	unsigned int target_fail;
205 	unsigned int start;
206 	unsigned int start_fail;
207 	unsigned int guest_type;
208 	unsigned int guest_type_fail;
209 	unsigned int doorbell_set;
210 	unsigned int doorbell_unset;
211 };
212 
213 #define STATS_INC(stat) (stat)++
214 #else
215 #define STATS_INC(stat)
216 #endif
217 
218 struct vmballoon;
219 
220 struct vmballoon_ops {
221 	void (*add_page)(struct vmballoon *b, int idx, struct page *p);
222 	int (*lock)(struct vmballoon *b, unsigned int num_pages,
223 			bool is_2m_pages, unsigned int *target);
224 	int (*unlock)(struct vmballoon *b, unsigned int num_pages,
225 			bool is_2m_pages, unsigned int *target);
226 };
227 
228 struct vmballoon_page_size {
229 	/* list of reserved physical pages */
230 	struct list_head pages;
231 
232 	/* transient list of non-balloonable pages */
233 	struct list_head refused_pages;
234 	unsigned int n_refused_pages;
235 };
236 
237 struct vmballoon {
238 	struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
239 
240 	/* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
241 	unsigned supported_page_sizes;
242 
243 	/* balloon size in pages */
244 	unsigned int size;
245 	unsigned int target;
246 
247 	/* reset flag */
248 	bool reset_required;
249 
250 	unsigned long capabilities;
251 
252 	struct vmballoon_batch_page *batch_page;
253 	unsigned int batch_max_pages;
254 	struct page *page;
255 
256 	const struct vmballoon_ops *ops;
257 
258 #ifdef CONFIG_DEBUG_FS
259 	/* statistics */
260 	struct vmballoon_stats stats;
261 
262 	/* debugfs file exporting statistics */
263 	struct dentry *dbg_entry;
264 #endif
265 
266 	struct sysinfo sysinfo;
267 
268 	struct delayed_work dwork;
269 
270 	struct vmci_handle vmci_doorbell;
271 };
272 
273 static struct vmballoon balloon;
274 
275 /*
276  * Send "start" command to the host, communicating supported version
277  * of the protocol.
278  */
279 static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
280 {
281 	unsigned long status, capabilities, dummy = 0;
282 	bool success;
283 
284 	STATS_INC(b->stats.start);
285 
286 	status = VMWARE_BALLOON_CMD(START, req_caps, dummy, capabilities);
287 
288 	switch (status) {
289 	case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
290 		b->capabilities = capabilities;
291 		success = true;
292 		break;
293 	case VMW_BALLOON_SUCCESS:
294 		b->capabilities = VMW_BALLOON_BASIC_CMDS;
295 		success = true;
296 		break;
297 	default:
298 		success = false;
299 	}
300 
301 	/*
302 	 * 2MB pages are only supported with batching. If batching is for some
303 	 * reason disabled, do not use 2MB pages, since otherwise the legacy
304 	 * mechanism is used with 2MB pages, causing a failure.
305 	 */
306 	if ((b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS) &&
307 	    (b->capabilities & VMW_BALLOON_BATCHED_CMDS))
308 		b->supported_page_sizes = 2;
309 	else
310 		b->supported_page_sizes = 1;
311 
312 	if (!success) {
313 		pr_debug("%s - failed, hv returns %ld\n", __func__, status);
314 		STATS_INC(b->stats.start_fail);
315 	}
316 	return success;
317 }
318 
319 static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
320 {
321 	switch (status) {
322 	case VMW_BALLOON_SUCCESS:
323 		return true;
324 
325 	case VMW_BALLOON_ERROR_RESET:
326 		b->reset_required = true;
327 		/* fall through */
328 
329 	default:
330 		return false;
331 	}
332 }
333 
334 /*
335  * Communicate guest type to the host so that it can adjust ballooning
336  * algorithm to the one most appropriate for the guest. This command
337  * is normally issued after sending "start" command and is part of
338  * standard reset sequence.
339  */
340 static bool vmballoon_send_guest_id(struct vmballoon *b)
341 {
342 	unsigned long status, dummy = 0;
343 
344 	status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy,
345 				dummy);
346 
347 	STATS_INC(b->stats.guest_type);
348 
349 	if (vmballoon_check_status(b, status))
350 		return true;
351 
352 	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
353 	STATS_INC(b->stats.guest_type_fail);
354 	return false;
355 }
356 
357 static u16 vmballoon_page_size(bool is_2m_page)
358 {
359 	if (is_2m_page)
360 		return 1 << VMW_BALLOON_2M_SHIFT;
361 
362 	return 1;
363 }
364 
365 /*
366  * Retrieve desired balloon size from the host.
367  */
368 static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
369 {
370 	unsigned long status;
371 	unsigned long target;
372 	unsigned long limit;
373 	unsigned long dummy = 0;
374 	u32 limit32;
375 
376 	/*
377 	 * si_meminfo() is cheap. Moreover, we want to provide dynamic
378 	 * max balloon size later. So let us call si_meminfo() every
379 	 * iteration.
380 	 */
381 	si_meminfo(&b->sysinfo);
382 	limit = b->sysinfo.totalram;
383 
384 	/* Ensure limit fits in 32-bits */
385 	limit32 = (u32)limit;
386 	if (limit != limit32)
387 		return false;
388 
389 	/* update stats */
390 	STATS_INC(b->stats.target);
391 
392 	status = VMWARE_BALLOON_CMD(GET_TARGET, limit, dummy, target);
393 	if (vmballoon_check_status(b, status)) {
394 		*new_target = target;
395 		return true;
396 	}
397 
398 	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
399 	STATS_INC(b->stats.target_fail);
400 	return false;
401 }
402 
403 /*
404  * Notify the host about allocated page so that host can use it without
405  * fear that guest will need it. Host may reject some pages, we need to
406  * check the return value and maybe submit a different page.
407  */
408 static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
409 				unsigned int *hv_status, unsigned int *target)
410 {
411 	unsigned long status, dummy = 0;
412 	u32 pfn32;
413 
414 	pfn32 = (u32)pfn;
415 	if (pfn32 != pfn)
416 		return -EINVAL;
417 
418 	STATS_INC(b->stats.lock[false]);
419 
420 	*hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy, *target);
421 	if (vmballoon_check_status(b, status))
422 		return 0;
423 
424 	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
425 	STATS_INC(b->stats.lock_fail[false]);
426 	return -EIO;
427 }
428 
429 static int vmballoon_send_batched_lock(struct vmballoon *b,
430 		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
431 {
432 	unsigned long status;
433 	unsigned long pfn = PHYS_PFN(virt_to_phys(b->batch_page));
434 
435 	STATS_INC(b->stats.lock[is_2m_pages]);
436 
437 	if (is_2m_pages)
438 		status = VMWARE_BALLOON_CMD(BATCHED_2M_LOCK, pfn, num_pages,
439 				*target);
440 	else
441 		status = VMWARE_BALLOON_CMD(BATCHED_LOCK, pfn, num_pages,
442 				*target);
443 
444 	if (vmballoon_check_status(b, status))
445 		return 0;
446 
447 	pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
448 	STATS_INC(b->stats.lock_fail[is_2m_pages]);
449 	return 1;
450 }
451 
452 /*
453  * Notify the host that guest intends to release given page back into
454  * the pool of available (to the guest) pages.
455  */
456 static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn,
457 							unsigned int *target)
458 {
459 	unsigned long status, dummy = 0;
460 	u32 pfn32;
461 
462 	pfn32 = (u32)pfn;
463 	if (pfn32 != pfn)
464 		return false;
465 
466 	STATS_INC(b->stats.unlock[false]);
467 
468 	status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy, *target);
469 	if (vmballoon_check_status(b, status))
470 		return true;
471 
472 	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
473 	STATS_INC(b->stats.unlock_fail[false]);
474 	return false;
475 }
476 
477 static bool vmballoon_send_batched_unlock(struct vmballoon *b,
478 		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
479 {
480 	unsigned long status;
481 	unsigned long pfn = PHYS_PFN(virt_to_phys(b->batch_page));
482 
483 	STATS_INC(b->stats.unlock[is_2m_pages]);
484 
485 	if (is_2m_pages)
486 		status = VMWARE_BALLOON_CMD(BATCHED_2M_UNLOCK, pfn, num_pages,
487 				*target);
488 	else
489 		status = VMWARE_BALLOON_CMD(BATCHED_UNLOCK, pfn, num_pages,
490 				*target);
491 
492 	if (vmballoon_check_status(b, status))
493 		return true;
494 
495 	pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
496 	STATS_INC(b->stats.unlock_fail[is_2m_pages]);
497 	return false;
498 }
499 
500 static struct page *vmballoon_alloc_page(gfp_t flags, bool is_2m_page)
501 {
502 	if (is_2m_page)
503 		return alloc_pages(flags, VMW_BALLOON_2M_SHIFT);
504 
505 	return alloc_page(flags);
506 }
507 
508 static void vmballoon_free_page(struct page *page, bool is_2m_page)
509 {
510 	if (is_2m_page)
511 		__free_pages(page, VMW_BALLOON_2M_SHIFT);
512 	else
513 		__free_page(page);
514 }
515 
516 /*
517  * Quickly release all pages allocated for the balloon. This function is
518  * called when host decides to "reset" balloon for one reason or another.
519  * Unlike normal "deflate" we do not (shall not) notify host of the pages
520  * being released.
521  */
522 static void vmballoon_pop(struct vmballoon *b)
523 {
524 	struct page *page, *next;
525 	unsigned is_2m_pages;
526 
527 	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
528 			is_2m_pages++) {
529 		struct vmballoon_page_size *page_size =
530 				&b->page_sizes[is_2m_pages];
531 		u16 size_per_page = vmballoon_page_size(is_2m_pages);
532 
533 		list_for_each_entry_safe(page, next, &page_size->pages, lru) {
534 			list_del(&page->lru);
535 			vmballoon_free_page(page, is_2m_pages);
536 			STATS_INC(b->stats.free[is_2m_pages]);
537 			b->size -= size_per_page;
538 			cond_resched();
539 		}
540 	}
541 
542 	/* Clearing the batch_page unconditionally has no adverse effect */
543 	free_page((unsigned long)b->batch_page);
544 	b->batch_page = NULL;
545 }
546 
547 /*
548  * Notify the host of a ballooned page. If host rejects the page put it on the
549  * refuse list, those refused page are then released at the end of the
550  * inflation cycle.
551  */
552 static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages,
553 				bool is_2m_pages, unsigned int *target)
554 {
555 	int locked, hv_status;
556 	struct page *page = b->page;
557 	struct vmballoon_page_size *page_size = &b->page_sizes[false];
558 
559 	/* is_2m_pages can never happen as 2m pages support implies batching */
560 
561 	locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status,
562 								target);
563 	if (locked) {
564 		STATS_INC(b->stats.refused_alloc[false]);
565 
566 		if (locked == -EIO &&
567 		    (hv_status == VMW_BALLOON_ERROR_RESET ||
568 		     hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED)) {
569 			vmballoon_free_page(page, false);
570 			return -EIO;
571 		}
572 
573 		/*
574 		 * Place page on the list of non-balloonable pages
575 		 * and retry allocation, unless we already accumulated
576 		 * too many of them, in which case take a breather.
577 		 */
578 		if (page_size->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
579 			page_size->n_refused_pages++;
580 			list_add(&page->lru, &page_size->refused_pages);
581 		} else {
582 			vmballoon_free_page(page, false);
583 		}
584 		return locked;
585 	}
586 
587 	/* track allocated page */
588 	list_add(&page->lru, &page_size->pages);
589 
590 	/* update balloon size */
591 	b->size++;
592 
593 	return 0;
594 }
595 
596 static int vmballoon_lock_batched_page(struct vmballoon *b,
597 		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
598 {
599 	int locked, i;
600 	u16 size_per_page = vmballoon_page_size(is_2m_pages);
601 
602 	locked = vmballoon_send_batched_lock(b, num_pages, is_2m_pages,
603 			target);
604 	if (locked > 0) {
605 		for (i = 0; i < num_pages; i++) {
606 			u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
607 			struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
608 
609 			vmballoon_free_page(p, is_2m_pages);
610 		}
611 
612 		return -EIO;
613 	}
614 
615 	for (i = 0; i < num_pages; i++) {
616 		u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
617 		struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
618 		struct vmballoon_page_size *page_size =
619 				&b->page_sizes[is_2m_pages];
620 
621 		locked = vmballoon_batch_get_status(b->batch_page, i);
622 
623 		switch (locked) {
624 		case VMW_BALLOON_SUCCESS:
625 			list_add(&p->lru, &page_size->pages);
626 			b->size += size_per_page;
627 			break;
628 		case VMW_BALLOON_ERROR_PPN_PINNED:
629 		case VMW_BALLOON_ERROR_PPN_INVALID:
630 			if (page_size->n_refused_pages
631 					< VMW_BALLOON_MAX_REFUSED) {
632 				list_add(&p->lru, &page_size->refused_pages);
633 				page_size->n_refused_pages++;
634 				break;
635 			}
636 			/* Fallthrough */
637 		case VMW_BALLOON_ERROR_RESET:
638 		case VMW_BALLOON_ERROR_PPN_NOTNEEDED:
639 			vmballoon_free_page(p, is_2m_pages);
640 			break;
641 		default:
642 			/* This should never happen */
643 			WARN_ON_ONCE(true);
644 		}
645 	}
646 
647 	return 0;
648 }
649 
650 /*
651  * Release the page allocated for the balloon. Note that we first notify
652  * the host so it can make sure the page will be available for the guest
653  * to use, if needed.
654  */
655 static int vmballoon_unlock_page(struct vmballoon *b, unsigned int num_pages,
656 		bool is_2m_pages, unsigned int *target)
657 {
658 	struct page *page = b->page;
659 	struct vmballoon_page_size *page_size = &b->page_sizes[false];
660 
661 	/* is_2m_pages can never happen as 2m pages support implies batching */
662 
663 	if (!vmballoon_send_unlock_page(b, page_to_pfn(page), target)) {
664 		list_add(&page->lru, &page_size->pages);
665 		return -EIO;
666 	}
667 
668 	/* deallocate page */
669 	vmballoon_free_page(page, false);
670 	STATS_INC(b->stats.free[false]);
671 
672 	/* update balloon size */
673 	b->size--;
674 
675 	return 0;
676 }
677 
678 static int vmballoon_unlock_batched_page(struct vmballoon *b,
679 				unsigned int num_pages, bool is_2m_pages,
680 				unsigned int *target)
681 {
682 	int locked, i, ret = 0;
683 	bool hv_success;
684 	u16 size_per_page = vmballoon_page_size(is_2m_pages);
685 
686 	hv_success = vmballoon_send_batched_unlock(b, num_pages, is_2m_pages,
687 			target);
688 	if (!hv_success)
689 		ret = -EIO;
690 
691 	for (i = 0; i < num_pages; i++) {
692 		u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
693 		struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
694 		struct vmballoon_page_size *page_size =
695 				&b->page_sizes[is_2m_pages];
696 
697 		locked = vmballoon_batch_get_status(b->batch_page, i);
698 		if (!hv_success || locked != VMW_BALLOON_SUCCESS) {
699 			/*
700 			 * That page wasn't successfully unlocked by the
701 			 * hypervisor, re-add it to the list of pages owned by
702 			 * the balloon driver.
703 			 */
704 			list_add(&p->lru, &page_size->pages);
705 		} else {
706 			/* deallocate page */
707 			vmballoon_free_page(p, is_2m_pages);
708 			STATS_INC(b->stats.free[is_2m_pages]);
709 
710 			/* update balloon size */
711 			b->size -= size_per_page;
712 		}
713 	}
714 
715 	return ret;
716 }
717 
718 /*
719  * Release pages that were allocated while attempting to inflate the
720  * balloon but were refused by the host for one reason or another.
721  */
722 static void vmballoon_release_refused_pages(struct vmballoon *b,
723 		bool is_2m_pages)
724 {
725 	struct page *page, *next;
726 	struct vmballoon_page_size *page_size =
727 			&b->page_sizes[is_2m_pages];
728 
729 	list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
730 		list_del(&page->lru);
731 		vmballoon_free_page(page, is_2m_pages);
732 		STATS_INC(b->stats.refused_free[is_2m_pages]);
733 	}
734 
735 	page_size->n_refused_pages = 0;
736 }
737 
738 static void vmballoon_add_page(struct vmballoon *b, int idx, struct page *p)
739 {
740 	b->page = p;
741 }
742 
743 static void vmballoon_add_batched_page(struct vmballoon *b, int idx,
744 				struct page *p)
745 {
746 	vmballoon_batch_set_pa(b->batch_page, idx,
747 			(u64)page_to_pfn(p) << PAGE_SHIFT);
748 }
749 
750 /*
751  * Inflate the balloon towards its target size. Note that we try to limit
752  * the rate of allocation to make sure we are not choking the rest of the
753  * system.
754  */
755 static void vmballoon_inflate(struct vmballoon *b)
756 {
757 	unsigned int num_pages = 0;
758 	int error = 0;
759 	gfp_t flags = VMW_PAGE_ALLOC_NOSLEEP;
760 	bool is_2m_pages;
761 
762 	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
763 
764 	/*
765 	 * First try NOSLEEP page allocations to inflate balloon.
766 	 *
767 	 * If we do not throttle nosleep allocations, we can drain all
768 	 * free pages in the guest quickly (if the balloon target is high).
769 	 * As a side-effect, draining free pages helps to inform (force)
770 	 * the guest to start swapping if balloon target is not met yet,
771 	 * which is a desired behavior. However, balloon driver can consume
772 	 * all available CPU cycles if too many pages are allocated in a
773 	 * second. Therefore, we throttle nosleep allocations even when
774 	 * the guest is not under memory pressure. OTOH, if we have already
775 	 * predicted that the guest is under memory pressure, then we
776 	 * slowdown page allocations considerably.
777 	 */
778 
779 	/*
780 	 * Start with no sleep allocation rate which may be higher
781 	 * than sleeping allocation rate.
782 	 */
783 	is_2m_pages = b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
784 
785 	pr_debug("%s - goal: %d",  __func__, b->target - b->size);
786 
787 	while (!b->reset_required &&
788 		b->size + num_pages * vmballoon_page_size(is_2m_pages)
789 		< b->target) {
790 		struct page *page;
791 
792 		if (flags == VMW_PAGE_ALLOC_NOSLEEP)
793 			STATS_INC(b->stats.alloc[is_2m_pages]);
794 		else
795 			STATS_INC(b->stats.sleep_alloc);
796 
797 		page = vmballoon_alloc_page(flags, is_2m_pages);
798 		if (!page) {
799 			STATS_INC(b->stats.alloc_fail[is_2m_pages]);
800 
801 			if (is_2m_pages) {
802 				b->ops->lock(b, num_pages, true, &b->target);
803 
804 				/*
805 				 * ignore errors from locking as we now switch
806 				 * to 4k pages and we might get different
807 				 * errors.
808 				 */
809 
810 				num_pages = 0;
811 				is_2m_pages = false;
812 				continue;
813 			}
814 
815 			if (flags == VMW_PAGE_ALLOC_CANSLEEP) {
816 				/*
817 				 * CANSLEEP page allocation failed, so guest
818 				 * is under severe memory pressure. We just log
819 				 * the event, but do not stop the inflation
820 				 * due to its negative impact on performance.
821 				 */
822 				STATS_INC(b->stats.sleep_alloc_fail);
823 				break;
824 			}
825 
826 			/*
827 			 * NOSLEEP page allocation failed, so the guest is
828 			 * under memory pressure. Slowing down page alloctions
829 			 * seems to be reasonable, but doing so might actually
830 			 * cause the hypervisor to throttle us down, resulting
831 			 * in degraded performance. We will count on the
832 			 * scheduler and standard memory management mechanisms
833 			 * for now.
834 			 */
835 			flags = VMW_PAGE_ALLOC_CANSLEEP;
836 			continue;
837 		}
838 
839 		b->ops->add_page(b, num_pages++, page);
840 		if (num_pages == b->batch_max_pages) {
841 			error = b->ops->lock(b, num_pages, is_2m_pages,
842 					&b->target);
843 			num_pages = 0;
844 			if (error)
845 				break;
846 		}
847 
848 		cond_resched();
849 	}
850 
851 	if (num_pages > 0)
852 		b->ops->lock(b, num_pages, is_2m_pages, &b->target);
853 
854 	vmballoon_release_refused_pages(b, true);
855 	vmballoon_release_refused_pages(b, false);
856 }
857 
858 /*
859  * Decrease the size of the balloon allowing guest to use more memory.
860  */
861 static void vmballoon_deflate(struct vmballoon *b)
862 {
863 	unsigned is_2m_pages;
864 
865 	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
866 
867 	/* free pages to reach target */
868 	for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
869 			is_2m_pages++) {
870 		struct page *page, *next;
871 		unsigned int num_pages = 0;
872 		struct vmballoon_page_size *page_size =
873 				&b->page_sizes[is_2m_pages];
874 
875 		list_for_each_entry_safe(page, next, &page_size->pages, lru) {
876 			if (b->reset_required ||
877 				(b->target > 0 &&
878 					b->size - num_pages
879 					* vmballoon_page_size(is_2m_pages)
880 				< b->target + vmballoon_page_size(true)))
881 				break;
882 
883 			list_del(&page->lru);
884 			b->ops->add_page(b, num_pages++, page);
885 
886 			if (num_pages == b->batch_max_pages) {
887 				int error;
888 
889 				error = b->ops->unlock(b, num_pages,
890 						is_2m_pages, &b->target);
891 				num_pages = 0;
892 				if (error)
893 					return;
894 			}
895 
896 			cond_resched();
897 		}
898 
899 		if (num_pages > 0)
900 			b->ops->unlock(b, num_pages, is_2m_pages, &b->target);
901 	}
902 }
903 
904 static const struct vmballoon_ops vmballoon_basic_ops = {
905 	.add_page = vmballoon_add_page,
906 	.lock = vmballoon_lock_page,
907 	.unlock = vmballoon_unlock_page
908 };
909 
910 static const struct vmballoon_ops vmballoon_batched_ops = {
911 	.add_page = vmballoon_add_batched_page,
912 	.lock = vmballoon_lock_batched_page,
913 	.unlock = vmballoon_unlock_batched_page
914 };
915 
916 static bool vmballoon_init_batching(struct vmballoon *b)
917 {
918 	struct page *page;
919 
920 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
921 	if (!page)
922 		return false;
923 
924 	b->batch_page = page_address(page);
925 	return true;
926 }
927 
928 /*
929  * Receive notification and resize balloon
930  */
931 static void vmballoon_doorbell(void *client_data)
932 {
933 	struct vmballoon *b = client_data;
934 
935 	STATS_INC(b->stats.doorbell);
936 
937 	mod_delayed_work(system_freezable_wq, &b->dwork, 0);
938 }
939 
940 /*
941  * Clean up vmci doorbell
942  */
943 static void vmballoon_vmci_cleanup(struct vmballoon *b)
944 {
945 	int error;
946 
947 	VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, VMCI_INVALID_ID,
948 			VMCI_INVALID_ID, error);
949 	STATS_INC(b->stats.doorbell_unset);
950 
951 	if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
952 		vmci_doorbell_destroy(b->vmci_doorbell);
953 		b->vmci_doorbell = VMCI_INVALID_HANDLE;
954 	}
955 }
956 
957 /*
958  * Initialize vmci doorbell, to get notified as soon as balloon changes
959  */
960 static int vmballoon_vmci_init(struct vmballoon *b)
961 {
962 	unsigned long error, dummy;
963 
964 	if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) == 0)
965 		return 0;
966 
967 	error = vmci_doorbell_create(&b->vmci_doorbell, VMCI_FLAG_DELAYED_CB,
968 				     VMCI_PRIVILEGE_FLAG_RESTRICTED,
969 				     vmballoon_doorbell, b);
970 
971 	if (error != VMCI_SUCCESS)
972 		goto fail;
973 
974 	error = VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, b->vmci_doorbell.context,
975 				   b->vmci_doorbell.resource, dummy);
976 
977 	STATS_INC(b->stats.doorbell_set);
978 
979 	if (error != VMW_BALLOON_SUCCESS)
980 		goto fail;
981 
982 	return 0;
983 fail:
984 	vmballoon_vmci_cleanup(b);
985 	return -EIO;
986 }
987 
988 /*
989  * Perform standard reset sequence by popping the balloon (in case it
990  * is not  empty) and then restarting protocol. This operation normally
991  * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
992  */
993 static void vmballoon_reset(struct vmballoon *b)
994 {
995 	int error;
996 
997 	vmballoon_vmci_cleanup(b);
998 
999 	/* free all pages, skipping monitor unlock */
1000 	vmballoon_pop(b);
1001 
1002 	if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
1003 		return;
1004 
1005 	if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
1006 		b->ops = &vmballoon_batched_ops;
1007 		b->batch_max_pages = VMW_BALLOON_BATCH_MAX_PAGES;
1008 		if (!vmballoon_init_batching(b)) {
1009 			/*
1010 			 * We failed to initialize batching, inform the monitor
1011 			 * about it by sending a null capability.
1012 			 *
1013 			 * The guest will retry in one second.
1014 			 */
1015 			vmballoon_send_start(b, 0);
1016 			return;
1017 		}
1018 	} else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
1019 		b->ops = &vmballoon_basic_ops;
1020 		b->batch_max_pages = 1;
1021 	}
1022 
1023 	b->reset_required = false;
1024 
1025 	error = vmballoon_vmci_init(b);
1026 	if (error)
1027 		pr_err("failed to initialize vmci doorbell\n");
1028 
1029 	if (!vmballoon_send_guest_id(b))
1030 		pr_err("failed to send guest ID to the host\n");
1031 }
1032 
1033 /*
1034  * Balloon work function: reset protocol, if needed, get the new size and
1035  * adjust balloon as needed. Repeat in 1 sec.
1036  */
1037 static void vmballoon_work(struct work_struct *work)
1038 {
1039 	struct delayed_work *dwork = to_delayed_work(work);
1040 	struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
1041 	unsigned int target;
1042 
1043 	STATS_INC(b->stats.timer);
1044 
1045 	if (b->reset_required)
1046 		vmballoon_reset(b);
1047 
1048 	if (!b->reset_required && vmballoon_send_get_target(b, &target)) {
1049 		/* update target, adjust size */
1050 		b->target = target;
1051 
1052 		if (b->size < target)
1053 			vmballoon_inflate(b);
1054 		else if (target == 0 ||
1055 				b->size > target + vmballoon_page_size(true))
1056 			vmballoon_deflate(b);
1057 	}
1058 
1059 	/*
1060 	 * We are using a freezable workqueue so that balloon operations are
1061 	 * stopped while the system transitions to/from sleep/hibernation.
1062 	 */
1063 	queue_delayed_work(system_freezable_wq,
1064 			   dwork, round_jiffies_relative(HZ));
1065 }
1066 
1067 /*
1068  * DEBUGFS Interface
1069  */
1070 #ifdef CONFIG_DEBUG_FS
1071 
1072 static int vmballoon_debug_show(struct seq_file *f, void *offset)
1073 {
1074 	struct vmballoon *b = f->private;
1075 	struct vmballoon_stats *stats = &b->stats;
1076 
1077 	/* format capabilities info */
1078 	seq_printf(f,
1079 		   "balloon capabilities:   %#4x\n"
1080 		   "used capabilities:      %#4lx\n"
1081 		   "is resetting:           %c\n",
1082 		   VMW_BALLOON_CAPABILITIES, b->capabilities,
1083 		   b->reset_required ? 'y' : 'n');
1084 
1085 	/* format size info */
1086 	seq_printf(f,
1087 		   "target:             %8d pages\n"
1088 		   "current:            %8d pages\n",
1089 		   b->target, b->size);
1090 
1091 	seq_printf(f,
1092 		   "\n"
1093 		   "timer:              %8u\n"
1094 		   "doorbell:           %8u\n"
1095 		   "start:              %8u (%4u failed)\n"
1096 		   "guestType:          %8u (%4u failed)\n"
1097 		   "2m-lock:            %8u (%4u failed)\n"
1098 		   "lock:               %8u (%4u failed)\n"
1099 		   "2m-unlock:          %8u (%4u failed)\n"
1100 		   "unlock:             %8u (%4u failed)\n"
1101 		   "target:             %8u (%4u failed)\n"
1102 		   "prim2mAlloc:        %8u (%4u failed)\n"
1103 		   "primNoSleepAlloc:   %8u (%4u failed)\n"
1104 		   "primCanSleepAlloc:  %8u (%4u failed)\n"
1105 		   "prim2mFree:         %8u\n"
1106 		   "primFree:           %8u\n"
1107 		   "err2mAlloc:         %8u\n"
1108 		   "errAlloc:           %8u\n"
1109 		   "err2mFree:          %8u\n"
1110 		   "errFree:            %8u\n"
1111 		   "doorbellSet:        %8u\n"
1112 		   "doorbellUnset:      %8u\n",
1113 		   stats->timer,
1114 		   stats->doorbell,
1115 		   stats->start, stats->start_fail,
1116 		   stats->guest_type, stats->guest_type_fail,
1117 		   stats->lock[true],  stats->lock_fail[true],
1118 		   stats->lock[false],  stats->lock_fail[false],
1119 		   stats->unlock[true], stats->unlock_fail[true],
1120 		   stats->unlock[false], stats->unlock_fail[false],
1121 		   stats->target, stats->target_fail,
1122 		   stats->alloc[true], stats->alloc_fail[true],
1123 		   stats->alloc[false], stats->alloc_fail[false],
1124 		   stats->sleep_alloc, stats->sleep_alloc_fail,
1125 		   stats->free[true],
1126 		   stats->free[false],
1127 		   stats->refused_alloc[true], stats->refused_alloc[false],
1128 		   stats->refused_free[true], stats->refused_free[false],
1129 		   stats->doorbell_set, stats->doorbell_unset);
1130 
1131 	return 0;
1132 }
1133 
1134 static int vmballoon_debug_open(struct inode *inode, struct file *file)
1135 {
1136 	return single_open(file, vmballoon_debug_show, inode->i_private);
1137 }
1138 
1139 static const struct file_operations vmballoon_debug_fops = {
1140 	.owner		= THIS_MODULE,
1141 	.open		= vmballoon_debug_open,
1142 	.read		= seq_read,
1143 	.llseek		= seq_lseek,
1144 	.release	= single_release,
1145 };
1146 
1147 static int __init vmballoon_debugfs_init(struct vmballoon *b)
1148 {
1149 	int error;
1150 
1151 	b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
1152 					   &vmballoon_debug_fops);
1153 	if (IS_ERR(b->dbg_entry)) {
1154 		error = PTR_ERR(b->dbg_entry);
1155 		pr_err("failed to create debugfs entry, error: %d\n", error);
1156 		return error;
1157 	}
1158 
1159 	return 0;
1160 }
1161 
1162 static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
1163 {
1164 	debugfs_remove(b->dbg_entry);
1165 }
1166 
1167 #else
1168 
1169 static inline int vmballoon_debugfs_init(struct vmballoon *b)
1170 {
1171 	return 0;
1172 }
1173 
1174 static inline void vmballoon_debugfs_exit(struct vmballoon *b)
1175 {
1176 }
1177 
1178 #endif	/* CONFIG_DEBUG_FS */
1179 
1180 static int __init vmballoon_init(void)
1181 {
1182 	int error;
1183 	unsigned is_2m_pages;
1184 	/*
1185 	 * Check if we are running on VMware's hypervisor and bail out
1186 	 * if we are not.
1187 	 */
1188 	if (x86_hyper_type != X86_HYPER_VMWARE)
1189 		return -ENODEV;
1190 
1191 	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
1192 			is_2m_pages++) {
1193 		INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
1194 		INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
1195 	}
1196 
1197 	INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
1198 
1199 	error = vmballoon_debugfs_init(&balloon);
1200 	if (error)
1201 		return error;
1202 
1203 	balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
1204 	balloon.batch_page = NULL;
1205 	balloon.page = NULL;
1206 	balloon.reset_required = true;
1207 
1208 	queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
1209 
1210 	return 0;
1211 }
1212 
1213 /*
1214  * Using late_initcall() instead of module_init() allows the balloon to use the
1215  * VMCI doorbell even when the balloon is built into the kernel. Otherwise the
1216  * VMCI is probed only after the balloon is initialized. If the balloon is used
1217  * as a module, late_initcall() is equivalent to module_init().
1218  */
1219 late_initcall(vmballoon_init);
1220 
1221 static void __exit vmballoon_exit(void)
1222 {
1223 	vmballoon_vmci_cleanup(&balloon);
1224 	cancel_delayed_work_sync(&balloon.dwork);
1225 
1226 	vmballoon_debugfs_exit(&balloon);
1227 
1228 	/*
1229 	 * Deallocate all reserved memory, and reset connection with monitor.
1230 	 * Reset connection before deallocating memory to avoid potential for
1231 	 * additional spurious resets from guest touching deallocated pages.
1232 	 */
1233 	vmballoon_send_start(&balloon, 0);
1234 	vmballoon_pop(&balloon);
1235 }
1236 module_exit(vmballoon_exit);
1237