xref: /openbmc/qemu/hw/hyperv/hv-balloon.c (revision 6233759a)
1 /*
2  * QEMU Hyper-V Dynamic Memory Protocol driver
3  *
4  * Copyright (C) 2020-2023 Oracle and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  */
9 
10 #include "hv-balloon-internal.h"
11 
12 #include "exec/address-spaces.h"
13 #include "exec/cpu-common.h"
14 #include "exec/ramblock.h"
15 #include "hw/boards.h"
16 #include "hw/hyperv/dynmem-proto.h"
17 #include "hw/hyperv/hv-balloon.h"
18 #include "hw/hyperv/vmbus.h"
19 #include "hw/mem/memory-device.h"
20 #include "hw/mem/pc-dimm.h"
21 #include "hw/qdev-core.h"
22 #include "hw/qdev-properties.h"
23 #include "monitor/qdev.h"
24 #include "qapi/error.h"
25 #include "qapi/qapi-commands-machine.h"
26 #include "qapi/qapi-events-machine.h"
27 #include "qapi/qapi-types-machine.h"
28 #include "qapi/qmp/qdict.h"
29 #include "qapi/visitor.h"
30 #include "qemu/error-report.h"
31 #include "qemu/module.h"
32 #include "qemu/units.h"
33 #include "qemu/timer.h"
34 #include "sysemu/balloon.h"
35 #include "sysemu/hostmem.h"
36 #include "sysemu/reset.h"
37 #include "hv-balloon-our_range_memslots.h"
38 #include "hv-balloon-page_range_tree.h"
39 #include "trace.h"
40 
41 #define HV_BALLOON_ADDR_PROP "addr"
42 #define HV_BALLOON_MEMDEV_PROP "memdev"
43 #define HV_BALLOON_GUID "525074DC-8985-46e2-8057-A307DC18A502"
44 
45 /*
46  * Some Windows versions (at least Server 2019) will crash with various
47  * error codes when receiving DM protocol requests (at least
48  * DM_MEM_HOT_ADD_REQUEST) immediately after boot.
49  *
50  * It looks like Hyper-V from Server 2016 uses a 50-second after-boot
51  * delay, probably to workaround this issue, so we'll use this value, too.
52  */
53 #define HV_BALLOON_POST_INIT_WAIT (50 * 1000)
54 
55 #define HV_BALLOON_HA_CHUNK_SIZE (2 * GiB)
56 #define HV_BALLOON_HA_CHUNK_PAGES (HV_BALLOON_HA_CHUNK_SIZE / HV_BALLOON_PAGE_SIZE)
57 
58 #define HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN (128 * MiB)
59 
60 #define HV_BALLOON_HR_CHUNK_PAGES 585728
61 /*
62  *                                ^ that's the maximum number of pages
63  * that Windows returns in one hot remove response
64  *
65  * If the number requested is too high Windows will no longer honor
66  * these requests
67  */
68 
69 struct HvBalloonClass {
70     VMBusDeviceClass parent_class;
71 } HvBalloonClass;
72 
73 typedef enum State {
74     /* not a real state */
75     S_NO_CHANGE = 0,
76 
77     S_WAIT_RESET,
78     S_POST_RESET_CLOSED,
79 
80     /* init flow */
81     S_VERSION,
82     S_CAPS,
83     S_POST_INIT_WAIT,
84 
85     S_IDLE,
86 
87     /* balloon op flow */
88     S_BALLOON_POSTING,
89     S_BALLOON_RB_WAIT,
90     S_BALLOON_REPLY_WAIT,
91 
92     /* unballoon + hot add ops flow */
93     S_UNBALLOON_POSTING,
94     S_UNBALLOON_RB_WAIT,
95     S_UNBALLOON_REPLY_WAIT,
96     S_HOT_ADD_SETUP,
97     S_HOT_ADD_RB_WAIT,
98     S_HOT_ADD_POSTING,
99     S_HOT_ADD_REPLY_WAIT,
100 } State;
101 
102 typedef struct StateDesc {
103     State state;
104     const char *desc;
105 } StateDesc;
106 
107 typedef struct HvBalloon {
108     VMBusDevice parent;
109     State state;
110 
111     union dm_version version;
112     union dm_caps caps;
113 
114     QEMUTimer post_init_timer;
115 
116     unsigned int trans_id;
117 
118     struct {
119         bool enabled;
120         bool received;
121         uint64_t committed;
122         uint64_t available;
123     } status_report;
124 
125     /* Guest target size */
126     uint64_t target;
127     bool target_changed;
128 
129     /* Current (un)balloon / hot-add operation parameters */
130     union {
131         uint64_t balloon_diff;
132 
133         struct {
134             uint64_t unballoon_diff;
135             uint64_t hot_add_diff;
136         };
137 
138         struct {
139             PageRange hot_add_range;
140             uint64_t ha_current_count;
141         };
142     };
143 
144     OurRangeMemslots *our_range;
145 
146     /* Count of memslots covering our memory */
147     unsigned int memslot_count;
148 
149     /* Nominal size of each memslot (the last one might be smaller) */
150     uint64_t memslot_size;
151 
152     /* Non-ours removed memory */
153     PageRangeTree removed_guest, removed_both;
154 
155     /* Grand totals of removed memory (both ours and non-ours) */
156     uint64_t removed_guest_ctr, removed_both_ctr;
157 
158     /* MEMORY_DEVICE props */
159     uint64_t addr;
160     HostMemoryBackend *hostmem;
161     MemoryRegion *mr;
162 } HvBalloon;
163 
164 OBJECT_DEFINE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, HV_BALLOON, VMBUS_DEVICE, \
165                                    { TYPE_MEMORY_DEVICE }, { })
166 
167 #define HV_BALLOON_SET_STATE(hvb, news)             \
168     do {                                            \
169         assert(news != S_NO_CHANGE);                \
170         hv_balloon_state_set(hvb, news, # news);    \
171     } while (0)
172 
173 #define HV_BALLOON_STATE_DESC_SET(stdesc, news)         \
174     _hv_balloon_state_desc_set(stdesc, news, # news)
175 
176 #define HV_BALLOON_STATE_DESC_INIT \
177     {                              \
178         .state = S_NO_CHANGE,      \
179     }
180 
181 typedef struct HvBalloonReq {
182     VMBusChanReq vmreq;
183 } HvBalloonReq;
184 
185 /* total our memory includes parts currently removed from the guest */
186 static uint64_t hv_balloon_total_our_ram(HvBalloon *balloon)
187 {
188     if (!balloon->our_range) {
189         return 0;
190     }
191 
192     return balloon->our_range->range.added;
193 }
194 
195 /* TODO: unify the code below with virtio-balloon and cache the value */
196 static int build_dimm_list(Object *obj, void *opaque)
197 {
198     GSList **list = opaque;
199 
200     if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
201         DeviceState *dev = DEVICE(obj);
202         if (dev->realized) { /* only realized DIMMs matter */
203             *list = g_slist_prepend(*list, dev);
204         }
205     }
206 
207     object_child_foreach(obj, build_dimm_list, opaque);
208     return 0;
209 }
210 
211 static ram_addr_t get_current_ram_size(void)
212 {
213     GSList *list = NULL, *item;
214     ram_addr_t size = current_machine->ram_size;
215 
216     build_dimm_list(qdev_get_machine(), &list);
217     for (item = list; item; item = g_slist_next(item)) {
218         Object *obj = OBJECT(item->data);
219         if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM))
220             size += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
221                                             &error_abort);
222     }
223     g_slist_free(list);
224 
225     return size;
226 }
227 
228 /* total RAM includes memory currently removed from the guest */
229 static uint64_t hv_balloon_total_ram(HvBalloon *balloon)
230 {
231     ram_addr_t ram_size = get_current_ram_size();
232     uint64_t ram_size_pages = ram_size >> HV_BALLOON_PFN_SHIFT;
233     uint64_t our_ram_size_pages = hv_balloon_total_our_ram(balloon);
234 
235     assert(ram_size_pages > 0);
236 
237     return SUM_SATURATE_U64(ram_size_pages, our_ram_size_pages);
238 }
239 
240 /*
241  * calculating the total RAM size is a slow operation,
242  * avoid it as much as possible
243  */
244 static uint64_t hv_balloon_total_removed_rs(HvBalloon *balloon,
245                                             uint64_t ram_size_pages)
246 {
247     uint64_t total_removed;
248 
249     total_removed = SUM_SATURATE_U64(balloon->removed_guest_ctr,
250                                      balloon->removed_both_ctr);
251 
252     /* possible if guest returns pages outside actual RAM */
253     if (total_removed > ram_size_pages) {
254         total_removed = ram_size_pages;
255     }
256 
257     return total_removed;
258 }
259 
260 /* Returns whether the state has actually changed */
261 static bool hv_balloon_state_set(HvBalloon *balloon,
262                                  State newst, const char *newststr)
263 {
264     if (newst == S_NO_CHANGE || balloon->state == newst) {
265         return false;
266     }
267 
268     balloon->state = newst;
269     trace_hv_balloon_state_change(newststr);
270     return true;
271 }
272 
273 static void _hv_balloon_state_desc_set(StateDesc *stdesc,
274                                        State newst, const char *newststr)
275 {
276     /* state setting is only permitted on a freshly init desc */
277     assert(stdesc->state == S_NO_CHANGE);
278 
279     assert(newst != S_NO_CHANGE);
280 
281     stdesc->state = newst;
282     stdesc->desc = newststr;
283 }
284 
285 static VMBusChannel *hv_balloon_get_channel_maybe(HvBalloon *balloon)
286 {
287     return vmbus_device_channel(&balloon->parent, 0);
288 }
289 
290 static VMBusChannel *hv_balloon_get_channel(HvBalloon *balloon)
291 {
292     VMBusChannel *chan;
293 
294     chan = hv_balloon_get_channel_maybe(balloon);
295     assert(chan != NULL);
296     return chan;
297 }
298 
299 static ssize_t hv_balloon_send_packet(VMBusChannel *chan,
300                                       struct dm_message *msg)
301 {
302     int ret;
303 
304     ret = vmbus_channel_reserve(chan, 0, msg->hdr.size);
305     if (ret < 0) {
306         return ret;
307     }
308 
309     return vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
310                               NULL, 0, msg, msg->hdr.size, false,
311                               msg->hdr.trans_id);
312 }
313 
314 static bool hv_balloon_unballoon_get_source(HvBalloon *balloon,
315                                             PageRangeTree *dtree,
316                                             uint64_t **dctr,
317                                             bool *is_our_range)
318 {
319     OurRange *our_range = OUR_RANGE(balloon->our_range);
320 
321     /* Try the boot memory first */
322     if (g_tree_nnodes(balloon->removed_guest.t) > 0) {
323         *dtree = balloon->removed_guest;
324         *dctr = &balloon->removed_guest_ctr;
325         *is_our_range = false;
326     } else if (g_tree_nnodes(balloon->removed_both.t) > 0) {
327         *dtree = balloon->removed_both;
328         *dctr = &balloon->removed_both_ctr;
329         *is_our_range = false;
330     } else if (!our_range) {
331         return false;
332     } else if (!our_range_is_removed_tree_empty(our_range, false)) {
333         *dtree = our_range_get_removed_tree(our_range, false);
334         *dctr = &balloon->removed_guest_ctr;
335         *is_our_range = true;
336     } else if (!our_range_is_removed_tree_empty(our_range, true)) {
337         *dtree = our_range_get_removed_tree(our_range, true);
338         *dctr = &balloon->removed_both_ctr;
339         *is_our_range = true;
340     } else {
341         return false;
342     }
343 
344     return true;
345 }
346 
347 static void hv_balloon_unballoon_rb_wait(HvBalloon *balloon, StateDesc *stdesc)
348 {
349     VMBusChannel *chan = hv_balloon_get_channel(balloon);
350     struct dm_unballoon_request *ur;
351     size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]);
352 
353     assert(balloon->state == S_UNBALLOON_RB_WAIT);
354 
355     if (vmbus_channel_reserve(chan, 0, ur_size) < 0) {
356         return;
357     }
358 
359     HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_POSTING);
360 }
361 
362 static void hv_balloon_unballoon_posting(HvBalloon *balloon, StateDesc *stdesc)
363 {
364     VMBusChannel *chan = hv_balloon_get_channel(balloon);
365     PageRangeTree dtree;
366     uint64_t *dctr;
367     bool our_range;
368     struct dm_unballoon_request *ur;
369     size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]);
370     PageRange range;
371     bool bret;
372     ssize_t ret;
373 
374     assert(balloon->state == S_UNBALLOON_POSTING);
375     assert(balloon->unballoon_diff > 0);
376 
377     if (!hv_balloon_unballoon_get_source(balloon, &dtree, &dctr, &our_range)) {
378         error_report("trying to unballoon but nothing seems to be ballooned");
379         /*
380          * there is little we can do as we might have already
381          * sent the guest a partial request we can't cancel
382          */
383         return;
384     }
385 
386     assert(balloon->our_range || !our_range);
387     assert(dtree.t);
388     assert(dctr);
389 
390     ur = alloca(ur_size);
391     memset(ur, 0, ur_size);
392     ur->hdr.type = DM_UNBALLOON_REQUEST;
393     ur->hdr.size = ur_size;
394     ur->hdr.trans_id = balloon->trans_id;
395 
396     bret = hvb_page_range_tree_pop(dtree, &range, MIN(balloon->unballoon_diff,
397                                                       HV_BALLOON_HA_CHUNK_PAGES));
398     assert(bret);
399     /* TODO: madvise? */
400 
401     *dctr -= range.count;
402     balloon->unballoon_diff -= range.count;
403 
404     ur->range_count = 1;
405     ur->range_array[0].finfo.start_page = range.start;
406     ur->range_array[0].finfo.page_cnt = range.count;
407     ur->more_pages = balloon->unballoon_diff > 0;
408 
409     trace_hv_balloon_outgoing_unballoon(ur->hdr.trans_id,
410                                         range.count, range.start,
411                                         balloon->unballoon_diff);
412 
413     if (ur->more_pages) {
414         HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT);
415     } else {
416         HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_REPLY_WAIT);
417     }
418 
419     ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
420                              NULL, 0, ur, ur_size, false,
421                              ur->hdr.trans_id);
422     if (ret <= 0) {
423         error_report("error %zd when posting unballoon msg, expect problems",
424                      ret);
425     }
426 }
427 
428 static bool hv_balloon_our_range_ensure(HvBalloon *balloon)
429 {
430     uint64_t align;
431     MemoryRegion *hostmem_mr;
432     g_autoptr(OurRangeMemslots) our_range_memslots = NULL;
433     OurRange *our_range;
434 
435     if (balloon->our_range) {
436         return true;
437     }
438 
439     if (!balloon->hostmem) {
440         return false;
441     }
442 
443     align = (1 << balloon->caps.cap_bits.hot_add_alignment) * MiB;
444     assert(QEMU_IS_ALIGNED(balloon->addr, align));
445 
446     hostmem_mr = host_memory_backend_get_memory(balloon->hostmem);
447 
448     our_range_memslots = hvb_our_range_memslots_new(balloon->addr,
449                                                     balloon->mr, hostmem_mr,
450                                                     OBJECT(balloon),
451                                                     balloon->memslot_count,
452                                                     balloon->memslot_size);
453     our_range = OUR_RANGE(our_range_memslots);
454 
455     if (hvb_page_range_tree_intree_any(balloon->removed_guest,
456                                        our_range->range.start,
457                                        our_range->range.count) ||
458         hvb_page_range_tree_intree_any(balloon->removed_both,
459                                        our_range->range.start,
460                                        our_range->range.count)) {
461         error_report("some parts of the memory backend were already returned by the guest. this should not happen, please reboot the guest and try again");
462         return false;
463     }
464 
465     trace_hv_balloon_our_range_add(our_range->range.count,
466                                    our_range->range.start);
467 
468     balloon->our_range = g_steal_pointer(&our_range_memslots);
469     return true;
470 }
471 
472 static void hv_balloon_hot_add_setup(HvBalloon *balloon, StateDesc *stdesc)
473 {
474     /* need to make copy since it is in union with hot_add_range */
475     uint64_t hot_add_diff = balloon->hot_add_diff;
476     PageRange *hot_add_range = &balloon->hot_add_range;
477     uint64_t align, our_range_remaining;
478     OurRange *our_range;
479 
480     assert(balloon->state == S_HOT_ADD_SETUP);
481     assert(hot_add_diff > 0);
482 
483     if (!hv_balloon_our_range_ensure(balloon)) {
484         goto ret_idle;
485     }
486 
487     our_range = OUR_RANGE(balloon->our_range);
488 
489     align = (1 << balloon->caps.cap_bits.hot_add_alignment) *
490         (MiB / HV_BALLOON_PAGE_SIZE);
491 
492     /* Absolute GPA in pages */
493     hot_add_range->start = our_range_get_remaining_start(our_range);
494     assert(QEMU_IS_ALIGNED(hot_add_range->start, align));
495 
496     our_range_remaining = our_range_get_remaining_size(our_range);
497     hot_add_range->count = MIN(our_range_remaining, hot_add_diff);
498     hot_add_range->count = QEMU_ALIGN_DOWN(hot_add_range->count, align);
499     if (hot_add_range->count == 0) {
500         goto ret_idle;
501     }
502 
503     hvb_our_range_memslots_ensure_mapped_additional(balloon->our_range,
504                                                     hot_add_range->count);
505 
506     HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT);
507     return;
508 
509 ret_idle:
510     HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
511 }
512 
513 static void hv_balloon_hot_add_rb_wait(HvBalloon *balloon, StateDesc *stdesc)
514 {
515     VMBusChannel *chan = hv_balloon_get_channel(balloon);
516     struct dm_hot_add *ha;
517     size_t ha_size = sizeof(*ha) + sizeof(ha->range);
518 
519     assert(balloon->state == S_HOT_ADD_RB_WAIT);
520 
521     if (vmbus_channel_reserve(chan, 0, ha_size) < 0) {
522         return;
523     }
524 
525     HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_POSTING);
526 }
527 
528 static void hv_balloon_hot_add_posting(HvBalloon *balloon, StateDesc *stdesc)
529 {
530     PageRange *hot_add_range = &balloon->hot_add_range;
531     uint64_t *current_count = &balloon->ha_current_count;
532     VMBusChannel *chan = hv_balloon_get_channel(balloon);
533     struct dm_hot_add *ha;
534     size_t ha_size = sizeof(*ha) + sizeof(ha->range);
535     union dm_mem_page_range *ha_region;
536     uint64_t align, chunk_max_size;
537     ssize_t ret;
538 
539     assert(balloon->state == S_HOT_ADD_POSTING);
540     assert(hot_add_range->count > 0);
541 
542     align = (1 << balloon->caps.cap_bits.hot_add_alignment) *
543         (MiB / HV_BALLOON_PAGE_SIZE);
544     if (align >= HV_BALLOON_HA_CHUNK_PAGES) {
545         /*
546          * If the required alignment is higher than the chunk size we let it
547          * override that size.
548          */
549         chunk_max_size = align;
550     } else {
551         chunk_max_size = QEMU_ALIGN_DOWN(HV_BALLOON_HA_CHUNK_PAGES, align);
552     }
553 
554     /*
555      * hot_add_range->count starts aligned in hv_balloon_hot_add_setup(),
556      * then it is either reduced by subtracting aligned current_count or
557      * further hot-adds are prevented by marking the whole remaining our range
558      * as unusable in hv_balloon_handle_hot_add_response().
559      */
560     *current_count = MIN(hot_add_range->count, chunk_max_size);
561 
562     ha = alloca(ha_size);
563     ha_region = &(&ha->range)[1];
564     memset(ha, 0, ha_size);
565     ha->hdr.type = DM_MEM_HOT_ADD_REQUEST;
566     ha->hdr.size = ha_size;
567     ha->hdr.trans_id = balloon->trans_id;
568 
569     ha->range.finfo.start_page = hot_add_range->start;
570     ha->range.finfo.page_cnt = *current_count;
571     ha_region->finfo.start_page = hot_add_range->start;
572     ha_region->finfo.page_cnt = ha->range.finfo.page_cnt;
573 
574     trace_hv_balloon_outgoing_hot_add(ha->hdr.trans_id,
575                                       *current_count, hot_add_range->start);
576 
577     ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
578                              NULL, 0, ha, ha_size, false,
579                              ha->hdr.trans_id);
580     if (ret <= 0) {
581         error_report("error %zd when posting hot add msg, expect problems",
582                      ret);
583     }
584 
585     HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_REPLY_WAIT);
586 }
587 
588 static void hv_balloon_balloon_rb_wait(HvBalloon *balloon, StateDesc *stdesc)
589 {
590     VMBusChannel *chan = hv_balloon_get_channel(balloon);
591     size_t bl_size = sizeof(struct dm_balloon);
592 
593     assert(balloon->state == S_BALLOON_RB_WAIT);
594 
595     if (vmbus_channel_reserve(chan, 0, bl_size) < 0) {
596         return;
597     }
598 
599     HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_POSTING);
600 }
601 
602 static void hv_balloon_balloon_posting(HvBalloon *balloon, StateDesc *stdesc)
603 {
604     VMBusChannel *chan = hv_balloon_get_channel(balloon);
605     struct dm_balloon bl;
606     size_t bl_size = sizeof(bl);
607     ssize_t ret;
608 
609     assert(balloon->state == S_BALLOON_POSTING);
610     assert(balloon->balloon_diff > 0);
611 
612     memset(&bl, 0, sizeof(bl));
613     bl.hdr.type = DM_BALLOON_REQUEST;
614     bl.hdr.size = bl_size;
615     bl.hdr.trans_id = balloon->trans_id;
616     bl.num_pages = MIN(balloon->balloon_diff, HV_BALLOON_HR_CHUNK_PAGES);
617 
618     trace_hv_balloon_outgoing_balloon(bl.hdr.trans_id, bl.num_pages,
619                                       balloon->balloon_diff);
620 
621     ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
622                              NULL, 0, &bl, bl_size, false,
623                              bl.hdr.trans_id);
624     if (ret <= 0) {
625         error_report("error %zd when posting balloon msg, expect problems",
626                      ret);
627     }
628 
629     HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_REPLY_WAIT);
630 }
631 
632 static void hv_balloon_idle_state_process_target(HvBalloon *balloon,
633                                                  StateDesc *stdesc)
634 {
635     bool can_balloon = balloon->caps.cap_bits.balloon;
636     uint64_t ram_size_pages, total_removed;
637 
638     ram_size_pages = hv_balloon_total_ram(balloon);
639     total_removed = hv_balloon_total_removed_rs(balloon, ram_size_pages);
640 
641     /*
642      * we need to cache the values computed from the balloon target value when
643      * starting the adjustment procedure in case someone changes the target when
644      * the procedure is in progress
645      */
646     if (balloon->target > ram_size_pages - total_removed) {
647         bool can_hot_add = balloon->caps.cap_bits.hot_add;
648         uint64_t target_diff = balloon->target -
649             (ram_size_pages - total_removed);
650 
651         balloon->unballoon_diff = MIN(target_diff, total_removed);
652 
653         if (can_hot_add) {
654             balloon->hot_add_diff = target_diff - balloon->unballoon_diff;
655         } else {
656             balloon->hot_add_diff = 0;
657         }
658 
659         if (balloon->unballoon_diff > 0) {
660             assert(can_balloon);
661             HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT);
662         } else if (balloon->hot_add_diff > 0) {
663             HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP);
664         }
665     } else if (can_balloon &&
666                balloon->target < ram_size_pages - total_removed) {
667         balloon->balloon_diff = ram_size_pages - total_removed -
668             balloon->target;
669         HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT);
670     }
671 }
672 
673 static void hv_balloon_idle_state(HvBalloon *balloon,
674                                   StateDesc *stdesc)
675 {
676     assert(balloon->state == S_IDLE);
677 
678     if (balloon->target_changed) {
679         balloon->target_changed = false;
680         hv_balloon_idle_state_process_target(balloon, stdesc);
681         return;
682     }
683 }
684 
685 static const struct {
686     void (*handler)(HvBalloon *balloon, StateDesc *stdesc);
687 } state_handlers[] = {
688     [S_IDLE].handler = hv_balloon_idle_state,
689     [S_BALLOON_POSTING].handler = hv_balloon_balloon_posting,
690     [S_BALLOON_RB_WAIT].handler = hv_balloon_balloon_rb_wait,
691     [S_UNBALLOON_POSTING].handler = hv_balloon_unballoon_posting,
692     [S_UNBALLOON_RB_WAIT].handler = hv_balloon_unballoon_rb_wait,
693     [S_HOT_ADD_SETUP].handler = hv_balloon_hot_add_setup,
694     [S_HOT_ADD_RB_WAIT].handler = hv_balloon_hot_add_rb_wait,
695     [S_HOT_ADD_POSTING].handler = hv_balloon_hot_add_posting,
696 };
697 
698 static void hv_balloon_handle_state(HvBalloon *balloon, StateDesc *stdesc)
699 {
700     if (balloon->state >= ARRAY_SIZE(state_handlers) ||
701         !state_handlers[balloon->state].handler) {
702         return;
703     }
704 
705     state_handlers[balloon->state].handler(balloon, stdesc);
706 }
707 
708 static void hv_balloon_remove_response_insert_range(PageRangeTree tree,
709                                                     const PageRange *range,
710                                                     uint64_t *ctr1,
711                                                     uint64_t *ctr2,
712                                                     uint64_t *ctr3)
713 {
714     uint64_t dupcount, effcount;
715 
716     if (range->count == 0) {
717         return;
718     }
719 
720     dupcount = 0;
721     hvb_page_range_tree_insert(tree, range->start, range->count, &dupcount);
722 
723     assert(dupcount <= range->count);
724     effcount = range->count - dupcount;
725 
726     *ctr1 += effcount;
727     *ctr2 += effcount;
728     if (ctr3) {
729         *ctr3 += effcount;
730     }
731 }
732 
733 static void hv_balloon_remove_response_handle_range(HvBalloon *balloon,
734                                                     PageRange *range,
735                                                     bool both,
736                                                     uint64_t *removedctr)
737 {
738     OurRange *our_range = OUR_RANGE(balloon->our_range);
739     PageRangeTree globaltree =
740         both ? balloon->removed_both : balloon->removed_guest;
741     uint64_t *globalctr =
742         both ? &balloon->removed_both_ctr : &balloon->removed_guest_ctr;
743     PageRange rangeeff;
744 
745     if (range->count == 0) {
746         return;
747     }
748 
749     trace_hv_balloon_remove_response(range->count, range->start, both);
750 
751     if (our_range) {
752         /* Includes the not-yet-hot-added and unusable parts. */
753         rangeeff = our_range->range;
754     } else {
755         rangeeff.start = rangeeff.count = 0;
756     }
757 
758     if (page_range_intersection_size(range, rangeeff.start, rangeeff.count) > 0) {
759         PageRangeTree ourtree = our_range_get_removed_tree(our_range, both);
760         PageRange rangehole, rangecommon;
761         uint64_t ourremoved = 0;
762 
763         /* process the hole before our range, if it exists */
764         page_range_part_before(range, rangeeff.start, &rangehole);
765         hv_balloon_remove_response_insert_range(globaltree, &rangehole,
766                                                 globalctr, removedctr, NULL);
767         if (rangehole.count > 0) {
768             trace_hv_balloon_remove_response_hole(rangehole.count,
769                                                   rangehole.start,
770                                                   range->count, range->start,
771                                                   rangeeff.start, both);
772         }
773 
774         /* process our part */
775         page_range_intersect(range, rangeeff.start, rangeeff.count,
776                              &rangecommon);
777         hv_balloon_remove_response_insert_range(ourtree, &rangecommon,
778                                                 globalctr, removedctr,
779                                                 &ourremoved);
780         if (rangecommon.count > 0) {
781             trace_hv_balloon_remove_response_common(rangecommon.count,
782                                                     rangecommon.start,
783                                                     range->count, range->start,
784                                                     rangeeff.count,
785                                                     rangeeff.start, ourremoved,
786                                                     both);
787         }
788 
789         /* calculate what's left after our range */
790         rangecommon = *range;
791         page_range_part_after(&rangecommon, rangeeff.start, rangeeff.count,
792                               range);
793     }
794 
795     /* process the remainder of the range that lies after our range */
796     if (range->count > 0) {
797         hv_balloon_remove_response_insert_range(globaltree, range,
798                                                 globalctr, removedctr, NULL);
799         trace_hv_balloon_remove_response_remainder(range->count, range->start,
800                                                    both);
801         range->count = 0;
802     }
803 }
804 
805 static void hv_balloon_remove_response_handle_pages(HvBalloon *balloon,
806                                                     PageRange *range,
807                                                     uint64_t start,
808                                                     uint64_t count,
809                                                     bool both,
810                                                     uint64_t *removedctr)
811 {
812     assert(count > 0);
813 
814     /*
815      * if there is an existing range that the new range can't be joined to
816      * dump it into tree(s)
817      */
818     if (range->count > 0 && !page_range_joinable(range, start, count)) {
819         hv_balloon_remove_response_handle_range(balloon, range, both,
820                                                 removedctr);
821     }
822 
823     if (range->count == 0) {
824         range->start = start;
825         range->count = count;
826     } else if (page_range_joinable_left(range, start, count)) {
827         range->start = start;
828         range->count += count;
829     } else { /* page_range_joinable_right() */
830         range->count += count;
831     }
832 }
833 
834 static gboolean hv_balloon_handle_remove_host_addr_node(gpointer key,
835                                                         gpointer value,
836                                                         gpointer data)
837 {
838     PageRange *range = value;
839     uint64_t pageoff;
840 
841     for (pageoff = 0; pageoff < range->count; ) {
842         uint64_t addr_64 = (range->start + pageoff) * HV_BALLOON_PAGE_SIZE;
843         void *addr;
844         RAMBlock *rb;
845         ram_addr_t rb_offset;
846         size_t rb_page_size;
847         size_t discard_size;
848 
849         assert(addr_64 <= UINTPTR_MAX);
850         addr = (void *)((uintptr_t)addr_64);
851         rb = qemu_ram_block_from_host(addr, false, &rb_offset);
852         rb_page_size = qemu_ram_pagesize(rb);
853 
854         if (rb_page_size != HV_BALLOON_PAGE_SIZE) {
855             /* TODO: these should end in "removed_guest" */
856             warn_report("guest reported removed page backed by unsupported page size %zu",
857                         rb_page_size);
858             pageoff++;
859             continue;
860         }
861 
862         discard_size = MIN(range->count - pageoff,
863                            (rb->max_length - rb_offset) /
864                            HV_BALLOON_PAGE_SIZE);
865         discard_size = MAX(discard_size, 1);
866 
867         if (ram_block_discard_range(rb, rb_offset, discard_size *
868                                     HV_BALLOON_PAGE_SIZE) != 0) {
869             warn_report("guest reported removed page failed discard");
870         }
871 
872         pageoff += discard_size;
873     }
874 
875     return false;
876 }
877 
878 static void hv_balloon_handle_remove_host_addr_tree(PageRangeTree tree)
879 {
880     g_tree_foreach(tree.t, hv_balloon_handle_remove_host_addr_node, NULL);
881 }
882 
883 static int hv_balloon_handle_remove_section(PageRangeTree tree,
884                                             const MemoryRegionSection *section,
885                                             uint64_t count)
886 {
887     void *addr = memory_region_get_ram_ptr(section->mr) +
888         section->offset_within_region;
889     uint64_t addr_page;
890 
891     assert(count > 0);
892 
893     if ((uintptr_t)addr % HV_BALLOON_PAGE_SIZE) {
894         warn_report("guest reported removed pages at an unaligned host addr %p",
895                     addr);
896         return -EINVAL;
897     }
898 
899     addr_page = (uintptr_t)addr / HV_BALLOON_PAGE_SIZE;
900     hvb_page_range_tree_insert(tree, addr_page, count, NULL);
901 
902     return 0;
903 }
904 
905 static void hv_balloon_handle_remove_ranges(HvBalloon *balloon,
906                                             union dm_mem_page_range ranges[],
907                                             uint32_t count)
908 {
909     uint64_t removedcnt;
910     PageRangeTree removed_host_addr;
911     PageRange range_guest, range_both;
912 
913     hvb_page_range_tree_init(&removed_host_addr);
914     range_guest.count = range_both.count = removedcnt = 0;
915     for (unsigned int ctr = 0; ctr < count; ctr++) {
916         union dm_mem_page_range *mr = &ranges[ctr];
917         hwaddr pa;
918         MemoryRegionSection section;
919 
920         for (unsigned int offset = 0; offset < mr->finfo.page_cnt; ) {
921             int ret;
922             uint64_t pageno = mr->finfo.start_page + offset;
923             uint64_t pagecnt = 1;
924 
925             pa = (hwaddr)pageno << HV_BALLOON_PFN_SHIFT;
926             section = memory_region_find(get_system_memory(), pa,
927                                          (mr->finfo.page_cnt - offset) *
928                                          HV_BALLOON_PAGE_SIZE);
929             if (!section.mr) {
930                 warn_report("guest reported removed page %"PRIu64" not found in RAM",
931                             pageno);
932                 ret = -EINVAL;
933                 goto finish_page;
934             }
935 
936             pagecnt = int128_get64(section.size) / HV_BALLOON_PAGE_SIZE;
937             if (pagecnt <= 0) {
938                 warn_report("guest reported removed page %"PRIu64" in a section smaller than page size",
939                             pageno);
940                 pagecnt = 1; /* skip the whole page */
941                 ret = -EINVAL;
942                 goto finish_page;
943             }
944 
945             if (!memory_region_is_ram(section.mr) ||
946                 memory_region_is_rom(section.mr) ||
947                 memory_region_is_romd(section.mr)) {
948                 warn_report("guest reported removed page %"PRIu64" in a section that is not an ordinary RAM",
949                             pageno);
950                 ret = -EINVAL;
951                 goto finish_page;
952             }
953 
954             ret = hv_balloon_handle_remove_section(removed_host_addr, &section,
955                                                    pagecnt);
956 
957         finish_page:
958             if (ret == 0) {
959                 hv_balloon_remove_response_handle_pages(balloon,
960                                                         &range_both,
961                                                         pageno, pagecnt,
962                                                         true, &removedcnt);
963             } else {
964                 hv_balloon_remove_response_handle_pages(balloon,
965                                                         &range_guest,
966                                                         pageno, pagecnt,
967                                                         false, &removedcnt);
968             }
969 
970             if (section.mr) {
971                 memory_region_unref(section.mr);
972             }
973 
974             offset += pagecnt;
975         }
976     }
977 
978     hv_balloon_remove_response_handle_range(balloon, &range_both, true,
979                                             &removedcnt);
980     hv_balloon_remove_response_handle_range(balloon, &range_guest, false,
981                                             &removedcnt);
982 
983     hv_balloon_handle_remove_host_addr_tree(removed_host_addr);
984     hvb_page_range_tree_destroy(&removed_host_addr);
985 
986     if (removedcnt > balloon->balloon_diff) {
987         warn_report("guest reported more pages removed than currently pending (%"PRIu64" vs %"PRIu64")",
988                     removedcnt, balloon->balloon_diff);
989         balloon->balloon_diff = 0;
990     } else {
991         balloon->balloon_diff -= removedcnt;
992     }
993 }
994 
995 static bool hv_balloon_handle_msg_size(HvBalloonReq *req, size_t minsize,
996                                        const char *msgname)
997 {
998     VMBusChanReq *vmreq = &req->vmreq;
999     uint32_t msglen = vmreq->msglen;
1000 
1001     if (msglen >= minsize) {
1002         return true;
1003     }
1004 
1005     warn_report("%s message too short (%u vs %zu), ignoring", msgname,
1006                 (unsigned int)msglen, minsize);
1007     return false;
1008 }
1009 
1010 static void hv_balloon_handle_version_request(HvBalloon *balloon,
1011                                               HvBalloonReq *req,
1012                                               StateDesc *stdesc)
1013 {
1014     VMBusChanReq *vmreq = &req->vmreq;
1015     struct dm_version_request *msgVr = vmreq->msg;
1016     struct dm_version_response respVr;
1017 
1018     if (balloon->state != S_VERSION) {
1019         warn_report("unexpected DM_VERSION_REQUEST in %d state",
1020                     balloon->state);
1021         return;
1022     }
1023 
1024     if (!hv_balloon_handle_msg_size(req, sizeof(*msgVr),
1025                                     "DM_VERSION_REQUEST")) {
1026         return;
1027     }
1028 
1029     trace_hv_balloon_incoming_version(msgVr->version.major_version,
1030                                       msgVr->version.minor_version);
1031 
1032     memset(&respVr, 0, sizeof(respVr));
1033     respVr.hdr.type = DM_VERSION_RESPONSE;
1034     respVr.hdr.size = sizeof(respVr);
1035     respVr.hdr.trans_id = msgVr->hdr.trans_id;
1036     respVr.is_accepted = msgVr->version.version >= DYNMEM_PROTOCOL_VERSION_1 &&
1037         msgVr->version.version <= DYNMEM_PROTOCOL_VERSION_3;
1038 
1039     hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respVr);
1040 
1041     if (respVr.is_accepted) {
1042         HV_BALLOON_STATE_DESC_SET(stdesc, S_CAPS);
1043     }
1044 }
1045 
1046 static void hv_balloon_handle_caps_report(HvBalloon *balloon,
1047                                           HvBalloonReq *req,
1048                                           StateDesc *stdesc)
1049 {
1050     VMBusChanReq *vmreq = &req->vmreq;
1051     struct dm_capabilities *msgCap = vmreq->msg;
1052     struct dm_capabilities_resp_msg respCap;
1053 
1054     if (balloon->state != S_CAPS) {
1055         warn_report("unexpected DM_CAPABILITIES_REPORT in %d state",
1056                     balloon->state);
1057         return;
1058     }
1059 
1060     if (!hv_balloon_handle_msg_size(req, sizeof(*msgCap),
1061                                     "DM_CAPABILITIES_REPORT")) {
1062         return;
1063     }
1064 
1065     trace_hv_balloon_incoming_caps(msgCap->caps.caps);
1066     balloon->caps = msgCap->caps;
1067 
1068     memset(&respCap, 0, sizeof(respCap));
1069     respCap.hdr.type = DM_CAPABILITIES_RESPONSE;
1070     respCap.hdr.size = sizeof(respCap);
1071     respCap.hdr.trans_id = msgCap->hdr.trans_id;
1072     respCap.is_accepted = 1;
1073     respCap.hot_remove = 1;
1074     respCap.suppress_pressure_reports = !balloon->status_report.enabled;
1075     hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respCap);
1076 
1077     timer_mod(&balloon->post_init_timer,
1078               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
1079               HV_BALLOON_POST_INIT_WAIT);
1080 
1081     HV_BALLOON_STATE_DESC_SET(stdesc, S_POST_INIT_WAIT);
1082 }
1083 
1084 static void hv_balloon_handle_status_report(HvBalloon *balloon,
1085                                             HvBalloonReq *req)
1086 {
1087     VMBusChanReq *vmreq = &req->vmreq;
1088     struct dm_status *msgStatus = vmreq->msg;
1089 
1090     if (!hv_balloon_handle_msg_size(req, sizeof(*msgStatus),
1091                                     "DM_STATUS_REPORT")) {
1092         return;
1093     }
1094 
1095     if (!balloon->status_report.enabled) {
1096         return;
1097     }
1098 
1099     balloon->status_report.committed = msgStatus->num_committed;
1100     balloon->status_report.committed *= HV_BALLOON_PAGE_SIZE;
1101     balloon->status_report.available = msgStatus->num_avail;
1102     balloon->status_report.available *= HV_BALLOON_PAGE_SIZE;
1103     balloon->status_report.received = true;
1104 
1105     qapi_event_send_hv_balloon_status_report(balloon->status_report.committed,
1106                                              balloon->status_report.available);
1107 }
1108 
1109 HvBalloonInfo *qmp_query_hv_balloon_status_report(Error **errp)
1110 {
1111     HvBalloon *balloon;
1112     HvBalloonInfo *info;
1113 
1114     balloon = HV_BALLOON(object_resolve_path_type("", TYPE_HV_BALLOON, NULL));
1115     if (!balloon) {
1116         error_setg(errp, "no %s device present", TYPE_HV_BALLOON);
1117         return NULL;
1118     }
1119 
1120     if (!balloon->status_report.enabled) {
1121         error_setg(errp, "guest memory status reporting not enabled");
1122         return NULL;
1123     }
1124 
1125     if (!balloon->status_report.received) {
1126         error_setg(errp, "no guest memory status report received yet");
1127         return NULL;
1128     }
1129 
1130     info = g_malloc0(sizeof(*info));
1131     info->committed = balloon->status_report.committed;
1132     info->available = balloon->status_report.available;
1133     return info;
1134 }
1135 
1136 static void hv_balloon_handle_unballoon_response(HvBalloon *balloon,
1137                                                  HvBalloonReq *req,
1138                                                  StateDesc *stdesc)
1139 {
1140     VMBusChanReq *vmreq = &req->vmreq;
1141     struct dm_unballoon_response *msgUrR = vmreq->msg;
1142 
1143     if (balloon->state != S_UNBALLOON_REPLY_WAIT) {
1144         warn_report("unexpected DM_UNBALLOON_RESPONSE in %d state",
1145                     balloon->state);
1146         return;
1147     }
1148 
1149     if (!hv_balloon_handle_msg_size(req, sizeof(*msgUrR),
1150                                     "DM_UNBALLOON_RESPONSE"))
1151         return;
1152 
1153     trace_hv_balloon_incoming_unballoon(msgUrR->hdr.trans_id);
1154 
1155     balloon->trans_id++;
1156 
1157     if (balloon->hot_add_diff > 0) {
1158         bool can_hot_add = balloon->caps.cap_bits.hot_add;
1159 
1160         assert(can_hot_add);
1161         HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP);
1162     } else {
1163         HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
1164     }
1165 }
1166 
1167 static void hv_balloon_handle_hot_add_response(HvBalloon *balloon,
1168                                                HvBalloonReq *req,
1169                                                StateDesc *stdesc)
1170 {
1171     PageRange *hot_add_range = &balloon->hot_add_range;
1172     VMBusChanReq *vmreq = &req->vmreq;
1173     struct dm_hot_add_response *msgHaR = vmreq->msg;
1174     OurRange *our_range;
1175 
1176     if (balloon->state != S_HOT_ADD_REPLY_WAIT) {
1177         warn_report("unexpected DM_HOT_ADD_RESPONSE in %d state",
1178                     balloon->state);
1179         return;
1180     }
1181 
1182     assert(balloon->our_range);
1183     our_range = OUR_RANGE(balloon->our_range);
1184 
1185     if (!hv_balloon_handle_msg_size(req, sizeof(*msgHaR),
1186                                     "DM_HOT_ADD_RESPONSE"))
1187         return;
1188 
1189     trace_hv_balloon_incoming_hot_add(msgHaR->hdr.trans_id, msgHaR->result,
1190                                       msgHaR->page_count);
1191 
1192     balloon->trans_id++;
1193 
1194     if (msgHaR->result) {
1195         if (msgHaR->page_count > balloon->ha_current_count) {
1196             warn_report("DM_HOT_ADD_RESPONSE page count higher than requested (%"PRIu32" vs %"PRIu64")",
1197                         msgHaR->page_count, balloon->ha_current_count);
1198             msgHaR->page_count = balloon->ha_current_count;
1199         }
1200 
1201         hvb_our_range_mark_added(our_range, msgHaR->page_count);
1202         hot_add_range->start += msgHaR->page_count;
1203         hot_add_range->count -= msgHaR->page_count;
1204     }
1205 
1206     if (!msgHaR->result || msgHaR->page_count < balloon->ha_current_count) {
1207         /*
1208          * the current planned range was only partially hot-added, take note
1209          * how much of it remains and don't attempt any further hot adds
1210          */
1211         our_range_mark_remaining_unusable(our_range);
1212 
1213         goto ret_idle;
1214     }
1215 
1216     /* any pages remaining to hot-add in our range? */
1217     if (hot_add_range->count > 0) {
1218         HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT);
1219         return;
1220     }
1221 
1222 ret_idle:
1223     HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
1224 }
1225 
1226 static void hv_balloon_handle_balloon_response(HvBalloon *balloon,
1227                                                HvBalloonReq *req,
1228                                                StateDesc *stdesc)
1229 {
1230     VMBusChanReq *vmreq = &req->vmreq;
1231     struct dm_balloon_response *msgBR = vmreq->msg;
1232 
1233     if (balloon->state != S_BALLOON_REPLY_WAIT) {
1234         warn_report("unexpected DM_BALLOON_RESPONSE in %d state",
1235                     balloon->state);
1236         return;
1237     }
1238 
1239     if (!hv_balloon_handle_msg_size(req, sizeof(*msgBR),
1240                                     "DM_BALLOON_RESPONSE"))
1241         return;
1242 
1243     trace_hv_balloon_incoming_balloon(msgBR->hdr.trans_id, msgBR->range_count,
1244                                       msgBR->more_pages);
1245 
1246     if (vmreq->msglen < sizeof(*msgBR) +
1247         (uint64_t)sizeof(msgBR->range_array[0]) * msgBR->range_count) {
1248         warn_report("DM_BALLOON_RESPONSE too short for the range count");
1249         return;
1250     }
1251 
1252     if (msgBR->range_count == 0) {
1253         /* The guest is already at its minimum size */
1254         balloon->balloon_diff = 0;
1255         goto ret_end_trans;
1256     } else {
1257         hv_balloon_handle_remove_ranges(balloon,
1258                                         msgBR->range_array,
1259                                         msgBR->range_count);
1260     }
1261 
1262     /* More responses expected? */
1263     if (msgBR->more_pages) {
1264         return;
1265     }
1266 
1267 ret_end_trans:
1268     balloon->trans_id++;
1269 
1270     if (balloon->balloon_diff > 0) {
1271         HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT);
1272     } else {
1273         HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
1274     }
1275 }
1276 
1277 static void hv_balloon_handle_packet(HvBalloon *balloon, HvBalloonReq *req,
1278                                      StateDesc *stdesc)
1279 {
1280     VMBusChanReq *vmreq = &req->vmreq;
1281     struct dm_message *msg = vmreq->msg;
1282 
1283     if (vmreq->msglen < sizeof(msg->hdr)) {
1284         return;
1285     }
1286 
1287     switch (msg->hdr.type) {
1288     case DM_VERSION_REQUEST:
1289         hv_balloon_handle_version_request(balloon, req, stdesc);
1290         break;
1291 
1292     case DM_CAPABILITIES_REPORT:
1293         hv_balloon_handle_caps_report(balloon, req, stdesc);
1294         break;
1295 
1296     case DM_STATUS_REPORT:
1297         hv_balloon_handle_status_report(balloon, req);
1298         break;
1299 
1300     case DM_MEM_HOT_ADD_RESPONSE:
1301         hv_balloon_handle_hot_add_response(balloon, req, stdesc);
1302         break;
1303 
1304     case DM_UNBALLOON_RESPONSE:
1305         hv_balloon_handle_unballoon_response(balloon, req, stdesc);
1306         break;
1307 
1308     case DM_BALLOON_RESPONSE:
1309         hv_balloon_handle_balloon_response(balloon, req, stdesc);
1310         break;
1311 
1312     default:
1313         warn_report("unknown DM message %u", msg->hdr.type);
1314         break;
1315     }
1316 }
1317 
1318 static bool hv_balloon_recv_channel(HvBalloon *balloon, StateDesc *stdesc)
1319 {
1320     VMBusChannel *chan;
1321     HvBalloonReq *req;
1322 
1323     if (balloon->state == S_WAIT_RESET ||
1324         balloon->state == S_POST_RESET_CLOSED) {
1325         return false;
1326     }
1327 
1328     chan = hv_balloon_get_channel(balloon);
1329     if (vmbus_channel_recv_start(chan)) {
1330         return false;
1331     }
1332 
1333     while ((req = vmbus_channel_recv_peek(chan, sizeof(*req)))) {
1334         hv_balloon_handle_packet(balloon, req, stdesc);
1335         vmbus_free_req(req);
1336         vmbus_channel_recv_pop(chan);
1337 
1338         if (stdesc->state != S_NO_CHANGE) {
1339             break;
1340         }
1341     }
1342 
1343     return vmbus_channel_recv_done(chan) > 0;
1344 }
1345 
1346 /* old state handler -> new state transition (potential) */
1347 static bool hv_balloon_event_loop_state(HvBalloon *balloon)
1348 {
1349     StateDesc state_new = HV_BALLOON_STATE_DESC_INIT;
1350 
1351     hv_balloon_handle_state(balloon, &state_new);
1352     return hv_balloon_state_set(balloon, state_new.state, state_new.desc);
1353 }
1354 
1355 /* VMBus message -> new state transition (potential) */
1356 static bool hv_balloon_event_loop_recv(HvBalloon *balloon)
1357 {
1358     StateDesc state_new = HV_BALLOON_STATE_DESC_INIT;
1359     bool any_recv, state_changed;
1360 
1361     any_recv = hv_balloon_recv_channel(balloon, &state_new);
1362     state_changed = hv_balloon_state_set(balloon,
1363                                          state_new.state, state_new.desc);
1364 
1365     return state_changed || any_recv;
1366 }
1367 
1368 static void hv_balloon_event_loop(HvBalloon *balloon)
1369 {
1370     bool state_repeat, recv_repeat;
1371 
1372     do {
1373         state_repeat = hv_balloon_event_loop_state(balloon);
1374         recv_repeat = hv_balloon_event_loop_recv(balloon);
1375     } while (state_repeat || recv_repeat);
1376 }
1377 
1378 static void hv_balloon_vmdev_chan_notify(VMBusChannel *chan)
1379 {
1380     HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan));
1381 
1382     hv_balloon_event_loop(balloon);
1383 }
1384 
1385 static void hv_balloon_stat(void *opaque, BalloonInfo *info)
1386 {
1387     HvBalloon *balloon = opaque;
1388     info->actual = (hv_balloon_total_ram(balloon) - balloon->removed_both_ctr)
1389         << HV_BALLOON_PFN_SHIFT;
1390 }
1391 
1392 static void hv_balloon_to_target(void *opaque, ram_addr_t target)
1393 {
1394     HvBalloon *balloon = opaque;
1395     uint64_t target_pages = target >> HV_BALLOON_PFN_SHIFT;
1396 
1397     if (!target_pages) {
1398         return;
1399     }
1400 
1401     /*
1402      * always set target_changed, even with unchanged target, as the user
1403      * might be asking us to try again reaching it
1404      */
1405     balloon->target = target_pages;
1406     balloon->target_changed = true;
1407 
1408     hv_balloon_event_loop(balloon);
1409 }
1410 
1411 static int hv_balloon_vmdev_open_channel(VMBusChannel *chan)
1412 {
1413     HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan));
1414 
1415     if (balloon->state != S_POST_RESET_CLOSED) {
1416         warn_report("guest trying to open a DM channel in invalid %d state",
1417                     balloon->state);
1418         return -EINVAL;
1419     }
1420 
1421     HV_BALLOON_SET_STATE(balloon, S_VERSION);
1422     hv_balloon_event_loop(balloon);
1423 
1424     return 0;
1425 }
1426 
1427 static void hv_balloon_vmdev_close_channel(VMBusChannel *chan)
1428 {
1429     HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan));
1430 
1431     timer_del(&balloon->post_init_timer);
1432 
1433     /* Don't report stale data */
1434     balloon->status_report.received = false;
1435 
1436     HV_BALLOON_SET_STATE(balloon, S_WAIT_RESET);
1437     hv_balloon_event_loop(balloon);
1438 }
1439 
1440 static void hv_balloon_post_init_timer(void *opaque)
1441 {
1442     HvBalloon *balloon = opaque;
1443 
1444     if (balloon->state != S_POST_INIT_WAIT) {
1445         return;
1446     }
1447 
1448     HV_BALLOON_SET_STATE(balloon, S_IDLE);
1449     hv_balloon_event_loop(balloon);
1450 }
1451 
1452 static void hv_balloon_system_reset_unrealize_common(HvBalloon *balloon)
1453 {
1454     g_clear_pointer(&balloon->our_range, hvb_our_range_memslots_free);
1455 }
1456 
1457 static void hv_balloon_system_reset(void *opaque)
1458 {
1459     HvBalloon *balloon = HV_BALLOON(opaque);
1460 
1461     hv_balloon_system_reset_unrealize_common(balloon);
1462 }
1463 
1464 static void hv_balloon_ensure_mr(HvBalloon *balloon)
1465 {
1466     MemoryRegion *hostmem_mr;
1467 
1468     assert(balloon->hostmem);
1469 
1470     if (balloon->mr) {
1471         return;
1472     }
1473 
1474     hostmem_mr = host_memory_backend_get_memory(balloon->hostmem);
1475 
1476     balloon->mr = g_new0(MemoryRegion, 1);
1477     memory_region_init(balloon->mr, OBJECT(balloon), TYPE_HV_BALLOON,
1478                        memory_region_size(hostmem_mr));
1479 
1480     /*
1481      * The VM can indicate an alignment up to 32 GiB. Memory device core can
1482      * usually only handle/guarantee 1 GiB alignment. The user will have to
1483      * specify a larger maxmem eventually.
1484      *
1485      * The memory device core will warn the user in case maxmem might have to be
1486      * increased and will fail plugging the device if there is not sufficient
1487      * space after alignment.
1488      *
1489      * TODO: we could do the alignment ourselves in a slightly bigger region.
1490      * But this feels better, although the warning might be annoying. Maybe
1491      * we can optimize that in the future (e.g., with such a device on the
1492      * cmdline place/size the device memory region differently.
1493      */
1494     balloon->mr->align = MAX(32 * GiB, memory_region_get_alignment(hostmem_mr));
1495 }
1496 
1497 static void hv_balloon_free_mr(HvBalloon *balloon)
1498 {
1499     if (!balloon->mr) {
1500         return;
1501     }
1502 
1503     object_unparent(OBJECT(balloon->mr));
1504     g_clear_pointer(&balloon->mr, g_free);
1505 }
1506 
1507 static void hv_balloon_vmdev_realize(VMBusDevice *vdev, Error **errp)
1508 {
1509     ERRP_GUARD();
1510     HvBalloon *balloon = HV_BALLOON(vdev);
1511     int ret;
1512 
1513     balloon->state = S_WAIT_RESET;
1514 
1515     ret = qemu_add_balloon_handler(hv_balloon_to_target, hv_balloon_stat,
1516                                    balloon);
1517     if (ret < 0) {
1518         /* This also protects against having multiple hv-balloon instances */
1519         error_setg(errp, "Only one balloon device is supported");
1520         return;
1521     }
1522 
1523     if (balloon->hostmem) {
1524         if (host_memory_backend_is_mapped(balloon->hostmem)) {
1525             Object *obj = OBJECT(balloon->hostmem);
1526 
1527             error_setg(errp, "'%s' property specifies a busy memdev: %s",
1528                        HV_BALLOON_MEMDEV_PROP,
1529                        object_get_canonical_path_component(obj));
1530             goto out_balloon_handler;
1531         }
1532 
1533         hv_balloon_ensure_mr(balloon);
1534 
1535         /* This is rather unlikely to happen, but let's still check for it. */
1536         if (!QEMU_IS_ALIGNED(memory_region_size(balloon->mr),
1537                              HV_BALLOON_PAGE_SIZE)) {
1538             error_setg(errp, "'%s' property memdev size has to be a multiple of 0x%" PRIx64,
1539                        HV_BALLOON_MEMDEV_PROP, (uint64_t)HV_BALLOON_PAGE_SIZE);
1540             goto out_balloon_handler;
1541         }
1542 
1543         host_memory_backend_set_mapped(balloon->hostmem, true);
1544         vmstate_register_ram(host_memory_backend_get_memory(balloon->hostmem),
1545                              DEVICE(balloon));
1546     } else if (balloon->addr) {
1547         error_setg(errp, "'%s' property must not be set without a memdev",
1548                    HV_BALLOON_MEMDEV_PROP);
1549         goto out_balloon_handler;
1550     }
1551 
1552     timer_init_ms(&balloon->post_init_timer, QEMU_CLOCK_VIRTUAL,
1553                   hv_balloon_post_init_timer, balloon);
1554 
1555     qemu_register_reset(hv_balloon_system_reset, balloon);
1556 
1557     return;
1558 
1559 out_balloon_handler:
1560     qemu_remove_balloon_handler(balloon);
1561 }
1562 
1563 /*
1564  * VMBus device reset has to be implemented in case the guest decides to
1565  * disconnect and reconnect to the VMBus without rebooting the whole system.
1566  *
1567  * However, the hot-added memory can't be removed here as Windows keeps on using
1568  * it until the system is restarted, even after disconnecting from the VMBus.
1569  */
1570 static void hv_balloon_vmdev_reset(VMBusDevice *vdev)
1571 {
1572     HvBalloon *balloon = HV_BALLOON(vdev);
1573 
1574     if (balloon->state == S_POST_RESET_CLOSED) {
1575         return;
1576     }
1577 
1578     if (balloon->our_range) {
1579         hvb_our_range_clear_removed_trees(OUR_RANGE(balloon->our_range));
1580     }
1581 
1582     hvb_page_range_tree_destroy(&balloon->removed_guest);
1583     hvb_page_range_tree_destroy(&balloon->removed_both);
1584     hvb_page_range_tree_init(&balloon->removed_guest);
1585     hvb_page_range_tree_init(&balloon->removed_both);
1586 
1587     balloon->trans_id = 0;
1588     balloon->removed_guest_ctr = 0;
1589     balloon->removed_both_ctr = 0;
1590 
1591     HV_BALLOON_SET_STATE(balloon, S_POST_RESET_CLOSED);
1592     hv_balloon_event_loop(balloon);
1593 }
1594 
1595 /*
1596  * Clean up things that were (possibly) allocated pre-realization, for example
1597  * from memory_device_pre_plug(), so we don't leak them if the device don't
1598  * actually get realized in the end.
1599  */
1600 static void hv_balloon_unrealize_finalize_common(HvBalloon *balloon)
1601 {
1602     hv_balloon_free_mr(balloon);
1603     balloon->addr = 0;
1604 
1605     balloon->memslot_count = 0;
1606 }
1607 
1608 static void hv_balloon_vmdev_unrealize(VMBusDevice *vdev)
1609 {
1610     HvBalloon *balloon = HV_BALLOON(vdev);
1611 
1612     qemu_unregister_reset(hv_balloon_system_reset, balloon);
1613 
1614     hv_balloon_system_reset_unrealize_common(balloon);
1615 
1616     qemu_remove_balloon_handler(balloon);
1617 
1618     if (balloon->hostmem) {
1619         vmstate_unregister_ram(host_memory_backend_get_memory(balloon->hostmem),
1620                                DEVICE(balloon));
1621         host_memory_backend_set_mapped(balloon->hostmem, false);
1622     }
1623 
1624     hvb_page_range_tree_destroy(&balloon->removed_guest);
1625     hvb_page_range_tree_destroy(&balloon->removed_both);
1626 
1627     hv_balloon_unrealize_finalize_common(balloon);
1628 }
1629 
1630 static uint64_t hv_balloon_md_get_addr(const MemoryDeviceState *md)
1631 {
1632     return object_property_get_uint(OBJECT(md), HV_BALLOON_ADDR_PROP,
1633                                     &error_abort);
1634 }
1635 
1636 static void hv_balloon_md_set_addr(MemoryDeviceState *md, uint64_t addr,
1637                                    Error **errp)
1638 {
1639     object_property_set_uint(OBJECT(md), HV_BALLOON_ADDR_PROP, addr, errp);
1640 }
1641 
1642 static MemoryRegion *hv_balloon_md_get_memory_region(MemoryDeviceState *md,
1643                                                      Error **errp)
1644 {
1645     HvBalloon *balloon = HV_BALLOON(md);
1646 
1647     if (!balloon->hostmem) {
1648         return NULL;
1649     }
1650 
1651     hv_balloon_ensure_mr(balloon);
1652 
1653     return balloon->mr;
1654 }
1655 
1656 static void hv_balloon_md_fill_device_info(const MemoryDeviceState *md,
1657                                            MemoryDeviceInfo *info)
1658 {
1659     HvBalloonDeviceInfo *hi = g_new0(HvBalloonDeviceInfo, 1);
1660     const HvBalloon *balloon = HV_BALLOON(md);
1661     DeviceState *dev = DEVICE(md);
1662 
1663     if (dev->id) {
1664         hi->id = g_strdup(dev->id);
1665     }
1666 
1667     if (balloon->hostmem) {
1668         hi->memdev = object_get_canonical_path(OBJECT(balloon->hostmem));
1669         hi->memaddr = balloon->addr;
1670         hi->has_memaddr = true;
1671         hi->max_size = memory_region_size(balloon->mr);
1672         /* TODO: expose current provided size or something else? */
1673     } else {
1674         hi->max_size = 0;
1675     }
1676 
1677     info->u.hv_balloon.data = hi;
1678     info->type = MEMORY_DEVICE_INFO_KIND_HV_BALLOON;
1679 }
1680 
1681 static void hv_balloon_decide_memslots(MemoryDeviceState *md,
1682                                        unsigned int limit)
1683 {
1684     HvBalloon *balloon = HV_BALLOON(md);
1685     MemoryRegion *hostmem_mr;
1686     uint64_t region_size, memslot_size, memslots;
1687 
1688     /* We're called exactly once, before realizing the device. */
1689     assert(!balloon->memslot_count);
1690 
1691     /* We should not be called if we don't have a memory backend */
1692     assert(balloon->hostmem);
1693 
1694     hostmem_mr = host_memory_backend_get_memory(balloon->hostmem);
1695     region_size = memory_region_size(hostmem_mr);
1696 
1697     assert(region_size > 0);
1698     memslot_size = QEMU_ALIGN_UP(region_size / limit,
1699                                  HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN);
1700     memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size;
1701 
1702     if (memslots > 1) {
1703         balloon->memslot_size = memslot_size;
1704     } else {
1705         balloon->memslot_size = region_size;
1706     }
1707 
1708     assert(memslots <= UINT_MAX);
1709     balloon->memslot_count = memslots;
1710 }
1711 
1712 static unsigned int hv_balloon_get_memslots(MemoryDeviceState *md)
1713 {
1714     const HvBalloon *balloon = HV_BALLOON(md);
1715 
1716     /* We're called after setting the suggested limit. */
1717     assert(balloon->memslot_count > 0);
1718 
1719     return balloon->memslot_count;
1720 }
1721 
1722 static void hv_balloon_init(Object *obj)
1723 {
1724 }
1725 
1726 static void hv_balloon_finalize(Object *obj)
1727 {
1728     HvBalloon *balloon = HV_BALLOON(obj);
1729 
1730     hv_balloon_unrealize_finalize_common(balloon);
1731 }
1732 
1733 static Property hv_balloon_properties[] = {
1734     DEFINE_PROP_BOOL("status-report", HvBalloon,
1735                      status_report.enabled, false),
1736 
1737     /* MEMORY_DEVICE props */
1738     DEFINE_PROP_LINK(HV_BALLOON_MEMDEV_PROP, HvBalloon, hostmem,
1739                      TYPE_MEMORY_BACKEND, HostMemoryBackend *),
1740     DEFINE_PROP_UINT64(HV_BALLOON_ADDR_PROP, HvBalloon, addr, 0),
1741 
1742     DEFINE_PROP_END_OF_LIST(),
1743 };
1744 
1745 static void hv_balloon_class_init(ObjectClass *klass, void *data)
1746 {
1747     DeviceClass *dc = DEVICE_CLASS(klass);
1748     VMBusDeviceClass *vdc = VMBUS_DEVICE_CLASS(klass);
1749     MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
1750 
1751     device_class_set_props(dc, hv_balloon_properties);
1752     qemu_uuid_parse(HV_BALLOON_GUID, &vdc->classid);
1753     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1754 
1755     vdc->vmdev_realize = hv_balloon_vmdev_realize;
1756     vdc->vmdev_unrealize = hv_balloon_vmdev_unrealize;
1757     vdc->vmdev_reset = hv_balloon_vmdev_reset;
1758     vdc->open_channel = hv_balloon_vmdev_open_channel;
1759     vdc->close_channel = hv_balloon_vmdev_close_channel;
1760     vdc->chan_notify_cb = hv_balloon_vmdev_chan_notify;
1761 
1762     mdc->get_addr = hv_balloon_md_get_addr;
1763     mdc->set_addr = hv_balloon_md_set_addr;
1764     mdc->get_plugged_size = memory_device_get_region_size;
1765     mdc->get_memory_region = hv_balloon_md_get_memory_region;
1766     mdc->decide_memslots = hv_balloon_decide_memslots;
1767     mdc->get_memslots = hv_balloon_get_memslots;
1768     mdc->fill_device_info = hv_balloon_md_fill_device_info;
1769 }
1770