xref: /openbmc/qemu/hw/hyperv/hv-balloon.c (revision 16dff2f9)
1 /*
2  * QEMU Hyper-V Dynamic Memory Protocol driver
3  *
4  * Copyright (C) 2020-2023 Oracle and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  */
9 
10 #include "hv-balloon-internal.h"
11 
12 #include "exec/address-spaces.h"
13 #include "exec/cpu-common.h"
14 #include "exec/ramblock.h"
15 #include "hw/boards.h"
16 #include "hw/hyperv/dynmem-proto.h"
17 #include "hw/hyperv/hv-balloon.h"
18 #include "hw/hyperv/vmbus.h"
19 #include "hw/mem/memory-device.h"
20 #include "hw/mem/pc-dimm.h"
21 #include "hw/qdev-core.h"
22 #include "hw/qdev-properties.h"
23 #include "monitor/qdev.h"
24 #include "qapi/error.h"
25 #include "qapi/qapi-commands-machine.h"
26 #include "qapi/qapi-events-machine.h"
27 #include "qapi/qapi-types-machine.h"
28 #include "qapi/qmp/qdict.h"
29 #include "qapi/visitor.h"
30 #include "qemu/error-report.h"
31 #include "qemu/module.h"
32 #include "qemu/units.h"
33 #include "qemu/timer.h"
34 #include "sysemu/balloon.h"
35 #include "sysemu/hostmem.h"
36 #include "sysemu/reset.h"
37 #include "hv-balloon-our_range_memslots.h"
38 #include "hv-balloon-page_range_tree.h"
39 #include "trace.h"
40 
41 #define HV_BALLOON_ADDR_PROP "addr"
42 #define HV_BALLOON_MEMDEV_PROP "memdev"
43 #define HV_BALLOON_GUID "525074DC-8985-46e2-8057-A307DC18A502"
44 
45 /*
46  * Some Windows versions (at least Server 2019) will crash with various
47  * error codes when receiving DM protocol requests (at least
48  * DM_MEM_HOT_ADD_REQUEST) immediately after boot.
49  *
50  * It looks like Hyper-V from Server 2016 uses a 50-second after-boot
51  * delay, probably to workaround this issue, so we'll use this value, too.
52  */
53 #define HV_BALLOON_POST_INIT_WAIT (50 * 1000)
54 
55 #define HV_BALLOON_HA_CHUNK_SIZE (2 * GiB)
56 #define HV_BALLOON_HA_CHUNK_PAGES (HV_BALLOON_HA_CHUNK_SIZE / HV_BALLOON_PAGE_SIZE)
57 
58 #define HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN (128 * MiB)
59 
60 #define HV_BALLOON_HR_CHUNK_PAGES 585728
61 /*
62  *                                ^ that's the maximum number of pages
63  * that Windows returns in one hot remove response
64  *
65  * If the number requested is too high Windows will no longer honor
66  * these requests
67  */
68 
69 struct HvBalloonClass {
70     VMBusDeviceClass parent_class;
71 } HvBalloonClass;
72 
73 typedef enum State {
74     /* not a real state */
75     S_NO_CHANGE = 0,
76 
77     S_WAIT_RESET,
78     S_POST_RESET_CLOSED,
79 
80     /* init flow */
81     S_VERSION,
82     S_CAPS,
83     S_POST_INIT_WAIT,
84 
85     S_IDLE,
86 
87     /* balloon op flow */
88     S_BALLOON_POSTING,
89     S_BALLOON_RB_WAIT,
90     S_BALLOON_REPLY_WAIT,
91 
92     /* unballoon + hot add ops flow */
93     S_UNBALLOON_POSTING,
94     S_UNBALLOON_RB_WAIT,
95     S_UNBALLOON_REPLY_WAIT,
96     S_HOT_ADD_SETUP,
97     S_HOT_ADD_RB_WAIT,
98     S_HOT_ADD_POSTING,
99     S_HOT_ADD_REPLY_WAIT,
100 } State;
101 
102 typedef struct StateDesc {
103     State state;
104     const char *desc;
105 } StateDesc;
106 
107 typedef struct HvBalloon {
108     VMBusDevice parent;
109     State state;
110 
111     union dm_version version;
112     union dm_caps caps;
113 
114     QEMUTimer post_init_timer;
115 
116     unsigned int trans_id;
117 
118     struct {
119         bool enabled;
120         bool received;
121         uint64_t committed;
122         uint64_t available;
123     } status_report;
124 
125     /* Guest target size */
126     uint64_t target;
127     bool target_changed;
128 
129     /* Current (un)balloon / hot-add operation parameters */
130     union {
131         uint64_t balloon_diff;
132 
133         struct {
134             uint64_t unballoon_diff;
135             uint64_t hot_add_diff;
136         };
137 
138         struct {
139             PageRange hot_add_range;
140             uint64_t ha_current_count;
141         };
142     };
143 
144     OurRangeMemslots *our_range;
145 
146     /* Count of memslots covering our memory */
147     unsigned int memslot_count;
148 
149     /* Nominal size of each memslot (the last one might be smaller) */
150     uint64_t memslot_size;
151 
152     /* Non-ours removed memory */
153     PageRangeTree removed_guest, removed_both;
154 
155     /* Grand totals of removed memory (both ours and non-ours) */
156     uint64_t removed_guest_ctr, removed_both_ctr;
157 
158     /* MEMORY_DEVICE props */
159     uint64_t addr;
160     HostMemoryBackend *hostmem;
161     MemoryRegion *mr;
162 } HvBalloon;
163 
164 OBJECT_DEFINE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, HV_BALLOON, VMBUS_DEVICE, \
165                                    { TYPE_MEMORY_DEVICE }, { })
166 
167 #define HV_BALLOON_SET_STATE(hvb, news)             \
168     do {                                            \
169         assert(news != S_NO_CHANGE);                \
170         hv_balloon_state_set(hvb, news, # news);    \
171     } while (0)
172 
173 #define HV_BALLOON_STATE_DESC_SET(stdesc, news)         \
174     _hv_balloon_state_desc_set(stdesc, news, # news)
175 
176 #define HV_BALLOON_STATE_DESC_INIT \
177     {                              \
178         .state = S_NO_CHANGE,      \
179     }
180 
181 typedef struct HvBalloonReq {
182     VMBusChanReq vmreq;
183 } HvBalloonReq;
184 
185 /* total our memory includes parts currently removed from the guest */
186 static uint64_t hv_balloon_total_our_ram(HvBalloon *balloon)
187 {
188     if (!balloon->our_range) {
189         return 0;
190     }
191 
192     return balloon->our_range->range.added;
193 }
194 
195 /* TODO: unify the code below with virtio-balloon and cache the value */
196 static int build_dimm_list(Object *obj, void *opaque)
197 {
198     GSList **list = opaque;
199 
200     if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
201         DeviceState *dev = DEVICE(obj);
202         if (dev->realized) { /* only realized DIMMs matter */
203             *list = g_slist_prepend(*list, dev);
204         }
205     }
206 
207     object_child_foreach(obj, build_dimm_list, opaque);
208     return 0;
209 }
210 
211 static ram_addr_t get_current_ram_size(void)
212 {
213     GSList *list = NULL, *item;
214     ram_addr_t size = current_machine->ram_size;
215 
216     build_dimm_list(qdev_get_machine(), &list);
217     for (item = list; item; item = g_slist_next(item)) {
218         Object *obj = OBJECT(item->data);
219         if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM))
220             size += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
221                                             &error_abort);
222     }
223     g_slist_free(list);
224 
225     return size;
226 }
227 
228 /* total RAM includes memory currently removed from the guest */
229 static uint64_t hv_balloon_total_ram(HvBalloon *balloon)
230 {
231     ram_addr_t ram_size = get_current_ram_size();
232     uint64_t ram_size_pages = ram_size >> HV_BALLOON_PFN_SHIFT;
233     uint64_t our_ram_size_pages = hv_balloon_total_our_ram(balloon);
234 
235     assert(ram_size_pages > 0);
236 
237     return SUM_SATURATE_U64(ram_size_pages, our_ram_size_pages);
238 }
239 
240 /*
241  * calculating the total RAM size is a slow operation,
242  * avoid it as much as possible
243  */
244 static uint64_t hv_balloon_total_removed_rs(HvBalloon *balloon,
245                                             uint64_t ram_size_pages)
246 {
247     uint64_t total_removed;
248 
249     total_removed = SUM_SATURATE_U64(balloon->removed_guest_ctr,
250                                      balloon->removed_both_ctr);
251 
252     /* possible if guest returns pages outside actual RAM */
253     if (total_removed > ram_size_pages) {
254         total_removed = ram_size_pages;
255     }
256 
257     return total_removed;
258 }
259 
260 /* Returns whether the state has actually changed */
261 static bool hv_balloon_state_set(HvBalloon *balloon,
262                                  State newst, const char *newststr)
263 {
264     if (newst == S_NO_CHANGE || balloon->state == newst) {
265         return false;
266     }
267 
268     balloon->state = newst;
269     trace_hv_balloon_state_change(newststr);
270     return true;
271 }
272 
273 static void _hv_balloon_state_desc_set(StateDesc *stdesc,
274                                        State newst, const char *newststr)
275 {
276     /* state setting is only permitted on a freshly init desc */
277     assert(stdesc->state == S_NO_CHANGE);
278 
279     assert(newst != S_NO_CHANGE);
280 
281     stdesc->state = newst;
282     stdesc->desc = newststr;
283 }
284 
285 static VMBusChannel *hv_balloon_get_channel_maybe(HvBalloon *balloon)
286 {
287     return vmbus_device_channel(&balloon->parent, 0);
288 }
289 
290 static VMBusChannel *hv_balloon_get_channel(HvBalloon *balloon)
291 {
292     VMBusChannel *chan;
293 
294     chan = hv_balloon_get_channel_maybe(balloon);
295     assert(chan != NULL);
296     return chan;
297 }
298 
299 static ssize_t hv_balloon_send_packet(VMBusChannel *chan,
300                                       struct dm_message *msg)
301 {
302     int ret;
303 
304     ret = vmbus_channel_reserve(chan, 0, msg->hdr.size);
305     if (ret < 0) {
306         return ret;
307     }
308 
309     return vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
310                               NULL, 0, msg, msg->hdr.size, false,
311                               msg->hdr.trans_id);
312 }
313 
314 static bool hv_balloon_unballoon_get_source(HvBalloon *balloon,
315                                             PageRangeTree *dtree,
316                                             uint64_t **dctr,
317                                             bool *is_our_range)
318 {
319     OurRange *our_range = OUR_RANGE(balloon->our_range);
320 
321     /* Try the boot memory first */
322     if (g_tree_nnodes(balloon->removed_guest.t) > 0) {
323         *dtree = balloon->removed_guest;
324         *dctr = &balloon->removed_guest_ctr;
325         *is_our_range = false;
326     } else if (g_tree_nnodes(balloon->removed_both.t) > 0) {
327         *dtree = balloon->removed_both;
328         *dctr = &balloon->removed_both_ctr;
329         *is_our_range = false;
330     } else if (!our_range) {
331         return false;
332     } else if (!our_range_is_removed_tree_empty(our_range, false)) {
333         *dtree = our_range_get_removed_tree(our_range, false);
334         *dctr = &balloon->removed_guest_ctr;
335         *is_our_range = true;
336     } else if (!our_range_is_removed_tree_empty(our_range, true)) {
337         *dtree = our_range_get_removed_tree(our_range, true);
338         *dctr = &balloon->removed_both_ctr;
339         *is_our_range = true;
340     } else {
341         return false;
342     }
343 
344     return true;
345 }
346 
347 static void hv_balloon_unballoon_rb_wait(HvBalloon *balloon, StateDesc *stdesc)
348 {
349     VMBusChannel *chan = hv_balloon_get_channel(balloon);
350     struct dm_unballoon_request *ur;
351     size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]);
352 
353     assert(balloon->state == S_UNBALLOON_RB_WAIT);
354 
355     if (vmbus_channel_reserve(chan, 0, ur_size) < 0) {
356         return;
357     }
358 
359     HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_POSTING);
360 }
361 
362 static void hv_balloon_unballoon_posting(HvBalloon *balloon, StateDesc *stdesc)
363 {
364     VMBusChannel *chan = hv_balloon_get_channel(balloon);
365     PageRangeTree dtree;
366     uint64_t *dctr;
367     bool our_range;
368     struct dm_unballoon_request *ur;
369     size_t ur_size = sizeof(*ur) + sizeof(ur->range_array[0]);
370     PageRange range;
371     bool bret;
372     ssize_t ret;
373 
374     assert(balloon->state == S_UNBALLOON_POSTING);
375     assert(balloon->unballoon_diff > 0);
376 
377     if (!hv_balloon_unballoon_get_source(balloon, &dtree, &dctr, &our_range)) {
378         error_report("trying to unballoon but nothing seems to be ballooned");
379         /*
380          * there is little we can do as we might have already
381          * sent the guest a partial request we can't cancel
382          */
383         return;
384     }
385 
386     assert(balloon->our_range || !our_range);
387     assert(dtree.t);
388     assert(dctr);
389 
390     ur = alloca(ur_size);
391     memset(ur, 0, ur_size);
392     ur->hdr.type = DM_UNBALLOON_REQUEST;
393     ur->hdr.size = ur_size;
394     ur->hdr.trans_id = balloon->trans_id;
395 
396     bret = hvb_page_range_tree_pop(dtree, &range, MIN(balloon->unballoon_diff,
397                                                       HV_BALLOON_HA_CHUNK_PAGES));
398     assert(bret);
399     /* TODO: madvise? */
400 
401     *dctr -= range.count;
402     balloon->unballoon_diff -= range.count;
403 
404     ur->range_count = 1;
405     ur->range_array[0].finfo.start_page = range.start;
406     ur->range_array[0].finfo.page_cnt = range.count;
407     ur->more_pages = balloon->unballoon_diff > 0;
408 
409     trace_hv_balloon_outgoing_unballoon(ur->hdr.trans_id,
410                                         range.count, range.start,
411                                         balloon->unballoon_diff);
412 
413     if (ur->more_pages) {
414         HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT);
415     } else {
416         HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_REPLY_WAIT);
417     }
418 
419     ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
420                              NULL, 0, ur, ur_size, false,
421                              ur->hdr.trans_id);
422     if (ret <= 0) {
423         error_report("error %zd when posting unballoon msg, expect problems",
424                      ret);
425     }
426 }
427 
428 static bool hv_balloon_our_range_ensure(HvBalloon *balloon)
429 {
430     uint64_t align;
431     MemoryRegion *hostmem_mr;
432     g_autoptr(OurRangeMemslots) our_range_memslots = NULL;
433     OurRange *our_range;
434 
435     if (balloon->our_range) {
436         return true;
437     }
438 
439     if (!balloon->hostmem) {
440         return false;
441     }
442 
443     align = (1 << balloon->caps.cap_bits.hot_add_alignment) * MiB;
444     assert(QEMU_IS_ALIGNED(balloon->addr, align));
445 
446     hostmem_mr = host_memory_backend_get_memory(balloon->hostmem);
447 
448     our_range_memslots = hvb_our_range_memslots_new(balloon->addr,
449                                                     balloon->mr, hostmem_mr,
450                                                     OBJECT(balloon),
451                                                     balloon->memslot_count,
452                                                     balloon->memslot_size);
453     our_range = OUR_RANGE(our_range_memslots);
454 
455     if (hvb_page_range_tree_intree_any(balloon->removed_guest,
456                                        our_range->range.start,
457                                        our_range->range.count) ||
458         hvb_page_range_tree_intree_any(balloon->removed_both,
459                                        our_range->range.start,
460                                        our_range->range.count)) {
461         error_report("some parts of the memory backend were already returned by the guest. this should not happen, please reboot the guest and try again");
462         return false;
463     }
464 
465     trace_hv_balloon_our_range_add(our_range->range.count,
466                                    our_range->range.start);
467 
468     balloon->our_range = g_steal_pointer(&our_range_memslots);
469     return true;
470 }
471 
472 static void hv_balloon_hot_add_setup(HvBalloon *balloon, StateDesc *stdesc)
473 {
474     /* need to make copy since it is in union with hot_add_range */
475     uint64_t hot_add_diff = balloon->hot_add_diff;
476     PageRange *hot_add_range = &balloon->hot_add_range;
477     uint64_t align, our_range_remaining;
478     OurRange *our_range;
479 
480     assert(balloon->state == S_HOT_ADD_SETUP);
481     assert(hot_add_diff > 0);
482 
483     if (!hv_balloon_our_range_ensure(balloon)) {
484         goto ret_idle;
485     }
486 
487     our_range = OUR_RANGE(balloon->our_range);
488 
489     align = (1 << balloon->caps.cap_bits.hot_add_alignment) *
490         (MiB / HV_BALLOON_PAGE_SIZE);
491 
492     /* Absolute GPA in pages */
493     hot_add_range->start = our_range_get_remaining_start(our_range);
494     assert(QEMU_IS_ALIGNED(hot_add_range->start, align));
495 
496     our_range_remaining = our_range_get_remaining_size(our_range);
497     hot_add_range->count = MIN(our_range_remaining, hot_add_diff);
498     hot_add_range->count = QEMU_ALIGN_DOWN(hot_add_range->count, align);
499     if (hot_add_range->count == 0) {
500         goto ret_idle;
501     }
502 
503     hvb_our_range_memslots_ensure_mapped_additional(balloon->our_range,
504                                                     hot_add_range->count);
505 
506     HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT);
507     return;
508 
509 ret_idle:
510     HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
511 }
512 
513 static void hv_balloon_hot_add_rb_wait(HvBalloon *balloon, StateDesc *stdesc)
514 {
515     VMBusChannel *chan = hv_balloon_get_channel(balloon);
516     struct dm_hot_add *ha;
517     size_t ha_size = sizeof(*ha) + sizeof(ha->range);
518 
519     assert(balloon->state == S_HOT_ADD_RB_WAIT);
520 
521     if (vmbus_channel_reserve(chan, 0, ha_size) < 0) {
522         return;
523     }
524 
525     HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_POSTING);
526 }
527 
528 static void hv_balloon_hot_add_posting(HvBalloon *balloon, StateDesc *stdesc)
529 {
530     PageRange *hot_add_range = &balloon->hot_add_range;
531     uint64_t *current_count = &balloon->ha_current_count;
532     VMBusChannel *chan = hv_balloon_get_channel(balloon);
533     struct dm_hot_add *ha;
534     size_t ha_size = sizeof(*ha) + sizeof(ha->range);
535     union dm_mem_page_range *ha_region;
536     uint64_t align, chunk_max_size;
537     ssize_t ret;
538 
539     assert(balloon->state == S_HOT_ADD_POSTING);
540     assert(hot_add_range->count > 0);
541 
542     align = (1 << balloon->caps.cap_bits.hot_add_alignment) *
543         (MiB / HV_BALLOON_PAGE_SIZE);
544     if (align >= HV_BALLOON_HA_CHUNK_PAGES) {
545         /*
546          * If the required alignment is higher than the chunk size we let it
547          * override that size.
548          */
549         chunk_max_size = align;
550     } else {
551         chunk_max_size = QEMU_ALIGN_DOWN(HV_BALLOON_HA_CHUNK_PAGES, align);
552     }
553 
554     /*
555      * hot_add_range->count starts aligned in hv_balloon_hot_add_setup(),
556      * then it is either reduced by subtracting aligned current_count or
557      * further hot-adds are prevented by marking the whole remaining our range
558      * as unusable in hv_balloon_handle_hot_add_response().
559      */
560     *current_count = MIN(hot_add_range->count, chunk_max_size);
561 
562     ha = alloca(ha_size);
563     ha_region = &(&ha->range)[1];
564     memset(ha, 0, ha_size);
565     ha->hdr.type = DM_MEM_HOT_ADD_REQUEST;
566     ha->hdr.size = ha_size;
567     ha->hdr.trans_id = balloon->trans_id;
568 
569     ha->range.finfo.start_page = hot_add_range->start;
570     ha->range.finfo.page_cnt = *current_count;
571     ha_region->finfo.start_page = hot_add_range->start;
572     ha_region->finfo.page_cnt = ha->range.finfo.page_cnt;
573 
574     trace_hv_balloon_outgoing_hot_add(ha->hdr.trans_id,
575                                       *current_count, hot_add_range->start);
576 
577     ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
578                              NULL, 0, ha, ha_size, false,
579                              ha->hdr.trans_id);
580     if (ret <= 0) {
581         error_report("error %zd when posting hot add msg, expect problems",
582                      ret);
583     }
584 
585     HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_REPLY_WAIT);
586 }
587 
588 static void hv_balloon_balloon_rb_wait(HvBalloon *balloon, StateDesc *stdesc)
589 {
590     VMBusChannel *chan = hv_balloon_get_channel(balloon);
591     size_t bl_size = sizeof(struct dm_balloon);
592 
593     assert(balloon->state == S_BALLOON_RB_WAIT);
594 
595     if (vmbus_channel_reserve(chan, 0, bl_size) < 0) {
596         return;
597     }
598 
599     HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_POSTING);
600 }
601 
602 static void hv_balloon_balloon_posting(HvBalloon *balloon, StateDesc *stdesc)
603 {
604     VMBusChannel *chan = hv_balloon_get_channel(balloon);
605     struct dm_balloon bl;
606     size_t bl_size = sizeof(bl);
607     ssize_t ret;
608 
609     assert(balloon->state == S_BALLOON_POSTING);
610     assert(balloon->balloon_diff > 0);
611 
612     memset(&bl, 0, sizeof(bl));
613     bl.hdr.type = DM_BALLOON_REQUEST;
614     bl.hdr.size = bl_size;
615     bl.hdr.trans_id = balloon->trans_id;
616     bl.num_pages = MIN(balloon->balloon_diff, HV_BALLOON_HR_CHUNK_PAGES);
617 
618     trace_hv_balloon_outgoing_balloon(bl.hdr.trans_id, bl.num_pages,
619                                       balloon->balloon_diff);
620 
621     ret = vmbus_channel_send(chan, VMBUS_PACKET_DATA_INBAND,
622                              NULL, 0, &bl, bl_size, false,
623                              bl.hdr.trans_id);
624     if (ret <= 0) {
625         error_report("error %zd when posting balloon msg, expect problems",
626                      ret);
627     }
628 
629     HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_REPLY_WAIT);
630 }
631 
632 static void hv_balloon_idle_state_process_target(HvBalloon *balloon,
633                                                  StateDesc *stdesc)
634 {
635     bool can_balloon = balloon->caps.cap_bits.balloon;
636     uint64_t ram_size_pages, total_removed;
637 
638     ram_size_pages = hv_balloon_total_ram(balloon);
639     total_removed = hv_balloon_total_removed_rs(balloon, ram_size_pages);
640 
641     /*
642      * we need to cache the values computed from the balloon target value when
643      * starting the adjustment procedure in case someone changes the target when
644      * the procedure is in progress
645      */
646     if (balloon->target > ram_size_pages - total_removed) {
647         bool can_hot_add = balloon->caps.cap_bits.hot_add;
648         uint64_t target_diff = balloon->target -
649             (ram_size_pages - total_removed);
650 
651         balloon->unballoon_diff = MIN(target_diff, total_removed);
652 
653         if (can_hot_add) {
654             balloon->hot_add_diff = target_diff - balloon->unballoon_diff;
655         } else {
656             balloon->hot_add_diff = 0;
657         }
658 
659         if (balloon->unballoon_diff > 0) {
660             assert(can_balloon);
661             HV_BALLOON_STATE_DESC_SET(stdesc, S_UNBALLOON_RB_WAIT);
662         } else if (balloon->hot_add_diff > 0) {
663             HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP);
664         }
665     } else if (can_balloon &&
666                balloon->target < ram_size_pages - total_removed) {
667         balloon->balloon_diff = ram_size_pages - total_removed -
668             balloon->target;
669         HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT);
670     }
671 }
672 
673 static void hv_balloon_idle_state(HvBalloon *balloon,
674                                   StateDesc *stdesc)
675 {
676     assert(balloon->state == S_IDLE);
677 
678     if (balloon->target_changed) {
679         balloon->target_changed = false;
680         hv_balloon_idle_state_process_target(balloon, stdesc);
681         return;
682     }
683 }
684 
685 static const struct {
686     void (*handler)(HvBalloon *balloon, StateDesc *stdesc);
687 } state_handlers[] = {
688     [S_IDLE].handler = hv_balloon_idle_state,
689     [S_BALLOON_POSTING].handler = hv_balloon_balloon_posting,
690     [S_BALLOON_RB_WAIT].handler = hv_balloon_balloon_rb_wait,
691     [S_UNBALLOON_POSTING].handler = hv_balloon_unballoon_posting,
692     [S_UNBALLOON_RB_WAIT].handler = hv_balloon_unballoon_rb_wait,
693     [S_HOT_ADD_SETUP].handler = hv_balloon_hot_add_setup,
694     [S_HOT_ADD_RB_WAIT].handler = hv_balloon_hot_add_rb_wait,
695     [S_HOT_ADD_POSTING].handler = hv_balloon_hot_add_posting,
696 };
697 
698 static void hv_balloon_handle_state(HvBalloon *balloon, StateDesc *stdesc)
699 {
700     if (balloon->state >= ARRAY_SIZE(state_handlers) ||
701         !state_handlers[balloon->state].handler) {
702         return;
703     }
704 
705     state_handlers[balloon->state].handler(balloon, stdesc);
706 }
707 
708 static void hv_balloon_remove_response_insert_range(PageRangeTree tree,
709                                                     const PageRange *range,
710                                                     uint64_t *ctr1,
711                                                     uint64_t *ctr2,
712                                                     uint64_t *ctr3)
713 {
714     uint64_t dupcount, effcount;
715 
716     if (range->count == 0) {
717         return;
718     }
719 
720     dupcount = 0;
721     hvb_page_range_tree_insert(tree, range->start, range->count, &dupcount);
722 
723     assert(dupcount <= range->count);
724     effcount = range->count - dupcount;
725 
726     *ctr1 += effcount;
727     *ctr2 += effcount;
728     if (ctr3) {
729         *ctr3 += effcount;
730     }
731 }
732 
733 static void hv_balloon_remove_response_handle_range(HvBalloon *balloon,
734                                                     PageRange *range,
735                                                     bool both,
736                                                     uint64_t *removedctr)
737 {
738     OurRange *our_range = OUR_RANGE(balloon->our_range);
739     PageRangeTree globaltree =
740         both ? balloon->removed_both : balloon->removed_guest;
741     uint64_t *globalctr =
742         both ? &balloon->removed_both_ctr : &balloon->removed_guest_ctr;
743     PageRange rangeeff;
744 
745     if (range->count == 0) {
746         return;
747     }
748 
749     trace_hv_balloon_remove_response(range->count, range->start, both);
750 
751     if (our_range) {
752         /* Includes the not-yet-hot-added and unusable parts. */
753         rangeeff = our_range->range;
754     } else {
755         rangeeff.start = rangeeff.count = 0;
756     }
757 
758     if (page_range_intersection_size(range, rangeeff.start, rangeeff.count) > 0) {
759         PageRangeTree ourtree = our_range_get_removed_tree(our_range, both);
760         PageRange rangehole, rangecommon;
761         uint64_t ourremoved = 0;
762 
763         /* process the hole before our range, if it exists */
764         page_range_part_before(range, rangeeff.start, &rangehole);
765         hv_balloon_remove_response_insert_range(globaltree, &rangehole,
766                                                 globalctr, removedctr, NULL);
767         if (rangehole.count > 0) {
768             trace_hv_balloon_remove_response_hole(rangehole.count,
769                                                   rangehole.start,
770                                                   range->count, range->start,
771                                                   rangeeff.start, both);
772         }
773 
774         /* process our part */
775         page_range_intersect(range, rangeeff.start, rangeeff.count,
776                              &rangecommon);
777         hv_balloon_remove_response_insert_range(ourtree, &rangecommon,
778                                                 globalctr, removedctr,
779                                                 &ourremoved);
780         if (rangecommon.count > 0) {
781             trace_hv_balloon_remove_response_common(rangecommon.count,
782                                                     rangecommon.start,
783                                                     range->count, range->start,
784                                                     rangeeff.count,
785                                                     rangeeff.start, ourremoved,
786                                                     both);
787         }
788 
789         /* calculate what's left after our range */
790         rangecommon = *range;
791         page_range_part_after(&rangecommon, rangeeff.start, rangeeff.count,
792                               range);
793     }
794 
795     /* process the remainder of the range that lies after our range */
796     if (range->count > 0) {
797         hv_balloon_remove_response_insert_range(globaltree, range,
798                                                 globalctr, removedctr, NULL);
799         trace_hv_balloon_remove_response_remainder(range->count, range->start,
800                                                    both);
801         range->count = 0;
802     }
803 }
804 
805 static void hv_balloon_remove_response_handle_pages(HvBalloon *balloon,
806                                                     PageRange *range,
807                                                     uint64_t start,
808                                                     uint64_t count,
809                                                     bool both,
810                                                     uint64_t *removedctr)
811 {
812     assert(count > 0);
813 
814     /*
815      * if there is an existing range that the new range can't be joined to
816      * dump it into tree(s)
817      */
818     if (range->count > 0 && !page_range_joinable(range, start, count)) {
819         hv_balloon_remove_response_handle_range(balloon, range, both,
820                                                 removedctr);
821     }
822 
823     if (range->count == 0) {
824         range->start = start;
825         range->count = count;
826     } else if (page_range_joinable_left(range, start, count)) {
827         range->start = start;
828         range->count += count;
829     } else { /* page_range_joinable_right() */
830         range->count += count;
831     }
832 }
833 
834 static gboolean hv_balloon_handle_remove_host_addr_node(gpointer key,
835                                                         gpointer value,
836                                                         gpointer data)
837 {
838     PageRange *range = value;
839     uint64_t pageoff;
840 
841     for (pageoff = 0; pageoff < range->count; ) {
842         uint64_t addr_64 = (range->start + pageoff) * HV_BALLOON_PAGE_SIZE;
843         void *addr;
844         RAMBlock *rb;
845         ram_addr_t rb_offset;
846         size_t rb_page_size;
847         size_t discard_size;
848 
849         assert(addr_64 <= UINTPTR_MAX);
850         addr = (void *)((uintptr_t)addr_64);
851         rb = qemu_ram_block_from_host(addr, false, &rb_offset);
852         rb_page_size = qemu_ram_pagesize(rb);
853 
854         if (rb_page_size != HV_BALLOON_PAGE_SIZE) {
855             /* TODO: these should end in "removed_guest" */
856             warn_report("guest reported removed page backed by unsupported page size %zu",
857                         rb_page_size);
858             pageoff++;
859             continue;
860         }
861 
862         discard_size = MIN(range->count - pageoff,
863                            (rb->max_length - rb_offset) /
864                            HV_BALLOON_PAGE_SIZE);
865         discard_size = MAX(discard_size, 1);
866 
867         if (ram_block_discard_range(rb, rb_offset, discard_size *
868                                     HV_BALLOON_PAGE_SIZE) != 0) {
869             warn_report("guest reported removed page failed discard");
870         }
871 
872         pageoff += discard_size;
873     }
874 
875     return false;
876 }
877 
878 static void hv_balloon_handle_remove_host_addr_tree(PageRangeTree tree)
879 {
880     g_tree_foreach(tree.t, hv_balloon_handle_remove_host_addr_node, NULL);
881 }
882 
883 static int hv_balloon_handle_remove_section(PageRangeTree tree,
884                                             const MemoryRegionSection *section,
885                                             uint64_t count)
886 {
887     void *addr = memory_region_get_ram_ptr(section->mr) +
888         section->offset_within_region;
889     uint64_t addr_page;
890 
891     assert(count > 0);
892 
893     if ((uintptr_t)addr % HV_BALLOON_PAGE_SIZE) {
894         warn_report("guest reported removed pages at an unaligned host addr %p",
895                     addr);
896         return -EINVAL;
897     }
898 
899     addr_page = (uintptr_t)addr / HV_BALLOON_PAGE_SIZE;
900     hvb_page_range_tree_insert(tree, addr_page, count, NULL);
901 
902     return 0;
903 }
904 
905 static void hv_balloon_handle_remove_ranges(HvBalloon *balloon,
906                                             union dm_mem_page_range ranges[],
907                                             uint32_t count)
908 {
909     uint64_t removedcnt;
910     PageRangeTree removed_host_addr;
911     PageRange range_guest, range_both;
912 
913     hvb_page_range_tree_init(&removed_host_addr);
914     range_guest.count = range_both.count = removedcnt = 0;
915     for (unsigned int ctr = 0; ctr < count; ctr++) {
916         union dm_mem_page_range *mr = &ranges[ctr];
917         hwaddr pa;
918         MemoryRegionSection section;
919 
920         for (unsigned int offset = 0; offset < mr->finfo.page_cnt; ) {
921             int ret;
922             uint64_t pageno = mr->finfo.start_page + offset;
923             uint64_t pagecnt = 1;
924 
925             pa = (hwaddr)pageno << HV_BALLOON_PFN_SHIFT;
926             section = memory_region_find(get_system_memory(), pa,
927                                          (mr->finfo.page_cnt - offset) *
928                                          HV_BALLOON_PAGE_SIZE);
929             if (!section.mr) {
930                 warn_report("guest reported removed page %"PRIu64" not found in RAM",
931                             pageno);
932                 ret = -EINVAL;
933                 goto finish_page;
934             }
935 
936             pagecnt = int128_get64(section.size) / HV_BALLOON_PAGE_SIZE;
937             if (pagecnt <= 0) {
938                 warn_report("guest reported removed page %"PRIu64" in a section smaller than page size",
939                             pageno);
940                 pagecnt = 1; /* skip the whole page */
941                 ret = -EINVAL;
942                 goto finish_page;
943             }
944 
945             if (!memory_region_is_ram(section.mr) ||
946                 memory_region_is_rom(section.mr) ||
947                 memory_region_is_romd(section.mr)) {
948                 warn_report("guest reported removed page %"PRIu64" in a section that is not an ordinary RAM",
949                             pageno);
950                 ret = -EINVAL;
951                 goto finish_page;
952             }
953 
954             ret = hv_balloon_handle_remove_section(removed_host_addr, &section,
955                                                    pagecnt);
956 
957         finish_page:
958             if (ret == 0) {
959                 hv_balloon_remove_response_handle_pages(balloon,
960                                                         &range_both,
961                                                         pageno, pagecnt,
962                                                         true, &removedcnt);
963             } else {
964                 hv_balloon_remove_response_handle_pages(balloon,
965                                                         &range_guest,
966                                                         pageno, pagecnt,
967                                                         false, &removedcnt);
968             }
969 
970             if (section.mr) {
971                 memory_region_unref(section.mr);
972             }
973 
974             offset += pagecnt;
975         }
976     }
977 
978     hv_balloon_remove_response_handle_range(balloon, &range_both, true,
979                                             &removedcnt);
980     hv_balloon_remove_response_handle_range(balloon, &range_guest, false,
981                                             &removedcnt);
982 
983     hv_balloon_handle_remove_host_addr_tree(removed_host_addr);
984     hvb_page_range_tree_destroy(&removed_host_addr);
985 
986     if (removedcnt > balloon->balloon_diff) {
987         warn_report("guest reported more pages removed than currently pending (%"PRIu64" vs %"PRIu64")",
988                     removedcnt, balloon->balloon_diff);
989         balloon->balloon_diff = 0;
990     } else {
991         balloon->balloon_diff -= removedcnt;
992     }
993 }
994 
995 static bool hv_balloon_handle_msg_size(HvBalloonReq *req, size_t minsize,
996                                        const char *msgname)
997 {
998     VMBusChanReq *vmreq = &req->vmreq;
999     uint32_t msglen = vmreq->msglen;
1000 
1001     if (msglen >= minsize) {
1002         return true;
1003     }
1004 
1005     warn_report("%s message too short (%u vs %zu), ignoring", msgname,
1006                 (unsigned int)msglen, minsize);
1007     return false;
1008 }
1009 
1010 static void hv_balloon_handle_version_request(HvBalloon *balloon,
1011                                               HvBalloonReq *req,
1012                                               StateDesc *stdesc)
1013 {
1014     VMBusChanReq *vmreq = &req->vmreq;
1015     struct dm_version_request *msgVr = vmreq->msg;
1016     struct dm_version_response respVr;
1017 
1018     if (balloon->state != S_VERSION) {
1019         warn_report("unexpected DM_VERSION_REQUEST in %d state",
1020                     balloon->state);
1021         return;
1022     }
1023 
1024     if (!hv_balloon_handle_msg_size(req, sizeof(*msgVr),
1025                                     "DM_VERSION_REQUEST")) {
1026         return;
1027     }
1028 
1029     trace_hv_balloon_incoming_version(msgVr->version.major_version,
1030                                       msgVr->version.minor_version);
1031 
1032     memset(&respVr, 0, sizeof(respVr));
1033     respVr.hdr.type = DM_VERSION_RESPONSE;
1034     respVr.hdr.size = sizeof(respVr);
1035     respVr.hdr.trans_id = msgVr->hdr.trans_id;
1036     respVr.is_accepted = msgVr->version.version >= DYNMEM_PROTOCOL_VERSION_1 &&
1037         msgVr->version.version <= DYNMEM_PROTOCOL_VERSION_3;
1038 
1039     hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respVr);
1040 
1041     if (respVr.is_accepted) {
1042         HV_BALLOON_STATE_DESC_SET(stdesc, S_CAPS);
1043     }
1044 }
1045 
1046 static void hv_balloon_handle_caps_report(HvBalloon *balloon,
1047                                           HvBalloonReq *req,
1048                                           StateDesc *stdesc)
1049 {
1050     VMBusChanReq *vmreq = &req->vmreq;
1051     struct dm_capabilities *msgCap = vmreq->msg;
1052     struct dm_capabilities_resp_msg respCap;
1053 
1054     if (balloon->state != S_CAPS) {
1055         warn_report("unexpected DM_CAPABILITIES_REPORT in %d state",
1056                     balloon->state);
1057         return;
1058     }
1059 
1060     if (!hv_balloon_handle_msg_size(req, sizeof(*msgCap),
1061                                     "DM_CAPABILITIES_REPORT")) {
1062         return;
1063     }
1064 
1065     trace_hv_balloon_incoming_caps(msgCap->caps.caps);
1066     balloon->caps = msgCap->caps;
1067 
1068     memset(&respCap, 0, sizeof(respCap));
1069     respCap.hdr.type = DM_CAPABILITIES_RESPONSE;
1070     respCap.hdr.size = sizeof(respCap);
1071     respCap.hdr.trans_id = msgCap->hdr.trans_id;
1072     respCap.is_accepted = 1;
1073     respCap.hot_remove = 1;
1074     respCap.suppress_pressure_reports = !balloon->status_report.enabled;
1075     hv_balloon_send_packet(vmreq->chan, (struct dm_message *)&respCap);
1076 
1077     timer_mod(&balloon->post_init_timer,
1078               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
1079               HV_BALLOON_POST_INIT_WAIT);
1080 
1081     HV_BALLOON_STATE_DESC_SET(stdesc, S_POST_INIT_WAIT);
1082 }
1083 
1084 static void hv_balloon_handle_status_report(HvBalloon *balloon,
1085                                             HvBalloonReq *req)
1086 {
1087     VMBusChanReq *vmreq = &req->vmreq;
1088     struct dm_status *msgStatus = vmreq->msg;
1089 
1090     if (!hv_balloon_handle_msg_size(req, sizeof(*msgStatus),
1091                                     "DM_STATUS_REPORT")) {
1092         return;
1093     }
1094 
1095     if (!balloon->status_report.enabled) {
1096         return;
1097     }
1098 
1099     balloon->status_report.committed = msgStatus->num_committed;
1100     balloon->status_report.committed *= HV_BALLOON_PAGE_SIZE;
1101     balloon->status_report.available = msgStatus->num_avail;
1102     balloon->status_report.available *= HV_BALLOON_PAGE_SIZE;
1103     balloon->status_report.received = true;
1104 
1105     /* report event */
1106 }
1107 
1108 static void hv_balloon_handle_unballoon_response(HvBalloon *balloon,
1109                                                  HvBalloonReq *req,
1110                                                  StateDesc *stdesc)
1111 {
1112     VMBusChanReq *vmreq = &req->vmreq;
1113     struct dm_unballoon_response *msgUrR = vmreq->msg;
1114 
1115     if (balloon->state != S_UNBALLOON_REPLY_WAIT) {
1116         warn_report("unexpected DM_UNBALLOON_RESPONSE in %d state",
1117                     balloon->state);
1118         return;
1119     }
1120 
1121     if (!hv_balloon_handle_msg_size(req, sizeof(*msgUrR),
1122                                     "DM_UNBALLOON_RESPONSE"))
1123         return;
1124 
1125     trace_hv_balloon_incoming_unballoon(msgUrR->hdr.trans_id);
1126 
1127     balloon->trans_id++;
1128 
1129     if (balloon->hot_add_diff > 0) {
1130         bool can_hot_add = balloon->caps.cap_bits.hot_add;
1131 
1132         assert(can_hot_add);
1133         HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_SETUP);
1134     } else {
1135         HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
1136     }
1137 }
1138 
1139 static void hv_balloon_handle_hot_add_response(HvBalloon *balloon,
1140                                                HvBalloonReq *req,
1141                                                StateDesc *stdesc)
1142 {
1143     PageRange *hot_add_range = &balloon->hot_add_range;
1144     VMBusChanReq *vmreq = &req->vmreq;
1145     struct dm_hot_add_response *msgHaR = vmreq->msg;
1146     OurRange *our_range;
1147 
1148     if (balloon->state != S_HOT_ADD_REPLY_WAIT) {
1149         warn_report("unexpected DM_HOT_ADD_RESPONSE in %d state",
1150                     balloon->state);
1151         return;
1152     }
1153 
1154     assert(balloon->our_range);
1155     our_range = OUR_RANGE(balloon->our_range);
1156 
1157     if (!hv_balloon_handle_msg_size(req, sizeof(*msgHaR),
1158                                     "DM_HOT_ADD_RESPONSE"))
1159         return;
1160 
1161     trace_hv_balloon_incoming_hot_add(msgHaR->hdr.trans_id, msgHaR->result,
1162                                       msgHaR->page_count);
1163 
1164     balloon->trans_id++;
1165 
1166     if (msgHaR->result) {
1167         if (msgHaR->page_count > balloon->ha_current_count) {
1168             warn_report("DM_HOT_ADD_RESPONSE page count higher than requested (%"PRIu32" vs %"PRIu64")",
1169                         msgHaR->page_count, balloon->ha_current_count);
1170             msgHaR->page_count = balloon->ha_current_count;
1171         }
1172 
1173         hvb_our_range_mark_added(our_range, msgHaR->page_count);
1174         hot_add_range->start += msgHaR->page_count;
1175         hot_add_range->count -= msgHaR->page_count;
1176     }
1177 
1178     if (!msgHaR->result || msgHaR->page_count < balloon->ha_current_count) {
1179         /*
1180          * the current planned range was only partially hot-added, take note
1181          * how much of it remains and don't attempt any further hot adds
1182          */
1183         our_range_mark_remaining_unusable(our_range);
1184 
1185         goto ret_idle;
1186     }
1187 
1188     /* any pages remaining to hot-add in our range? */
1189     if (hot_add_range->count > 0) {
1190         HV_BALLOON_STATE_DESC_SET(stdesc, S_HOT_ADD_RB_WAIT);
1191         return;
1192     }
1193 
1194 ret_idle:
1195     HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
1196 }
1197 
1198 static void hv_balloon_handle_balloon_response(HvBalloon *balloon,
1199                                                HvBalloonReq *req,
1200                                                StateDesc *stdesc)
1201 {
1202     VMBusChanReq *vmreq = &req->vmreq;
1203     struct dm_balloon_response *msgBR = vmreq->msg;
1204 
1205     if (balloon->state != S_BALLOON_REPLY_WAIT) {
1206         warn_report("unexpected DM_BALLOON_RESPONSE in %d state",
1207                     balloon->state);
1208         return;
1209     }
1210 
1211     if (!hv_balloon_handle_msg_size(req, sizeof(*msgBR),
1212                                     "DM_BALLOON_RESPONSE"))
1213         return;
1214 
1215     trace_hv_balloon_incoming_balloon(msgBR->hdr.trans_id, msgBR->range_count,
1216                                       msgBR->more_pages);
1217 
1218     if (vmreq->msglen < sizeof(*msgBR) +
1219         (uint64_t)sizeof(msgBR->range_array[0]) * msgBR->range_count) {
1220         warn_report("DM_BALLOON_RESPONSE too short for the range count");
1221         return;
1222     }
1223 
1224     if (msgBR->range_count == 0) {
1225         /* The guest is already at its minimum size */
1226         balloon->balloon_diff = 0;
1227         goto ret_end_trans;
1228     } else {
1229         hv_balloon_handle_remove_ranges(balloon,
1230                                         msgBR->range_array,
1231                                         msgBR->range_count);
1232     }
1233 
1234     /* More responses expected? */
1235     if (msgBR->more_pages) {
1236         return;
1237     }
1238 
1239 ret_end_trans:
1240     balloon->trans_id++;
1241 
1242     if (balloon->balloon_diff > 0) {
1243         HV_BALLOON_STATE_DESC_SET(stdesc, S_BALLOON_RB_WAIT);
1244     } else {
1245         HV_BALLOON_STATE_DESC_SET(stdesc, S_IDLE);
1246     }
1247 }
1248 
1249 static void hv_balloon_handle_packet(HvBalloon *balloon, HvBalloonReq *req,
1250                                      StateDesc *stdesc)
1251 {
1252     VMBusChanReq *vmreq = &req->vmreq;
1253     struct dm_message *msg = vmreq->msg;
1254 
1255     if (vmreq->msglen < sizeof(msg->hdr)) {
1256         return;
1257     }
1258 
1259     switch (msg->hdr.type) {
1260     case DM_VERSION_REQUEST:
1261         hv_balloon_handle_version_request(balloon, req, stdesc);
1262         break;
1263 
1264     case DM_CAPABILITIES_REPORT:
1265         hv_balloon_handle_caps_report(balloon, req, stdesc);
1266         break;
1267 
1268     case DM_STATUS_REPORT:
1269         hv_balloon_handle_status_report(balloon, req);
1270         break;
1271 
1272     case DM_MEM_HOT_ADD_RESPONSE:
1273         hv_balloon_handle_hot_add_response(balloon, req, stdesc);
1274         break;
1275 
1276     case DM_UNBALLOON_RESPONSE:
1277         hv_balloon_handle_unballoon_response(balloon, req, stdesc);
1278         break;
1279 
1280     case DM_BALLOON_RESPONSE:
1281         hv_balloon_handle_balloon_response(balloon, req, stdesc);
1282         break;
1283 
1284     default:
1285         warn_report("unknown DM message %u", msg->hdr.type);
1286         break;
1287     }
1288 }
1289 
1290 static bool hv_balloon_recv_channel(HvBalloon *balloon, StateDesc *stdesc)
1291 {
1292     VMBusChannel *chan;
1293     HvBalloonReq *req;
1294 
1295     if (balloon->state == S_WAIT_RESET ||
1296         balloon->state == S_POST_RESET_CLOSED) {
1297         return false;
1298     }
1299 
1300     chan = hv_balloon_get_channel(balloon);
1301     if (vmbus_channel_recv_start(chan)) {
1302         return false;
1303     }
1304 
1305     while ((req = vmbus_channel_recv_peek(chan, sizeof(*req)))) {
1306         hv_balloon_handle_packet(balloon, req, stdesc);
1307         vmbus_free_req(req);
1308         vmbus_channel_recv_pop(chan);
1309 
1310         if (stdesc->state != S_NO_CHANGE) {
1311             break;
1312         }
1313     }
1314 
1315     return vmbus_channel_recv_done(chan) > 0;
1316 }
1317 
1318 /* old state handler -> new state transition (potential) */
1319 static bool hv_balloon_event_loop_state(HvBalloon *balloon)
1320 {
1321     StateDesc state_new = HV_BALLOON_STATE_DESC_INIT;
1322 
1323     hv_balloon_handle_state(balloon, &state_new);
1324     return hv_balloon_state_set(balloon, state_new.state, state_new.desc);
1325 }
1326 
1327 /* VMBus message -> new state transition (potential) */
1328 static bool hv_balloon_event_loop_recv(HvBalloon *balloon)
1329 {
1330     StateDesc state_new = HV_BALLOON_STATE_DESC_INIT;
1331     bool any_recv, state_changed;
1332 
1333     any_recv = hv_balloon_recv_channel(balloon, &state_new);
1334     state_changed = hv_balloon_state_set(balloon,
1335                                          state_new.state, state_new.desc);
1336 
1337     return state_changed || any_recv;
1338 }
1339 
1340 static void hv_balloon_event_loop(HvBalloon *balloon)
1341 {
1342     bool state_repeat, recv_repeat;
1343 
1344     do {
1345         state_repeat = hv_balloon_event_loop_state(balloon);
1346         recv_repeat = hv_balloon_event_loop_recv(balloon);
1347     } while (state_repeat || recv_repeat);
1348 }
1349 
1350 static void hv_balloon_vmdev_chan_notify(VMBusChannel *chan)
1351 {
1352     HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan));
1353 
1354     hv_balloon_event_loop(balloon);
1355 }
1356 
1357 static void hv_balloon_stat(void *opaque, BalloonInfo *info)
1358 {
1359     HvBalloon *balloon = opaque;
1360     info->actual = (hv_balloon_total_ram(balloon) - balloon->removed_both_ctr)
1361         << HV_BALLOON_PFN_SHIFT;
1362 }
1363 
1364 static void hv_balloon_to_target(void *opaque, ram_addr_t target)
1365 {
1366     HvBalloon *balloon = opaque;
1367     uint64_t target_pages = target >> HV_BALLOON_PFN_SHIFT;
1368 
1369     if (!target_pages) {
1370         return;
1371     }
1372 
1373     /*
1374      * always set target_changed, even with unchanged target, as the user
1375      * might be asking us to try again reaching it
1376      */
1377     balloon->target = target_pages;
1378     balloon->target_changed = true;
1379 
1380     hv_balloon_event_loop(balloon);
1381 }
1382 
1383 static int hv_balloon_vmdev_open_channel(VMBusChannel *chan)
1384 {
1385     HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan));
1386 
1387     if (balloon->state != S_POST_RESET_CLOSED) {
1388         warn_report("guest trying to open a DM channel in invalid %d state",
1389                     balloon->state);
1390         return -EINVAL;
1391     }
1392 
1393     HV_BALLOON_SET_STATE(balloon, S_VERSION);
1394     hv_balloon_event_loop(balloon);
1395 
1396     return 0;
1397 }
1398 
1399 static void hv_balloon_vmdev_close_channel(VMBusChannel *chan)
1400 {
1401     HvBalloon *balloon = HV_BALLOON(vmbus_channel_device(chan));
1402 
1403     timer_del(&balloon->post_init_timer);
1404 
1405     /* Don't report stale data */
1406     balloon->status_report.received = false;
1407 
1408     HV_BALLOON_SET_STATE(balloon, S_WAIT_RESET);
1409     hv_balloon_event_loop(balloon);
1410 }
1411 
1412 static void hv_balloon_post_init_timer(void *opaque)
1413 {
1414     HvBalloon *balloon = opaque;
1415 
1416     if (balloon->state != S_POST_INIT_WAIT) {
1417         return;
1418     }
1419 
1420     HV_BALLOON_SET_STATE(balloon, S_IDLE);
1421     hv_balloon_event_loop(balloon);
1422 }
1423 
1424 static void hv_balloon_system_reset_unrealize_common(HvBalloon *balloon)
1425 {
1426     g_clear_pointer(&balloon->our_range, hvb_our_range_memslots_free);
1427 }
1428 
1429 static void hv_balloon_system_reset(void *opaque)
1430 {
1431     HvBalloon *balloon = HV_BALLOON(opaque);
1432 
1433     hv_balloon_system_reset_unrealize_common(balloon);
1434 }
1435 
1436 static void hv_balloon_ensure_mr(HvBalloon *balloon)
1437 {
1438     MemoryRegion *hostmem_mr;
1439 
1440     assert(balloon->hostmem);
1441 
1442     if (balloon->mr) {
1443         return;
1444     }
1445 
1446     hostmem_mr = host_memory_backend_get_memory(balloon->hostmem);
1447 
1448     balloon->mr = g_new0(MemoryRegion, 1);
1449     memory_region_init(balloon->mr, OBJECT(balloon), TYPE_HV_BALLOON,
1450                        memory_region_size(hostmem_mr));
1451 
1452     /*
1453      * The VM can indicate an alignment up to 32 GiB. Memory device core can
1454      * usually only handle/guarantee 1 GiB alignment. The user will have to
1455      * specify a larger maxmem eventually.
1456      *
1457      * The memory device core will warn the user in case maxmem might have to be
1458      * increased and will fail plugging the device if there is not sufficient
1459      * space after alignment.
1460      *
1461      * TODO: we could do the alignment ourselves in a slightly bigger region.
1462      * But this feels better, although the warning might be annoying. Maybe
1463      * we can optimize that in the future (e.g., with such a device on the
1464      * cmdline place/size the device memory region differently.
1465      */
1466     balloon->mr->align = MAX(32 * GiB, memory_region_get_alignment(hostmem_mr));
1467 }
1468 
1469 static void hv_balloon_free_mr(HvBalloon *balloon)
1470 {
1471     if (!balloon->mr) {
1472         return;
1473     }
1474 
1475     object_unparent(OBJECT(balloon->mr));
1476     g_clear_pointer(&balloon->mr, g_free);
1477 }
1478 
1479 static void hv_balloon_vmdev_realize(VMBusDevice *vdev, Error **errp)
1480 {
1481     ERRP_GUARD();
1482     HvBalloon *balloon = HV_BALLOON(vdev);
1483     int ret;
1484 
1485     balloon->state = S_WAIT_RESET;
1486 
1487     ret = qemu_add_balloon_handler(hv_balloon_to_target, hv_balloon_stat,
1488                                    balloon);
1489     if (ret < 0) {
1490         /* This also protects against having multiple hv-balloon instances */
1491         error_setg(errp, "Only one balloon device is supported");
1492         return;
1493     }
1494 
1495     if (balloon->hostmem) {
1496         if (host_memory_backend_is_mapped(balloon->hostmem)) {
1497             Object *obj = OBJECT(balloon->hostmem);
1498 
1499             error_setg(errp, "'%s' property specifies a busy memdev: %s",
1500                        HV_BALLOON_MEMDEV_PROP,
1501                        object_get_canonical_path_component(obj));
1502             goto out_balloon_handler;
1503         }
1504 
1505         hv_balloon_ensure_mr(balloon);
1506 
1507         /* This is rather unlikely to happen, but let's still check for it. */
1508         if (!QEMU_IS_ALIGNED(memory_region_size(balloon->mr),
1509                              HV_BALLOON_PAGE_SIZE)) {
1510             error_setg(errp, "'%s' property memdev size has to be a multiple of 0x%" PRIx64,
1511                        HV_BALLOON_MEMDEV_PROP, (uint64_t)HV_BALLOON_PAGE_SIZE);
1512             goto out_balloon_handler;
1513         }
1514 
1515         host_memory_backend_set_mapped(balloon->hostmem, true);
1516         vmstate_register_ram(host_memory_backend_get_memory(balloon->hostmem),
1517                              DEVICE(balloon));
1518     } else if (balloon->addr) {
1519         error_setg(errp, "'%s' property must not be set without a memdev",
1520                    HV_BALLOON_MEMDEV_PROP);
1521         goto out_balloon_handler;
1522     }
1523 
1524     timer_init_ms(&balloon->post_init_timer, QEMU_CLOCK_VIRTUAL,
1525                   hv_balloon_post_init_timer, balloon);
1526 
1527     qemu_register_reset(hv_balloon_system_reset, balloon);
1528 
1529     return;
1530 
1531 out_balloon_handler:
1532     qemu_remove_balloon_handler(balloon);
1533 }
1534 
1535 /*
1536  * VMBus device reset has to be implemented in case the guest decides to
1537  * disconnect and reconnect to the VMBus without rebooting the whole system.
1538  *
1539  * However, the hot-added memory can't be removed here as Windows keeps on using
1540  * it until the system is restarted, even after disconnecting from the VMBus.
1541  */
1542 static void hv_balloon_vmdev_reset(VMBusDevice *vdev)
1543 {
1544     HvBalloon *balloon = HV_BALLOON(vdev);
1545 
1546     if (balloon->state == S_POST_RESET_CLOSED) {
1547         return;
1548     }
1549 
1550     if (balloon->our_range) {
1551         hvb_our_range_clear_removed_trees(OUR_RANGE(balloon->our_range));
1552     }
1553 
1554     hvb_page_range_tree_destroy(&balloon->removed_guest);
1555     hvb_page_range_tree_destroy(&balloon->removed_both);
1556     hvb_page_range_tree_init(&balloon->removed_guest);
1557     hvb_page_range_tree_init(&balloon->removed_both);
1558 
1559     balloon->trans_id = 0;
1560     balloon->removed_guest_ctr = 0;
1561     balloon->removed_both_ctr = 0;
1562 
1563     HV_BALLOON_SET_STATE(balloon, S_POST_RESET_CLOSED);
1564     hv_balloon_event_loop(balloon);
1565 }
1566 
1567 /*
1568  * Clean up things that were (possibly) allocated pre-realization, for example
1569  * from memory_device_pre_plug(), so we don't leak them if the device don't
1570  * actually get realized in the end.
1571  */
1572 static void hv_balloon_unrealize_finalize_common(HvBalloon *balloon)
1573 {
1574     hv_balloon_free_mr(balloon);
1575     balloon->addr = 0;
1576 
1577     balloon->memslot_count = 0;
1578 }
1579 
1580 static void hv_balloon_vmdev_unrealize(VMBusDevice *vdev)
1581 {
1582     HvBalloon *balloon = HV_BALLOON(vdev);
1583 
1584     qemu_unregister_reset(hv_balloon_system_reset, balloon);
1585 
1586     hv_balloon_system_reset_unrealize_common(balloon);
1587 
1588     qemu_remove_balloon_handler(balloon);
1589 
1590     if (balloon->hostmem) {
1591         vmstate_unregister_ram(host_memory_backend_get_memory(balloon->hostmem),
1592                                DEVICE(balloon));
1593         host_memory_backend_set_mapped(balloon->hostmem, false);
1594     }
1595 
1596     hvb_page_range_tree_destroy(&balloon->removed_guest);
1597     hvb_page_range_tree_destroy(&balloon->removed_both);
1598 
1599     hv_balloon_unrealize_finalize_common(balloon);
1600 }
1601 
1602 static uint64_t hv_balloon_md_get_addr(const MemoryDeviceState *md)
1603 {
1604     return object_property_get_uint(OBJECT(md), HV_BALLOON_ADDR_PROP,
1605                                     &error_abort);
1606 }
1607 
1608 static void hv_balloon_md_set_addr(MemoryDeviceState *md, uint64_t addr,
1609                                    Error **errp)
1610 {
1611     object_property_set_uint(OBJECT(md), HV_BALLOON_ADDR_PROP, addr, errp);
1612 }
1613 
1614 static MemoryRegion *hv_balloon_md_get_memory_region(MemoryDeviceState *md,
1615                                                      Error **errp)
1616 {
1617     HvBalloon *balloon = HV_BALLOON(md);
1618 
1619     if (!balloon->hostmem) {
1620         return NULL;
1621     }
1622 
1623     hv_balloon_ensure_mr(balloon);
1624 
1625     return balloon->mr;
1626 }
1627 
1628 static void hv_balloon_md_fill_device_info(const MemoryDeviceState *md,
1629                                            MemoryDeviceInfo *info)
1630 {
1631     HvBalloonDeviceInfo *hi = g_new0(HvBalloonDeviceInfo, 1);
1632     const HvBalloon *balloon = HV_BALLOON(md);
1633     DeviceState *dev = DEVICE(md);
1634 
1635     if (dev->id) {
1636         hi->id = g_strdup(dev->id);
1637     }
1638 
1639     if (balloon->hostmem) {
1640         hi->memdev = object_get_canonical_path(OBJECT(balloon->hostmem));
1641         hi->memaddr = balloon->addr;
1642         hi->has_memaddr = true;
1643         hi->max_size = memory_region_size(balloon->mr);
1644         /* TODO: expose current provided size or something else? */
1645     } else {
1646         hi->max_size = 0;
1647     }
1648 
1649     info->u.hv_balloon.data = hi;
1650     info->type = MEMORY_DEVICE_INFO_KIND_HV_BALLOON;
1651 }
1652 
1653 static void hv_balloon_decide_memslots(MemoryDeviceState *md,
1654                                        unsigned int limit)
1655 {
1656     HvBalloon *balloon = HV_BALLOON(md);
1657     MemoryRegion *hostmem_mr;
1658     uint64_t region_size, memslot_size, memslots;
1659 
1660     /* We're called exactly once, before realizing the device. */
1661     assert(!balloon->memslot_count);
1662 
1663     /* We should not be called if we don't have a memory backend */
1664     assert(balloon->hostmem);
1665 
1666     hostmem_mr = host_memory_backend_get_memory(balloon->hostmem);
1667     region_size = memory_region_size(hostmem_mr);
1668 
1669     assert(region_size > 0);
1670     memslot_size = QEMU_ALIGN_UP(region_size / limit,
1671                                  HV_BALLOON_HA_MEMSLOT_SIZE_ALIGN);
1672     memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size;
1673 
1674     if (memslots > 1) {
1675         balloon->memslot_size = memslot_size;
1676     } else {
1677         balloon->memslot_size = region_size;
1678     }
1679 
1680     assert(memslots <= UINT_MAX);
1681     balloon->memslot_count = memslots;
1682 }
1683 
1684 static unsigned int hv_balloon_get_memslots(MemoryDeviceState *md)
1685 {
1686     const HvBalloon *balloon = HV_BALLOON(md);
1687 
1688     /* We're called after setting the suggested limit. */
1689     assert(balloon->memslot_count > 0);
1690 
1691     return balloon->memslot_count;
1692 }
1693 
1694 static void hv_balloon_init(Object *obj)
1695 {
1696 }
1697 
1698 static void hv_balloon_finalize(Object *obj)
1699 {
1700     HvBalloon *balloon = HV_BALLOON(obj);
1701 
1702     hv_balloon_unrealize_finalize_common(balloon);
1703 }
1704 
1705 static Property hv_balloon_properties[] = {
1706     DEFINE_PROP_BOOL("status-report", HvBalloon,
1707                      status_report.enabled, false),
1708 
1709     /* MEMORY_DEVICE props */
1710     DEFINE_PROP_LINK(HV_BALLOON_MEMDEV_PROP, HvBalloon, hostmem,
1711                      TYPE_MEMORY_BACKEND, HostMemoryBackend *),
1712     DEFINE_PROP_UINT64(HV_BALLOON_ADDR_PROP, HvBalloon, addr, 0),
1713 
1714     DEFINE_PROP_END_OF_LIST(),
1715 };
1716 
1717 static void hv_balloon_class_init(ObjectClass *klass, void *data)
1718 {
1719     DeviceClass *dc = DEVICE_CLASS(klass);
1720     VMBusDeviceClass *vdc = VMBUS_DEVICE_CLASS(klass);
1721     MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
1722 
1723     device_class_set_props(dc, hv_balloon_properties);
1724     qemu_uuid_parse(HV_BALLOON_GUID, &vdc->classid);
1725     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1726 
1727     vdc->vmdev_realize = hv_balloon_vmdev_realize;
1728     vdc->vmdev_unrealize = hv_balloon_vmdev_unrealize;
1729     vdc->vmdev_reset = hv_balloon_vmdev_reset;
1730     vdc->open_channel = hv_balloon_vmdev_open_channel;
1731     vdc->close_channel = hv_balloon_vmdev_close_channel;
1732     vdc->chan_notify_cb = hv_balloon_vmdev_chan_notify;
1733 
1734     mdc->get_addr = hv_balloon_md_get_addr;
1735     mdc->set_addr = hv_balloon_md_set_addr;
1736     mdc->get_plugged_size = memory_device_get_region_size;
1737     mdc->get_memory_region = hv_balloon_md_get_memory_region;
1738     mdc->decide_memslots = hv_balloon_decide_memslots;
1739     mdc->get_memslots = hv_balloon_get_memslots;
1740     mdc->fill_device_info = hv_balloon_md_fill_device_info;
1741 }
1742