xref: /openbmc/qemu/hw/virtio/virtio-mem.c (revision 623d7e3551a6fc5693c06ea938c60fe281b52e27)
1 /*
2  * Virtio MEM device
3  *
4  * Copyright (C) 2020 Red Hat, Inc.
5  *
6  * Authors:
7  *  David Hildenbrand <david@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.
10  * See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "qemu/iov.h"
15 #include "qemu/cutils.h"
16 #include "qemu/error-report.h"
17 #include "qemu/units.h"
18 #include "sysemu/numa.h"
19 #include "sysemu/sysemu.h"
20 #include "sysemu/reset.h"
21 #include "hw/virtio/virtio.h"
22 #include "hw/virtio/virtio-bus.h"
23 #include "hw/virtio/virtio-mem.h"
24 #include "qapi/error.h"
25 #include "qapi/visitor.h"
26 #include "exec/ram_addr.h"
27 #include "migration/misc.h"
28 #include "hw/boards.h"
29 #include "hw/qdev-properties.h"
30 #include CONFIG_DEVICES
31 #include "trace.h"
32 
33 static const VMStateDescription vmstate_virtio_mem_device_early;
34 
35 /*
36  * We only had legacy x86 guests that did not support
37  * VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests.
38  */
39 #if defined(TARGET_X86_64) || defined(TARGET_I386)
40 #define VIRTIO_MEM_HAS_LEGACY_GUESTS
41 #endif
42 
43 /*
44  * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
45  * bitmap small.
46  */
47 #define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
48 
49 static uint32_t virtio_mem_default_thp_size(void)
50 {
51     uint32_t default_thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
52 
53 #if defined(__x86_64__) || defined(__arm__) || defined(__powerpc64__)
54     default_thp_size = 2 * MiB;
55 #elif defined(__aarch64__)
56     if (qemu_real_host_page_size() == 4 * KiB) {
57         default_thp_size = 2 * MiB;
58     } else if (qemu_real_host_page_size() == 16 * KiB) {
59         default_thp_size = 32 * MiB;
60     } else if (qemu_real_host_page_size() == 64 * KiB) {
61         default_thp_size = 512 * MiB;
62     }
63 #endif
64 
65     return default_thp_size;
66 }
67 
68 /*
69  * We want to have a reasonable default block size such that
70  * 1. We avoid splitting THPs when unplugging memory, which degrades
71  *    performance.
72  * 2. We avoid placing THPs for plugged blocks that also cover unplugged
73  *    blocks.
74  *
75  * The actual THP size might differ between Linux kernels, so we try to probe
76  * it. In the future (if we ever run into issues regarding 2.), we might want
77  * to disable THP in case we fail to properly probe the THP size, or if the
78  * block size is configured smaller than the THP size.
79  */
80 static uint32_t thp_size;
81 
82 #define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
83 static uint32_t virtio_mem_thp_size(void)
84 {
85     gchar *content = NULL;
86     const char *endptr;
87     uint64_t tmp;
88 
89     if (thp_size) {
90         return thp_size;
91     }
92 
93     /*
94      * Try to probe the actual THP size, fallback to (sane but eventually
95      * incorrect) default sizes.
96      */
97     if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
98         !qemu_strtou64(content, &endptr, 0, &tmp) &&
99         (!endptr || *endptr == '\n')) {
100         /* Sanity-check the value and fallback to something reasonable. */
101         if (!tmp || !is_power_of_2(tmp)) {
102             warn_report("Read unsupported THP size: %" PRIx64, tmp);
103         } else {
104             thp_size = tmp;
105         }
106     }
107 
108     if (!thp_size) {
109         thp_size = virtio_mem_default_thp_size();
110         warn_report("Could not detect THP size, falling back to %" PRIx64
111                     "  MiB.", thp_size / MiB);
112     }
113 
114     g_free(content);
115     return thp_size;
116 }
117 
118 static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
119 {
120     const uint64_t page_size = qemu_ram_pagesize(rb);
121 
122     /* We can have hugetlbfs with a page size smaller than the THP size. */
123     if (page_size == qemu_real_host_page_size()) {
124         return MAX(page_size, virtio_mem_thp_size());
125     }
126     return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE);
127 }
128 
129 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
130 static bool virtio_mem_has_shared_zeropage(RAMBlock *rb)
131 {
132     /*
133      * We only have a guaranteed shared zeropage on ordinary MAP_PRIVATE
134      * anonymous RAM. In any other case, reading unplugged *can* populate a
135      * fresh page, consuming actual memory.
136      */
137     return !qemu_ram_is_shared(rb) && qemu_ram_get_fd(rb) < 0 &&
138            qemu_ram_pagesize(rb) == qemu_real_host_page_size();
139 }
140 #endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
141 
142 /*
143  * Size the usable region bigger than the requested size if possible. Esp.
144  * Linux guests will only add (aligned) memory blocks in case they fully
145  * fit into the usable region, but plug+online only a subset of the pages.
146  * The memory block size corresponds mostly to the section size.
147  *
148  * This allows e.g., to add 20MB with a section size of 128MB on x86_64, and
149  * a section size of 512MB on arm64 (as long as the start address is properly
150  * aligned, similar to ordinary DIMMs).
151  *
152  * We can change this at any time and maybe even make it configurable if
153  * necessary (as the section size can change). But it's more likely that the
154  * section size will rather get smaller and not bigger over time.
155  */
156 #if defined(TARGET_X86_64) || defined(TARGET_I386)
157 #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
158 #elif defined(TARGET_ARM)
159 #define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB))
160 #else
161 #error VIRTIO_MEM_USABLE_EXTENT not defined
162 #endif
163 
164 static bool virtio_mem_is_busy(void)
165 {
166     /*
167      * Postcopy cannot handle concurrent discards and we don't want to migrate
168      * pages on-demand with stale content when plugging new blocks.
169      *
170      * For precopy, we don't want unplugged blocks in our migration stream, and
171      * when plugging new blocks, the page content might differ between source
172      * and destination (observable by the guest when not initializing pages
173      * after plugging them) until we're running on the destination (as we didn't
174      * migrate these blocks when they were unplugged).
175      */
176     return migration_in_incoming_postcopy() || !migration_is_idle();
177 }
178 
179 typedef int (*virtio_mem_range_cb)(const VirtIOMEM *vmem, void *arg,
180                                    uint64_t offset, uint64_t size);
181 
182 static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg,
183                                                virtio_mem_range_cb cb)
184 {
185     unsigned long first_zero_bit, last_zero_bit;
186     uint64_t offset, size;
187     int ret = 0;
188 
189     first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size);
190     while (first_zero_bit < vmem->bitmap_size) {
191         offset = first_zero_bit * vmem->block_size;
192         last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
193                                       first_zero_bit + 1) - 1;
194         size = (last_zero_bit - first_zero_bit + 1) * vmem->block_size;
195 
196         ret = cb(vmem, arg, offset, size);
197         if (ret) {
198             break;
199         }
200         first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
201                                             last_zero_bit + 2);
202     }
203     return ret;
204 }
205 
206 static int virtio_mem_for_each_plugged_range(const VirtIOMEM *vmem, void *arg,
207                                              virtio_mem_range_cb cb)
208 {
209     unsigned long first_bit, last_bit;
210     uint64_t offset, size;
211     int ret = 0;
212 
213     first_bit = find_first_bit(vmem->bitmap, vmem->bitmap_size);
214     while (first_bit < vmem->bitmap_size) {
215         offset = first_bit * vmem->block_size;
216         last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
217                                       first_bit + 1) - 1;
218         size = (last_bit - first_bit + 1) * vmem->block_size;
219 
220         ret = cb(vmem, arg, offset, size);
221         if (ret) {
222             break;
223         }
224         first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
225                                   last_bit + 2);
226     }
227     return ret;
228 }
229 
230 /*
231  * Adjust the memory section to cover the intersection with the given range.
232  *
233  * Returns false if the intersection is empty, otherwise returns true.
234  */
235 static bool virtio_mem_intersect_memory_section(MemoryRegionSection *s,
236                                                 uint64_t offset, uint64_t size)
237 {
238     uint64_t start = MAX(s->offset_within_region, offset);
239     uint64_t end = MIN(s->offset_within_region + int128_get64(s->size),
240                        offset + size);
241 
242     if (end <= start) {
243         return false;
244     }
245 
246     s->offset_within_address_space += start - s->offset_within_region;
247     s->offset_within_region = start;
248     s->size = int128_make64(end - start);
249     return true;
250 }
251 
252 typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg);
253 
254 static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem,
255                                                MemoryRegionSection *s,
256                                                void *arg,
257                                                virtio_mem_section_cb cb)
258 {
259     unsigned long first_bit, last_bit;
260     uint64_t offset, size;
261     int ret = 0;
262 
263     first_bit = s->offset_within_region / vmem->block_size;
264     first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
265     while (first_bit < vmem->bitmap_size) {
266         MemoryRegionSection tmp = *s;
267 
268         offset = first_bit * vmem->block_size;
269         last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
270                                       first_bit + 1) - 1;
271         size = (last_bit - first_bit + 1) * vmem->block_size;
272 
273         if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
274             break;
275         }
276         ret = cb(&tmp, arg);
277         if (ret) {
278             break;
279         }
280         first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
281                                   last_bit + 2);
282     }
283     return ret;
284 }
285 
286 static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem,
287                                                  MemoryRegionSection *s,
288                                                  void *arg,
289                                                  virtio_mem_section_cb cb)
290 {
291     unsigned long first_bit, last_bit;
292     uint64_t offset, size;
293     int ret = 0;
294 
295     first_bit = s->offset_within_region / vmem->block_size;
296     first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
297     while (first_bit < vmem->bitmap_size) {
298         MemoryRegionSection tmp = *s;
299 
300         offset = first_bit * vmem->block_size;
301         last_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
302                                  first_bit + 1) - 1;
303         size = (last_bit - first_bit + 1) * vmem->block_size;
304 
305         if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
306             break;
307         }
308         ret = cb(&tmp, arg);
309         if (ret) {
310             break;
311         }
312         first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
313                                        last_bit + 2);
314     }
315     return ret;
316 }
317 
318 static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg)
319 {
320     RamDiscardListener *rdl = arg;
321 
322     return rdl->notify_populate(rdl, s);
323 }
324 
325 static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg)
326 {
327     RamDiscardListener *rdl = arg;
328 
329     rdl->notify_discard(rdl, s);
330     return 0;
331 }
332 
333 static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset,
334                                      uint64_t size)
335 {
336     RamDiscardListener *rdl;
337 
338     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
339         MemoryRegionSection tmp = *rdl->section;
340 
341         if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
342             continue;
343         }
344         rdl->notify_discard(rdl, &tmp);
345     }
346 }
347 
348 static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset,
349                                   uint64_t size)
350 {
351     RamDiscardListener *rdl, *rdl2;
352     int ret = 0;
353 
354     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
355         MemoryRegionSection tmp = *rdl->section;
356 
357         if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
358             continue;
359         }
360         ret = rdl->notify_populate(rdl, &tmp);
361         if (ret) {
362             break;
363         }
364     }
365 
366     if (ret) {
367         /* Notify all already-notified listeners. */
368         QLIST_FOREACH(rdl2, &vmem->rdl_list, next) {
369             MemoryRegionSection tmp = *rdl2->section;
370 
371             if (rdl2 == rdl) {
372                 break;
373             }
374             if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
375                 continue;
376             }
377             rdl2->notify_discard(rdl2, &tmp);
378         }
379     }
380     return ret;
381 }
382 
383 static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem)
384 {
385     RamDiscardListener *rdl;
386 
387     if (!vmem->size) {
388         return;
389     }
390 
391     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
392         if (rdl->double_discard_supported) {
393             rdl->notify_discard(rdl, rdl->section);
394         } else {
395             virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
396                                                 virtio_mem_notify_discard_cb);
397         }
398     }
399 }
400 
401 static bool virtio_mem_is_range_plugged(const VirtIOMEM *vmem,
402                                         uint64_t start_gpa, uint64_t size)
403 {
404     const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
405     const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
406     unsigned long found_bit;
407 
408     /* We fake a shorter bitmap to avoid searching too far. */
409     found_bit = find_next_zero_bit(vmem->bitmap, last_bit + 1, first_bit);
410     return found_bit > last_bit;
411 }
412 
413 static bool virtio_mem_is_range_unplugged(const VirtIOMEM *vmem,
414                                           uint64_t start_gpa, uint64_t size)
415 {
416     const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
417     const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
418     unsigned long found_bit;
419 
420     /* We fake a shorter bitmap to avoid searching too far. */
421     found_bit = find_next_bit(vmem->bitmap, last_bit + 1, first_bit);
422     return found_bit > last_bit;
423 }
424 
425 static void virtio_mem_set_range_plugged(VirtIOMEM *vmem, uint64_t start_gpa,
426                                          uint64_t size)
427 {
428     const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
429     const unsigned long nbits = size / vmem->block_size;
430 
431     bitmap_set(vmem->bitmap, bit, nbits);
432 }
433 
434 static void virtio_mem_set_range_unplugged(VirtIOMEM *vmem, uint64_t start_gpa,
435                                            uint64_t size)
436 {
437     const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
438     const unsigned long nbits = size / vmem->block_size;
439 
440     bitmap_clear(vmem->bitmap, bit, nbits);
441 }
442 
443 static void virtio_mem_send_response(VirtIOMEM *vmem, VirtQueueElement *elem,
444                                      struct virtio_mem_resp *resp)
445 {
446     VirtIODevice *vdev = VIRTIO_DEVICE(vmem);
447     VirtQueue *vq = vmem->vq;
448 
449     trace_virtio_mem_send_response(le16_to_cpu(resp->type));
450     iov_from_buf(elem->in_sg, elem->in_num, 0, resp, sizeof(*resp));
451 
452     virtqueue_push(vq, elem, sizeof(*resp));
453     virtio_notify(vdev, vq);
454 }
455 
456 static void virtio_mem_send_response_simple(VirtIOMEM *vmem,
457                                             VirtQueueElement *elem,
458                                             uint16_t type)
459 {
460     struct virtio_mem_resp resp = {
461         .type = cpu_to_le16(type),
462     };
463 
464     virtio_mem_send_response(vmem, elem, &resp);
465 }
466 
467 static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa,
468                                    uint64_t size)
469 {
470     if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) {
471         return false;
472     }
473     if (gpa + size < gpa || !size) {
474         return false;
475     }
476     if (gpa < vmem->addr || gpa >= vmem->addr + vmem->usable_region_size) {
477         return false;
478     }
479     if (gpa + size > vmem->addr + vmem->usable_region_size) {
480         return false;
481     }
482     return true;
483 }
484 
485 static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
486                                       uint64_t size, bool plug)
487 {
488     const uint64_t offset = start_gpa - vmem->addr;
489     RAMBlock *rb = vmem->memdev->mr.ram_block;
490     int ret = 0;
491 
492     if (virtio_mem_is_busy()) {
493         return -EBUSY;
494     }
495 
496     if (!plug) {
497         if (ram_block_discard_range(rb, offset, size)) {
498             return -EBUSY;
499         }
500         virtio_mem_notify_unplug(vmem, offset, size);
501         virtio_mem_set_range_unplugged(vmem, start_gpa, size);
502         return 0;
503     }
504 
505     if (vmem->prealloc) {
506         void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
507         int fd = memory_region_get_fd(&vmem->memdev->mr);
508         Error *local_err = NULL;
509 
510         qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err);
511         if (local_err) {
512             static bool warned;
513 
514             /*
515              * Warn only once, we don't want to fill the log with these
516              * warnings.
517              */
518             if (!warned) {
519                 warn_report_err(local_err);
520                 warned = true;
521             } else {
522                 error_free(local_err);
523             }
524             ret = -EBUSY;
525         }
526     }
527 
528     if (!ret) {
529         ret = virtio_mem_notify_plug(vmem, offset, size);
530     }
531     if (ret) {
532         /* Could be preallocation or a notifier populated memory. */
533         ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size);
534         return -EBUSY;
535     }
536 
537     virtio_mem_set_range_plugged(vmem, start_gpa, size);
538     return 0;
539 }
540 
541 static int virtio_mem_state_change_request(VirtIOMEM *vmem, uint64_t gpa,
542                                            uint16_t nb_blocks, bool plug)
543 {
544     const uint64_t size = nb_blocks * vmem->block_size;
545     int ret;
546 
547     if (!virtio_mem_valid_range(vmem, gpa, size)) {
548         return VIRTIO_MEM_RESP_ERROR;
549     }
550 
551     if (plug && (vmem->size + size > vmem->requested_size)) {
552         return VIRTIO_MEM_RESP_NACK;
553     }
554 
555     /* test if really all blocks are in the opposite state */
556     if ((plug && !virtio_mem_is_range_unplugged(vmem, gpa, size)) ||
557         (!plug && !virtio_mem_is_range_plugged(vmem, gpa, size))) {
558         return VIRTIO_MEM_RESP_ERROR;
559     }
560 
561     ret = virtio_mem_set_block_state(vmem, gpa, size, plug);
562     if (ret) {
563         return VIRTIO_MEM_RESP_BUSY;
564     }
565     if (plug) {
566         vmem->size += size;
567     } else {
568         vmem->size -= size;
569     }
570     notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
571     return VIRTIO_MEM_RESP_ACK;
572 }
573 
574 static void virtio_mem_plug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
575                                     struct virtio_mem_req *req)
576 {
577     const uint64_t gpa = le64_to_cpu(req->u.plug.addr);
578     const uint16_t nb_blocks = le16_to_cpu(req->u.plug.nb_blocks);
579     uint16_t type;
580 
581     trace_virtio_mem_plug_request(gpa, nb_blocks);
582     type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, true);
583     virtio_mem_send_response_simple(vmem, elem, type);
584 }
585 
586 static void virtio_mem_unplug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
587                                       struct virtio_mem_req *req)
588 {
589     const uint64_t gpa = le64_to_cpu(req->u.unplug.addr);
590     const uint16_t nb_blocks = le16_to_cpu(req->u.unplug.nb_blocks);
591     uint16_t type;
592 
593     trace_virtio_mem_unplug_request(gpa, nb_blocks);
594     type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, false);
595     virtio_mem_send_response_simple(vmem, elem, type);
596 }
597 
598 static void virtio_mem_resize_usable_region(VirtIOMEM *vmem,
599                                             uint64_t requested_size,
600                                             bool can_shrink)
601 {
602     uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr),
603                            requested_size + VIRTIO_MEM_USABLE_EXTENT);
604 
605     /* The usable region size always has to be multiples of the block size. */
606     newsize = QEMU_ALIGN_UP(newsize, vmem->block_size);
607 
608     if (!requested_size) {
609         newsize = 0;
610     }
611 
612     if (newsize < vmem->usable_region_size && !can_shrink) {
613         return;
614     }
615 
616     trace_virtio_mem_resized_usable_region(vmem->usable_region_size, newsize);
617     vmem->usable_region_size = newsize;
618 }
619 
620 static int virtio_mem_unplug_all(VirtIOMEM *vmem)
621 {
622     RAMBlock *rb = vmem->memdev->mr.ram_block;
623 
624     if (virtio_mem_is_busy()) {
625         return -EBUSY;
626     }
627 
628     if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
629         return -EBUSY;
630     }
631     virtio_mem_notify_unplug_all(vmem);
632 
633     bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size);
634     if (vmem->size) {
635         vmem->size = 0;
636         notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
637     }
638     trace_virtio_mem_unplugged_all();
639     virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
640     return 0;
641 }
642 
643 static void virtio_mem_unplug_all_request(VirtIOMEM *vmem,
644                                           VirtQueueElement *elem)
645 {
646     trace_virtio_mem_unplug_all_request();
647     if (virtio_mem_unplug_all(vmem)) {
648         virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_BUSY);
649     } else {
650         virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ACK);
651     }
652 }
653 
654 static void virtio_mem_state_request(VirtIOMEM *vmem, VirtQueueElement *elem,
655                                      struct virtio_mem_req *req)
656 {
657     const uint16_t nb_blocks = le16_to_cpu(req->u.state.nb_blocks);
658     const uint64_t gpa = le64_to_cpu(req->u.state.addr);
659     const uint64_t size = nb_blocks * vmem->block_size;
660     struct virtio_mem_resp resp = {
661         .type = cpu_to_le16(VIRTIO_MEM_RESP_ACK),
662     };
663 
664     trace_virtio_mem_state_request(gpa, nb_blocks);
665     if (!virtio_mem_valid_range(vmem, gpa, size)) {
666         virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ERROR);
667         return;
668     }
669 
670     if (virtio_mem_is_range_plugged(vmem, gpa, size)) {
671         resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_PLUGGED);
672     } else if (virtio_mem_is_range_unplugged(vmem, gpa, size)) {
673         resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_UNPLUGGED);
674     } else {
675         resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_MIXED);
676     }
677     trace_virtio_mem_state_response(le16_to_cpu(resp.u.state.state));
678     virtio_mem_send_response(vmem, elem, &resp);
679 }
680 
681 static void virtio_mem_handle_request(VirtIODevice *vdev, VirtQueue *vq)
682 {
683     const int len = sizeof(struct virtio_mem_req);
684     VirtIOMEM *vmem = VIRTIO_MEM(vdev);
685     VirtQueueElement *elem;
686     struct virtio_mem_req req;
687     uint16_t type;
688 
689     while (true) {
690         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
691         if (!elem) {
692             return;
693         }
694 
695         if (iov_to_buf(elem->out_sg, elem->out_num, 0, &req, len) < len) {
696             virtio_error(vdev, "virtio-mem protocol violation: invalid request"
697                          " size: %d", len);
698             virtqueue_detach_element(vq, elem, 0);
699             g_free(elem);
700             return;
701         }
702 
703         if (iov_size(elem->in_sg, elem->in_num) <
704             sizeof(struct virtio_mem_resp)) {
705             virtio_error(vdev, "virtio-mem protocol violation: not enough space"
706                          " for response: %zu",
707                          iov_size(elem->in_sg, elem->in_num));
708             virtqueue_detach_element(vq, elem, 0);
709             g_free(elem);
710             return;
711         }
712 
713         type = le16_to_cpu(req.type);
714         switch (type) {
715         case VIRTIO_MEM_REQ_PLUG:
716             virtio_mem_plug_request(vmem, elem, &req);
717             break;
718         case VIRTIO_MEM_REQ_UNPLUG:
719             virtio_mem_unplug_request(vmem, elem, &req);
720             break;
721         case VIRTIO_MEM_REQ_UNPLUG_ALL:
722             virtio_mem_unplug_all_request(vmem, elem);
723             break;
724         case VIRTIO_MEM_REQ_STATE:
725             virtio_mem_state_request(vmem, elem, &req);
726             break;
727         default:
728             virtio_error(vdev, "virtio-mem protocol violation: unknown request"
729                          " type: %d", type);
730             virtqueue_detach_element(vq, elem, 0);
731             g_free(elem);
732             return;
733         }
734 
735         g_free(elem);
736     }
737 }
738 
739 static void virtio_mem_get_config(VirtIODevice *vdev, uint8_t *config_data)
740 {
741     VirtIOMEM *vmem = VIRTIO_MEM(vdev);
742     struct virtio_mem_config *config = (void *) config_data;
743 
744     config->block_size = cpu_to_le64(vmem->block_size);
745     config->node_id = cpu_to_le16(vmem->node);
746     config->requested_size = cpu_to_le64(vmem->requested_size);
747     config->plugged_size = cpu_to_le64(vmem->size);
748     config->addr = cpu_to_le64(vmem->addr);
749     config->region_size = cpu_to_le64(memory_region_size(&vmem->memdev->mr));
750     config->usable_region_size = cpu_to_le64(vmem->usable_region_size);
751 }
752 
753 static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features,
754                                         Error **errp)
755 {
756     MachineState *ms = MACHINE(qdev_get_machine());
757     VirtIOMEM *vmem = VIRTIO_MEM(vdev);
758 
759     if (ms->numa_state) {
760 #if defined(CONFIG_ACPI)
761         virtio_add_feature(&features, VIRTIO_MEM_F_ACPI_PXM);
762 #endif
763     }
764     assert(vmem->unplugged_inaccessible != ON_OFF_AUTO_AUTO);
765     if (vmem->unplugged_inaccessible == ON_OFF_AUTO_ON) {
766         virtio_add_feature(&features, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE);
767     }
768     return features;
769 }
770 
771 static int virtio_mem_validate_features(VirtIODevice *vdev)
772 {
773     if (virtio_host_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE) &&
774         !virtio_vdev_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE)) {
775         return -EFAULT;
776     }
777     return 0;
778 }
779 
780 static void virtio_mem_system_reset(void *opaque)
781 {
782     VirtIOMEM *vmem = VIRTIO_MEM(opaque);
783 
784     /*
785      * During usual resets, we will unplug all memory and shrink the usable
786      * region size. This is, however, not possible in all scenarios. Then,
787      * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL).
788      */
789     virtio_mem_unplug_all(vmem);
790 }
791 
792 static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
793 {
794     MachineState *ms = MACHINE(qdev_get_machine());
795     int nb_numa_nodes = ms->numa_state ? ms->numa_state->num_nodes : 0;
796     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
797     VirtIOMEM *vmem = VIRTIO_MEM(dev);
798     uint64_t page_size;
799     RAMBlock *rb;
800     int ret;
801 
802     if (!vmem->memdev) {
803         error_setg(errp, "'%s' property is not set", VIRTIO_MEM_MEMDEV_PROP);
804         return;
805     } else if (host_memory_backend_is_mapped(vmem->memdev)) {
806         error_setg(errp, "'%s' property specifies a busy memdev: %s",
807                    VIRTIO_MEM_MEMDEV_PROP,
808                    object_get_canonical_path_component(OBJECT(vmem->memdev)));
809         return;
810     } else if (!memory_region_is_ram(&vmem->memdev->mr) ||
811         memory_region_is_rom(&vmem->memdev->mr) ||
812         !vmem->memdev->mr.ram_block) {
813         error_setg(errp, "'%s' property specifies an unsupported memdev",
814                    VIRTIO_MEM_MEMDEV_PROP);
815         return;
816     } else if (vmem->memdev->prealloc) {
817         error_setg(errp, "'%s' property specifies a memdev with preallocation"
818                    " enabled: %s. Instead, specify 'prealloc=on' for the"
819                    " virtio-mem device. ", VIRTIO_MEM_MEMDEV_PROP,
820                    object_get_canonical_path_component(OBJECT(vmem->memdev)));
821         return;
822     }
823 
824     if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) ||
825         (!nb_numa_nodes && vmem->node)) {
826         error_setg(errp, "'%s' property has value '%" PRIu32 "', which exceeds"
827                    "the number of numa nodes: %d", VIRTIO_MEM_NODE_PROP,
828                    vmem->node, nb_numa_nodes ? nb_numa_nodes : 1);
829         return;
830     }
831 
832     if (enable_mlock) {
833         error_setg(errp, "Incompatible with mlock");
834         return;
835     }
836 
837     rb = vmem->memdev->mr.ram_block;
838     page_size = qemu_ram_pagesize(rb);
839 
840 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
841     switch (vmem->unplugged_inaccessible) {
842     case ON_OFF_AUTO_AUTO:
843         if (virtio_mem_has_shared_zeropage(rb)) {
844             vmem->unplugged_inaccessible = ON_OFF_AUTO_OFF;
845         } else {
846             vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
847         }
848         break;
849     case ON_OFF_AUTO_OFF:
850         if (!virtio_mem_has_shared_zeropage(rb)) {
851             warn_report("'%s' property set to 'off' with a memdev that does"
852                         " not support the shared zeropage.",
853                         VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
854         }
855         break;
856     default:
857         break;
858     }
859 #else /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
860     vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
861 #endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
862 
863     /*
864      * If the block size wasn't configured by the user, use a sane default. This
865      * allows using hugetlbfs backends of any page size without manual
866      * intervention.
867      */
868     if (!vmem->block_size) {
869         vmem->block_size = virtio_mem_default_block_size(rb);
870     }
871 
872     if (vmem->block_size < page_size) {
873         error_setg(errp, "'%s' property has to be at least the page size (0x%"
874                    PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size);
875         return;
876     } else if (vmem->block_size < virtio_mem_default_block_size(rb)) {
877         warn_report("'%s' property is smaller than the default block size (%"
878                     PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP,
879                     virtio_mem_default_block_size(rb) / MiB);
880     }
881     if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) {
882         error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
883                    ")", VIRTIO_MEM_REQUESTED_SIZE_PROP,
884                    VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
885         return;
886     } else if (!QEMU_IS_ALIGNED(vmem->addr, vmem->block_size)) {
887         error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
888                    ")", VIRTIO_MEM_ADDR_PROP, VIRTIO_MEM_BLOCK_SIZE_PROP,
889                    vmem->block_size);
890         return;
891     } else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr),
892                                 vmem->block_size)) {
893         error_setg(errp, "'%s' property memdev size has to be multiples of"
894                    "'%s' (0x%" PRIx64 ")", VIRTIO_MEM_MEMDEV_PROP,
895                    VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
896         return;
897     }
898 
899     if (ram_block_coordinated_discard_require(true)) {
900         error_setg(errp, "Discarding RAM is disabled");
901         return;
902     }
903 
904     ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb));
905     if (ret) {
906         error_setg_errno(errp, -ret, "Unexpected error discarding RAM");
907         ram_block_coordinated_discard_require(false);
908         return;
909     }
910 
911     virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
912 
913     vmem->bitmap_size = memory_region_size(&vmem->memdev->mr) /
914                         vmem->block_size;
915     vmem->bitmap = bitmap_new(vmem->bitmap_size);
916 
917     virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config));
918     vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
919 
920     host_memory_backend_set_mapped(vmem->memdev, true);
921     vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
922     if (vmem->early_migration) {
923         vmstate_register(VMSTATE_IF(vmem), VMSTATE_INSTANCE_ID_ANY,
924                          &vmstate_virtio_mem_device_early, vmem);
925     }
926     qemu_register_reset(virtio_mem_system_reset, vmem);
927 
928     /*
929      * Set ourselves as RamDiscardManager before the plug handler maps the
930      * memory region and exposes it via an address space.
931      */
932     memory_region_set_ram_discard_manager(&vmem->memdev->mr,
933                                           RAM_DISCARD_MANAGER(vmem));
934 }
935 
936 static void virtio_mem_device_unrealize(DeviceState *dev)
937 {
938     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
939     VirtIOMEM *vmem = VIRTIO_MEM(dev);
940 
941     /*
942      * The unplug handler unmapped the memory region, it cannot be
943      * found via an address space anymore. Unset ourselves.
944      */
945     memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
946     qemu_unregister_reset(virtio_mem_system_reset, vmem);
947     if (vmem->early_migration) {
948         vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early,
949                            vmem);
950     }
951     vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
952     host_memory_backend_set_mapped(vmem->memdev, false);
953     virtio_del_queue(vdev, 0);
954     virtio_cleanup(vdev);
955     g_free(vmem->bitmap);
956     ram_block_coordinated_discard_require(false);
957 }
958 
959 static int virtio_mem_discard_range_cb(const VirtIOMEM *vmem, void *arg,
960                                        uint64_t offset, uint64_t size)
961 {
962     RAMBlock *rb = vmem->memdev->mr.ram_block;
963 
964     return ram_block_discard_range(rb, offset, size) ? -EINVAL : 0;
965 }
966 
967 static int virtio_mem_restore_unplugged(VirtIOMEM *vmem)
968 {
969     /* Make sure all memory is really discarded after migration. */
970     return virtio_mem_for_each_unplugged_range(vmem, NULL,
971                                                virtio_mem_discard_range_cb);
972 }
973 
974 static int virtio_mem_post_load(void *opaque, int version_id)
975 {
976     VirtIOMEM *vmem = VIRTIO_MEM(opaque);
977     RamDiscardListener *rdl;
978     int ret;
979 
980     if (vmem->prealloc && !vmem->early_migration) {
981         warn_report("Proper preallocation with migration requires a newer QEMU machine");
982     }
983 
984     /*
985      * We started out with all memory discarded and our memory region is mapped
986      * into an address space. Replay, now that we updated the bitmap.
987      */
988     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
989         ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
990                                                  virtio_mem_notify_populate_cb);
991         if (ret) {
992             return ret;
993         }
994     }
995 
996     if (migration_in_incoming_postcopy()) {
997         return 0;
998     }
999 
1000     return virtio_mem_restore_unplugged(vmem);
1001 }
1002 
1003 static int virtio_mem_prealloc_range_cb(const VirtIOMEM *vmem, void *arg,
1004                                         uint64_t offset, uint64_t size)
1005 {
1006     void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
1007     int fd = memory_region_get_fd(&vmem->memdev->mr);
1008     Error *local_err = NULL;
1009 
1010     qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err);
1011     if (local_err) {
1012         error_report_err(local_err);
1013         return -ENOMEM;
1014     }
1015     return 0;
1016 }
1017 
1018 static int virtio_mem_post_load_early(void *opaque, int version_id)
1019 {
1020     VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1021     RAMBlock *rb = vmem->memdev->mr.ram_block;
1022     int ret;
1023 
1024     if (!vmem->prealloc) {
1025         return 0;
1026     }
1027 
1028     /*
1029      * We restored the bitmap and verified that the basic properties
1030      * match on source and destination, so we can go ahead and preallocate
1031      * memory for all plugged memory blocks, before actual RAM migration starts
1032      * touching this memory.
1033      */
1034     ret = virtio_mem_for_each_plugged_range(vmem, NULL,
1035                                             virtio_mem_prealloc_range_cb);
1036     if (ret) {
1037         return ret;
1038     }
1039 
1040     /*
1041      * This is tricky: postcopy wants to start with a clean slate. On
1042      * POSTCOPY_INCOMING_ADVISE, postcopy code discards all (ordinarily
1043      * preallocated) RAM such that postcopy will work as expected later.
1044      *
1045      * However, we run after POSTCOPY_INCOMING_ADVISE -- but before actual
1046      * RAM migration. So let's discard all memory again. This looks like an
1047      * expensive NOP, but actually serves a purpose: we made sure that we
1048      * were able to allocate all required backend memory once. We cannot
1049      * guarantee that the backend memory we will free will remain free
1050      * until we need it during postcopy, but at least we can catch the
1051      * obvious setup issues this way.
1052      */
1053     if (migration_incoming_postcopy_advised()) {
1054         if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
1055             return -EBUSY;
1056         }
1057     }
1058     return 0;
1059 }
1060 
1061 typedef struct VirtIOMEMMigSanityChecks {
1062     VirtIOMEM *parent;
1063     uint64_t addr;
1064     uint64_t region_size;
1065     uint64_t block_size;
1066     uint32_t node;
1067 } VirtIOMEMMigSanityChecks;
1068 
1069 static int virtio_mem_mig_sanity_checks_pre_save(void *opaque)
1070 {
1071     VirtIOMEMMigSanityChecks *tmp = opaque;
1072     VirtIOMEM *vmem = tmp->parent;
1073 
1074     tmp->addr = vmem->addr;
1075     tmp->region_size = memory_region_size(&vmem->memdev->mr);
1076     tmp->block_size = vmem->block_size;
1077     tmp->node = vmem->node;
1078     return 0;
1079 }
1080 
1081 static int virtio_mem_mig_sanity_checks_post_load(void *opaque, int version_id)
1082 {
1083     VirtIOMEMMigSanityChecks *tmp = opaque;
1084     VirtIOMEM *vmem = tmp->parent;
1085     const uint64_t new_region_size = memory_region_size(&vmem->memdev->mr);
1086 
1087     if (tmp->addr != vmem->addr) {
1088         error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
1089                      VIRTIO_MEM_ADDR_PROP, tmp->addr, vmem->addr);
1090         return -EINVAL;
1091     }
1092     /*
1093      * Note: Preparation for resizeable memory regions. The maximum size
1094      * of the memory region must not change during migration.
1095      */
1096     if (tmp->region_size != new_region_size) {
1097         error_report("Property '%s' size changed from 0x%" PRIx64 " to 0x%"
1098                      PRIx64, VIRTIO_MEM_MEMDEV_PROP, tmp->region_size,
1099                      new_region_size);
1100         return -EINVAL;
1101     }
1102     if (tmp->block_size != vmem->block_size) {
1103         error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
1104                      VIRTIO_MEM_BLOCK_SIZE_PROP, tmp->block_size,
1105                      vmem->block_size);
1106         return -EINVAL;
1107     }
1108     if (tmp->node != vmem->node) {
1109         error_report("Property '%s' changed from %" PRIu32 " to %" PRIu32,
1110                      VIRTIO_MEM_NODE_PROP, tmp->node, vmem->node);
1111         return -EINVAL;
1112     }
1113     return 0;
1114 }
1115 
1116 static const VMStateDescription vmstate_virtio_mem_sanity_checks = {
1117     .name = "virtio-mem-device/sanity-checks",
1118     .pre_save = virtio_mem_mig_sanity_checks_pre_save,
1119     .post_load = virtio_mem_mig_sanity_checks_post_load,
1120     .fields = (VMStateField[]) {
1121         VMSTATE_UINT64(addr, VirtIOMEMMigSanityChecks),
1122         VMSTATE_UINT64(region_size, VirtIOMEMMigSanityChecks),
1123         VMSTATE_UINT64(block_size, VirtIOMEMMigSanityChecks),
1124         VMSTATE_UINT32(node, VirtIOMEMMigSanityChecks),
1125         VMSTATE_END_OF_LIST(),
1126     },
1127 };
1128 
1129 static bool virtio_mem_vmstate_field_exists(void *opaque, int version_id)
1130 {
1131     const VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1132 
1133     /* With early migration, these fields were already migrated. */
1134     return !vmem->early_migration;
1135 }
1136 
1137 static const VMStateDescription vmstate_virtio_mem_device = {
1138     .name = "virtio-mem-device",
1139     .minimum_version_id = 1,
1140     .version_id = 1,
1141     .priority = MIG_PRI_VIRTIO_MEM,
1142     .post_load = virtio_mem_post_load,
1143     .fields = (VMStateField[]) {
1144         VMSTATE_WITH_TMP_TEST(VirtIOMEM, virtio_mem_vmstate_field_exists,
1145                               VirtIOMEMMigSanityChecks,
1146                               vmstate_virtio_mem_sanity_checks),
1147         VMSTATE_UINT64(usable_region_size, VirtIOMEM),
1148         VMSTATE_UINT64_TEST(size, VirtIOMEM, virtio_mem_vmstate_field_exists),
1149         VMSTATE_UINT64(requested_size, VirtIOMEM),
1150         VMSTATE_BITMAP_TEST(bitmap, VirtIOMEM, virtio_mem_vmstate_field_exists,
1151                             0, bitmap_size),
1152         VMSTATE_END_OF_LIST()
1153     },
1154 };
1155 
1156 /*
1157  * Transfer properties that are immutable while migration is active early,
1158  * such that we have have this information around before migrating any RAM
1159  * content.
1160  *
1161  * Note that virtio_mem_is_busy() makes sure these properties can no longer
1162  * change on the migration source until migration completed.
1163  *
1164  * With QEMU compat machines, we transmit these properties later, via
1165  * vmstate_virtio_mem_device instead -- see virtio_mem_vmstate_field_exists().
1166  */
1167 static const VMStateDescription vmstate_virtio_mem_device_early = {
1168     .name = "virtio-mem-device-early",
1169     .minimum_version_id = 1,
1170     .version_id = 1,
1171     .early_setup = true,
1172     .post_load = virtio_mem_post_load_early,
1173     .fields = (VMStateField[]) {
1174         VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks,
1175                          vmstate_virtio_mem_sanity_checks),
1176         VMSTATE_UINT64(size, VirtIOMEM),
1177         VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size),
1178         VMSTATE_END_OF_LIST()
1179     },
1180 };
1181 
1182 static const VMStateDescription vmstate_virtio_mem = {
1183     .name = "virtio-mem",
1184     .minimum_version_id = 1,
1185     .version_id = 1,
1186     .fields = (VMStateField[]) {
1187         VMSTATE_VIRTIO_DEVICE,
1188         VMSTATE_END_OF_LIST()
1189     },
1190 };
1191 
1192 static void virtio_mem_fill_device_info(const VirtIOMEM *vmem,
1193                                         VirtioMEMDeviceInfo *vi)
1194 {
1195     vi->memaddr = vmem->addr;
1196     vi->node = vmem->node;
1197     vi->requested_size = vmem->requested_size;
1198     vi->size = vmem->size;
1199     vi->max_size = memory_region_size(&vmem->memdev->mr);
1200     vi->block_size = vmem->block_size;
1201     vi->memdev = object_get_canonical_path(OBJECT(vmem->memdev));
1202 }
1203 
1204 static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp)
1205 {
1206     if (!vmem->memdev) {
1207         error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP);
1208         return NULL;
1209     }
1210 
1211     return &vmem->memdev->mr;
1212 }
1213 
1214 static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem,
1215                                                 Notifier *notifier)
1216 {
1217     notifier_list_add(&vmem->size_change_notifiers, notifier);
1218 }
1219 
1220 static void virtio_mem_remove_size_change_notifier(VirtIOMEM *vmem,
1221                                                    Notifier *notifier)
1222 {
1223     notifier_remove(notifier);
1224 }
1225 
1226 static void virtio_mem_get_size(Object *obj, Visitor *v, const char *name,
1227                                 void *opaque, Error **errp)
1228 {
1229     const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1230     uint64_t value = vmem->size;
1231 
1232     visit_type_size(v, name, &value, errp);
1233 }
1234 
1235 static void virtio_mem_get_requested_size(Object *obj, Visitor *v,
1236                                           const char *name, void *opaque,
1237                                           Error **errp)
1238 {
1239     const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1240     uint64_t value = vmem->requested_size;
1241 
1242     visit_type_size(v, name, &value, errp);
1243 }
1244 
1245 static void virtio_mem_set_requested_size(Object *obj, Visitor *v,
1246                                           const char *name, void *opaque,
1247                                           Error **errp)
1248 {
1249     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1250     uint64_t value;
1251 
1252     if (!visit_type_size(v, name, &value, errp)) {
1253         return;
1254     }
1255 
1256     /*
1257      * The block size and memory backend are not fixed until the device was
1258      * realized. realize() will verify these properties then.
1259      */
1260     if (DEVICE(obj)->realized) {
1261         if (!QEMU_IS_ALIGNED(value, vmem->block_size)) {
1262             error_setg(errp, "'%s' has to be multiples of '%s' (0x%" PRIx64
1263                        ")", name, VIRTIO_MEM_BLOCK_SIZE_PROP,
1264                        vmem->block_size);
1265             return;
1266         } else if (value > memory_region_size(&vmem->memdev->mr)) {
1267             error_setg(errp, "'%s' cannot exceed the memory backend size"
1268                        "(0x%" PRIx64 ")", name,
1269                        memory_region_size(&vmem->memdev->mr));
1270             return;
1271         }
1272 
1273         if (value != vmem->requested_size) {
1274             virtio_mem_resize_usable_region(vmem, value, false);
1275             vmem->requested_size = value;
1276         }
1277         /*
1278          * Trigger a config update so the guest gets notified. We trigger
1279          * even if the size didn't change (especially helpful for debugging).
1280          */
1281         virtio_notify_config(VIRTIO_DEVICE(vmem));
1282     } else {
1283         vmem->requested_size = value;
1284     }
1285 }
1286 
1287 static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name,
1288                                       void *opaque, Error **errp)
1289 {
1290     const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1291     uint64_t value = vmem->block_size;
1292 
1293     /*
1294      * If not configured by the user (and we're not realized yet), use the
1295      * default block size we would use with the current memory backend.
1296      */
1297     if (!value) {
1298         if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) {
1299             value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block);
1300         } else {
1301             value = virtio_mem_thp_size();
1302         }
1303     }
1304 
1305     visit_type_size(v, name, &value, errp);
1306 }
1307 
1308 static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
1309                                       void *opaque, Error **errp)
1310 {
1311     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1312     uint64_t value;
1313 
1314     if (DEVICE(obj)->realized) {
1315         error_setg(errp, "'%s' cannot be changed", name);
1316         return;
1317     }
1318 
1319     if (!visit_type_size(v, name, &value, errp)) {
1320         return;
1321     }
1322 
1323     if (value < VIRTIO_MEM_MIN_BLOCK_SIZE) {
1324         error_setg(errp, "'%s' property has to be at least 0x%" PRIx32, name,
1325                    VIRTIO_MEM_MIN_BLOCK_SIZE);
1326         return;
1327     } else if (!is_power_of_2(value)) {
1328         error_setg(errp, "'%s' property has to be a power of two", name);
1329         return;
1330     }
1331     vmem->block_size = value;
1332 }
1333 
1334 static void virtio_mem_instance_init(Object *obj)
1335 {
1336     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1337 
1338     notifier_list_init(&vmem->size_change_notifiers);
1339     QLIST_INIT(&vmem->rdl_list);
1340 
1341     object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size,
1342                         NULL, NULL, NULL);
1343     object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size",
1344                         virtio_mem_get_requested_size,
1345                         virtio_mem_set_requested_size, NULL, NULL);
1346     object_property_add(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, "size",
1347                         virtio_mem_get_block_size, virtio_mem_set_block_size,
1348                         NULL, NULL);
1349 }
1350 
1351 static Property virtio_mem_properties[] = {
1352     DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0),
1353     DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0),
1354     DEFINE_PROP_BOOL(VIRTIO_MEM_PREALLOC_PROP, VirtIOMEM, prealloc, false),
1355     DEFINE_PROP_LINK(VIRTIO_MEM_MEMDEV_PROP, VirtIOMEM, memdev,
1356                      TYPE_MEMORY_BACKEND, HostMemoryBackend *),
1357 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
1358     DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM,
1359                             unplugged_inaccessible, ON_OFF_AUTO_ON),
1360 #endif
1361     DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
1362                      early_migration, true),
1363     DEFINE_PROP_END_OF_LIST(),
1364 };
1365 
1366 static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm,
1367                                                    const MemoryRegion *mr)
1368 {
1369     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1370 
1371     g_assert(mr == &vmem->memdev->mr);
1372     return vmem->block_size;
1373 }
1374 
1375 static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm,
1376                                         const MemoryRegionSection *s)
1377 {
1378     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1379     uint64_t start_gpa = vmem->addr + s->offset_within_region;
1380     uint64_t end_gpa = start_gpa + int128_get64(s->size);
1381 
1382     g_assert(s->mr == &vmem->memdev->mr);
1383 
1384     start_gpa = QEMU_ALIGN_DOWN(start_gpa, vmem->block_size);
1385     end_gpa = QEMU_ALIGN_UP(end_gpa, vmem->block_size);
1386 
1387     if (!virtio_mem_valid_range(vmem, start_gpa, end_gpa - start_gpa)) {
1388         return false;
1389     }
1390 
1391     return virtio_mem_is_range_plugged(vmem, start_gpa, end_gpa - start_gpa);
1392 }
1393 
1394 struct VirtIOMEMReplayData {
1395     void *fn;
1396     void *opaque;
1397 };
1398 
1399 static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg)
1400 {
1401     struct VirtIOMEMReplayData *data = arg;
1402 
1403     return ((ReplayRamPopulate)data->fn)(s, data->opaque);
1404 }
1405 
1406 static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm,
1407                                            MemoryRegionSection *s,
1408                                            ReplayRamPopulate replay_fn,
1409                                            void *opaque)
1410 {
1411     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1412     struct VirtIOMEMReplayData data = {
1413         .fn = replay_fn,
1414         .opaque = opaque,
1415     };
1416 
1417     g_assert(s->mr == &vmem->memdev->mr);
1418     return virtio_mem_for_each_plugged_section(vmem, s, &data,
1419                                             virtio_mem_rdm_replay_populated_cb);
1420 }
1421 
1422 static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s,
1423                                               void *arg)
1424 {
1425     struct VirtIOMEMReplayData *data = arg;
1426 
1427     ((ReplayRamDiscard)data->fn)(s, data->opaque);
1428     return 0;
1429 }
1430 
1431 static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm,
1432                                             MemoryRegionSection *s,
1433                                             ReplayRamDiscard replay_fn,
1434                                             void *opaque)
1435 {
1436     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1437     struct VirtIOMEMReplayData data = {
1438         .fn = replay_fn,
1439         .opaque = opaque,
1440     };
1441 
1442     g_assert(s->mr == &vmem->memdev->mr);
1443     virtio_mem_for_each_unplugged_section(vmem, s, &data,
1444                                           virtio_mem_rdm_replay_discarded_cb);
1445 }
1446 
1447 static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm,
1448                                              RamDiscardListener *rdl,
1449                                              MemoryRegionSection *s)
1450 {
1451     VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1452     int ret;
1453 
1454     g_assert(s->mr == &vmem->memdev->mr);
1455     rdl->section = memory_region_section_new_copy(s);
1456 
1457     QLIST_INSERT_HEAD(&vmem->rdl_list, rdl, next);
1458     ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1459                                               virtio_mem_notify_populate_cb);
1460     if (ret) {
1461         error_report("%s: Replaying plugged ranges failed: %s", __func__,
1462                      strerror(-ret));
1463     }
1464 }
1465 
1466 static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm,
1467                                                RamDiscardListener *rdl)
1468 {
1469     VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1470 
1471     g_assert(rdl->section->mr == &vmem->memdev->mr);
1472     if (vmem->size) {
1473         if (rdl->double_discard_supported) {
1474             rdl->notify_discard(rdl, rdl->section);
1475         } else {
1476             virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1477                                                 virtio_mem_notify_discard_cb);
1478         }
1479     }
1480 
1481     memory_region_section_free_copy(rdl->section);
1482     rdl->section = NULL;
1483     QLIST_REMOVE(rdl, next);
1484 }
1485 
1486 static void virtio_mem_class_init(ObjectClass *klass, void *data)
1487 {
1488     DeviceClass *dc = DEVICE_CLASS(klass);
1489     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
1490     VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass);
1491     RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass);
1492 
1493     device_class_set_props(dc, virtio_mem_properties);
1494     dc->vmsd = &vmstate_virtio_mem;
1495 
1496     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1497     vdc->realize = virtio_mem_device_realize;
1498     vdc->unrealize = virtio_mem_device_unrealize;
1499     vdc->get_config = virtio_mem_get_config;
1500     vdc->get_features = virtio_mem_get_features;
1501     vdc->validate_features = virtio_mem_validate_features;
1502     vdc->vmsd = &vmstate_virtio_mem_device;
1503 
1504     vmc->fill_device_info = virtio_mem_fill_device_info;
1505     vmc->get_memory_region = virtio_mem_get_memory_region;
1506     vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier;
1507     vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier;
1508 
1509     rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity;
1510     rdmc->is_populated = virtio_mem_rdm_is_populated;
1511     rdmc->replay_populated = virtio_mem_rdm_replay_populated;
1512     rdmc->replay_discarded = virtio_mem_rdm_replay_discarded;
1513     rdmc->register_listener = virtio_mem_rdm_register_listener;
1514     rdmc->unregister_listener = virtio_mem_rdm_unregister_listener;
1515 }
1516 
1517 static const TypeInfo virtio_mem_info = {
1518     .name = TYPE_VIRTIO_MEM,
1519     .parent = TYPE_VIRTIO_DEVICE,
1520     .instance_size = sizeof(VirtIOMEM),
1521     .instance_init = virtio_mem_instance_init,
1522     .class_init = virtio_mem_class_init,
1523     .class_size = sizeof(VirtIOMEMClass),
1524     .interfaces = (InterfaceInfo[]) {
1525         { TYPE_RAM_DISCARD_MANAGER },
1526         { }
1527     },
1528 };
1529 
1530 static void virtio_register_types(void)
1531 {
1532     type_register_static(&virtio_mem_info);
1533 }
1534 
1535 type_init(virtio_register_types)
1536