xref: /openbmc/qemu/hw/virtio/virtio-mem.c (revision 34a8892d)
1 /*
2  * Virtio MEM device
3  *
4  * Copyright (C) 2020 Red Hat, Inc.
5  *
6  * Authors:
7  *  David Hildenbrand <david@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.
10  * See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "qemu/iov.h"
15 #include "qemu/cutils.h"
16 #include "qemu/error-report.h"
17 #include "qemu/units.h"
18 #include "sysemu/numa.h"
19 #include "sysemu/sysemu.h"
20 #include "sysemu/reset.h"
21 #include "sysemu/runstate.h"
22 #include "hw/virtio/virtio.h"
23 #include "hw/virtio/virtio-bus.h"
24 #include "hw/virtio/virtio-mem.h"
25 #include "qapi/error.h"
26 #include "qapi/visitor.h"
27 #include "exec/ram_addr.h"
28 #include "migration/misc.h"
29 #include "hw/boards.h"
30 #include "hw/qdev-properties.h"
31 #include CONFIG_DEVICES
32 #include "trace.h"
33 
34 static const VMStateDescription vmstate_virtio_mem_device_early;
35 
36 /*
37  * We only had legacy x86 guests that did not support
38  * VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests.
39  */
40 #if defined(TARGET_X86_64) || defined(TARGET_I386)
41 #define VIRTIO_MEM_HAS_LEGACY_GUESTS
42 #endif
43 
44 /*
45  * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
46  * bitmap small.
47  */
48 #define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
49 
virtio_mem_default_thp_size(void)50 static uint32_t virtio_mem_default_thp_size(void)
51 {
52     uint32_t default_thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
53 
54 #if defined(__x86_64__) || defined(__arm__) || defined(__powerpc64__)
55     default_thp_size = 2 * MiB;
56 #elif defined(__aarch64__)
57     if (qemu_real_host_page_size() == 4 * KiB) {
58         default_thp_size = 2 * MiB;
59     } else if (qemu_real_host_page_size() == 16 * KiB) {
60         default_thp_size = 32 * MiB;
61     } else if (qemu_real_host_page_size() == 64 * KiB) {
62         default_thp_size = 512 * MiB;
63     }
64 #endif
65 
66     return default_thp_size;
67 }
68 
69 /*
70  * The minimum memslot size depends on this setting ("sane default"), the
71  * device block size, and the memory backend page size. The last (or single)
72  * memslot might be smaller than this constant.
73  */
74 #define VIRTIO_MEM_MIN_MEMSLOT_SIZE (1 * GiB)
75 
76 /*
77  * We want to have a reasonable default block size such that
78  * 1. We avoid splitting THPs when unplugging memory, which degrades
79  *    performance.
80  * 2. We avoid placing THPs for plugged blocks that also cover unplugged
81  *    blocks.
82  *
83  * The actual THP size might differ between Linux kernels, so we try to probe
84  * it. In the future (if we ever run into issues regarding 2.), we might want
85  * to disable THP in case we fail to properly probe the THP size, or if the
86  * block size is configured smaller than the THP size.
87  */
88 static uint32_t thp_size;
89 
90 #define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
91 #define HPAGE_PATH "/sys/kernel/mm/transparent_hugepage/"
virtio_mem_thp_size(void)92 static uint32_t virtio_mem_thp_size(void)
93 {
94     gchar *content = NULL;
95     const char *endptr;
96     uint64_t tmp;
97 
98     if (thp_size) {
99         return thp_size;
100     }
101 
102     /* No THP -> no restrictions. */
103     if (!g_file_test(HPAGE_PATH, G_FILE_TEST_EXISTS)) {
104         thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
105         return thp_size;
106     }
107 
108     /*
109      * Try to probe the actual THP size, fallback to (sane but eventually
110      * incorrect) default sizes.
111      */
112     if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
113         !qemu_strtou64(content, &endptr, 0, &tmp) &&
114         (!endptr || *endptr == '\n')) {
115         /* Sanity-check the value and fallback to something reasonable. */
116         if (!tmp || !is_power_of_2(tmp)) {
117             warn_report("Read unsupported THP size: %" PRIx64, tmp);
118         } else {
119             thp_size = tmp;
120         }
121     }
122 
123     if (!thp_size) {
124         thp_size = virtio_mem_default_thp_size();
125         warn_report("Could not detect THP size, falling back to %" PRIx64
126                     "  MiB.", thp_size / MiB);
127     }
128 
129     g_free(content);
130     return thp_size;
131 }
132 
virtio_mem_default_block_size(RAMBlock * rb)133 static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
134 {
135     const uint64_t page_size = qemu_ram_pagesize(rb);
136 
137     /* We can have hugetlbfs with a page size smaller than the THP size. */
138     if (page_size == qemu_real_host_page_size()) {
139         return MAX(page_size, virtio_mem_thp_size());
140     }
141     return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE);
142 }
143 
144 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
virtio_mem_has_shared_zeropage(RAMBlock * rb)145 static bool virtio_mem_has_shared_zeropage(RAMBlock *rb)
146 {
147     /*
148      * We only have a guaranteed shared zeropage on ordinary MAP_PRIVATE
149      * anonymous RAM. In any other case, reading unplugged *can* populate a
150      * fresh page, consuming actual memory.
151      */
152     return !qemu_ram_is_shared(rb) && qemu_ram_get_fd(rb) < 0 &&
153            qemu_ram_pagesize(rb) == qemu_real_host_page_size();
154 }
155 #endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
156 
157 /*
158  * Size the usable region bigger than the requested size if possible. Esp.
159  * Linux guests will only add (aligned) memory blocks in case they fully
160  * fit into the usable region, but plug+online only a subset of the pages.
161  * The memory block size corresponds mostly to the section size.
162  *
163  * This allows e.g., to add 20MB with a section size of 128MB on x86_64, and
164  * a section size of 512MB on arm64 (as long as the start address is properly
165  * aligned, similar to ordinary DIMMs).
166  *
167  * We can change this at any time and maybe even make it configurable if
168  * necessary (as the section size can change). But it's more likely that the
169  * section size will rather get smaller and not bigger over time.
170  */
171 #if defined(TARGET_X86_64) || defined(TARGET_I386)
172 #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
173 #elif defined(TARGET_ARM)
174 #define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB))
175 #else
176 #error VIRTIO_MEM_USABLE_EXTENT not defined
177 #endif
178 
virtio_mem_is_busy(void)179 static bool virtio_mem_is_busy(void)
180 {
181     /*
182      * Postcopy cannot handle concurrent discards and we don't want to migrate
183      * pages on-demand with stale content when plugging new blocks.
184      *
185      * For precopy, we don't want unplugged blocks in our migration stream, and
186      * when plugging new blocks, the page content might differ between source
187      * and destination (observable by the guest when not initializing pages
188      * after plugging them) until we're running on the destination (as we didn't
189      * migrate these blocks when they were unplugged).
190      */
191     return migration_in_incoming_postcopy() || migration_is_running();
192 }
193 
194 typedef int (*virtio_mem_range_cb)(VirtIOMEM *vmem, void *arg,
195                                    uint64_t offset, uint64_t size);
196 
virtio_mem_for_each_unplugged_range(VirtIOMEM * vmem,void * arg,virtio_mem_range_cb cb)197 static int virtio_mem_for_each_unplugged_range(VirtIOMEM *vmem, void *arg,
198                                                virtio_mem_range_cb cb)
199 {
200     unsigned long first_zero_bit, last_zero_bit;
201     uint64_t offset, size;
202     int ret = 0;
203 
204     first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size);
205     while (first_zero_bit < vmem->bitmap_size) {
206         offset = first_zero_bit * vmem->block_size;
207         last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
208                                       first_zero_bit + 1) - 1;
209         size = (last_zero_bit - first_zero_bit + 1) * vmem->block_size;
210 
211         ret = cb(vmem, arg, offset, size);
212         if (ret) {
213             break;
214         }
215         first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
216                                             last_zero_bit + 2);
217     }
218     return ret;
219 }
220 
virtio_mem_for_each_plugged_range(VirtIOMEM * vmem,void * arg,virtio_mem_range_cb cb)221 static int virtio_mem_for_each_plugged_range(VirtIOMEM *vmem, void *arg,
222                                              virtio_mem_range_cb cb)
223 {
224     unsigned long first_bit, last_bit;
225     uint64_t offset, size;
226     int ret = 0;
227 
228     first_bit = find_first_bit(vmem->bitmap, vmem->bitmap_size);
229     while (first_bit < vmem->bitmap_size) {
230         offset = first_bit * vmem->block_size;
231         last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
232                                       first_bit + 1) - 1;
233         size = (last_bit - first_bit + 1) * vmem->block_size;
234 
235         ret = cb(vmem, arg, offset, size);
236         if (ret) {
237             break;
238         }
239         first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
240                                   last_bit + 2);
241     }
242     return ret;
243 }
244 
245 /*
246  * Adjust the memory section to cover the intersection with the given range.
247  *
248  * Returns false if the intersection is empty, otherwise returns true.
249  */
virtio_mem_intersect_memory_section(MemoryRegionSection * s,uint64_t offset,uint64_t size)250 static bool virtio_mem_intersect_memory_section(MemoryRegionSection *s,
251                                                 uint64_t offset, uint64_t size)
252 {
253     uint64_t start = MAX(s->offset_within_region, offset);
254     uint64_t end = MIN(s->offset_within_region + int128_get64(s->size),
255                        offset + size);
256 
257     if (end <= start) {
258         return false;
259     }
260 
261     s->offset_within_address_space += start - s->offset_within_region;
262     s->offset_within_region = start;
263     s->size = int128_make64(end - start);
264     return true;
265 }
266 
267 typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg);
268 
virtio_mem_for_each_plugged_section(const VirtIOMEM * vmem,MemoryRegionSection * s,void * arg,virtio_mem_section_cb cb)269 static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem,
270                                                MemoryRegionSection *s,
271                                                void *arg,
272                                                virtio_mem_section_cb cb)
273 {
274     unsigned long first_bit, last_bit;
275     uint64_t offset, size;
276     int ret = 0;
277 
278     first_bit = s->offset_within_region / vmem->block_size;
279     first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
280     while (first_bit < vmem->bitmap_size) {
281         MemoryRegionSection tmp = *s;
282 
283         offset = first_bit * vmem->block_size;
284         last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
285                                       first_bit + 1) - 1;
286         size = (last_bit - first_bit + 1) * vmem->block_size;
287 
288         if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
289             break;
290         }
291         ret = cb(&tmp, arg);
292         if (ret) {
293             break;
294         }
295         first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
296                                   last_bit + 2);
297     }
298     return ret;
299 }
300 
virtio_mem_for_each_unplugged_section(const VirtIOMEM * vmem,MemoryRegionSection * s,void * arg,virtio_mem_section_cb cb)301 static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem,
302                                                  MemoryRegionSection *s,
303                                                  void *arg,
304                                                  virtio_mem_section_cb cb)
305 {
306     unsigned long first_bit, last_bit;
307     uint64_t offset, size;
308     int ret = 0;
309 
310     first_bit = s->offset_within_region / vmem->block_size;
311     first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
312     while (first_bit < vmem->bitmap_size) {
313         MemoryRegionSection tmp = *s;
314 
315         offset = first_bit * vmem->block_size;
316         last_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
317                                  first_bit + 1) - 1;
318         size = (last_bit - first_bit + 1) * vmem->block_size;
319 
320         if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
321             break;
322         }
323         ret = cb(&tmp, arg);
324         if (ret) {
325             break;
326         }
327         first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
328                                        last_bit + 2);
329     }
330     return ret;
331 }
332 
virtio_mem_notify_populate_cb(MemoryRegionSection * s,void * arg)333 static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg)
334 {
335     RamDiscardListener *rdl = arg;
336 
337     return rdl->notify_populate(rdl, s);
338 }
339 
virtio_mem_notify_discard_cb(MemoryRegionSection * s,void * arg)340 static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg)
341 {
342     RamDiscardListener *rdl = arg;
343 
344     rdl->notify_discard(rdl, s);
345     return 0;
346 }
347 
virtio_mem_notify_unplug(VirtIOMEM * vmem,uint64_t offset,uint64_t size)348 static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset,
349                                      uint64_t size)
350 {
351     RamDiscardListener *rdl;
352 
353     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
354         MemoryRegionSection tmp = *rdl->section;
355 
356         if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
357             continue;
358         }
359         rdl->notify_discard(rdl, &tmp);
360     }
361 }
362 
virtio_mem_notify_plug(VirtIOMEM * vmem,uint64_t offset,uint64_t size)363 static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset,
364                                   uint64_t size)
365 {
366     RamDiscardListener *rdl, *rdl2;
367     int ret = 0;
368 
369     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
370         MemoryRegionSection tmp = *rdl->section;
371 
372         if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
373             continue;
374         }
375         ret = rdl->notify_populate(rdl, &tmp);
376         if (ret) {
377             break;
378         }
379     }
380 
381     if (ret) {
382         /* Notify all already-notified listeners. */
383         QLIST_FOREACH(rdl2, &vmem->rdl_list, next) {
384             MemoryRegionSection tmp = *rdl2->section;
385 
386             if (rdl2 == rdl) {
387                 break;
388             }
389             if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
390                 continue;
391             }
392             rdl2->notify_discard(rdl2, &tmp);
393         }
394     }
395     return ret;
396 }
397 
virtio_mem_notify_unplug_all(VirtIOMEM * vmem)398 static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem)
399 {
400     RamDiscardListener *rdl;
401 
402     if (!vmem->size) {
403         return;
404     }
405 
406     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
407         if (rdl->double_discard_supported) {
408             rdl->notify_discard(rdl, rdl->section);
409         } else {
410             virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
411                                                 virtio_mem_notify_discard_cb);
412         }
413     }
414 }
415 
virtio_mem_is_range_plugged(const VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)416 static bool virtio_mem_is_range_plugged(const VirtIOMEM *vmem,
417                                         uint64_t start_gpa, uint64_t size)
418 {
419     const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
420     const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
421     unsigned long found_bit;
422 
423     /* We fake a shorter bitmap to avoid searching too far. */
424     found_bit = find_next_zero_bit(vmem->bitmap, last_bit + 1, first_bit);
425     return found_bit > last_bit;
426 }
427 
virtio_mem_is_range_unplugged(const VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)428 static bool virtio_mem_is_range_unplugged(const VirtIOMEM *vmem,
429                                           uint64_t start_gpa, uint64_t size)
430 {
431     const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
432     const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
433     unsigned long found_bit;
434 
435     /* We fake a shorter bitmap to avoid searching too far. */
436     found_bit = find_next_bit(vmem->bitmap, last_bit + 1, first_bit);
437     return found_bit > last_bit;
438 }
439 
virtio_mem_set_range_plugged(VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)440 static void virtio_mem_set_range_plugged(VirtIOMEM *vmem, uint64_t start_gpa,
441                                          uint64_t size)
442 {
443     const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
444     const unsigned long nbits = size / vmem->block_size;
445 
446     bitmap_set(vmem->bitmap, bit, nbits);
447 }
448 
virtio_mem_set_range_unplugged(VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)449 static void virtio_mem_set_range_unplugged(VirtIOMEM *vmem, uint64_t start_gpa,
450                                            uint64_t size)
451 {
452     const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
453     const unsigned long nbits = size / vmem->block_size;
454 
455     bitmap_clear(vmem->bitmap, bit, nbits);
456 }
457 
virtio_mem_send_response(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_resp * resp)458 static void virtio_mem_send_response(VirtIOMEM *vmem, VirtQueueElement *elem,
459                                      struct virtio_mem_resp *resp)
460 {
461     VirtIODevice *vdev = VIRTIO_DEVICE(vmem);
462     VirtQueue *vq = vmem->vq;
463 
464     trace_virtio_mem_send_response(le16_to_cpu(resp->type));
465     iov_from_buf(elem->in_sg, elem->in_num, 0, resp, sizeof(*resp));
466 
467     virtqueue_push(vq, elem, sizeof(*resp));
468     virtio_notify(vdev, vq);
469 }
470 
virtio_mem_send_response_simple(VirtIOMEM * vmem,VirtQueueElement * elem,uint16_t type)471 static void virtio_mem_send_response_simple(VirtIOMEM *vmem,
472                                             VirtQueueElement *elem,
473                                             uint16_t type)
474 {
475     struct virtio_mem_resp resp = {
476         .type = cpu_to_le16(type),
477     };
478 
479     virtio_mem_send_response(vmem, elem, &resp);
480 }
481 
virtio_mem_valid_range(const VirtIOMEM * vmem,uint64_t gpa,uint64_t size)482 static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa,
483                                    uint64_t size)
484 {
485     if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) {
486         return false;
487     }
488     if (gpa + size < gpa || !size) {
489         return false;
490     }
491     if (gpa < vmem->addr || gpa >= vmem->addr + vmem->usable_region_size) {
492         return false;
493     }
494     if (gpa + size > vmem->addr + vmem->usable_region_size) {
495         return false;
496     }
497     return true;
498 }
499 
virtio_mem_activate_memslot(VirtIOMEM * vmem,unsigned int idx)500 static void virtio_mem_activate_memslot(VirtIOMEM *vmem, unsigned int idx)
501 {
502     const uint64_t memslot_offset = idx * vmem->memslot_size;
503 
504     assert(vmem->memslots);
505 
506     /*
507      * Instead of enabling/disabling memslots, we add/remove them. This should
508      * make address space updates faster, because we don't have to loop over
509      * many disabled subregions.
510      */
511     if (memory_region_is_mapped(&vmem->memslots[idx])) {
512         return;
513     }
514     memory_region_add_subregion(vmem->mr, memslot_offset, &vmem->memslots[idx]);
515 }
516 
virtio_mem_deactivate_memslot(VirtIOMEM * vmem,unsigned int idx)517 static void virtio_mem_deactivate_memslot(VirtIOMEM *vmem, unsigned int idx)
518 {
519     assert(vmem->memslots);
520 
521     if (!memory_region_is_mapped(&vmem->memslots[idx])) {
522         return;
523     }
524     memory_region_del_subregion(vmem->mr, &vmem->memslots[idx]);
525 }
526 
virtio_mem_activate_memslots_to_plug(VirtIOMEM * vmem,uint64_t offset,uint64_t size)527 static void virtio_mem_activate_memslots_to_plug(VirtIOMEM *vmem,
528                                                  uint64_t offset, uint64_t size)
529 {
530     const unsigned int start_idx = offset / vmem->memslot_size;
531     const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /
532                                  vmem->memslot_size;
533     unsigned int idx;
534 
535     assert(vmem->dynamic_memslots);
536 
537     /* Activate all involved memslots in a single transaction. */
538     memory_region_transaction_begin();
539     for (idx = start_idx; idx < end_idx; idx++) {
540         virtio_mem_activate_memslot(vmem, idx);
541     }
542     memory_region_transaction_commit();
543 }
544 
virtio_mem_deactivate_unplugged_memslots(VirtIOMEM * vmem,uint64_t offset,uint64_t size)545 static void virtio_mem_deactivate_unplugged_memslots(VirtIOMEM *vmem,
546                                                      uint64_t offset,
547                                                      uint64_t size)
548 {
549     const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
550     const unsigned int start_idx = offset / vmem->memslot_size;
551     const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /
552                                  vmem->memslot_size;
553     unsigned int idx;
554 
555     assert(vmem->dynamic_memslots);
556 
557     /* Deactivate all memslots with unplugged blocks in a single transaction. */
558     memory_region_transaction_begin();
559     for (idx = start_idx; idx < end_idx; idx++) {
560         const uint64_t memslot_offset = idx * vmem->memslot_size;
561         uint64_t memslot_size = vmem->memslot_size;
562 
563         /* The size of the last memslot might be smaller. */
564         if (idx == vmem->nb_memslots - 1) {
565             memslot_size = region_size - memslot_offset;
566         }
567 
568         /*
569          * Partially covered memslots might still have some blocks plugged and
570          * have to remain active if that's the case.
571          */
572         if (offset > memslot_offset ||
573             offset + size < memslot_offset + memslot_size) {
574             const uint64_t gpa = vmem->addr + memslot_offset;
575 
576             if (!virtio_mem_is_range_unplugged(vmem, gpa, memslot_size)) {
577                 continue;
578             }
579         }
580 
581         virtio_mem_deactivate_memslot(vmem, idx);
582     }
583     memory_region_transaction_commit();
584 }
585 
virtio_mem_set_block_state(VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size,bool plug)586 static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
587                                       uint64_t size, bool plug)
588 {
589     const uint64_t offset = start_gpa - vmem->addr;
590     RAMBlock *rb = vmem->memdev->mr.ram_block;
591     int ret = 0;
592 
593     if (virtio_mem_is_busy()) {
594         return -EBUSY;
595     }
596 
597     if (!plug) {
598         if (ram_block_discard_range(rb, offset, size)) {
599             return -EBUSY;
600         }
601         virtio_mem_notify_unplug(vmem, offset, size);
602         virtio_mem_set_range_unplugged(vmem, start_gpa, size);
603         /* Deactivate completely unplugged memslots after updating the state. */
604         if (vmem->dynamic_memslots) {
605             virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);
606         }
607         return 0;
608     }
609 
610     if (vmem->prealloc) {
611         void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
612         int fd = memory_region_get_fd(&vmem->memdev->mr);
613         Error *local_err = NULL;
614 
615         if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
616             static bool warned;
617 
618             /*
619              * Warn only once, we don't want to fill the log with these
620              * warnings.
621              */
622             if (!warned) {
623                 warn_report_err(local_err);
624                 warned = true;
625             } else {
626                 error_free(local_err);
627             }
628             ret = -EBUSY;
629         }
630     }
631 
632     if (!ret) {
633         /*
634          * Activate before notifying and rollback in case of any errors.
635          *
636          * When activating a yet inactive memslot, memory notifiers will get
637          * notified about the added memory region and can register with the
638          * RamDiscardManager; this will traverse all plugged blocks and skip the
639          * blocks we are plugging here. The following notification will inform
640          * registered listeners about the blocks we're plugging.
641          */
642         if (vmem->dynamic_memslots) {
643             virtio_mem_activate_memslots_to_plug(vmem, offset, size);
644         }
645         ret = virtio_mem_notify_plug(vmem, offset, size);
646         if (ret && vmem->dynamic_memslots) {
647             virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);
648         }
649     }
650     if (ret) {
651         /* Could be preallocation or a notifier populated memory. */
652         ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size);
653         return -EBUSY;
654     }
655 
656     virtio_mem_set_range_plugged(vmem, start_gpa, size);
657     return 0;
658 }
659 
virtio_mem_state_change_request(VirtIOMEM * vmem,uint64_t gpa,uint16_t nb_blocks,bool plug)660 static int virtio_mem_state_change_request(VirtIOMEM *vmem, uint64_t gpa,
661                                            uint16_t nb_blocks, bool plug)
662 {
663     const uint64_t size = nb_blocks * vmem->block_size;
664     int ret;
665 
666     if (!virtio_mem_valid_range(vmem, gpa, size)) {
667         return VIRTIO_MEM_RESP_ERROR;
668     }
669 
670     if (plug && (vmem->size + size > vmem->requested_size)) {
671         return VIRTIO_MEM_RESP_NACK;
672     }
673 
674     /* test if really all blocks are in the opposite state */
675     if ((plug && !virtio_mem_is_range_unplugged(vmem, gpa, size)) ||
676         (!plug && !virtio_mem_is_range_plugged(vmem, gpa, size))) {
677         return VIRTIO_MEM_RESP_ERROR;
678     }
679 
680     ret = virtio_mem_set_block_state(vmem, gpa, size, plug);
681     if (ret) {
682         return VIRTIO_MEM_RESP_BUSY;
683     }
684     if (plug) {
685         vmem->size += size;
686     } else {
687         vmem->size -= size;
688     }
689     notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
690     return VIRTIO_MEM_RESP_ACK;
691 }
692 
virtio_mem_plug_request(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_req * req)693 static void virtio_mem_plug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
694                                     struct virtio_mem_req *req)
695 {
696     const uint64_t gpa = le64_to_cpu(req->u.plug.addr);
697     const uint16_t nb_blocks = le16_to_cpu(req->u.plug.nb_blocks);
698     uint16_t type;
699 
700     trace_virtio_mem_plug_request(gpa, nb_blocks);
701     type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, true);
702     virtio_mem_send_response_simple(vmem, elem, type);
703 }
704 
virtio_mem_unplug_request(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_req * req)705 static void virtio_mem_unplug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
706                                       struct virtio_mem_req *req)
707 {
708     const uint64_t gpa = le64_to_cpu(req->u.unplug.addr);
709     const uint16_t nb_blocks = le16_to_cpu(req->u.unplug.nb_blocks);
710     uint16_t type;
711 
712     trace_virtio_mem_unplug_request(gpa, nb_blocks);
713     type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, false);
714     virtio_mem_send_response_simple(vmem, elem, type);
715 }
716 
virtio_mem_resize_usable_region(VirtIOMEM * vmem,uint64_t requested_size,bool can_shrink)717 static void virtio_mem_resize_usable_region(VirtIOMEM *vmem,
718                                             uint64_t requested_size,
719                                             bool can_shrink)
720 {
721     uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr),
722                            requested_size + VIRTIO_MEM_USABLE_EXTENT);
723 
724     /* The usable region size always has to be multiples of the block size. */
725     newsize = QEMU_ALIGN_UP(newsize, vmem->block_size);
726 
727     if (!requested_size) {
728         newsize = 0;
729     }
730 
731     if (newsize < vmem->usable_region_size && !can_shrink) {
732         return;
733     }
734 
735     trace_virtio_mem_resized_usable_region(vmem->usable_region_size, newsize);
736     vmem->usable_region_size = newsize;
737 }
738 
virtio_mem_unplug_all(VirtIOMEM * vmem)739 static int virtio_mem_unplug_all(VirtIOMEM *vmem)
740 {
741     const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
742     RAMBlock *rb = vmem->memdev->mr.ram_block;
743 
744     if (vmem->size) {
745         if (virtio_mem_is_busy()) {
746             return -EBUSY;
747         }
748         if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
749             return -EBUSY;
750         }
751         virtio_mem_notify_unplug_all(vmem);
752 
753         bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size);
754         vmem->size = 0;
755         notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
756 
757         /* Deactivate all memslots after updating the state. */
758         if (vmem->dynamic_memslots) {
759             virtio_mem_deactivate_unplugged_memslots(vmem, 0, region_size);
760         }
761     }
762 
763     trace_virtio_mem_unplugged_all();
764     virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
765     return 0;
766 }
767 
virtio_mem_unplug_all_request(VirtIOMEM * vmem,VirtQueueElement * elem)768 static void virtio_mem_unplug_all_request(VirtIOMEM *vmem,
769                                           VirtQueueElement *elem)
770 {
771     trace_virtio_mem_unplug_all_request();
772     if (virtio_mem_unplug_all(vmem)) {
773         virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_BUSY);
774     } else {
775         virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ACK);
776     }
777 }
778 
virtio_mem_state_request(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_req * req)779 static void virtio_mem_state_request(VirtIOMEM *vmem, VirtQueueElement *elem,
780                                      struct virtio_mem_req *req)
781 {
782     const uint16_t nb_blocks = le16_to_cpu(req->u.state.nb_blocks);
783     const uint64_t gpa = le64_to_cpu(req->u.state.addr);
784     const uint64_t size = nb_blocks * vmem->block_size;
785     struct virtio_mem_resp resp = {
786         .type = cpu_to_le16(VIRTIO_MEM_RESP_ACK),
787     };
788 
789     trace_virtio_mem_state_request(gpa, nb_blocks);
790     if (!virtio_mem_valid_range(vmem, gpa, size)) {
791         virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ERROR);
792         return;
793     }
794 
795     if (virtio_mem_is_range_plugged(vmem, gpa, size)) {
796         resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_PLUGGED);
797     } else if (virtio_mem_is_range_unplugged(vmem, gpa, size)) {
798         resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_UNPLUGGED);
799     } else {
800         resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_MIXED);
801     }
802     trace_virtio_mem_state_response(le16_to_cpu(resp.u.state.state));
803     virtio_mem_send_response(vmem, elem, &resp);
804 }
805 
virtio_mem_handle_request(VirtIODevice * vdev,VirtQueue * vq)806 static void virtio_mem_handle_request(VirtIODevice *vdev, VirtQueue *vq)
807 {
808     const int len = sizeof(struct virtio_mem_req);
809     VirtIOMEM *vmem = VIRTIO_MEM(vdev);
810     VirtQueueElement *elem;
811     struct virtio_mem_req req;
812     uint16_t type;
813 
814     while (true) {
815         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
816         if (!elem) {
817             return;
818         }
819 
820         if (iov_to_buf(elem->out_sg, elem->out_num, 0, &req, len) < len) {
821             virtio_error(vdev, "virtio-mem protocol violation: invalid request"
822                          " size: %d", len);
823             virtqueue_detach_element(vq, elem, 0);
824             g_free(elem);
825             return;
826         }
827 
828         if (iov_size(elem->in_sg, elem->in_num) <
829             sizeof(struct virtio_mem_resp)) {
830             virtio_error(vdev, "virtio-mem protocol violation: not enough space"
831                          " for response: %zu",
832                          iov_size(elem->in_sg, elem->in_num));
833             virtqueue_detach_element(vq, elem, 0);
834             g_free(elem);
835             return;
836         }
837 
838         type = le16_to_cpu(req.type);
839         switch (type) {
840         case VIRTIO_MEM_REQ_PLUG:
841             virtio_mem_plug_request(vmem, elem, &req);
842             break;
843         case VIRTIO_MEM_REQ_UNPLUG:
844             virtio_mem_unplug_request(vmem, elem, &req);
845             break;
846         case VIRTIO_MEM_REQ_UNPLUG_ALL:
847             virtio_mem_unplug_all_request(vmem, elem);
848             break;
849         case VIRTIO_MEM_REQ_STATE:
850             virtio_mem_state_request(vmem, elem, &req);
851             break;
852         default:
853             virtio_error(vdev, "virtio-mem protocol violation: unknown request"
854                          " type: %d", type);
855             virtqueue_detach_element(vq, elem, 0);
856             g_free(elem);
857             return;
858         }
859 
860         g_free(elem);
861     }
862 }
863 
virtio_mem_get_config(VirtIODevice * vdev,uint8_t * config_data)864 static void virtio_mem_get_config(VirtIODevice *vdev, uint8_t *config_data)
865 {
866     VirtIOMEM *vmem = VIRTIO_MEM(vdev);
867     struct virtio_mem_config *config = (void *) config_data;
868 
869     config->block_size = cpu_to_le64(vmem->block_size);
870     config->node_id = cpu_to_le16(vmem->node);
871     config->requested_size = cpu_to_le64(vmem->requested_size);
872     config->plugged_size = cpu_to_le64(vmem->size);
873     config->addr = cpu_to_le64(vmem->addr);
874     config->region_size = cpu_to_le64(memory_region_size(&vmem->memdev->mr));
875     config->usable_region_size = cpu_to_le64(vmem->usable_region_size);
876 }
877 
virtio_mem_get_features(VirtIODevice * vdev,uint64_t features,Error ** errp)878 static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features,
879                                         Error **errp)
880 {
881     MachineState *ms = MACHINE(qdev_get_machine());
882     VirtIOMEM *vmem = VIRTIO_MEM(vdev);
883 
884     if (ms->numa_state) {
885 #if defined(CONFIG_ACPI)
886         virtio_add_feature(&features, VIRTIO_MEM_F_ACPI_PXM);
887 #endif
888     }
889     assert(vmem->unplugged_inaccessible != ON_OFF_AUTO_AUTO);
890     if (vmem->unplugged_inaccessible == ON_OFF_AUTO_ON) {
891         virtio_add_feature(&features, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE);
892     }
893     if (qemu_wakeup_suspend_enabled()) {
894         virtio_add_feature(&features, VIRTIO_MEM_F_PERSISTENT_SUSPEND);
895     }
896     return features;
897 }
898 
virtio_mem_validate_features(VirtIODevice * vdev)899 static int virtio_mem_validate_features(VirtIODevice *vdev)
900 {
901     if (virtio_host_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE) &&
902         !virtio_vdev_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE)) {
903         return -EFAULT;
904     }
905     return 0;
906 }
907 
virtio_mem_prepare_mr(VirtIOMEM * vmem)908 static void virtio_mem_prepare_mr(VirtIOMEM *vmem)
909 {
910     const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
911 
912     assert(!vmem->mr && vmem->dynamic_memslots);
913     vmem->mr = g_new0(MemoryRegion, 1);
914     memory_region_init(vmem->mr, OBJECT(vmem), "virtio-mem",
915                        region_size);
916     vmem->mr->align = memory_region_get_alignment(&vmem->memdev->mr);
917 }
918 
virtio_mem_prepare_memslots(VirtIOMEM * vmem)919 static void virtio_mem_prepare_memslots(VirtIOMEM *vmem)
920 {
921     const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
922     unsigned int idx;
923 
924     g_assert(!vmem->memslots && vmem->nb_memslots && vmem->dynamic_memslots);
925     vmem->memslots = g_new0(MemoryRegion, vmem->nb_memslots);
926 
927     /* Initialize our memslots, but don't map them yet. */
928     for (idx = 0; idx < vmem->nb_memslots; idx++) {
929         const uint64_t memslot_offset = idx * vmem->memslot_size;
930         uint64_t memslot_size = vmem->memslot_size;
931         char name[20];
932 
933         /* The size of the last memslot might be smaller. */
934         if (idx == vmem->nb_memslots - 1) {
935             memslot_size = region_size - memslot_offset;
936         }
937 
938         snprintf(name, sizeof(name), "memslot-%u", idx);
939         memory_region_init_alias(&vmem->memslots[idx], OBJECT(vmem), name,
940                                  &vmem->memdev->mr, memslot_offset,
941                                  memslot_size);
942         /*
943          * We want to be able to atomically and efficiently activate/deactivate
944          * individual memslots without affecting adjacent memslots in memory
945          * notifiers.
946          */
947         memory_region_set_unmergeable(&vmem->memslots[idx], true);
948     }
949 }
950 
virtio_mem_device_realize(DeviceState * dev,Error ** errp)951 static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
952 {
953     MachineState *ms = MACHINE(qdev_get_machine());
954     int nb_numa_nodes = ms->numa_state ? ms->numa_state->num_nodes : 0;
955     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
956     VirtIOMEM *vmem = VIRTIO_MEM(dev);
957     uint64_t page_size;
958     RAMBlock *rb;
959     int ret;
960 
961     if (!vmem->memdev) {
962         error_setg(errp, "'%s' property is not set", VIRTIO_MEM_MEMDEV_PROP);
963         return;
964     } else if (host_memory_backend_is_mapped(vmem->memdev)) {
965         error_setg(errp, "'%s' property specifies a busy memdev: %s",
966                    VIRTIO_MEM_MEMDEV_PROP,
967                    object_get_canonical_path_component(OBJECT(vmem->memdev)));
968         return;
969     } else if (!memory_region_is_ram(&vmem->memdev->mr) ||
970         memory_region_is_rom(&vmem->memdev->mr) ||
971         !vmem->memdev->mr.ram_block) {
972         error_setg(errp, "'%s' property specifies an unsupported memdev",
973                    VIRTIO_MEM_MEMDEV_PROP);
974         return;
975     } else if (vmem->memdev->prealloc) {
976         error_setg(errp, "'%s' property specifies a memdev with preallocation"
977                    " enabled: %s. Instead, specify 'prealloc=on' for the"
978                    " virtio-mem device. ", VIRTIO_MEM_MEMDEV_PROP,
979                    object_get_canonical_path_component(OBJECT(vmem->memdev)));
980         return;
981     }
982 
983     if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) ||
984         (!nb_numa_nodes && vmem->node)) {
985         error_setg(errp, "'%s' property has value '%" PRIu32 "', which exceeds"
986                    "the number of numa nodes: %d", VIRTIO_MEM_NODE_PROP,
987                    vmem->node, nb_numa_nodes ? nb_numa_nodes : 1);
988         return;
989     }
990 
991     if (enable_mlock) {
992         error_setg(errp, "Incompatible with mlock");
993         return;
994     }
995 
996     rb = vmem->memdev->mr.ram_block;
997     page_size = qemu_ram_pagesize(rb);
998 
999 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
1000     switch (vmem->unplugged_inaccessible) {
1001     case ON_OFF_AUTO_AUTO:
1002         if (virtio_mem_has_shared_zeropage(rb)) {
1003             vmem->unplugged_inaccessible = ON_OFF_AUTO_OFF;
1004         } else {
1005             vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
1006         }
1007         break;
1008     case ON_OFF_AUTO_OFF:
1009         if (!virtio_mem_has_shared_zeropage(rb)) {
1010             warn_report("'%s' property set to 'off' with a memdev that does"
1011                         " not support the shared zeropage.",
1012                         VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
1013         }
1014         break;
1015     default:
1016         break;
1017     }
1018 #else /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
1019     vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
1020 #endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
1021 
1022     if (vmem->dynamic_memslots &&
1023         vmem->unplugged_inaccessible != ON_OFF_AUTO_ON) {
1024         error_setg(errp, "'%s' property set to 'on' requires '%s' to be 'on'",
1025                    VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP,
1026                    VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
1027         return;
1028     }
1029 
1030     /*
1031      * If the block size wasn't configured by the user, use a sane default. This
1032      * allows using hugetlbfs backends of any page size without manual
1033      * intervention.
1034      */
1035     if (!vmem->block_size) {
1036         vmem->block_size = virtio_mem_default_block_size(rb);
1037     }
1038 
1039     if (vmem->block_size < page_size) {
1040         error_setg(errp, "'%s' property has to be at least the page size (0x%"
1041                    PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size);
1042         return;
1043     } else if (vmem->block_size < virtio_mem_default_block_size(rb)) {
1044         warn_report("'%s' property is smaller than the default block size (%"
1045                     PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP,
1046                     virtio_mem_default_block_size(rb) / MiB);
1047     }
1048     if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) {
1049         error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
1050                    ")", VIRTIO_MEM_REQUESTED_SIZE_PROP,
1051                    VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
1052         return;
1053     } else if (!QEMU_IS_ALIGNED(vmem->addr, vmem->block_size)) {
1054         error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
1055                    ")", VIRTIO_MEM_ADDR_PROP, VIRTIO_MEM_BLOCK_SIZE_PROP,
1056                    vmem->block_size);
1057         return;
1058     } else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr),
1059                                 vmem->block_size)) {
1060         error_setg(errp, "'%s' property memdev size has to be multiples of"
1061                    "'%s' (0x%" PRIx64 ")", VIRTIO_MEM_MEMDEV_PROP,
1062                    VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
1063         return;
1064     }
1065 
1066     if (ram_block_coordinated_discard_require(true)) {
1067         error_setg(errp, "Discarding RAM is disabled");
1068         return;
1069     }
1070 
1071     /*
1072      * We don't know at this point whether shared RAM is migrated using
1073      * QEMU or migrated using the file content. "x-ignore-shared" will be
1074      * configured after realizing the device. So in case we have an
1075      * incoming migration, simply always skip the discard step.
1076      *
1077      * Otherwise, make sure that we start with a clean slate: either the
1078      * memory backend might get reused or the shared file might still have
1079      * memory allocated.
1080      */
1081     if (!runstate_check(RUN_STATE_INMIGRATE)) {
1082         ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb));
1083         if (ret) {
1084             error_setg_errno(errp, -ret, "Unexpected error discarding RAM");
1085             ram_block_coordinated_discard_require(false);
1086             return;
1087         }
1088     }
1089 
1090     virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
1091 
1092     vmem->bitmap_size = memory_region_size(&vmem->memdev->mr) /
1093                         vmem->block_size;
1094     vmem->bitmap = bitmap_new(vmem->bitmap_size);
1095 
1096     virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config));
1097     vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
1098 
1099     /*
1100      * With "dynamic-memslots=off" (old behavior) we always map the whole
1101      * RAM memory region directly.
1102      */
1103     if (vmem->dynamic_memslots) {
1104         if (!vmem->mr) {
1105             virtio_mem_prepare_mr(vmem);
1106         }
1107         if (vmem->nb_memslots <= 1) {
1108             vmem->nb_memslots = 1;
1109             vmem->memslot_size = memory_region_size(&vmem->memdev->mr);
1110         }
1111         if (!vmem->memslots) {
1112             virtio_mem_prepare_memslots(vmem);
1113         }
1114     } else {
1115         assert(!vmem->mr && !vmem->nb_memslots && !vmem->memslots);
1116     }
1117 
1118     host_memory_backend_set_mapped(vmem->memdev, true);
1119     vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
1120     if (vmem->early_migration) {
1121         vmstate_register_any(VMSTATE_IF(vmem),
1122                              &vmstate_virtio_mem_device_early, vmem);
1123     }
1124     qemu_register_resettable(OBJECT(vmem));
1125 
1126     /*
1127      * Set ourselves as RamDiscardManager before the plug handler maps the
1128      * memory region and exposes it via an address space.
1129      */
1130     memory_region_set_ram_discard_manager(&vmem->memdev->mr,
1131                                           RAM_DISCARD_MANAGER(vmem));
1132 }
1133 
virtio_mem_device_unrealize(DeviceState * dev)1134 static void virtio_mem_device_unrealize(DeviceState *dev)
1135 {
1136     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
1137     VirtIOMEM *vmem = VIRTIO_MEM(dev);
1138 
1139     /*
1140      * The unplug handler unmapped the memory region, it cannot be
1141      * found via an address space anymore. Unset ourselves.
1142      */
1143     memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
1144     qemu_unregister_resettable(OBJECT(vmem));
1145     if (vmem->early_migration) {
1146         vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early,
1147                            vmem);
1148     }
1149     vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
1150     host_memory_backend_set_mapped(vmem->memdev, false);
1151     virtio_del_queue(vdev, 0);
1152     virtio_cleanup(vdev);
1153     g_free(vmem->bitmap);
1154     ram_block_coordinated_discard_require(false);
1155 }
1156 
virtio_mem_discard_range_cb(VirtIOMEM * vmem,void * arg,uint64_t offset,uint64_t size)1157 static int virtio_mem_discard_range_cb(VirtIOMEM *vmem, void *arg,
1158                                        uint64_t offset, uint64_t size)
1159 {
1160     RAMBlock *rb = vmem->memdev->mr.ram_block;
1161 
1162     return ram_block_discard_range(rb, offset, size) ? -EINVAL : 0;
1163 }
1164 
virtio_mem_restore_unplugged(VirtIOMEM * vmem)1165 static int virtio_mem_restore_unplugged(VirtIOMEM *vmem)
1166 {
1167     /* Make sure all memory is really discarded after migration. */
1168     return virtio_mem_for_each_unplugged_range(vmem, NULL,
1169                                                virtio_mem_discard_range_cb);
1170 }
1171 
virtio_mem_activate_memslot_range_cb(VirtIOMEM * vmem,void * arg,uint64_t offset,uint64_t size)1172 static int virtio_mem_activate_memslot_range_cb(VirtIOMEM *vmem, void *arg,
1173                                                 uint64_t offset, uint64_t size)
1174 {
1175     virtio_mem_activate_memslots_to_plug(vmem, offset, size);
1176     return 0;
1177 }
1178 
virtio_mem_post_load_bitmap(VirtIOMEM * vmem)1179 static int virtio_mem_post_load_bitmap(VirtIOMEM *vmem)
1180 {
1181     RamDiscardListener *rdl;
1182     int ret;
1183 
1184     /*
1185      * We restored the bitmap and updated the requested size; activate all
1186      * memslots (so listeners register) before notifying about plugged blocks.
1187      */
1188     if (vmem->dynamic_memslots) {
1189         /*
1190          * We don't expect any active memslots at this point to deactivate: no
1191          * memory was plugged on the migration destination.
1192          */
1193         virtio_mem_for_each_plugged_range(vmem, NULL,
1194                                           virtio_mem_activate_memslot_range_cb);
1195     }
1196 
1197     /*
1198      * We started out with all memory discarded and our memory region is mapped
1199      * into an address space. Replay, now that we updated the bitmap.
1200      */
1201     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
1202         ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1203                                                  virtio_mem_notify_populate_cb);
1204         if (ret) {
1205             return ret;
1206         }
1207     }
1208     return 0;
1209 }
1210 
virtio_mem_post_load(void * opaque,int version_id)1211 static int virtio_mem_post_load(void *opaque, int version_id)
1212 {
1213     VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1214     int ret;
1215 
1216     if (!vmem->early_migration) {
1217         ret = virtio_mem_post_load_bitmap(vmem);
1218         if (ret) {
1219             return ret;
1220         }
1221     }
1222 
1223     /*
1224      * If shared RAM is migrated using the file content and not using QEMU,
1225      * don't mess with preallocation and postcopy.
1226      */
1227     if (migrate_ram_is_ignored(vmem->memdev->mr.ram_block)) {
1228         return 0;
1229     }
1230 
1231     if (vmem->prealloc && !vmem->early_migration) {
1232         warn_report("Proper preallocation with migration requires a newer QEMU machine");
1233     }
1234 
1235     if (migration_in_incoming_postcopy()) {
1236         return 0;
1237     }
1238 
1239     return virtio_mem_restore_unplugged(vmem);
1240 }
1241 
virtio_mem_prealloc_range_cb(VirtIOMEM * vmem,void * arg,uint64_t offset,uint64_t size)1242 static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg,
1243                                         uint64_t offset, uint64_t size)
1244 {
1245     void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
1246     int fd = memory_region_get_fd(&vmem->memdev->mr);
1247     Error *local_err = NULL;
1248 
1249     if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
1250         error_report_err(local_err);
1251         return -ENOMEM;
1252     }
1253     return 0;
1254 }
1255 
virtio_mem_post_load_early(void * opaque,int version_id)1256 static int virtio_mem_post_load_early(void *opaque, int version_id)
1257 {
1258     VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1259     RAMBlock *rb = vmem->memdev->mr.ram_block;
1260     int ret;
1261 
1262     if (!vmem->prealloc) {
1263         goto post_load_bitmap;
1264     }
1265 
1266     /*
1267      * If shared RAM is migrated using the file content and not using QEMU,
1268      * don't mess with preallocation and postcopy.
1269      */
1270     if (migrate_ram_is_ignored(rb)) {
1271         goto post_load_bitmap;
1272     }
1273 
1274     /*
1275      * We restored the bitmap and verified that the basic properties
1276      * match on source and destination, so we can go ahead and preallocate
1277      * memory for all plugged memory blocks, before actual RAM migration starts
1278      * touching this memory.
1279      */
1280     ret = virtio_mem_for_each_plugged_range(vmem, NULL,
1281                                             virtio_mem_prealloc_range_cb);
1282     if (ret) {
1283         return ret;
1284     }
1285 
1286     /*
1287      * This is tricky: postcopy wants to start with a clean slate. On
1288      * POSTCOPY_INCOMING_ADVISE, postcopy code discards all (ordinarily
1289      * preallocated) RAM such that postcopy will work as expected later.
1290      *
1291      * However, we run after POSTCOPY_INCOMING_ADVISE -- but before actual
1292      * RAM migration. So let's discard all memory again. This looks like an
1293      * expensive NOP, but actually serves a purpose: we made sure that we
1294      * were able to allocate all required backend memory once. We cannot
1295      * guarantee that the backend memory we will free will remain free
1296      * until we need it during postcopy, but at least we can catch the
1297      * obvious setup issues this way.
1298      */
1299     if (migration_incoming_postcopy_advised()) {
1300         if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
1301             return -EBUSY;
1302         }
1303     }
1304 
1305 post_load_bitmap:
1306     /* Finally, update any other state to be consistent with the new bitmap. */
1307     return virtio_mem_post_load_bitmap(vmem);
1308 }
1309 
1310 typedef struct VirtIOMEMMigSanityChecks {
1311     VirtIOMEM *parent;
1312     uint64_t addr;
1313     uint64_t region_size;
1314     uint64_t block_size;
1315     uint32_t node;
1316 } VirtIOMEMMigSanityChecks;
1317 
virtio_mem_mig_sanity_checks_pre_save(void * opaque)1318 static int virtio_mem_mig_sanity_checks_pre_save(void *opaque)
1319 {
1320     VirtIOMEMMigSanityChecks *tmp = opaque;
1321     VirtIOMEM *vmem = tmp->parent;
1322 
1323     tmp->addr = vmem->addr;
1324     tmp->region_size = memory_region_size(&vmem->memdev->mr);
1325     tmp->block_size = vmem->block_size;
1326     tmp->node = vmem->node;
1327     return 0;
1328 }
1329 
virtio_mem_mig_sanity_checks_post_load(void * opaque,int version_id)1330 static int virtio_mem_mig_sanity_checks_post_load(void *opaque, int version_id)
1331 {
1332     VirtIOMEMMigSanityChecks *tmp = opaque;
1333     VirtIOMEM *vmem = tmp->parent;
1334     const uint64_t new_region_size = memory_region_size(&vmem->memdev->mr);
1335 
1336     if (tmp->addr != vmem->addr) {
1337         error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
1338                      VIRTIO_MEM_ADDR_PROP, tmp->addr, vmem->addr);
1339         return -EINVAL;
1340     }
1341     /*
1342      * Note: Preparation for resizable memory regions. The maximum size
1343      * of the memory region must not change during migration.
1344      */
1345     if (tmp->region_size != new_region_size) {
1346         error_report("Property '%s' size changed from 0x%" PRIx64 " to 0x%"
1347                      PRIx64, VIRTIO_MEM_MEMDEV_PROP, tmp->region_size,
1348                      new_region_size);
1349         return -EINVAL;
1350     }
1351     if (tmp->block_size != vmem->block_size) {
1352         error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
1353                      VIRTIO_MEM_BLOCK_SIZE_PROP, tmp->block_size,
1354                      vmem->block_size);
1355         return -EINVAL;
1356     }
1357     if (tmp->node != vmem->node) {
1358         error_report("Property '%s' changed from %" PRIu32 " to %" PRIu32,
1359                      VIRTIO_MEM_NODE_PROP, tmp->node, vmem->node);
1360         return -EINVAL;
1361     }
1362     return 0;
1363 }
1364 
1365 static const VMStateDescription vmstate_virtio_mem_sanity_checks = {
1366     .name = "virtio-mem-device/sanity-checks",
1367     .pre_save = virtio_mem_mig_sanity_checks_pre_save,
1368     .post_load = virtio_mem_mig_sanity_checks_post_load,
1369     .fields = (const VMStateField[]) {
1370         VMSTATE_UINT64(addr, VirtIOMEMMigSanityChecks),
1371         VMSTATE_UINT64(region_size, VirtIOMEMMigSanityChecks),
1372         VMSTATE_UINT64(block_size, VirtIOMEMMigSanityChecks),
1373         VMSTATE_UINT32(node, VirtIOMEMMigSanityChecks),
1374         VMSTATE_END_OF_LIST(),
1375     },
1376 };
1377 
virtio_mem_vmstate_field_exists(void * opaque,int version_id)1378 static bool virtio_mem_vmstate_field_exists(void *opaque, int version_id)
1379 {
1380     const VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1381 
1382     /* With early migration, these fields were already migrated. */
1383     return !vmem->early_migration;
1384 }
1385 
1386 static const VMStateDescription vmstate_virtio_mem_device = {
1387     .name = "virtio-mem-device",
1388     .minimum_version_id = 1,
1389     .version_id = 1,
1390     .priority = MIG_PRI_VIRTIO_MEM,
1391     .post_load = virtio_mem_post_load,
1392     .fields = (const VMStateField[]) {
1393         VMSTATE_WITH_TMP_TEST(VirtIOMEM, virtio_mem_vmstate_field_exists,
1394                               VirtIOMEMMigSanityChecks,
1395                               vmstate_virtio_mem_sanity_checks),
1396         VMSTATE_UINT64(usable_region_size, VirtIOMEM),
1397         VMSTATE_UINT64_TEST(size, VirtIOMEM, virtio_mem_vmstate_field_exists),
1398         VMSTATE_UINT64(requested_size, VirtIOMEM),
1399         VMSTATE_BITMAP_TEST(bitmap, VirtIOMEM, virtio_mem_vmstate_field_exists,
1400                             0, bitmap_size),
1401         VMSTATE_END_OF_LIST()
1402     },
1403 };
1404 
1405 /*
1406  * Transfer properties that are immutable while migration is active early,
1407  * such that we have have this information around before migrating any RAM
1408  * content.
1409  *
1410  * Note that virtio_mem_is_busy() makes sure these properties can no longer
1411  * change on the migration source until migration completed.
1412  *
1413  * With QEMU compat machines, we transmit these properties later, via
1414  * vmstate_virtio_mem_device instead -- see virtio_mem_vmstate_field_exists().
1415  */
1416 static const VMStateDescription vmstate_virtio_mem_device_early = {
1417     .name = "virtio-mem-device-early",
1418     .minimum_version_id = 1,
1419     .version_id = 1,
1420     .early_setup = true,
1421     .post_load = virtio_mem_post_load_early,
1422     .fields = (const VMStateField[]) {
1423         VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks,
1424                          vmstate_virtio_mem_sanity_checks),
1425         VMSTATE_UINT64(size, VirtIOMEM),
1426         VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size),
1427         VMSTATE_END_OF_LIST()
1428     },
1429 };
1430 
1431 static const VMStateDescription vmstate_virtio_mem = {
1432     .name = "virtio-mem",
1433     .minimum_version_id = 1,
1434     .version_id = 1,
1435     .fields = (const VMStateField[]) {
1436         VMSTATE_VIRTIO_DEVICE,
1437         VMSTATE_END_OF_LIST()
1438     },
1439 };
1440 
virtio_mem_fill_device_info(const VirtIOMEM * vmem,VirtioMEMDeviceInfo * vi)1441 static void virtio_mem_fill_device_info(const VirtIOMEM *vmem,
1442                                         VirtioMEMDeviceInfo *vi)
1443 {
1444     vi->memaddr = vmem->addr;
1445     vi->node = vmem->node;
1446     vi->requested_size = vmem->requested_size;
1447     vi->size = vmem->size;
1448     vi->max_size = memory_region_size(&vmem->memdev->mr);
1449     vi->block_size = vmem->block_size;
1450     vi->memdev = object_get_canonical_path(OBJECT(vmem->memdev));
1451 }
1452 
virtio_mem_get_memory_region(VirtIOMEM * vmem,Error ** errp)1453 static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp)
1454 {
1455     if (!vmem->memdev) {
1456         error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP);
1457         return NULL;
1458     } else if (vmem->dynamic_memslots) {
1459         if (!vmem->mr) {
1460             virtio_mem_prepare_mr(vmem);
1461         }
1462         return vmem->mr;
1463     }
1464 
1465     return &vmem->memdev->mr;
1466 }
1467 
virtio_mem_decide_memslots(VirtIOMEM * vmem,unsigned int limit)1468 static void virtio_mem_decide_memslots(VirtIOMEM *vmem, unsigned int limit)
1469 {
1470     uint64_t region_size, memslot_size, min_memslot_size;
1471     unsigned int memslots;
1472     RAMBlock *rb;
1473 
1474     if (!vmem->dynamic_memslots) {
1475         return;
1476     }
1477 
1478     /* We're called exactly once, before realizing the device. */
1479     assert(!vmem->nb_memslots);
1480 
1481     /* If realizing the device will fail, just assume a single memslot. */
1482     if (limit <= 1 || !vmem->memdev || !vmem->memdev->mr.ram_block) {
1483         vmem->nb_memslots = 1;
1484         return;
1485     }
1486 
1487     rb = vmem->memdev->mr.ram_block;
1488     region_size = memory_region_size(&vmem->memdev->mr);
1489 
1490     /*
1491      * Determine the default block size now, to determine the minimum memslot
1492      * size. We want the minimum slot size to be at least the device block size.
1493      */
1494     if (!vmem->block_size) {
1495         vmem->block_size = virtio_mem_default_block_size(rb);
1496     }
1497     /* If realizing the device will fail, just assume a single memslot. */
1498     if (vmem->block_size < qemu_ram_pagesize(rb) ||
1499         !QEMU_IS_ALIGNED(region_size, vmem->block_size)) {
1500         vmem->nb_memslots = 1;
1501         return;
1502     }
1503 
1504     /*
1505      * All memslots except the last one have a reasonable minimum size, and
1506      * and all memslot sizes are aligned to the device block size.
1507      */
1508     memslot_size = QEMU_ALIGN_UP(region_size / limit, vmem->block_size);
1509     min_memslot_size = MAX(vmem->block_size, VIRTIO_MEM_MIN_MEMSLOT_SIZE);
1510     memslot_size = MAX(memslot_size, min_memslot_size);
1511 
1512     memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size;
1513     if (memslots != 1) {
1514         vmem->memslot_size = memslot_size;
1515     }
1516     vmem->nb_memslots = memslots;
1517 }
1518 
virtio_mem_get_memslots(VirtIOMEM * vmem)1519 static unsigned int virtio_mem_get_memslots(VirtIOMEM *vmem)
1520 {
1521     if (!vmem->dynamic_memslots) {
1522         /* Exactly one static RAM memory region. */
1523         return 1;
1524     }
1525 
1526     /* We're called after instructed to make a decision. */
1527     g_assert(vmem->nb_memslots);
1528     return vmem->nb_memslots;
1529 }
1530 
virtio_mem_add_size_change_notifier(VirtIOMEM * vmem,Notifier * notifier)1531 static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem,
1532                                                 Notifier *notifier)
1533 {
1534     notifier_list_add(&vmem->size_change_notifiers, notifier);
1535 }
1536 
virtio_mem_remove_size_change_notifier(VirtIOMEM * vmem,Notifier * notifier)1537 static void virtio_mem_remove_size_change_notifier(VirtIOMEM *vmem,
1538                                                    Notifier *notifier)
1539 {
1540     notifier_remove(notifier);
1541 }
1542 
virtio_mem_get_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1543 static void virtio_mem_get_size(Object *obj, Visitor *v, const char *name,
1544                                 void *opaque, Error **errp)
1545 {
1546     const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1547     uint64_t value = vmem->size;
1548 
1549     visit_type_size(v, name, &value, errp);
1550 }
1551 
virtio_mem_get_requested_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1552 static void virtio_mem_get_requested_size(Object *obj, Visitor *v,
1553                                           const char *name, void *opaque,
1554                                           Error **errp)
1555 {
1556     const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1557     uint64_t value = vmem->requested_size;
1558 
1559     visit_type_size(v, name, &value, errp);
1560 }
1561 
virtio_mem_set_requested_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1562 static void virtio_mem_set_requested_size(Object *obj, Visitor *v,
1563                                           const char *name, void *opaque,
1564                                           Error **errp)
1565 {
1566     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1567     uint64_t value;
1568 
1569     if (!visit_type_size(v, name, &value, errp)) {
1570         return;
1571     }
1572 
1573     /*
1574      * The block size and memory backend are not fixed until the device was
1575      * realized. realize() will verify these properties then.
1576      */
1577     if (DEVICE(obj)->realized) {
1578         if (!QEMU_IS_ALIGNED(value, vmem->block_size)) {
1579             error_setg(errp, "'%s' has to be multiples of '%s' (0x%" PRIx64
1580                        ")", name, VIRTIO_MEM_BLOCK_SIZE_PROP,
1581                        vmem->block_size);
1582             return;
1583         } else if (value > memory_region_size(&vmem->memdev->mr)) {
1584             error_setg(errp, "'%s' cannot exceed the memory backend size"
1585                        "(0x%" PRIx64 ")", name,
1586                        memory_region_size(&vmem->memdev->mr));
1587             return;
1588         }
1589 
1590         if (value != vmem->requested_size) {
1591             virtio_mem_resize_usable_region(vmem, value, false);
1592             vmem->requested_size = value;
1593         }
1594         /*
1595          * Trigger a config update so the guest gets notified. We trigger
1596          * even if the size didn't change (especially helpful for debugging).
1597          */
1598         virtio_notify_config(VIRTIO_DEVICE(vmem));
1599     } else {
1600         vmem->requested_size = value;
1601     }
1602 }
1603 
virtio_mem_get_block_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1604 static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name,
1605                                       void *opaque, Error **errp)
1606 {
1607     const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1608     uint64_t value = vmem->block_size;
1609 
1610     /*
1611      * If not configured by the user (and we're not realized yet), use the
1612      * default block size we would use with the current memory backend.
1613      */
1614     if (!value) {
1615         if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) {
1616             value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block);
1617         } else {
1618             value = virtio_mem_thp_size();
1619         }
1620     }
1621 
1622     visit_type_size(v, name, &value, errp);
1623 }
1624 
virtio_mem_set_block_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1625 static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
1626                                       void *opaque, Error **errp)
1627 {
1628     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1629     uint64_t value;
1630 
1631     if (DEVICE(obj)->realized) {
1632         error_setg(errp, "'%s' cannot be changed", name);
1633         return;
1634     }
1635 
1636     if (!visit_type_size(v, name, &value, errp)) {
1637         return;
1638     }
1639 
1640     if (value < VIRTIO_MEM_MIN_BLOCK_SIZE) {
1641         error_setg(errp, "'%s' property has to be at least 0x%" PRIx32, name,
1642                    VIRTIO_MEM_MIN_BLOCK_SIZE);
1643         return;
1644     } else if (!is_power_of_2(value)) {
1645         error_setg(errp, "'%s' property has to be a power of two", name);
1646         return;
1647     }
1648     vmem->block_size = value;
1649 }
1650 
virtio_mem_instance_init(Object * obj)1651 static void virtio_mem_instance_init(Object *obj)
1652 {
1653     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1654 
1655     notifier_list_init(&vmem->size_change_notifiers);
1656     QLIST_INIT(&vmem->rdl_list);
1657 
1658     object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size,
1659                         NULL, NULL, NULL);
1660     object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size",
1661                         virtio_mem_get_requested_size,
1662                         virtio_mem_set_requested_size, NULL, NULL);
1663     object_property_add(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, "size",
1664                         virtio_mem_get_block_size, virtio_mem_set_block_size,
1665                         NULL, NULL);
1666 }
1667 
virtio_mem_instance_finalize(Object * obj)1668 static void virtio_mem_instance_finalize(Object *obj)
1669 {
1670     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1671 
1672     /*
1673      * Note: the core already dropped the references on all memory regions
1674      * (it's passed as the owner to memory_region_init_*()) and finalized
1675      * these objects. We can simply free the memory.
1676      */
1677     g_free(vmem->memslots);
1678     vmem->memslots = NULL;
1679     g_free(vmem->mr);
1680     vmem->mr = NULL;
1681 }
1682 
1683 static Property virtio_mem_properties[] = {
1684     DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0),
1685     DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0),
1686     DEFINE_PROP_BOOL(VIRTIO_MEM_PREALLOC_PROP, VirtIOMEM, prealloc, false),
1687     DEFINE_PROP_LINK(VIRTIO_MEM_MEMDEV_PROP, VirtIOMEM, memdev,
1688                      TYPE_MEMORY_BACKEND, HostMemoryBackend *),
1689 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
1690     DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM,
1691                             unplugged_inaccessible, ON_OFF_AUTO_ON),
1692 #endif
1693     DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
1694                      early_migration, true),
1695     DEFINE_PROP_BOOL(VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP, VirtIOMEM,
1696                      dynamic_memslots, false),
1697     DEFINE_PROP_END_OF_LIST(),
1698 };
1699 
virtio_mem_rdm_get_min_granularity(const RamDiscardManager * rdm,const MemoryRegion * mr)1700 static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm,
1701                                                    const MemoryRegion *mr)
1702 {
1703     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1704 
1705     g_assert(mr == &vmem->memdev->mr);
1706     return vmem->block_size;
1707 }
1708 
virtio_mem_rdm_is_populated(const RamDiscardManager * rdm,const MemoryRegionSection * s)1709 static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm,
1710                                         const MemoryRegionSection *s)
1711 {
1712     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1713     uint64_t start_gpa = vmem->addr + s->offset_within_region;
1714     uint64_t end_gpa = start_gpa + int128_get64(s->size);
1715 
1716     g_assert(s->mr == &vmem->memdev->mr);
1717 
1718     start_gpa = QEMU_ALIGN_DOWN(start_gpa, vmem->block_size);
1719     end_gpa = QEMU_ALIGN_UP(end_gpa, vmem->block_size);
1720 
1721     if (!virtio_mem_valid_range(vmem, start_gpa, end_gpa - start_gpa)) {
1722         return false;
1723     }
1724 
1725     return virtio_mem_is_range_plugged(vmem, start_gpa, end_gpa - start_gpa);
1726 }
1727 
1728 struct VirtIOMEMReplayData {
1729     void *fn;
1730     void *opaque;
1731 };
1732 
virtio_mem_rdm_replay_populated_cb(MemoryRegionSection * s,void * arg)1733 static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg)
1734 {
1735     struct VirtIOMEMReplayData *data = arg;
1736 
1737     return ((ReplayRamPopulate)data->fn)(s, data->opaque);
1738 }
1739 
virtio_mem_rdm_replay_populated(const RamDiscardManager * rdm,MemoryRegionSection * s,ReplayRamPopulate replay_fn,void * opaque)1740 static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm,
1741                                            MemoryRegionSection *s,
1742                                            ReplayRamPopulate replay_fn,
1743                                            void *opaque)
1744 {
1745     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1746     struct VirtIOMEMReplayData data = {
1747         .fn = replay_fn,
1748         .opaque = opaque,
1749     };
1750 
1751     g_assert(s->mr == &vmem->memdev->mr);
1752     return virtio_mem_for_each_plugged_section(vmem, s, &data,
1753                                             virtio_mem_rdm_replay_populated_cb);
1754 }
1755 
virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection * s,void * arg)1756 static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s,
1757                                               void *arg)
1758 {
1759     struct VirtIOMEMReplayData *data = arg;
1760 
1761     ((ReplayRamDiscard)data->fn)(s, data->opaque);
1762     return 0;
1763 }
1764 
virtio_mem_rdm_replay_discarded(const RamDiscardManager * rdm,MemoryRegionSection * s,ReplayRamDiscard replay_fn,void * opaque)1765 static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm,
1766                                             MemoryRegionSection *s,
1767                                             ReplayRamDiscard replay_fn,
1768                                             void *opaque)
1769 {
1770     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1771     struct VirtIOMEMReplayData data = {
1772         .fn = replay_fn,
1773         .opaque = opaque,
1774     };
1775 
1776     g_assert(s->mr == &vmem->memdev->mr);
1777     virtio_mem_for_each_unplugged_section(vmem, s, &data,
1778                                           virtio_mem_rdm_replay_discarded_cb);
1779 }
1780 
virtio_mem_rdm_register_listener(RamDiscardManager * rdm,RamDiscardListener * rdl,MemoryRegionSection * s)1781 static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm,
1782                                              RamDiscardListener *rdl,
1783                                              MemoryRegionSection *s)
1784 {
1785     VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1786     int ret;
1787 
1788     g_assert(s->mr == &vmem->memdev->mr);
1789     rdl->section = memory_region_section_new_copy(s);
1790 
1791     QLIST_INSERT_HEAD(&vmem->rdl_list, rdl, next);
1792     ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1793                                               virtio_mem_notify_populate_cb);
1794     if (ret) {
1795         error_report("%s: Replaying plugged ranges failed: %s", __func__,
1796                      strerror(-ret));
1797     }
1798 }
1799 
virtio_mem_rdm_unregister_listener(RamDiscardManager * rdm,RamDiscardListener * rdl)1800 static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm,
1801                                                RamDiscardListener *rdl)
1802 {
1803     VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1804 
1805     g_assert(rdl->section->mr == &vmem->memdev->mr);
1806     if (vmem->size) {
1807         if (rdl->double_discard_supported) {
1808             rdl->notify_discard(rdl, rdl->section);
1809         } else {
1810             virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1811                                                 virtio_mem_notify_discard_cb);
1812         }
1813     }
1814 
1815     memory_region_section_free_copy(rdl->section);
1816     rdl->section = NULL;
1817     QLIST_REMOVE(rdl, next);
1818 }
1819 
virtio_mem_unplug_request_check(VirtIOMEM * vmem,Error ** errp)1820 static void virtio_mem_unplug_request_check(VirtIOMEM *vmem, Error **errp)
1821 {
1822     if (vmem->unplugged_inaccessible == ON_OFF_AUTO_OFF) {
1823         /*
1824          * We could allow it with a usable region size of 0, but let's just
1825          * not care about that legacy setting.
1826          */
1827         error_setg(errp, "virtio-mem device cannot get unplugged while"
1828                    " '" VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP "' != 'on'");
1829         return;
1830     }
1831 
1832     if (vmem->size) {
1833         error_setg(errp, "virtio-mem device cannot get unplugged while some"
1834                    " of its memory is still plugged");
1835         return;
1836     }
1837     if (vmem->requested_size) {
1838         error_setg(errp, "virtio-mem device cannot get unplugged while"
1839                    " '" VIRTIO_MEM_REQUESTED_SIZE_PROP "' != '0'");
1840         return;
1841     }
1842 }
1843 
virtio_mem_get_reset_state(Object * obj)1844 static ResettableState *virtio_mem_get_reset_state(Object *obj)
1845 {
1846     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1847     return &vmem->reset_state;
1848 }
1849 
virtio_mem_system_reset_hold(Object * obj,ResetType type)1850 static void virtio_mem_system_reset_hold(Object *obj, ResetType type)
1851 {
1852     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1853 
1854     /*
1855      * When waking up from standby/suspend-to-ram, do not unplug any memory.
1856      */
1857     if (type == RESET_TYPE_WAKEUP) {
1858         return;
1859     }
1860 
1861     /*
1862      * During usual resets, we will unplug all memory and shrink the usable
1863      * region size. This is, however, not possible in all scenarios. Then,
1864      * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL).
1865      */
1866     virtio_mem_unplug_all(vmem);
1867 }
1868 
virtio_mem_class_init(ObjectClass * klass,void * data)1869 static void virtio_mem_class_init(ObjectClass *klass, void *data)
1870 {
1871     DeviceClass *dc = DEVICE_CLASS(klass);
1872     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
1873     VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass);
1874     RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass);
1875     ResettableClass *rc = RESETTABLE_CLASS(klass);
1876 
1877     device_class_set_props(dc, virtio_mem_properties);
1878     dc->vmsd = &vmstate_virtio_mem;
1879 
1880     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1881     vdc->realize = virtio_mem_device_realize;
1882     vdc->unrealize = virtio_mem_device_unrealize;
1883     vdc->get_config = virtio_mem_get_config;
1884     vdc->get_features = virtio_mem_get_features;
1885     vdc->validate_features = virtio_mem_validate_features;
1886     vdc->vmsd = &vmstate_virtio_mem_device;
1887 
1888     vmc->fill_device_info = virtio_mem_fill_device_info;
1889     vmc->get_memory_region = virtio_mem_get_memory_region;
1890     vmc->decide_memslots = virtio_mem_decide_memslots;
1891     vmc->get_memslots = virtio_mem_get_memslots;
1892     vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier;
1893     vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier;
1894     vmc->unplug_request_check = virtio_mem_unplug_request_check;
1895 
1896     rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity;
1897     rdmc->is_populated = virtio_mem_rdm_is_populated;
1898     rdmc->replay_populated = virtio_mem_rdm_replay_populated;
1899     rdmc->replay_discarded = virtio_mem_rdm_replay_discarded;
1900     rdmc->register_listener = virtio_mem_rdm_register_listener;
1901     rdmc->unregister_listener = virtio_mem_rdm_unregister_listener;
1902 
1903     rc->get_state = virtio_mem_get_reset_state;
1904     rc->phases.hold = virtio_mem_system_reset_hold;
1905 }
1906 
1907 static const TypeInfo virtio_mem_info = {
1908     .name = TYPE_VIRTIO_MEM,
1909     .parent = TYPE_VIRTIO_DEVICE,
1910     .instance_size = sizeof(VirtIOMEM),
1911     .instance_init = virtio_mem_instance_init,
1912     .instance_finalize = virtio_mem_instance_finalize,
1913     .class_init = virtio_mem_class_init,
1914     .class_size = sizeof(VirtIOMEMClass),
1915     .interfaces = (InterfaceInfo[]) {
1916         { TYPE_RAM_DISCARD_MANAGER },
1917         { }
1918     },
1919 };
1920 
virtio_register_types(void)1921 static void virtio_register_types(void)
1922 {
1923     type_register_static(&virtio_mem_info);
1924 }
1925 
1926 type_init(virtio_register_types)
1927