xref: /openbmc/qemu/hw/virtio/virtio.c (revision c2b38b27)
1 /*
2  * Virtio Support
3  *
4  * Copyright IBM, Corp. 2007
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 #include "qapi/error.h"
16 #include "qemu-common.h"
17 #include "cpu.h"
18 #include "trace.h"
19 #include "exec/address-spaces.h"
20 #include "qemu/error-report.h"
21 #include "hw/virtio/virtio.h"
22 #include "qemu/atomic.h"
23 #include "hw/virtio/virtio-bus.h"
24 #include "migration/migration.h"
25 #include "hw/virtio/virtio-access.h"
26 #include "sysemu/dma.h"
27 
28 /*
29  * The alignment to use between consumer and producer parts of vring.
30  * x86 pagesize again. This is the default, used by transports like PCI
31  * which don't provide a means for the guest to tell the host the alignment.
32  */
33 #define VIRTIO_PCI_VRING_ALIGN         4096
34 
35 typedef struct VRingDesc
36 {
37     uint64_t addr;
38     uint32_t len;
39     uint16_t flags;
40     uint16_t next;
41 } VRingDesc;
42 
43 typedef struct VRingAvail
44 {
45     uint16_t flags;
46     uint16_t idx;
47     uint16_t ring[0];
48 } VRingAvail;
49 
50 typedef struct VRingUsedElem
51 {
52     uint32_t id;
53     uint32_t len;
54 } VRingUsedElem;
55 
56 typedef struct VRingUsed
57 {
58     uint16_t flags;
59     uint16_t idx;
60     VRingUsedElem ring[0];
61 } VRingUsed;
62 
63 typedef struct VRingMemoryRegionCaches {
64     struct rcu_head rcu;
65     MemoryRegionCache desc;
66     MemoryRegionCache avail;
67     MemoryRegionCache used;
68 } VRingMemoryRegionCaches;
69 
70 typedef struct VRing
71 {
72     unsigned int num;
73     unsigned int num_default;
74     unsigned int align;
75     hwaddr desc;
76     hwaddr avail;
77     hwaddr used;
78     VRingMemoryRegionCaches *caches;
79 } VRing;
80 
81 struct VirtQueue
82 {
83     VRing vring;
84 
85     /* Next head to pop */
86     uint16_t last_avail_idx;
87 
88     /* Last avail_idx read from VQ. */
89     uint16_t shadow_avail_idx;
90 
91     uint16_t used_idx;
92 
93     /* Last used index value we have signalled on */
94     uint16_t signalled_used;
95 
96     /* Last used index value we have signalled on */
97     bool signalled_used_valid;
98 
99     /* Notification enabled? */
100     bool notification;
101 
102     uint16_t queue_index;
103 
104     unsigned int inuse;
105 
106     uint16_t vector;
107     VirtIOHandleOutput handle_output;
108     VirtIOHandleAIOOutput handle_aio_output;
109     VirtIODevice *vdev;
110     EventNotifier guest_notifier;
111     EventNotifier host_notifier;
112     QLIST_ENTRY(VirtQueue) node;
113 };
114 
115 static void virtio_free_region_cache(VRingMemoryRegionCaches *caches)
116 {
117     if (!caches) {
118         return;
119     }
120 
121     address_space_cache_destroy(&caches->desc);
122     address_space_cache_destroy(&caches->avail);
123     address_space_cache_destroy(&caches->used);
124     g_free(caches);
125 }
126 
127 static void virtio_init_region_cache(VirtIODevice *vdev, int n)
128 {
129     VirtQueue *vq = &vdev->vq[n];
130     VRingMemoryRegionCaches *old = vq->vring.caches;
131     VRingMemoryRegionCaches *new;
132     hwaddr addr, size;
133     int event_size;
134 
135     event_size = virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
136 
137     addr = vq->vring.desc;
138     if (!addr) {
139         return;
140     }
141     new = g_new0(VRingMemoryRegionCaches, 1);
142     size = virtio_queue_get_desc_size(vdev, n);
143     address_space_cache_init(&new->desc, vdev->dma_as,
144                              addr, size, false);
145 
146     size = virtio_queue_get_used_size(vdev, n) + event_size;
147     address_space_cache_init(&new->used, vdev->dma_as,
148                              vq->vring.used, size, true);
149 
150     size = virtio_queue_get_avail_size(vdev, n) + event_size;
151     address_space_cache_init(&new->avail, vdev->dma_as,
152                              vq->vring.avail, size, false);
153 
154     atomic_rcu_set(&vq->vring.caches, new);
155     if (old) {
156         call_rcu(old, virtio_free_region_cache, rcu);
157     }
158 }
159 
160 /* virt queue functions */
161 void virtio_queue_update_rings(VirtIODevice *vdev, int n)
162 {
163     VRing *vring = &vdev->vq[n].vring;
164 
165     if (!vring->desc) {
166         /* not yet setup -> nothing to do */
167         return;
168     }
169     vring->avail = vring->desc + vring->num * sizeof(VRingDesc);
170     vring->used = vring_align(vring->avail +
171                               offsetof(VRingAvail, ring[vring->num]),
172                               vring->align);
173     virtio_init_region_cache(vdev, n);
174 }
175 
176 /* Called within rcu_read_lock().  */
177 static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc,
178                             MemoryRegionCache *cache, int i)
179 {
180     address_space_read_cached(cache, i * sizeof(VRingDesc),
181                               desc, sizeof(VRingDesc));
182     virtio_tswap64s(vdev, &desc->addr);
183     virtio_tswap32s(vdev, &desc->len);
184     virtio_tswap16s(vdev, &desc->flags);
185     virtio_tswap16s(vdev, &desc->next);
186 }
187 
188 /* Called within rcu_read_lock().  */
189 static inline uint16_t vring_avail_flags(VirtQueue *vq)
190 {
191     VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches);
192     hwaddr pa = offsetof(VRingAvail, flags);
193     return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa);
194 }
195 
196 /* Called within rcu_read_lock().  */
197 static inline uint16_t vring_avail_idx(VirtQueue *vq)
198 {
199     VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches);
200     hwaddr pa = offsetof(VRingAvail, idx);
201     vq->shadow_avail_idx = virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa);
202     return vq->shadow_avail_idx;
203 }
204 
205 /* Called within rcu_read_lock().  */
206 static inline uint16_t vring_avail_ring(VirtQueue *vq, int i)
207 {
208     VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches);
209     hwaddr pa = offsetof(VRingAvail, ring[i]);
210     return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa);
211 }
212 
213 /* Called within rcu_read_lock().  */
214 static inline uint16_t vring_get_used_event(VirtQueue *vq)
215 {
216     return vring_avail_ring(vq, vq->vring.num);
217 }
218 
219 /* Called within rcu_read_lock().  */
220 static inline void vring_used_write(VirtQueue *vq, VRingUsedElem *uelem,
221                                     int i)
222 {
223     VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches);
224     hwaddr pa = offsetof(VRingUsed, ring[i]);
225     virtio_tswap32s(vq->vdev, &uelem->id);
226     virtio_tswap32s(vq->vdev, &uelem->len);
227     address_space_write_cached(&caches->used, pa, uelem, sizeof(VRingUsedElem));
228     address_space_cache_invalidate(&caches->used, pa, sizeof(VRingUsedElem));
229 }
230 
231 /* Called within rcu_read_lock().  */
232 static uint16_t vring_used_idx(VirtQueue *vq)
233 {
234     VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches);
235     hwaddr pa = offsetof(VRingUsed, idx);
236     return virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
237 }
238 
239 /* Called within rcu_read_lock().  */
240 static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val)
241 {
242     VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches);
243     hwaddr pa = offsetof(VRingUsed, idx);
244     virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val);
245     address_space_cache_invalidate(&caches->used, pa, sizeof(val));
246     vq->used_idx = val;
247 }
248 
249 /* Called within rcu_read_lock().  */
250 static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
251 {
252     VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches);
253     VirtIODevice *vdev = vq->vdev;
254     hwaddr pa = offsetof(VRingUsed, flags);
255     uint16_t flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
256 
257     virtio_stw_phys_cached(vdev, &caches->used, pa, flags | mask);
258     address_space_cache_invalidate(&caches->used, pa, sizeof(flags));
259 }
260 
261 /* Called within rcu_read_lock().  */
262 static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask)
263 {
264     VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches);
265     VirtIODevice *vdev = vq->vdev;
266     hwaddr pa = offsetof(VRingUsed, flags);
267     uint16_t flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
268 
269     virtio_stw_phys_cached(vdev, &caches->used, pa, flags & ~mask);
270     address_space_cache_invalidate(&caches->used, pa, sizeof(flags));
271 }
272 
273 /* Called within rcu_read_lock().  */
274 static inline void vring_set_avail_event(VirtQueue *vq, uint16_t val)
275 {
276     VRingMemoryRegionCaches *caches;
277     hwaddr pa;
278     if (!vq->notification) {
279         return;
280     }
281 
282     caches = atomic_rcu_read(&vq->vring.caches);
283     pa = offsetof(VRingUsed, ring[vq->vring.num]);
284     virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val);
285 }
286 
287 void virtio_queue_set_notification(VirtQueue *vq, int enable)
288 {
289     vq->notification = enable;
290 
291     rcu_read_lock();
292     if (virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
293         vring_set_avail_event(vq, vring_avail_idx(vq));
294     } else if (enable) {
295         vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY);
296     } else {
297         vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY);
298     }
299     if (enable) {
300         /* Expose avail event/used flags before caller checks the avail idx. */
301         smp_mb();
302     }
303     rcu_read_unlock();
304 }
305 
306 int virtio_queue_ready(VirtQueue *vq)
307 {
308     return vq->vring.avail != 0;
309 }
310 
311 /* Fetch avail_idx from VQ memory only when we really need to know if
312  * guest has added some buffers.
313  * Called within rcu_read_lock().  */
314 static int virtio_queue_empty_rcu(VirtQueue *vq)
315 {
316     if (vq->shadow_avail_idx != vq->last_avail_idx) {
317         return 0;
318     }
319 
320     return vring_avail_idx(vq) == vq->last_avail_idx;
321 }
322 
323 int virtio_queue_empty(VirtQueue *vq)
324 {
325     bool empty;
326 
327     if (vq->shadow_avail_idx != vq->last_avail_idx) {
328         return 0;
329     }
330 
331     rcu_read_lock();
332     empty = vring_avail_idx(vq) == vq->last_avail_idx;
333     rcu_read_unlock();
334     return empty;
335 }
336 
337 static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem,
338                                unsigned int len)
339 {
340     AddressSpace *dma_as = vq->vdev->dma_as;
341     unsigned int offset;
342     int i;
343 
344     offset = 0;
345     for (i = 0; i < elem->in_num; i++) {
346         size_t size = MIN(len - offset, elem->in_sg[i].iov_len);
347 
348         dma_memory_unmap(dma_as, elem->in_sg[i].iov_base,
349                          elem->in_sg[i].iov_len,
350                          DMA_DIRECTION_FROM_DEVICE, size);
351 
352         offset += size;
353     }
354 
355     for (i = 0; i < elem->out_num; i++)
356         dma_memory_unmap(dma_as, elem->out_sg[i].iov_base,
357                          elem->out_sg[i].iov_len,
358                          DMA_DIRECTION_TO_DEVICE,
359                          elem->out_sg[i].iov_len);
360 }
361 
362 /* virtqueue_detach_element:
363  * @vq: The #VirtQueue
364  * @elem: The #VirtQueueElement
365  * @len: number of bytes written
366  *
367  * Detach the element from the virtqueue.  This function is suitable for device
368  * reset or other situations where a #VirtQueueElement is simply freed and will
369  * not be pushed or discarded.
370  */
371 void virtqueue_detach_element(VirtQueue *vq, const VirtQueueElement *elem,
372                               unsigned int len)
373 {
374     vq->inuse--;
375     virtqueue_unmap_sg(vq, elem, len);
376 }
377 
378 /* virtqueue_unpop:
379  * @vq: The #VirtQueue
380  * @elem: The #VirtQueueElement
381  * @len: number of bytes written
382  *
383  * Pretend the most recent element wasn't popped from the virtqueue.  The next
384  * call to virtqueue_pop() will refetch the element.
385  */
386 void virtqueue_unpop(VirtQueue *vq, const VirtQueueElement *elem,
387                      unsigned int len)
388 {
389     vq->last_avail_idx--;
390     virtqueue_detach_element(vq, elem, len);
391 }
392 
393 /* virtqueue_rewind:
394  * @vq: The #VirtQueue
395  * @num: Number of elements to push back
396  *
397  * Pretend that elements weren't popped from the virtqueue.  The next
398  * virtqueue_pop() will refetch the oldest element.
399  *
400  * Use virtqueue_unpop() instead if you have a VirtQueueElement.
401  *
402  * Returns: true on success, false if @num is greater than the number of in use
403  * elements.
404  */
405 bool virtqueue_rewind(VirtQueue *vq, unsigned int num)
406 {
407     if (num > vq->inuse) {
408         return false;
409     }
410     vq->last_avail_idx -= num;
411     vq->inuse -= num;
412     return true;
413 }
414 
415 /* Called within rcu_read_lock().  */
416 void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
417                     unsigned int len, unsigned int idx)
418 {
419     VRingUsedElem uelem;
420 
421     trace_virtqueue_fill(vq, elem, len, idx);
422 
423     virtqueue_unmap_sg(vq, elem, len);
424 
425     if (unlikely(vq->vdev->broken)) {
426         return;
427     }
428 
429     idx = (idx + vq->used_idx) % vq->vring.num;
430 
431     uelem.id = elem->index;
432     uelem.len = len;
433     vring_used_write(vq, &uelem, idx);
434 }
435 
436 /* Called within rcu_read_lock().  */
437 void virtqueue_flush(VirtQueue *vq, unsigned int count)
438 {
439     uint16_t old, new;
440 
441     if (unlikely(vq->vdev->broken)) {
442         vq->inuse -= count;
443         return;
444     }
445 
446     /* Make sure buffer is written before we update index. */
447     smp_wmb();
448     trace_virtqueue_flush(vq, count);
449     old = vq->used_idx;
450     new = old + count;
451     vring_used_idx_set(vq, new);
452     vq->inuse -= count;
453     if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old)))
454         vq->signalled_used_valid = false;
455 }
456 
457 void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
458                     unsigned int len)
459 {
460     rcu_read_lock();
461     virtqueue_fill(vq, elem, len, 0);
462     virtqueue_flush(vq, 1);
463     rcu_read_unlock();
464 }
465 
466 /* Called within rcu_read_lock().  */
467 static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
468 {
469     uint16_t num_heads = vring_avail_idx(vq) - idx;
470 
471     /* Check it isn't doing very strange things with descriptor numbers. */
472     if (num_heads > vq->vring.num) {
473         virtio_error(vq->vdev, "Guest moved used index from %u to %u",
474                      idx, vq->shadow_avail_idx);
475         return -EINVAL;
476     }
477     /* On success, callers read a descriptor at vq->last_avail_idx.
478      * Make sure descriptor read does not bypass avail index read. */
479     if (num_heads) {
480         smp_rmb();
481     }
482 
483     return num_heads;
484 }
485 
486 /* Called within rcu_read_lock().  */
487 static bool virtqueue_get_head(VirtQueue *vq, unsigned int idx,
488                                unsigned int *head)
489 {
490     /* Grab the next descriptor number they're advertising, and increment
491      * the index we've seen. */
492     *head = vring_avail_ring(vq, idx % vq->vring.num);
493 
494     /* If their number is silly, that's a fatal mistake. */
495     if (*head >= vq->vring.num) {
496         virtio_error(vq->vdev, "Guest says index %u is available", *head);
497         return false;
498     }
499 
500     return true;
501 }
502 
503 enum {
504     VIRTQUEUE_READ_DESC_ERROR = -1,
505     VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
506     VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
507 };
508 
509 static int virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc,
510                                     MemoryRegionCache *desc_cache, unsigned int max,
511                                     unsigned int *next)
512 {
513     /* If this descriptor says it doesn't chain, we're done. */
514     if (!(desc->flags & VRING_DESC_F_NEXT)) {
515         return VIRTQUEUE_READ_DESC_DONE;
516     }
517 
518     /* Check they're not leading us off end of descriptors. */
519     *next = desc->next;
520     /* Make sure compiler knows to grab that: we don't want it changing! */
521     smp_wmb();
522 
523     if (*next >= max) {
524         virtio_error(vdev, "Desc next is %u", *next);
525         return VIRTQUEUE_READ_DESC_ERROR;
526     }
527 
528     vring_desc_read(vdev, desc, desc_cache, *next);
529     return VIRTQUEUE_READ_DESC_MORE;
530 }
531 
532 void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
533                                unsigned int *out_bytes,
534                                unsigned max_in_bytes, unsigned max_out_bytes)
535 {
536     VirtIODevice *vdev = vq->vdev;
537     unsigned int max, idx;
538     unsigned int total_bufs, in_total, out_total;
539     VRingMemoryRegionCaches *caches;
540     MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
541     int64_t len = 0;
542     int rc;
543 
544     rcu_read_lock();
545     idx = vq->last_avail_idx;
546     total_bufs = in_total = out_total = 0;
547 
548     max = vq->vring.num;
549     caches = atomic_rcu_read(&vq->vring.caches);
550     if (caches->desc.len < max * sizeof(VRingDesc)) {
551         virtio_error(vdev, "Cannot map descriptor ring");
552         goto err;
553     }
554 
555     while ((rc = virtqueue_num_heads(vq, idx)) > 0) {
556         MemoryRegionCache *desc_cache = &caches->desc;
557         unsigned int num_bufs;
558         VRingDesc desc;
559         unsigned int i;
560 
561         num_bufs = total_bufs;
562 
563         if (!virtqueue_get_head(vq, idx++, &i)) {
564             goto err;
565         }
566 
567         vring_desc_read(vdev, &desc, desc_cache, i);
568 
569         if (desc.flags & VRING_DESC_F_INDIRECT) {
570             if (desc.len % sizeof(VRingDesc)) {
571                 virtio_error(vdev, "Invalid size for indirect buffer table");
572                 goto err;
573             }
574 
575             /* If we've got too many, that implies a descriptor loop. */
576             if (num_bufs >= max) {
577                 virtio_error(vdev, "Looped descriptor");
578                 goto err;
579             }
580 
581             /* loop over the indirect descriptor table */
582             len = address_space_cache_init(&indirect_desc_cache,
583                                            vdev->dma_as,
584                                            desc.addr, desc.len, false);
585             desc_cache = &indirect_desc_cache;
586             if (len < desc.len) {
587                 virtio_error(vdev, "Cannot map indirect buffer");
588                 goto err;
589             }
590 
591             max = desc.len / sizeof(VRingDesc);
592             num_bufs = i = 0;
593             vring_desc_read(vdev, &desc, desc_cache, i);
594         }
595 
596         do {
597             /* If we've got too many, that implies a descriptor loop. */
598             if (++num_bufs > max) {
599                 virtio_error(vdev, "Looped descriptor");
600                 goto err;
601             }
602 
603             if (desc.flags & VRING_DESC_F_WRITE) {
604                 in_total += desc.len;
605             } else {
606                 out_total += desc.len;
607             }
608             if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
609                 goto done;
610             }
611 
612             rc = virtqueue_read_next_desc(vdev, &desc, desc_cache, max, &i);
613         } while (rc == VIRTQUEUE_READ_DESC_MORE);
614 
615         if (rc == VIRTQUEUE_READ_DESC_ERROR) {
616             goto err;
617         }
618 
619         if (desc_cache == &indirect_desc_cache) {
620             address_space_cache_destroy(&indirect_desc_cache);
621             total_bufs++;
622         } else {
623             total_bufs = num_bufs;
624         }
625     }
626 
627     if (rc < 0) {
628         goto err;
629     }
630 
631 done:
632     address_space_cache_destroy(&indirect_desc_cache);
633     if (in_bytes) {
634         *in_bytes = in_total;
635     }
636     if (out_bytes) {
637         *out_bytes = out_total;
638     }
639     rcu_read_unlock();
640     return;
641 
642 err:
643     in_total = out_total = 0;
644     goto done;
645 }
646 
647 int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes,
648                           unsigned int out_bytes)
649 {
650     unsigned int in_total, out_total;
651 
652     virtqueue_get_avail_bytes(vq, &in_total, &out_total, in_bytes, out_bytes);
653     return in_bytes <= in_total && out_bytes <= out_total;
654 }
655 
656 static bool virtqueue_map_desc(VirtIODevice *vdev, unsigned int *p_num_sg,
657                                hwaddr *addr, struct iovec *iov,
658                                unsigned int max_num_sg, bool is_write,
659                                hwaddr pa, size_t sz)
660 {
661     bool ok = false;
662     unsigned num_sg = *p_num_sg;
663     assert(num_sg <= max_num_sg);
664 
665     if (!sz) {
666         virtio_error(vdev, "virtio: zero sized buffers are not allowed");
667         goto out;
668     }
669 
670     while (sz) {
671         hwaddr len = sz;
672 
673         if (num_sg == max_num_sg) {
674             virtio_error(vdev, "virtio: too many write descriptors in "
675                                "indirect table");
676             goto out;
677         }
678 
679         iov[num_sg].iov_base = dma_memory_map(vdev->dma_as, pa, &len,
680                                               is_write ?
681                                               DMA_DIRECTION_FROM_DEVICE :
682                                               DMA_DIRECTION_TO_DEVICE);
683         if (!iov[num_sg].iov_base) {
684             virtio_error(vdev, "virtio: bogus descriptor or out of resources");
685             goto out;
686         }
687 
688         iov[num_sg].iov_len = len;
689         addr[num_sg] = pa;
690 
691         sz -= len;
692         pa += len;
693         num_sg++;
694     }
695     ok = true;
696 
697 out:
698     *p_num_sg = num_sg;
699     return ok;
700 }
701 
702 /* Only used by error code paths before we have a VirtQueueElement (therefore
703  * virtqueue_unmap_sg() can't be used).  Assumes buffers weren't written to
704  * yet.
705  */
706 static void virtqueue_undo_map_desc(unsigned int out_num, unsigned int in_num,
707                                     struct iovec *iov)
708 {
709     unsigned int i;
710 
711     for (i = 0; i < out_num + in_num; i++) {
712         int is_write = i >= out_num;
713 
714         cpu_physical_memory_unmap(iov->iov_base, iov->iov_len, is_write, 0);
715         iov++;
716     }
717 }
718 
719 static void virtqueue_map_iovec(VirtIODevice *vdev, struct iovec *sg,
720                                 hwaddr *addr, unsigned int *num_sg,
721                                 int is_write)
722 {
723     unsigned int i;
724     hwaddr len;
725 
726     for (i = 0; i < *num_sg; i++) {
727         len = sg[i].iov_len;
728         sg[i].iov_base = dma_memory_map(vdev->dma_as,
729                                         addr[i], &len, is_write ?
730                                         DMA_DIRECTION_FROM_DEVICE :
731                                         DMA_DIRECTION_TO_DEVICE);
732         if (!sg[i].iov_base) {
733             error_report("virtio: error trying to map MMIO memory");
734             exit(1);
735         }
736         if (len != sg[i].iov_len) {
737             error_report("virtio: unexpected memory split");
738             exit(1);
739         }
740     }
741 }
742 
743 void virtqueue_map(VirtIODevice *vdev, VirtQueueElement *elem)
744 {
745     virtqueue_map_iovec(vdev, elem->in_sg, elem->in_addr, &elem->in_num, 1);
746     virtqueue_map_iovec(vdev, elem->out_sg, elem->out_addr, &elem->out_num, 0);
747 }
748 
749 static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num)
750 {
751     VirtQueueElement *elem;
752     size_t in_addr_ofs = QEMU_ALIGN_UP(sz, __alignof__(elem->in_addr[0]));
753     size_t out_addr_ofs = in_addr_ofs + in_num * sizeof(elem->in_addr[0]);
754     size_t out_addr_end = out_addr_ofs + out_num * sizeof(elem->out_addr[0]);
755     size_t in_sg_ofs = QEMU_ALIGN_UP(out_addr_end, __alignof__(elem->in_sg[0]));
756     size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
757     size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
758 
759     assert(sz >= sizeof(VirtQueueElement));
760     elem = g_malloc(out_sg_end);
761     elem->out_num = out_num;
762     elem->in_num = in_num;
763     elem->in_addr = (void *)elem + in_addr_ofs;
764     elem->out_addr = (void *)elem + out_addr_ofs;
765     elem->in_sg = (void *)elem + in_sg_ofs;
766     elem->out_sg = (void *)elem + out_sg_ofs;
767     return elem;
768 }
769 
770 void *virtqueue_pop(VirtQueue *vq, size_t sz)
771 {
772     unsigned int i, head, max;
773     VRingMemoryRegionCaches *caches;
774     MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
775     MemoryRegionCache *desc_cache;
776     int64_t len;
777     VirtIODevice *vdev = vq->vdev;
778     VirtQueueElement *elem = NULL;
779     unsigned out_num, in_num;
780     hwaddr addr[VIRTQUEUE_MAX_SIZE];
781     struct iovec iov[VIRTQUEUE_MAX_SIZE];
782     VRingDesc desc;
783     int rc;
784 
785     if (unlikely(vdev->broken)) {
786         return NULL;
787     }
788     rcu_read_lock();
789     if (virtio_queue_empty_rcu(vq)) {
790         goto done;
791     }
792     /* Needed after virtio_queue_empty(), see comment in
793      * virtqueue_num_heads(). */
794     smp_rmb();
795 
796     /* When we start there are none of either input nor output. */
797     out_num = in_num = 0;
798 
799     max = vq->vring.num;
800 
801     if (vq->inuse >= vq->vring.num) {
802         virtio_error(vdev, "Virtqueue size exceeded");
803         goto done;
804     }
805 
806     if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) {
807         goto done;
808     }
809 
810     if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
811         vring_set_avail_event(vq, vq->last_avail_idx);
812     }
813 
814     i = head;
815 
816     caches = atomic_rcu_read(&vq->vring.caches);
817     if (caches->desc.len < max * sizeof(VRingDesc)) {
818         virtio_error(vdev, "Cannot map descriptor ring");
819         goto done;
820     }
821 
822     desc_cache = &caches->desc;
823     vring_desc_read(vdev, &desc, desc_cache, i);
824     if (desc.flags & VRING_DESC_F_INDIRECT) {
825         if (desc.len % sizeof(VRingDesc)) {
826             virtio_error(vdev, "Invalid size for indirect buffer table");
827             goto done;
828         }
829 
830         /* loop over the indirect descriptor table */
831         len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as,
832                                        desc.addr, desc.len, false);
833         desc_cache = &indirect_desc_cache;
834         if (len < desc.len) {
835             virtio_error(vdev, "Cannot map indirect buffer");
836             goto done;
837         }
838 
839         max = desc.len / sizeof(VRingDesc);
840         i = 0;
841         vring_desc_read(vdev, &desc, desc_cache, i);
842     }
843 
844     /* Collect all the descriptors */
845     do {
846         bool map_ok;
847 
848         if (desc.flags & VRING_DESC_F_WRITE) {
849             map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num,
850                                         iov + out_num,
851                                         VIRTQUEUE_MAX_SIZE - out_num, true,
852                                         desc.addr, desc.len);
853         } else {
854             if (in_num) {
855                 virtio_error(vdev, "Incorrect order for descriptors");
856                 goto err_undo_map;
857             }
858             map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov,
859                                         VIRTQUEUE_MAX_SIZE, false,
860                                         desc.addr, desc.len);
861         }
862         if (!map_ok) {
863             goto err_undo_map;
864         }
865 
866         /* If we've got too many, that implies a descriptor loop. */
867         if ((in_num + out_num) > max) {
868             virtio_error(vdev, "Looped descriptor");
869             goto err_undo_map;
870         }
871 
872         rc = virtqueue_read_next_desc(vdev, &desc, desc_cache, max, &i);
873     } while (rc == VIRTQUEUE_READ_DESC_MORE);
874 
875     if (rc == VIRTQUEUE_READ_DESC_ERROR) {
876         goto err_undo_map;
877     }
878 
879     /* Now copy what we have collected and mapped */
880     elem = virtqueue_alloc_element(sz, out_num, in_num);
881     elem->index = head;
882     for (i = 0; i < out_num; i++) {
883         elem->out_addr[i] = addr[i];
884         elem->out_sg[i] = iov[i];
885     }
886     for (i = 0; i < in_num; i++) {
887         elem->in_addr[i] = addr[out_num + i];
888         elem->in_sg[i] = iov[out_num + i];
889     }
890 
891     vq->inuse++;
892 
893     trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
894 done:
895     address_space_cache_destroy(&indirect_desc_cache);
896     rcu_read_unlock();
897 
898     return elem;
899 
900 err_undo_map:
901     virtqueue_undo_map_desc(out_num, in_num, iov);
902     goto done;
903 }
904 
905 /* virtqueue_drop_all:
906  * @vq: The #VirtQueue
907  * Drops all queued buffers and indicates them to the guest
908  * as if they are done. Useful when buffers can not be
909  * processed but must be returned to the guest.
910  */
911 unsigned int virtqueue_drop_all(VirtQueue *vq)
912 {
913     unsigned int dropped = 0;
914     VirtQueueElement elem = {};
915     VirtIODevice *vdev = vq->vdev;
916     bool fEventIdx = virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
917 
918     if (unlikely(vdev->broken)) {
919         return 0;
920     }
921 
922     while (!virtio_queue_empty(vq) && vq->inuse < vq->vring.num) {
923         /* works similar to virtqueue_pop but does not map buffers
924         * and does not allocate any memory */
925         smp_rmb();
926         if (!virtqueue_get_head(vq, vq->last_avail_idx, &elem.index)) {
927             break;
928         }
929         vq->inuse++;
930         vq->last_avail_idx++;
931         if (fEventIdx) {
932             vring_set_avail_event(vq, vq->last_avail_idx);
933         }
934         /* immediately push the element, nothing to unmap
935          * as both in_num and out_num are set to 0 */
936         virtqueue_push(vq, &elem, 0);
937         dropped++;
938     }
939 
940     return dropped;
941 }
942 
943 /* Reading and writing a structure directly to QEMUFile is *awful*, but
944  * it is what QEMU has always done by mistake.  We can change it sooner
945  * or later by bumping the version number of the affected vm states.
946  * In the meanwhile, since the in-memory layout of VirtQueueElement
947  * has changed, we need to marshal to and from the layout that was
948  * used before the change.
949  */
950 typedef struct VirtQueueElementOld {
951     unsigned int index;
952     unsigned int out_num;
953     unsigned int in_num;
954     hwaddr in_addr[VIRTQUEUE_MAX_SIZE];
955     hwaddr out_addr[VIRTQUEUE_MAX_SIZE];
956     struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
957     struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
958 } VirtQueueElementOld;
959 
960 void *qemu_get_virtqueue_element(VirtIODevice *vdev, QEMUFile *f, size_t sz)
961 {
962     VirtQueueElement *elem;
963     VirtQueueElementOld data;
964     int i;
965 
966     qemu_get_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld));
967 
968     /* TODO: teach all callers that this can fail, and return failure instead
969      * of asserting here.
970      * When we do, we might be able to re-enable NDEBUG below.
971      */
972 #ifdef NDEBUG
973 #error building with NDEBUG is not supported
974 #endif
975     assert(ARRAY_SIZE(data.in_addr) >= data.in_num);
976     assert(ARRAY_SIZE(data.out_addr) >= data.out_num);
977 
978     elem = virtqueue_alloc_element(sz, data.out_num, data.in_num);
979     elem->index = data.index;
980 
981     for (i = 0; i < elem->in_num; i++) {
982         elem->in_addr[i] = data.in_addr[i];
983     }
984 
985     for (i = 0; i < elem->out_num; i++) {
986         elem->out_addr[i] = data.out_addr[i];
987     }
988 
989     for (i = 0; i < elem->in_num; i++) {
990         /* Base is overwritten by virtqueue_map.  */
991         elem->in_sg[i].iov_base = 0;
992         elem->in_sg[i].iov_len = data.in_sg[i].iov_len;
993     }
994 
995     for (i = 0; i < elem->out_num; i++) {
996         /* Base is overwritten by virtqueue_map.  */
997         elem->out_sg[i].iov_base = 0;
998         elem->out_sg[i].iov_len = data.out_sg[i].iov_len;
999     }
1000 
1001     virtqueue_map(vdev, elem);
1002     return elem;
1003 }
1004 
1005 void qemu_put_virtqueue_element(QEMUFile *f, VirtQueueElement *elem)
1006 {
1007     VirtQueueElementOld data;
1008     int i;
1009 
1010     memset(&data, 0, sizeof(data));
1011     data.index = elem->index;
1012     data.in_num = elem->in_num;
1013     data.out_num = elem->out_num;
1014 
1015     for (i = 0; i < elem->in_num; i++) {
1016         data.in_addr[i] = elem->in_addr[i];
1017     }
1018 
1019     for (i = 0; i < elem->out_num; i++) {
1020         data.out_addr[i] = elem->out_addr[i];
1021     }
1022 
1023     for (i = 0; i < elem->in_num; i++) {
1024         /* Base is overwritten by virtqueue_map when loading.  Do not
1025          * save it, as it would leak the QEMU address space layout.  */
1026         data.in_sg[i].iov_len = elem->in_sg[i].iov_len;
1027     }
1028 
1029     for (i = 0; i < elem->out_num; i++) {
1030         /* Do not save iov_base as above.  */
1031         data.out_sg[i].iov_len = elem->out_sg[i].iov_len;
1032     }
1033     qemu_put_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld));
1034 }
1035 
1036 /* virtio device */
1037 static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector)
1038 {
1039     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
1040     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
1041 
1042     if (unlikely(vdev->broken)) {
1043         return;
1044     }
1045 
1046     if (k->notify) {
1047         k->notify(qbus->parent, vector);
1048     }
1049 }
1050 
1051 void virtio_update_irq(VirtIODevice *vdev)
1052 {
1053     virtio_notify_vector(vdev, VIRTIO_NO_VECTOR);
1054 }
1055 
1056 static int virtio_validate_features(VirtIODevice *vdev)
1057 {
1058     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1059 
1060     if (virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM) &&
1061         !virtio_vdev_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) {
1062         return -EFAULT;
1063     }
1064 
1065     if (k->validate_features) {
1066         return k->validate_features(vdev);
1067     } else {
1068         return 0;
1069     }
1070 }
1071 
1072 int virtio_set_status(VirtIODevice *vdev, uint8_t val)
1073 {
1074     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1075     trace_virtio_set_status(vdev, val);
1076 
1077     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1078         if (!(vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) &&
1079             val & VIRTIO_CONFIG_S_FEATURES_OK) {
1080             int ret = virtio_validate_features(vdev);
1081 
1082             if (ret) {
1083                 return ret;
1084             }
1085         }
1086     }
1087     if (k->set_status) {
1088         k->set_status(vdev, val);
1089     }
1090     vdev->status = val;
1091     return 0;
1092 }
1093 
1094 bool target_words_bigendian(void);
1095 static enum virtio_device_endian virtio_default_endian(void)
1096 {
1097     if (target_words_bigendian()) {
1098         return VIRTIO_DEVICE_ENDIAN_BIG;
1099     } else {
1100         return VIRTIO_DEVICE_ENDIAN_LITTLE;
1101     }
1102 }
1103 
1104 static enum virtio_device_endian virtio_current_cpu_endian(void)
1105 {
1106     CPUClass *cc = CPU_GET_CLASS(current_cpu);
1107 
1108     if (cc->virtio_is_big_endian(current_cpu)) {
1109         return VIRTIO_DEVICE_ENDIAN_BIG;
1110     } else {
1111         return VIRTIO_DEVICE_ENDIAN_LITTLE;
1112     }
1113 }
1114 
1115 void virtio_reset(void *opaque)
1116 {
1117     VirtIODevice *vdev = opaque;
1118     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1119     int i;
1120 
1121     virtio_set_status(vdev, 0);
1122     if (current_cpu) {
1123         /* Guest initiated reset */
1124         vdev->device_endian = virtio_current_cpu_endian();
1125     } else {
1126         /* System reset */
1127         vdev->device_endian = virtio_default_endian();
1128     }
1129 
1130     if (k->reset) {
1131         k->reset(vdev);
1132     }
1133 
1134     vdev->broken = false;
1135     vdev->guest_features = 0;
1136     vdev->queue_sel = 0;
1137     vdev->status = 0;
1138     atomic_set(&vdev->isr, 0);
1139     vdev->config_vector = VIRTIO_NO_VECTOR;
1140     virtio_notify_vector(vdev, vdev->config_vector);
1141 
1142     for(i = 0; i < VIRTIO_QUEUE_MAX; i++) {
1143         vdev->vq[i].vring.desc = 0;
1144         vdev->vq[i].vring.avail = 0;
1145         vdev->vq[i].vring.used = 0;
1146         vdev->vq[i].last_avail_idx = 0;
1147         vdev->vq[i].shadow_avail_idx = 0;
1148         vdev->vq[i].used_idx = 0;
1149         virtio_queue_set_vector(vdev, i, VIRTIO_NO_VECTOR);
1150         vdev->vq[i].signalled_used = 0;
1151         vdev->vq[i].signalled_used_valid = false;
1152         vdev->vq[i].notification = true;
1153         vdev->vq[i].vring.num = vdev->vq[i].vring.num_default;
1154         vdev->vq[i].inuse = 0;
1155     }
1156 }
1157 
1158 uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr)
1159 {
1160     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1161     uint8_t val;
1162 
1163     if (addr + sizeof(val) > vdev->config_len) {
1164         return (uint32_t)-1;
1165     }
1166 
1167     k->get_config(vdev, vdev->config);
1168 
1169     val = ldub_p(vdev->config + addr);
1170     return val;
1171 }
1172 
1173 uint32_t virtio_config_readw(VirtIODevice *vdev, uint32_t addr)
1174 {
1175     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1176     uint16_t val;
1177 
1178     if (addr + sizeof(val) > vdev->config_len) {
1179         return (uint32_t)-1;
1180     }
1181 
1182     k->get_config(vdev, vdev->config);
1183 
1184     val = lduw_p(vdev->config + addr);
1185     return val;
1186 }
1187 
1188 uint32_t virtio_config_readl(VirtIODevice *vdev, uint32_t addr)
1189 {
1190     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1191     uint32_t val;
1192 
1193     if (addr + sizeof(val) > vdev->config_len) {
1194         return (uint32_t)-1;
1195     }
1196 
1197     k->get_config(vdev, vdev->config);
1198 
1199     val = ldl_p(vdev->config + addr);
1200     return val;
1201 }
1202 
1203 void virtio_config_writeb(VirtIODevice *vdev, uint32_t addr, uint32_t data)
1204 {
1205     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1206     uint8_t val = data;
1207 
1208     if (addr + sizeof(val) > vdev->config_len) {
1209         return;
1210     }
1211 
1212     stb_p(vdev->config + addr, val);
1213 
1214     if (k->set_config) {
1215         k->set_config(vdev, vdev->config);
1216     }
1217 }
1218 
1219 void virtio_config_writew(VirtIODevice *vdev, uint32_t addr, uint32_t data)
1220 {
1221     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1222     uint16_t val = data;
1223 
1224     if (addr + sizeof(val) > vdev->config_len) {
1225         return;
1226     }
1227 
1228     stw_p(vdev->config + addr, val);
1229 
1230     if (k->set_config) {
1231         k->set_config(vdev, vdev->config);
1232     }
1233 }
1234 
1235 void virtio_config_writel(VirtIODevice *vdev, uint32_t addr, uint32_t data)
1236 {
1237     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1238     uint32_t val = data;
1239 
1240     if (addr + sizeof(val) > vdev->config_len) {
1241         return;
1242     }
1243 
1244     stl_p(vdev->config + addr, val);
1245 
1246     if (k->set_config) {
1247         k->set_config(vdev, vdev->config);
1248     }
1249 }
1250 
1251 uint32_t virtio_config_modern_readb(VirtIODevice *vdev, uint32_t addr)
1252 {
1253     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1254     uint8_t val;
1255 
1256     if (addr + sizeof(val) > vdev->config_len) {
1257         return (uint32_t)-1;
1258     }
1259 
1260     k->get_config(vdev, vdev->config);
1261 
1262     val = ldub_p(vdev->config + addr);
1263     return val;
1264 }
1265 
1266 uint32_t virtio_config_modern_readw(VirtIODevice *vdev, uint32_t addr)
1267 {
1268     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1269     uint16_t val;
1270 
1271     if (addr + sizeof(val) > vdev->config_len) {
1272         return (uint32_t)-1;
1273     }
1274 
1275     k->get_config(vdev, vdev->config);
1276 
1277     val = lduw_le_p(vdev->config + addr);
1278     return val;
1279 }
1280 
1281 uint32_t virtio_config_modern_readl(VirtIODevice *vdev, uint32_t addr)
1282 {
1283     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1284     uint32_t val;
1285 
1286     if (addr + sizeof(val) > vdev->config_len) {
1287         return (uint32_t)-1;
1288     }
1289 
1290     k->get_config(vdev, vdev->config);
1291 
1292     val = ldl_le_p(vdev->config + addr);
1293     return val;
1294 }
1295 
1296 void virtio_config_modern_writeb(VirtIODevice *vdev,
1297                                  uint32_t addr, uint32_t data)
1298 {
1299     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1300     uint8_t val = data;
1301 
1302     if (addr + sizeof(val) > vdev->config_len) {
1303         return;
1304     }
1305 
1306     stb_p(vdev->config + addr, val);
1307 
1308     if (k->set_config) {
1309         k->set_config(vdev, vdev->config);
1310     }
1311 }
1312 
1313 void virtio_config_modern_writew(VirtIODevice *vdev,
1314                                  uint32_t addr, uint32_t data)
1315 {
1316     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1317     uint16_t val = data;
1318 
1319     if (addr + sizeof(val) > vdev->config_len) {
1320         return;
1321     }
1322 
1323     stw_le_p(vdev->config + addr, val);
1324 
1325     if (k->set_config) {
1326         k->set_config(vdev, vdev->config);
1327     }
1328 }
1329 
1330 void virtio_config_modern_writel(VirtIODevice *vdev,
1331                                  uint32_t addr, uint32_t data)
1332 {
1333     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1334     uint32_t val = data;
1335 
1336     if (addr + sizeof(val) > vdev->config_len) {
1337         return;
1338     }
1339 
1340     stl_le_p(vdev->config + addr, val);
1341 
1342     if (k->set_config) {
1343         k->set_config(vdev, vdev->config);
1344     }
1345 }
1346 
1347 void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr)
1348 {
1349     vdev->vq[n].vring.desc = addr;
1350     virtio_queue_update_rings(vdev, n);
1351 }
1352 
1353 hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n)
1354 {
1355     return vdev->vq[n].vring.desc;
1356 }
1357 
1358 void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
1359                             hwaddr avail, hwaddr used)
1360 {
1361     vdev->vq[n].vring.desc = desc;
1362     vdev->vq[n].vring.avail = avail;
1363     vdev->vq[n].vring.used = used;
1364     virtio_init_region_cache(vdev, n);
1365 }
1366 
1367 void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
1368 {
1369     /* Don't allow guest to flip queue between existent and
1370      * nonexistent states, or to set it to an invalid size.
1371      */
1372     if (!!num != !!vdev->vq[n].vring.num ||
1373         num > VIRTQUEUE_MAX_SIZE ||
1374         num < 0) {
1375         return;
1376     }
1377     vdev->vq[n].vring.num = num;
1378 }
1379 
1380 VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector)
1381 {
1382     return QLIST_FIRST(&vdev->vector_queues[vector]);
1383 }
1384 
1385 VirtQueue *virtio_vector_next_queue(VirtQueue *vq)
1386 {
1387     return QLIST_NEXT(vq, node);
1388 }
1389 
1390 int virtio_queue_get_num(VirtIODevice *vdev, int n)
1391 {
1392     return vdev->vq[n].vring.num;
1393 }
1394 
1395 int virtio_queue_get_max_num(VirtIODevice *vdev, int n)
1396 {
1397     return vdev->vq[n].vring.num_default;
1398 }
1399 
1400 int virtio_get_num_queues(VirtIODevice *vdev)
1401 {
1402     int i;
1403 
1404     for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
1405         if (!virtio_queue_get_num(vdev, i)) {
1406             break;
1407         }
1408     }
1409 
1410     return i;
1411 }
1412 
1413 void virtio_queue_set_align(VirtIODevice *vdev, int n, int align)
1414 {
1415     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
1416     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
1417 
1418     /* virtio-1 compliant devices cannot change the alignment */
1419     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1420         error_report("tried to modify queue alignment for virtio-1 device");
1421         return;
1422     }
1423     /* Check that the transport told us it was going to do this
1424      * (so a buggy transport will immediately assert rather than
1425      * silently failing to migrate this state)
1426      */
1427     assert(k->has_variable_vring_alignment);
1428 
1429     vdev->vq[n].vring.align = align;
1430     virtio_queue_update_rings(vdev, n);
1431 }
1432 
1433 static bool virtio_queue_notify_aio_vq(VirtQueue *vq)
1434 {
1435     if (vq->vring.desc && vq->handle_aio_output) {
1436         VirtIODevice *vdev = vq->vdev;
1437 
1438         trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
1439         return vq->handle_aio_output(vdev, vq);
1440     }
1441 
1442     return false;
1443 }
1444 
1445 static void virtio_queue_notify_vq(VirtQueue *vq)
1446 {
1447     if (vq->vring.desc && vq->handle_output) {
1448         VirtIODevice *vdev = vq->vdev;
1449 
1450         if (unlikely(vdev->broken)) {
1451             return;
1452         }
1453 
1454         trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
1455         vq->handle_output(vdev, vq);
1456     }
1457 }
1458 
1459 void virtio_queue_notify(VirtIODevice *vdev, int n)
1460 {
1461     virtio_queue_notify_vq(&vdev->vq[n]);
1462 }
1463 
1464 uint16_t virtio_queue_vector(VirtIODevice *vdev, int n)
1465 {
1466     return n < VIRTIO_QUEUE_MAX ? vdev->vq[n].vector :
1467         VIRTIO_NO_VECTOR;
1468 }
1469 
1470 void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector)
1471 {
1472     VirtQueue *vq = &vdev->vq[n];
1473 
1474     if (n < VIRTIO_QUEUE_MAX) {
1475         if (vdev->vector_queues &&
1476             vdev->vq[n].vector != VIRTIO_NO_VECTOR) {
1477             QLIST_REMOVE(vq, node);
1478         }
1479         vdev->vq[n].vector = vector;
1480         if (vdev->vector_queues &&
1481             vector != VIRTIO_NO_VECTOR) {
1482             QLIST_INSERT_HEAD(&vdev->vector_queues[vector], vq, node);
1483         }
1484     }
1485 }
1486 
1487 VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
1488                             VirtIOHandleOutput handle_output)
1489 {
1490     int i;
1491 
1492     for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
1493         if (vdev->vq[i].vring.num == 0)
1494             break;
1495     }
1496 
1497     if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE)
1498         abort();
1499 
1500     vdev->vq[i].vring.num = queue_size;
1501     vdev->vq[i].vring.num_default = queue_size;
1502     vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN;
1503     vdev->vq[i].handle_output = handle_output;
1504     vdev->vq[i].handle_aio_output = NULL;
1505 
1506     return &vdev->vq[i];
1507 }
1508 
1509 void virtio_del_queue(VirtIODevice *vdev, int n)
1510 {
1511     if (n < 0 || n >= VIRTIO_QUEUE_MAX) {
1512         abort();
1513     }
1514 
1515     vdev->vq[n].vring.num = 0;
1516     vdev->vq[n].vring.num_default = 0;
1517 }
1518 
1519 static void virtio_set_isr(VirtIODevice *vdev, int value)
1520 {
1521     uint8_t old = atomic_read(&vdev->isr);
1522 
1523     /* Do not write ISR if it does not change, so that its cacheline remains
1524      * shared in the common case where the guest does not read it.
1525      */
1526     if ((old & value) != value) {
1527         atomic_or(&vdev->isr, value);
1528     }
1529 }
1530 
1531 /* Called within rcu_read_lock().  */
1532 static bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq)
1533 {
1534     uint16_t old, new;
1535     bool v;
1536     /* We need to expose used array entries before checking used event. */
1537     smp_mb();
1538     /* Always notify when queue is empty (when feature acknowledge) */
1539     if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
1540         !vq->inuse && virtio_queue_empty(vq)) {
1541         return true;
1542     }
1543 
1544     if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
1545         return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
1546     }
1547 
1548     v = vq->signalled_used_valid;
1549     vq->signalled_used_valid = true;
1550     old = vq->signalled_used;
1551     new = vq->signalled_used = vq->used_idx;
1552     return !v || vring_need_event(vring_get_used_event(vq), new, old);
1553 }
1554 
1555 void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq)
1556 {
1557     bool should_notify;
1558     rcu_read_lock();
1559     should_notify = virtio_should_notify(vdev, vq);
1560     rcu_read_unlock();
1561 
1562     if (!should_notify) {
1563         return;
1564     }
1565 
1566     trace_virtio_notify_irqfd(vdev, vq);
1567 
1568     /*
1569      * virtio spec 1.0 says ISR bit 0 should be ignored with MSI, but
1570      * windows drivers included in virtio-win 1.8.0 (circa 2015) are
1571      * incorrectly polling this bit during crashdump and hibernation
1572      * in MSI mode, causing a hang if this bit is never updated.
1573      * Recent releases of Windows do not really shut down, but rather
1574      * log out and hibernate to make the next startup faster.  Hence,
1575      * this manifested as a more serious hang during shutdown with
1576      *
1577      * Next driver release from 2016 fixed this problem, so working around it
1578      * is not a must, but it's easy to do so let's do it here.
1579      *
1580      * Note: it's safe to update ISR from any thread as it was switched
1581      * to an atomic operation.
1582      */
1583     virtio_set_isr(vq->vdev, 0x1);
1584     event_notifier_set(&vq->guest_notifier);
1585 }
1586 
1587 static void virtio_irq(VirtQueue *vq)
1588 {
1589     virtio_set_isr(vq->vdev, 0x1);
1590     virtio_notify_vector(vq->vdev, vq->vector);
1591 }
1592 
1593 void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
1594 {
1595     bool should_notify;
1596     rcu_read_lock();
1597     should_notify = virtio_should_notify(vdev, vq);
1598     rcu_read_unlock();
1599 
1600     if (!should_notify) {
1601         return;
1602     }
1603 
1604     trace_virtio_notify(vdev, vq);
1605     virtio_irq(vq);
1606 }
1607 
1608 void virtio_notify_config(VirtIODevice *vdev)
1609 {
1610     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
1611         return;
1612 
1613     virtio_set_isr(vdev, 0x3);
1614     vdev->generation++;
1615     virtio_notify_vector(vdev, vdev->config_vector);
1616 }
1617 
1618 static bool virtio_device_endian_needed(void *opaque)
1619 {
1620     VirtIODevice *vdev = opaque;
1621 
1622     assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN);
1623     if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1624         return vdev->device_endian != virtio_default_endian();
1625     }
1626     /* Devices conforming to VIRTIO 1.0 or later are always LE. */
1627     return vdev->device_endian != VIRTIO_DEVICE_ENDIAN_LITTLE;
1628 }
1629 
1630 static bool virtio_64bit_features_needed(void *opaque)
1631 {
1632     VirtIODevice *vdev = opaque;
1633 
1634     return (vdev->host_features >> 32) != 0;
1635 }
1636 
1637 static bool virtio_virtqueue_needed(void *opaque)
1638 {
1639     VirtIODevice *vdev = opaque;
1640 
1641     return virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1);
1642 }
1643 
1644 static bool virtio_ringsize_needed(void *opaque)
1645 {
1646     VirtIODevice *vdev = opaque;
1647     int i;
1648 
1649     for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
1650         if (vdev->vq[i].vring.num != vdev->vq[i].vring.num_default) {
1651             return true;
1652         }
1653     }
1654     return false;
1655 }
1656 
1657 static bool virtio_extra_state_needed(void *opaque)
1658 {
1659     VirtIODevice *vdev = opaque;
1660     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
1661     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
1662 
1663     return k->has_extra_state &&
1664         k->has_extra_state(qbus->parent);
1665 }
1666 
1667 static bool virtio_broken_needed(void *opaque)
1668 {
1669     VirtIODevice *vdev = opaque;
1670 
1671     return vdev->broken;
1672 }
1673 
1674 static const VMStateDescription vmstate_virtqueue = {
1675     .name = "virtqueue_state",
1676     .version_id = 1,
1677     .minimum_version_id = 1,
1678     .fields = (VMStateField[]) {
1679         VMSTATE_UINT64(vring.avail, struct VirtQueue),
1680         VMSTATE_UINT64(vring.used, struct VirtQueue),
1681         VMSTATE_END_OF_LIST()
1682     }
1683 };
1684 
1685 static const VMStateDescription vmstate_virtio_virtqueues = {
1686     .name = "virtio/virtqueues",
1687     .version_id = 1,
1688     .minimum_version_id = 1,
1689     .needed = &virtio_virtqueue_needed,
1690     .fields = (VMStateField[]) {
1691         VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice,
1692                       VIRTIO_QUEUE_MAX, 0, vmstate_virtqueue, VirtQueue),
1693         VMSTATE_END_OF_LIST()
1694     }
1695 };
1696 
1697 static const VMStateDescription vmstate_ringsize = {
1698     .name = "ringsize_state",
1699     .version_id = 1,
1700     .minimum_version_id = 1,
1701     .fields = (VMStateField[]) {
1702         VMSTATE_UINT32(vring.num_default, struct VirtQueue),
1703         VMSTATE_END_OF_LIST()
1704     }
1705 };
1706 
1707 static const VMStateDescription vmstate_virtio_ringsize = {
1708     .name = "virtio/ringsize",
1709     .version_id = 1,
1710     .minimum_version_id = 1,
1711     .needed = &virtio_ringsize_needed,
1712     .fields = (VMStateField[]) {
1713         VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice,
1714                       VIRTIO_QUEUE_MAX, 0, vmstate_ringsize, VirtQueue),
1715         VMSTATE_END_OF_LIST()
1716     }
1717 };
1718 
1719 static int get_extra_state(QEMUFile *f, void *pv, size_t size,
1720                            VMStateField *field)
1721 {
1722     VirtIODevice *vdev = pv;
1723     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
1724     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
1725 
1726     if (!k->load_extra_state) {
1727         return -1;
1728     } else {
1729         return k->load_extra_state(qbus->parent, f);
1730     }
1731 }
1732 
1733 static int put_extra_state(QEMUFile *f, void *pv, size_t size,
1734                            VMStateField *field, QJSON *vmdesc)
1735 {
1736     VirtIODevice *vdev = pv;
1737     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
1738     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
1739 
1740     k->save_extra_state(qbus->parent, f);
1741     return 0;
1742 }
1743 
1744 static const VMStateInfo vmstate_info_extra_state = {
1745     .name = "virtqueue_extra_state",
1746     .get = get_extra_state,
1747     .put = put_extra_state,
1748 };
1749 
1750 static const VMStateDescription vmstate_virtio_extra_state = {
1751     .name = "virtio/extra_state",
1752     .version_id = 1,
1753     .minimum_version_id = 1,
1754     .needed = &virtio_extra_state_needed,
1755     .fields = (VMStateField[]) {
1756         {
1757             .name         = "extra_state",
1758             .version_id   = 0,
1759             .field_exists = NULL,
1760             .size         = 0,
1761             .info         = &vmstate_info_extra_state,
1762             .flags        = VMS_SINGLE,
1763             .offset       = 0,
1764         },
1765         VMSTATE_END_OF_LIST()
1766     }
1767 };
1768 
1769 static const VMStateDescription vmstate_virtio_device_endian = {
1770     .name = "virtio/device_endian",
1771     .version_id = 1,
1772     .minimum_version_id = 1,
1773     .needed = &virtio_device_endian_needed,
1774     .fields = (VMStateField[]) {
1775         VMSTATE_UINT8(device_endian, VirtIODevice),
1776         VMSTATE_END_OF_LIST()
1777     }
1778 };
1779 
1780 static const VMStateDescription vmstate_virtio_64bit_features = {
1781     .name = "virtio/64bit_features",
1782     .version_id = 1,
1783     .minimum_version_id = 1,
1784     .needed = &virtio_64bit_features_needed,
1785     .fields = (VMStateField[]) {
1786         VMSTATE_UINT64(guest_features, VirtIODevice),
1787         VMSTATE_END_OF_LIST()
1788     }
1789 };
1790 
1791 static const VMStateDescription vmstate_virtio_broken = {
1792     .name = "virtio/broken",
1793     .version_id = 1,
1794     .minimum_version_id = 1,
1795     .needed = &virtio_broken_needed,
1796     .fields = (VMStateField[]) {
1797         VMSTATE_BOOL(broken, VirtIODevice),
1798         VMSTATE_END_OF_LIST()
1799     }
1800 };
1801 
1802 static const VMStateDescription vmstate_virtio = {
1803     .name = "virtio",
1804     .version_id = 1,
1805     .minimum_version_id = 1,
1806     .minimum_version_id_old = 1,
1807     .fields = (VMStateField[]) {
1808         VMSTATE_END_OF_LIST()
1809     },
1810     .subsections = (const VMStateDescription*[]) {
1811         &vmstate_virtio_device_endian,
1812         &vmstate_virtio_64bit_features,
1813         &vmstate_virtio_virtqueues,
1814         &vmstate_virtio_ringsize,
1815         &vmstate_virtio_broken,
1816         &vmstate_virtio_extra_state,
1817         NULL
1818     }
1819 };
1820 
1821 void virtio_save(VirtIODevice *vdev, QEMUFile *f)
1822 {
1823     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
1824     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
1825     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
1826     uint32_t guest_features_lo = (vdev->guest_features & 0xffffffff);
1827     int i;
1828 
1829     if (k->save_config) {
1830         k->save_config(qbus->parent, f);
1831     }
1832 
1833     qemu_put_8s(f, &vdev->status);
1834     qemu_put_8s(f, &vdev->isr);
1835     qemu_put_be16s(f, &vdev->queue_sel);
1836     qemu_put_be32s(f, &guest_features_lo);
1837     qemu_put_be32(f, vdev->config_len);
1838     qemu_put_buffer(f, vdev->config, vdev->config_len);
1839 
1840     for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
1841         if (vdev->vq[i].vring.num == 0)
1842             break;
1843     }
1844 
1845     qemu_put_be32(f, i);
1846 
1847     for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
1848         if (vdev->vq[i].vring.num == 0)
1849             break;
1850 
1851         qemu_put_be32(f, vdev->vq[i].vring.num);
1852         if (k->has_variable_vring_alignment) {
1853             qemu_put_be32(f, vdev->vq[i].vring.align);
1854         }
1855         /* XXX virtio-1 devices */
1856         qemu_put_be64(f, vdev->vq[i].vring.desc);
1857         qemu_put_be16s(f, &vdev->vq[i].last_avail_idx);
1858         if (k->save_queue) {
1859             k->save_queue(qbus->parent, i, f);
1860         }
1861     }
1862 
1863     if (vdc->save != NULL) {
1864         vdc->save(vdev, f);
1865     }
1866 
1867     if (vdc->vmsd) {
1868         vmstate_save_state(f, vdc->vmsd, vdev, NULL);
1869     }
1870 
1871     /* Subsections */
1872     vmstate_save_state(f, &vmstate_virtio, vdev, NULL);
1873 }
1874 
1875 /* A wrapper for use as a VMState .put function */
1876 static int virtio_device_put(QEMUFile *f, void *opaque, size_t size,
1877                               VMStateField *field, QJSON *vmdesc)
1878 {
1879     virtio_save(VIRTIO_DEVICE(opaque), f);
1880 
1881     return 0;
1882 }
1883 
1884 /* A wrapper for use as a VMState .get function */
1885 static int virtio_device_get(QEMUFile *f, void *opaque, size_t size,
1886                              VMStateField *field)
1887 {
1888     VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
1889     DeviceClass *dc = DEVICE_CLASS(VIRTIO_DEVICE_GET_CLASS(vdev));
1890 
1891     return virtio_load(vdev, f, dc->vmsd->version_id);
1892 }
1893 
1894 const VMStateInfo  virtio_vmstate_info = {
1895     .name = "virtio",
1896     .get = virtio_device_get,
1897     .put = virtio_device_put,
1898 };
1899 
1900 static int virtio_set_features_nocheck(VirtIODevice *vdev, uint64_t val)
1901 {
1902     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
1903     bool bad = (val & ~(vdev->host_features)) != 0;
1904 
1905     val &= vdev->host_features;
1906     if (k->set_features) {
1907         k->set_features(vdev, val);
1908     }
1909     vdev->guest_features = val;
1910     return bad ? -1 : 0;
1911 }
1912 
1913 int virtio_set_features(VirtIODevice *vdev, uint64_t val)
1914 {
1915    /*
1916      * The driver must not attempt to set features after feature negotiation
1917      * has finished.
1918      */
1919     if (vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) {
1920         return -EINVAL;
1921     }
1922     return virtio_set_features_nocheck(vdev, val);
1923 }
1924 
1925 int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
1926 {
1927     int i, ret;
1928     int32_t config_len;
1929     uint32_t num;
1930     uint32_t features;
1931     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
1932     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
1933     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
1934 
1935     /*
1936      * We poison the endianness to ensure it does not get used before
1937      * subsections have been loaded.
1938      */
1939     vdev->device_endian = VIRTIO_DEVICE_ENDIAN_UNKNOWN;
1940 
1941     if (k->load_config) {
1942         ret = k->load_config(qbus->parent, f);
1943         if (ret)
1944             return ret;
1945     }
1946 
1947     qemu_get_8s(f, &vdev->status);
1948     qemu_get_8s(f, &vdev->isr);
1949     qemu_get_be16s(f, &vdev->queue_sel);
1950     if (vdev->queue_sel >= VIRTIO_QUEUE_MAX) {
1951         return -1;
1952     }
1953     qemu_get_be32s(f, &features);
1954 
1955     /*
1956      * Temporarily set guest_features low bits - needed by
1957      * virtio net load code testing for VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
1958      * VIRTIO_NET_F_GUEST_ANNOUNCE and VIRTIO_NET_F_CTRL_VQ.
1959      *
1960      * Note: devices should always test host features in future - don't create
1961      * new dependencies like this.
1962      */
1963     vdev->guest_features = features;
1964 
1965     config_len = qemu_get_be32(f);
1966 
1967     /*
1968      * There are cases where the incoming config can be bigger or smaller
1969      * than what we have; so load what we have space for, and skip
1970      * any excess that's in the stream.
1971      */
1972     qemu_get_buffer(f, vdev->config, MIN(config_len, vdev->config_len));
1973 
1974     while (config_len > vdev->config_len) {
1975         qemu_get_byte(f);
1976         config_len--;
1977     }
1978 
1979     num = qemu_get_be32(f);
1980 
1981     if (num > VIRTIO_QUEUE_MAX) {
1982         error_report("Invalid number of virtqueues: 0x%x", num);
1983         return -1;
1984     }
1985 
1986     for (i = 0; i < num; i++) {
1987         vdev->vq[i].vring.num = qemu_get_be32(f);
1988         if (k->has_variable_vring_alignment) {
1989             vdev->vq[i].vring.align = qemu_get_be32(f);
1990         }
1991         vdev->vq[i].vring.desc = qemu_get_be64(f);
1992         qemu_get_be16s(f, &vdev->vq[i].last_avail_idx);
1993         vdev->vq[i].signalled_used_valid = false;
1994         vdev->vq[i].notification = true;
1995 
1996         if (vdev->vq[i].vring.desc) {
1997             /* XXX virtio-1 devices */
1998             virtio_queue_update_rings(vdev, i);
1999         } else if (vdev->vq[i].last_avail_idx) {
2000             error_report("VQ %d address 0x0 "
2001                          "inconsistent with Host index 0x%x",
2002                          i, vdev->vq[i].last_avail_idx);
2003                 return -1;
2004         }
2005         if (k->load_queue) {
2006             ret = k->load_queue(qbus->parent, i, f);
2007             if (ret)
2008                 return ret;
2009         }
2010     }
2011 
2012     virtio_notify_vector(vdev, VIRTIO_NO_VECTOR);
2013 
2014     if (vdc->load != NULL) {
2015         ret = vdc->load(vdev, f, version_id);
2016         if (ret) {
2017             return ret;
2018         }
2019     }
2020 
2021     if (vdc->vmsd) {
2022         ret = vmstate_load_state(f, vdc->vmsd, vdev, version_id);
2023         if (ret) {
2024             return ret;
2025         }
2026     }
2027 
2028     /* Subsections */
2029     ret = vmstate_load_state(f, &vmstate_virtio, vdev, 1);
2030     if (ret) {
2031         return ret;
2032     }
2033 
2034     if (vdev->device_endian == VIRTIO_DEVICE_ENDIAN_UNKNOWN) {
2035         vdev->device_endian = virtio_default_endian();
2036     }
2037 
2038     if (virtio_64bit_features_needed(vdev)) {
2039         /*
2040          * Subsection load filled vdev->guest_features.  Run them
2041          * through virtio_set_features to sanity-check them against
2042          * host_features.
2043          */
2044         uint64_t features64 = vdev->guest_features;
2045         if (virtio_set_features_nocheck(vdev, features64) < 0) {
2046             error_report("Features 0x%" PRIx64 " unsupported. "
2047                          "Allowed features: 0x%" PRIx64,
2048                          features64, vdev->host_features);
2049             return -1;
2050         }
2051     } else {
2052         if (virtio_set_features_nocheck(vdev, features) < 0) {
2053             error_report("Features 0x%x unsupported. "
2054                          "Allowed features: 0x%" PRIx64,
2055                          features, vdev->host_features);
2056             return -1;
2057         }
2058     }
2059 
2060     rcu_read_lock();
2061     for (i = 0; i < num; i++) {
2062         if (vdev->vq[i].vring.desc) {
2063             uint16_t nheads;
2064             nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx;
2065             /* Check it isn't doing strange things with descriptor numbers. */
2066             if (nheads > vdev->vq[i].vring.num) {
2067                 error_report("VQ %d size 0x%x Guest index 0x%x "
2068                              "inconsistent with Host index 0x%x: delta 0x%x",
2069                              i, vdev->vq[i].vring.num,
2070                              vring_avail_idx(&vdev->vq[i]),
2071                              vdev->vq[i].last_avail_idx, nheads);
2072                 return -1;
2073             }
2074             vdev->vq[i].used_idx = vring_used_idx(&vdev->vq[i]);
2075             vdev->vq[i].shadow_avail_idx = vring_avail_idx(&vdev->vq[i]);
2076 
2077             /*
2078              * Some devices migrate VirtQueueElements that have been popped
2079              * from the avail ring but not yet returned to the used ring.
2080              * Since max ring size < UINT16_MAX it's safe to use modulo
2081              * UINT16_MAX + 1 subtraction.
2082              */
2083             vdev->vq[i].inuse = (uint16_t)(vdev->vq[i].last_avail_idx -
2084                                 vdev->vq[i].used_idx);
2085             if (vdev->vq[i].inuse > vdev->vq[i].vring.num) {
2086                 error_report("VQ %d size 0x%x < last_avail_idx 0x%x - "
2087                              "used_idx 0x%x",
2088                              i, vdev->vq[i].vring.num,
2089                              vdev->vq[i].last_avail_idx,
2090                              vdev->vq[i].used_idx);
2091                 return -1;
2092             }
2093         }
2094     }
2095     rcu_read_unlock();
2096 
2097     return 0;
2098 }
2099 
2100 void virtio_cleanup(VirtIODevice *vdev)
2101 {
2102     qemu_del_vm_change_state_handler(vdev->vmstate);
2103 }
2104 
2105 static void virtio_vmstate_change(void *opaque, int running, RunState state)
2106 {
2107     VirtIODevice *vdev = opaque;
2108     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
2109     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
2110     bool backend_run = running && (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK);
2111     vdev->vm_running = running;
2112 
2113     if (backend_run) {
2114         virtio_set_status(vdev, vdev->status);
2115     }
2116 
2117     if (k->vmstate_change) {
2118         k->vmstate_change(qbus->parent, backend_run);
2119     }
2120 
2121     if (!backend_run) {
2122         virtio_set_status(vdev, vdev->status);
2123     }
2124 }
2125 
2126 void virtio_instance_init_common(Object *proxy_obj, void *data,
2127                                  size_t vdev_size, const char *vdev_name)
2128 {
2129     DeviceState *vdev = data;
2130 
2131     object_initialize(vdev, vdev_size, vdev_name);
2132     object_property_add_child(proxy_obj, "virtio-backend", OBJECT(vdev), NULL);
2133     object_unref(OBJECT(vdev));
2134     qdev_alias_all_properties(vdev, proxy_obj);
2135 }
2136 
2137 void virtio_init(VirtIODevice *vdev, const char *name,
2138                  uint16_t device_id, size_t config_size)
2139 {
2140     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
2141     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
2142     int i;
2143     int nvectors = k->query_nvectors ? k->query_nvectors(qbus->parent) : 0;
2144 
2145     if (nvectors) {
2146         vdev->vector_queues =
2147             g_malloc0(sizeof(*vdev->vector_queues) * nvectors);
2148     }
2149 
2150     vdev->device_id = device_id;
2151     vdev->status = 0;
2152     atomic_set(&vdev->isr, 0);
2153     vdev->queue_sel = 0;
2154     vdev->config_vector = VIRTIO_NO_VECTOR;
2155     vdev->vq = g_malloc0(sizeof(VirtQueue) * VIRTIO_QUEUE_MAX);
2156     vdev->vm_running = runstate_is_running();
2157     vdev->broken = false;
2158     for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
2159         vdev->vq[i].vector = VIRTIO_NO_VECTOR;
2160         vdev->vq[i].vdev = vdev;
2161         vdev->vq[i].queue_index = i;
2162     }
2163 
2164     vdev->name = name;
2165     vdev->config_len = config_size;
2166     if (vdev->config_len) {
2167         vdev->config = g_malloc0(config_size);
2168     } else {
2169         vdev->config = NULL;
2170     }
2171     vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change,
2172                                                      vdev);
2173     vdev->device_endian = virtio_default_endian();
2174     vdev->use_guest_notifier_mask = true;
2175 }
2176 
2177 hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n)
2178 {
2179     return vdev->vq[n].vring.desc;
2180 }
2181 
2182 hwaddr virtio_queue_get_avail_addr(VirtIODevice *vdev, int n)
2183 {
2184     return vdev->vq[n].vring.avail;
2185 }
2186 
2187 hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n)
2188 {
2189     return vdev->vq[n].vring.used;
2190 }
2191 
2192 hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n)
2193 {
2194     return sizeof(VRingDesc) * vdev->vq[n].vring.num;
2195 }
2196 
2197 hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n)
2198 {
2199     return offsetof(VRingAvail, ring) +
2200         sizeof(uint16_t) * vdev->vq[n].vring.num;
2201 }
2202 
2203 hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n)
2204 {
2205     return offsetof(VRingUsed, ring) +
2206         sizeof(VRingUsedElem) * vdev->vq[n].vring.num;
2207 }
2208 
2209 uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n)
2210 {
2211     return vdev->vq[n].last_avail_idx;
2212 }
2213 
2214 void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx)
2215 {
2216     vdev->vq[n].last_avail_idx = idx;
2217     vdev->vq[n].shadow_avail_idx = idx;
2218 }
2219 
2220 void virtio_queue_update_used_idx(VirtIODevice *vdev, int n)
2221 {
2222     rcu_read_lock();
2223     if (vdev->vq[n].vring.desc) {
2224         vdev->vq[n].used_idx = vring_used_idx(&vdev->vq[n]);
2225     }
2226     rcu_read_unlock();
2227 }
2228 
2229 void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n)
2230 {
2231     vdev->vq[n].signalled_used_valid = false;
2232 }
2233 
2234 VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n)
2235 {
2236     return vdev->vq + n;
2237 }
2238 
2239 uint16_t virtio_get_queue_index(VirtQueue *vq)
2240 {
2241     return vq->queue_index;
2242 }
2243 
2244 static void virtio_queue_guest_notifier_read(EventNotifier *n)
2245 {
2246     VirtQueue *vq = container_of(n, VirtQueue, guest_notifier);
2247     if (event_notifier_test_and_clear(n)) {
2248         virtio_irq(vq);
2249     }
2250 }
2251 
2252 void virtio_queue_set_guest_notifier_fd_handler(VirtQueue *vq, bool assign,
2253                                                 bool with_irqfd)
2254 {
2255     if (assign && !with_irqfd) {
2256         event_notifier_set_handler(&vq->guest_notifier,
2257                                    virtio_queue_guest_notifier_read);
2258     } else {
2259         event_notifier_set_handler(&vq->guest_notifier, NULL);
2260     }
2261     if (!assign) {
2262         /* Test and clear notifier before closing it,
2263          * in case poll callback didn't have time to run. */
2264         virtio_queue_guest_notifier_read(&vq->guest_notifier);
2265     }
2266 }
2267 
2268 EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq)
2269 {
2270     return &vq->guest_notifier;
2271 }
2272 
2273 static void virtio_queue_host_notifier_aio_read(EventNotifier *n)
2274 {
2275     VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
2276     if (event_notifier_test_and_clear(n)) {
2277         virtio_queue_notify_aio_vq(vq);
2278     }
2279 }
2280 
2281 static void virtio_queue_host_notifier_aio_poll_begin(EventNotifier *n)
2282 {
2283     VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
2284 
2285     virtio_queue_set_notification(vq, 0);
2286 }
2287 
2288 static bool virtio_queue_host_notifier_aio_poll(void *opaque)
2289 {
2290     EventNotifier *n = opaque;
2291     VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
2292     bool progress;
2293 
2294     if (virtio_queue_empty(vq)) {
2295         return false;
2296     }
2297 
2298     progress = virtio_queue_notify_aio_vq(vq);
2299 
2300     /* In case the handler function re-enabled notifications */
2301     virtio_queue_set_notification(vq, 0);
2302     return progress;
2303 }
2304 
2305 static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n)
2306 {
2307     VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
2308 
2309     /* Caller polls once more after this to catch requests that race with us */
2310     virtio_queue_set_notification(vq, 1);
2311 }
2312 
2313 void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx,
2314                                                 VirtIOHandleAIOOutput handle_output)
2315 {
2316     if (handle_output) {
2317         vq->handle_aio_output = handle_output;
2318         aio_set_event_notifier(ctx, &vq->host_notifier, true,
2319                                virtio_queue_host_notifier_aio_read,
2320                                virtio_queue_host_notifier_aio_poll);
2321         aio_set_event_notifier_poll(ctx, &vq->host_notifier,
2322                                     virtio_queue_host_notifier_aio_poll_begin,
2323                                     virtio_queue_host_notifier_aio_poll_end);
2324     } else {
2325         aio_set_event_notifier(ctx, &vq->host_notifier, true, NULL, NULL);
2326         /* Test and clear notifier before after disabling event,
2327          * in case poll callback didn't have time to run. */
2328         virtio_queue_host_notifier_aio_read(&vq->host_notifier);
2329         vq->handle_aio_output = NULL;
2330     }
2331 }
2332 
2333 void virtio_queue_host_notifier_read(EventNotifier *n)
2334 {
2335     VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
2336     if (event_notifier_test_and_clear(n)) {
2337         virtio_queue_notify_vq(vq);
2338     }
2339 }
2340 
2341 EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq)
2342 {
2343     return &vq->host_notifier;
2344 }
2345 
2346 void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
2347 {
2348     g_free(vdev->bus_name);
2349     vdev->bus_name = g_strdup(bus_name);
2350 }
2351 
2352 void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
2353 {
2354     va_list ap;
2355 
2356     va_start(ap, fmt);
2357     error_vreport(fmt, ap);
2358     va_end(ap);
2359 
2360     vdev->broken = true;
2361 
2362     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
2363         virtio_set_status(vdev, vdev->status | VIRTIO_CONFIG_S_NEEDS_RESET);
2364         virtio_notify_config(vdev);
2365     }
2366 }
2367 
2368 static void virtio_memory_listener_commit(MemoryListener *listener)
2369 {
2370     VirtIODevice *vdev = container_of(listener, VirtIODevice, listener);
2371     int i;
2372 
2373     for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
2374         if (vdev->vq[i].vring.num == 0) {
2375             break;
2376         }
2377         virtio_init_region_cache(vdev, i);
2378     }
2379 }
2380 
2381 static void virtio_device_realize(DeviceState *dev, Error **errp)
2382 {
2383     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
2384     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev);
2385     Error *err = NULL;
2386 
2387     /* Devices should either use vmsd or the load/save methods */
2388     assert(!vdc->vmsd || !vdc->load);
2389 
2390     if (vdc->realize != NULL) {
2391         vdc->realize(dev, &err);
2392         if (err != NULL) {
2393             error_propagate(errp, err);
2394             return;
2395         }
2396     }
2397 
2398     virtio_bus_device_plugged(vdev, &err);
2399     if (err != NULL) {
2400         error_propagate(errp, err);
2401         return;
2402     }
2403 
2404     vdev->listener.commit = virtio_memory_listener_commit;
2405     memory_listener_register(&vdev->listener, vdev->dma_as);
2406 }
2407 
2408 static void virtio_device_unrealize(DeviceState *dev, Error **errp)
2409 {
2410     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
2411     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev);
2412     Error *err = NULL;
2413 
2414     virtio_bus_device_unplugged(vdev);
2415 
2416     if (vdc->unrealize != NULL) {
2417         vdc->unrealize(dev, &err);
2418         if (err != NULL) {
2419             error_propagate(errp, err);
2420             return;
2421         }
2422     }
2423 
2424     g_free(vdev->bus_name);
2425     vdev->bus_name = NULL;
2426 }
2427 
2428 static void virtio_device_free_virtqueues(VirtIODevice *vdev)
2429 {
2430     int i;
2431     if (!vdev->vq) {
2432         return;
2433     }
2434 
2435     for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
2436         VRingMemoryRegionCaches *caches;
2437         if (vdev->vq[i].vring.num == 0) {
2438             break;
2439         }
2440         caches = atomic_read(&vdev->vq[i].vring.caches);
2441         atomic_set(&vdev->vq[i].vring.caches, NULL);
2442         virtio_free_region_cache(caches);
2443     }
2444     g_free(vdev->vq);
2445 }
2446 
2447 static void virtio_device_instance_finalize(Object *obj)
2448 {
2449     VirtIODevice *vdev = VIRTIO_DEVICE(obj);
2450 
2451     memory_listener_unregister(&vdev->listener);
2452     virtio_device_free_virtqueues(vdev);
2453 
2454     g_free(vdev->config);
2455     g_free(vdev->vector_queues);
2456 }
2457 
2458 static Property virtio_properties[] = {
2459     DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features),
2460     DEFINE_PROP_END_OF_LIST(),
2461 };
2462 
2463 static int virtio_device_start_ioeventfd_impl(VirtIODevice *vdev)
2464 {
2465     VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev)));
2466     int n, r, err;
2467 
2468     for (n = 0; n < VIRTIO_QUEUE_MAX; n++) {
2469         VirtQueue *vq = &vdev->vq[n];
2470         if (!virtio_queue_get_num(vdev, n)) {
2471             continue;
2472         }
2473         r = virtio_bus_set_host_notifier(qbus, n, true);
2474         if (r < 0) {
2475             err = r;
2476             goto assign_error;
2477         }
2478         event_notifier_set_handler(&vq->host_notifier,
2479                                    virtio_queue_host_notifier_read);
2480     }
2481 
2482     for (n = 0; n < VIRTIO_QUEUE_MAX; n++) {
2483         /* Kick right away to begin processing requests already in vring */
2484         VirtQueue *vq = &vdev->vq[n];
2485         if (!vq->vring.num) {
2486             continue;
2487         }
2488         event_notifier_set(&vq->host_notifier);
2489     }
2490     return 0;
2491 
2492 assign_error:
2493     while (--n >= 0) {
2494         VirtQueue *vq = &vdev->vq[n];
2495         if (!virtio_queue_get_num(vdev, n)) {
2496             continue;
2497         }
2498 
2499         event_notifier_set_handler(&vq->host_notifier, NULL);
2500         r = virtio_bus_set_host_notifier(qbus, n, false);
2501         assert(r >= 0);
2502     }
2503     return err;
2504 }
2505 
2506 int virtio_device_start_ioeventfd(VirtIODevice *vdev)
2507 {
2508     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
2509     VirtioBusState *vbus = VIRTIO_BUS(qbus);
2510 
2511     return virtio_bus_start_ioeventfd(vbus);
2512 }
2513 
2514 static void virtio_device_stop_ioeventfd_impl(VirtIODevice *vdev)
2515 {
2516     VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev)));
2517     int n, r;
2518 
2519     for (n = 0; n < VIRTIO_QUEUE_MAX; n++) {
2520         VirtQueue *vq = &vdev->vq[n];
2521 
2522         if (!virtio_queue_get_num(vdev, n)) {
2523             continue;
2524         }
2525         event_notifier_set_handler(&vq->host_notifier, NULL);
2526         r = virtio_bus_set_host_notifier(qbus, n, false);
2527         assert(r >= 0);
2528     }
2529 }
2530 
2531 void virtio_device_stop_ioeventfd(VirtIODevice *vdev)
2532 {
2533     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
2534     VirtioBusState *vbus = VIRTIO_BUS(qbus);
2535 
2536     virtio_bus_stop_ioeventfd(vbus);
2537 }
2538 
2539 int virtio_device_grab_ioeventfd(VirtIODevice *vdev)
2540 {
2541     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
2542     VirtioBusState *vbus = VIRTIO_BUS(qbus);
2543 
2544     return virtio_bus_grab_ioeventfd(vbus);
2545 }
2546 
2547 void virtio_device_release_ioeventfd(VirtIODevice *vdev)
2548 {
2549     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
2550     VirtioBusState *vbus = VIRTIO_BUS(qbus);
2551 
2552     virtio_bus_release_ioeventfd(vbus);
2553 }
2554 
2555 static void virtio_device_class_init(ObjectClass *klass, void *data)
2556 {
2557     /* Set the default value here. */
2558     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
2559     DeviceClass *dc = DEVICE_CLASS(klass);
2560 
2561     dc->realize = virtio_device_realize;
2562     dc->unrealize = virtio_device_unrealize;
2563     dc->bus_type = TYPE_VIRTIO_BUS;
2564     dc->props = virtio_properties;
2565     vdc->start_ioeventfd = virtio_device_start_ioeventfd_impl;
2566     vdc->stop_ioeventfd = virtio_device_stop_ioeventfd_impl;
2567 
2568     vdc->legacy_features |= VIRTIO_LEGACY_FEATURES;
2569 }
2570 
2571 bool virtio_device_ioeventfd_enabled(VirtIODevice *vdev)
2572 {
2573     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
2574     VirtioBusState *vbus = VIRTIO_BUS(qbus);
2575 
2576     return virtio_bus_ioeventfd_enabled(vbus);
2577 }
2578 
2579 static const TypeInfo virtio_device_info = {
2580     .name = TYPE_VIRTIO_DEVICE,
2581     .parent = TYPE_DEVICE,
2582     .instance_size = sizeof(VirtIODevice),
2583     .class_init = virtio_device_class_init,
2584     .instance_finalize = virtio_device_instance_finalize,
2585     .abstract = true,
2586     .class_size = sizeof(VirtioDeviceClass),
2587 };
2588 
2589 static void virtio_register_types(void)
2590 {
2591     type_register_static(&virtio_device_info);
2592 }
2593 
2594 type_init(virtio_register_types)
2595