xref: /openbmc/qemu/hw/net/virtio-net.c (revision 646b5378)
1 /*
2  * Virtio Network Device
3  *
4  * Copyright IBM, Corp. 2007
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 #include "qemu/atomic.h"
16 #include "qemu/iov.h"
17 #include "qemu/log.h"
18 #include "qemu/main-loop.h"
19 #include "qemu/module.h"
20 #include "hw/virtio/virtio.h"
21 #include "net/net.h"
22 #include "net/checksum.h"
23 #include "net/tap.h"
24 #include "qemu/error-report.h"
25 #include "qemu/timer.h"
26 #include "qemu/option.h"
27 #include "qemu/option_int.h"
28 #include "qemu/config-file.h"
29 #include "qapi/qmp/qdict.h"
30 #include "hw/virtio/virtio-net.h"
31 #include "net/vhost_net.h"
32 #include "net/announce.h"
33 #include "hw/virtio/virtio-bus.h"
34 #include "qapi/error.h"
35 #include "qapi/qapi-events-net.h"
36 #include "hw/qdev-properties.h"
37 #include "qapi/qapi-types-migration.h"
38 #include "qapi/qapi-events-migration.h"
39 #include "hw/virtio/virtio-access.h"
40 #include "migration/misc.h"
41 #include "standard-headers/linux/ethtool.h"
42 #include "sysemu/sysemu.h"
43 #include "sysemu/replay.h"
44 #include "trace.h"
45 #include "monitor/qdev.h"
46 #include "monitor/monitor.h"
47 #include "hw/pci/pci_device.h"
48 #include "net_rx_pkt.h"
49 #include "hw/virtio/vhost.h"
50 #include "sysemu/qtest.h"
51 
52 #define VIRTIO_NET_VM_VERSION    11
53 
54 /* previously fixed value */
55 #define VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE 256
56 #define VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE 256
57 
58 /* for now, only allow larger queue_pairs; with virtio-1, guest can downsize */
59 #define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE
60 #define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE
61 
62 #define VIRTIO_NET_IP4_ADDR_SIZE   8        /* ipv4 saddr + daddr */
63 
64 #define VIRTIO_NET_TCP_FLAG         0x3F
65 #define VIRTIO_NET_TCP_HDR_LENGTH   0xF000
66 
67 /* IPv4 max payload, 16 bits in the header */
68 #define VIRTIO_NET_MAX_IP4_PAYLOAD (65535 - sizeof(struct ip_header))
69 #define VIRTIO_NET_MAX_TCP_PAYLOAD 65535
70 
71 /* header length value in ip header without option */
72 #define VIRTIO_NET_IP4_HEADER_LENGTH 5
73 
74 #define VIRTIO_NET_IP6_ADDR_SIZE   32      /* ipv6 saddr + daddr */
75 #define VIRTIO_NET_MAX_IP6_PAYLOAD VIRTIO_NET_MAX_TCP_PAYLOAD
76 
77 /* Purge coalesced packets timer interval, This value affects the performance
78    a lot, and should be tuned carefully, '300000'(300us) is the recommended
79    value to pass the WHQL test, '50000' can gain 2x netperf throughput with
80    tso/gso/gro 'off'. */
81 #define VIRTIO_NET_RSC_DEFAULT_INTERVAL 300000
82 
83 #define VIRTIO_NET_RSS_SUPPORTED_HASHES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
84                                          VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
85                                          VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
86                                          VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
87                                          VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
88                                          VIRTIO_NET_RSS_HASH_TYPE_UDPv6 | \
89                                          VIRTIO_NET_RSS_HASH_TYPE_IP_EX | \
90                                          VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \
91                                          VIRTIO_NET_RSS_HASH_TYPE_UDP_EX)
92 
93 static const VirtIOFeature feature_sizes[] = {
94     {.flags = 1ULL << VIRTIO_NET_F_MAC,
95      .end = endof(struct virtio_net_config, mac)},
96     {.flags = 1ULL << VIRTIO_NET_F_STATUS,
97      .end = endof(struct virtio_net_config, status)},
98     {.flags = 1ULL << VIRTIO_NET_F_MQ,
99      .end = endof(struct virtio_net_config, max_virtqueue_pairs)},
100     {.flags = 1ULL << VIRTIO_NET_F_MTU,
101      .end = endof(struct virtio_net_config, mtu)},
102     {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
103      .end = endof(struct virtio_net_config, duplex)},
104     {.flags = (1ULL << VIRTIO_NET_F_RSS) | (1ULL << VIRTIO_NET_F_HASH_REPORT),
105      .end = endof(struct virtio_net_config, supported_hash_types)},
106     {}
107 };
108 
109 static const VirtIOConfigSizeParams cfg_size_params = {
110     .min_size = endof(struct virtio_net_config, mac),
111     .max_size = sizeof(struct virtio_net_config),
112     .feature_sizes = feature_sizes
113 };
114 
115 static VirtIONetQueue *virtio_net_get_subqueue(NetClientState *nc)
116 {
117     VirtIONet *n = qemu_get_nic_opaque(nc);
118 
119     return &n->vqs[nc->queue_index];
120 }
121 
122 static int vq2q(int queue_index)
123 {
124     return queue_index / 2;
125 }
126 
127 static void flush_or_purge_queued_packets(NetClientState *nc)
128 {
129     if (!nc->peer) {
130         return;
131     }
132 
133     qemu_flush_or_purge_queued_packets(nc->peer, true);
134     assert(!virtio_net_get_subqueue(nc)->async_tx.elem);
135 }
136 
137 /* TODO
138  * - we could suppress RX interrupt if we were so inclined.
139  */
140 
141 static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
142 {
143     VirtIONet *n = VIRTIO_NET(vdev);
144     struct virtio_net_config netcfg;
145     NetClientState *nc = qemu_get_queue(n->nic);
146     static const MACAddr zero = { .a = { 0, 0, 0, 0, 0, 0 } };
147 
148     int ret = 0;
149     memset(&netcfg, 0 , sizeof(struct virtio_net_config));
150     virtio_stw_p(vdev, &netcfg.status, n->status);
151     virtio_stw_p(vdev, &netcfg.max_virtqueue_pairs, n->max_queue_pairs);
152     virtio_stw_p(vdev, &netcfg.mtu, n->net_conf.mtu);
153     memcpy(netcfg.mac, n->mac, ETH_ALEN);
154     virtio_stl_p(vdev, &netcfg.speed, n->net_conf.speed);
155     netcfg.duplex = n->net_conf.duplex;
156     netcfg.rss_max_key_size = VIRTIO_NET_RSS_MAX_KEY_SIZE;
157     virtio_stw_p(vdev, &netcfg.rss_max_indirection_table_length,
158                  virtio_host_has_feature(vdev, VIRTIO_NET_F_RSS) ?
159                  VIRTIO_NET_RSS_MAX_TABLE_LEN : 1);
160     virtio_stl_p(vdev, &netcfg.supported_hash_types,
161                  VIRTIO_NET_RSS_SUPPORTED_HASHES);
162     memcpy(config, &netcfg, n->config_size);
163 
164     /*
165      * Is this VDPA? No peer means not VDPA: there's no way to
166      * disconnect/reconnect a VDPA peer.
167      */
168     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
169         ret = vhost_net_get_config(get_vhost_net(nc->peer), (uint8_t *)&netcfg,
170                                    n->config_size);
171         if (ret == -1) {
172             return;
173         }
174 
175         /*
176          * Some NIC/kernel combinations present 0 as the mac address.  As that
177          * is not a legal address, try to proceed with the address from the
178          * QEMU command line in the hope that the address has been configured
179          * correctly elsewhere - just not reported by the device.
180          */
181         if (memcmp(&netcfg.mac, &zero, sizeof(zero)) == 0) {
182             info_report("Zero hardware mac address detected. Ignoring.");
183             memcpy(netcfg.mac, n->mac, ETH_ALEN);
184         }
185 
186         netcfg.status |= virtio_tswap16(vdev,
187                                         n->status & VIRTIO_NET_S_ANNOUNCE);
188         memcpy(config, &netcfg, n->config_size);
189     }
190 }
191 
192 static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config)
193 {
194     VirtIONet *n = VIRTIO_NET(vdev);
195     struct virtio_net_config netcfg = {};
196     NetClientState *nc = qemu_get_queue(n->nic);
197 
198     memcpy(&netcfg, config, n->config_size);
199 
200     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR) &&
201         !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) &&
202         memcmp(netcfg.mac, n->mac, ETH_ALEN)) {
203         memcpy(n->mac, netcfg.mac, ETH_ALEN);
204         qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
205     }
206 
207     /*
208      * Is this VDPA? No peer means not VDPA: there's no way to
209      * disconnect/reconnect a VDPA peer.
210      */
211     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
212         vhost_net_set_config(get_vhost_net(nc->peer),
213                              (uint8_t *)&netcfg, 0, n->config_size,
214                              VHOST_SET_CONFIG_TYPE_FRONTEND);
215       }
216 }
217 
218 static bool virtio_net_started(VirtIONet *n, uint8_t status)
219 {
220     VirtIODevice *vdev = VIRTIO_DEVICE(n);
221     return (status & VIRTIO_CONFIG_S_DRIVER_OK) &&
222         (n->status & VIRTIO_NET_S_LINK_UP) && vdev->vm_running;
223 }
224 
225 static void virtio_net_announce_notify(VirtIONet *net)
226 {
227     VirtIODevice *vdev = VIRTIO_DEVICE(net);
228     trace_virtio_net_announce_notify();
229 
230     net->status |= VIRTIO_NET_S_ANNOUNCE;
231     virtio_notify_config(vdev);
232 }
233 
234 static void virtio_net_announce_timer(void *opaque)
235 {
236     VirtIONet *n = opaque;
237     trace_virtio_net_announce_timer(n->announce_timer.round);
238 
239     n->announce_timer.round--;
240     virtio_net_announce_notify(n);
241 }
242 
243 static void virtio_net_announce(NetClientState *nc)
244 {
245     VirtIONet *n = qemu_get_nic_opaque(nc);
246     VirtIODevice *vdev = VIRTIO_DEVICE(n);
247 
248     /*
249      * Make sure the virtio migration announcement timer isn't running
250      * If it is, let it trigger announcement so that we do not cause
251      * confusion.
252      */
253     if (n->announce_timer.round) {
254         return;
255     }
256 
257     if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE) &&
258         virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
259             virtio_net_announce_notify(n);
260     }
261 }
262 
263 static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
264 {
265     VirtIODevice *vdev = VIRTIO_DEVICE(n);
266     NetClientState *nc = qemu_get_queue(n->nic);
267     int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
268     int cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
269               n->max_ncs - n->max_queue_pairs : 0;
270 
271     if (!get_vhost_net(nc->peer)) {
272         return;
273     }
274 
275     if ((virtio_net_started(n, status) && !nc->peer->link_down) ==
276         !!n->vhost_started) {
277         return;
278     }
279     if (!n->vhost_started) {
280         int r, i;
281 
282         if (n->needs_vnet_hdr_swap) {
283             error_report("backend does not support %s vnet headers; "
284                          "falling back on userspace virtio",
285                          virtio_is_big_endian(vdev) ? "BE" : "LE");
286             return;
287         }
288 
289         /* Any packets outstanding? Purge them to avoid touching rings
290          * when vhost is running.
291          */
292         for (i = 0;  i < queue_pairs; i++) {
293             NetClientState *qnc = qemu_get_subqueue(n->nic, i);
294 
295             /* Purge both directions: TX and RX. */
296             qemu_net_queue_purge(qnc->peer->incoming_queue, qnc);
297             qemu_net_queue_purge(qnc->incoming_queue, qnc->peer);
298         }
299 
300         if (virtio_has_feature(vdev->guest_features, VIRTIO_NET_F_MTU)) {
301             r = vhost_net_set_mtu(get_vhost_net(nc->peer), n->net_conf.mtu);
302             if (r < 0) {
303                 error_report("%uBytes MTU not supported by the backend",
304                              n->net_conf.mtu);
305 
306                 return;
307             }
308         }
309 
310         n->vhost_started = 1;
311         r = vhost_net_start(vdev, n->nic->ncs, queue_pairs, cvq);
312         if (r < 0) {
313             error_report("unable to start vhost net: %d: "
314                          "falling back on userspace virtio", -r);
315             n->vhost_started = 0;
316         }
317     } else {
318         vhost_net_stop(vdev, n->nic->ncs, queue_pairs, cvq);
319         n->vhost_started = 0;
320     }
321 }
322 
323 static int virtio_net_set_vnet_endian_one(VirtIODevice *vdev,
324                                           NetClientState *peer,
325                                           bool enable)
326 {
327     if (virtio_is_big_endian(vdev)) {
328         return qemu_set_vnet_be(peer, enable);
329     } else {
330         return qemu_set_vnet_le(peer, enable);
331     }
332 }
333 
334 static bool virtio_net_set_vnet_endian(VirtIODevice *vdev, NetClientState *ncs,
335                                        int queue_pairs, bool enable)
336 {
337     int i;
338 
339     for (i = 0; i < queue_pairs; i++) {
340         if (virtio_net_set_vnet_endian_one(vdev, ncs[i].peer, enable) < 0 &&
341             enable) {
342             while (--i >= 0) {
343                 virtio_net_set_vnet_endian_one(vdev, ncs[i].peer, false);
344             }
345 
346             return true;
347         }
348     }
349 
350     return false;
351 }
352 
353 static void virtio_net_vnet_endian_status(VirtIONet *n, uint8_t status)
354 {
355     VirtIODevice *vdev = VIRTIO_DEVICE(n);
356     int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
357 
358     if (virtio_net_started(n, status)) {
359         /* Before using the device, we tell the network backend about the
360          * endianness to use when parsing vnet headers. If the backend
361          * can't do it, we fallback onto fixing the headers in the core
362          * virtio-net code.
363          */
364         n->needs_vnet_hdr_swap = n->has_vnet_hdr &&
365                                  virtio_net_set_vnet_endian(vdev, n->nic->ncs,
366                                                             queue_pairs, true);
367     } else if (virtio_net_started(n, vdev->status)) {
368         /* After using the device, we need to reset the network backend to
369          * the default (guest native endianness), otherwise the guest may
370          * lose network connectivity if it is rebooted into a different
371          * endianness.
372          */
373         virtio_net_set_vnet_endian(vdev, n->nic->ncs, queue_pairs, false);
374     }
375 }
376 
377 static void virtio_net_drop_tx_queue_data(VirtIODevice *vdev, VirtQueue *vq)
378 {
379     unsigned int dropped = virtqueue_drop_all(vq);
380     if (dropped) {
381         virtio_notify(vdev, vq);
382     }
383 }
384 
385 static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
386 {
387     VirtIONet *n = VIRTIO_NET(vdev);
388     VirtIONetQueue *q;
389     int i;
390     uint8_t queue_status;
391 
392     virtio_net_vnet_endian_status(n, status);
393     virtio_net_vhost_status(n, status);
394 
395     for (i = 0; i < n->max_queue_pairs; i++) {
396         NetClientState *ncs = qemu_get_subqueue(n->nic, i);
397         bool queue_started;
398         q = &n->vqs[i];
399 
400         if ((!n->multiqueue && i != 0) || i >= n->curr_queue_pairs) {
401             queue_status = 0;
402         } else {
403             queue_status = status;
404         }
405         queue_started =
406             virtio_net_started(n, queue_status) && !n->vhost_started;
407 
408         if (queue_started) {
409             qemu_flush_queued_packets(ncs);
410         }
411 
412         if (!q->tx_waiting) {
413             continue;
414         }
415 
416         if (queue_started) {
417             if (q->tx_timer) {
418                 timer_mod(q->tx_timer,
419                                qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
420             } else {
421                 replay_bh_schedule_event(q->tx_bh);
422             }
423         } else {
424             if (q->tx_timer) {
425                 timer_del(q->tx_timer);
426             } else {
427                 qemu_bh_cancel(q->tx_bh);
428             }
429             if ((n->status & VIRTIO_NET_S_LINK_UP) == 0 &&
430                 (queue_status & VIRTIO_CONFIG_S_DRIVER_OK) &&
431                 vdev->vm_running) {
432                 /* if tx is waiting we are likely have some packets in tx queue
433                  * and disabled notification */
434                 q->tx_waiting = 0;
435                 virtio_queue_set_notification(q->tx_vq, 1);
436                 virtio_net_drop_tx_queue_data(vdev, q->tx_vq);
437             }
438         }
439     }
440 }
441 
442 static void virtio_net_set_link_status(NetClientState *nc)
443 {
444     VirtIONet *n = qemu_get_nic_opaque(nc);
445     VirtIODevice *vdev = VIRTIO_DEVICE(n);
446     uint16_t old_status = n->status;
447 
448     if (nc->link_down)
449         n->status &= ~VIRTIO_NET_S_LINK_UP;
450     else
451         n->status |= VIRTIO_NET_S_LINK_UP;
452 
453     if (n->status != old_status)
454         virtio_notify_config(vdev);
455 
456     virtio_net_set_status(vdev, vdev->status);
457 }
458 
459 static void rxfilter_notify(NetClientState *nc)
460 {
461     VirtIONet *n = qemu_get_nic_opaque(nc);
462 
463     if (nc->rxfilter_notify_enabled) {
464         char *path = object_get_canonical_path(OBJECT(n->qdev));
465         qapi_event_send_nic_rx_filter_changed(n->netclient_name, path);
466         g_free(path);
467 
468         /* disable event notification to avoid events flooding */
469         nc->rxfilter_notify_enabled = 0;
470     }
471 }
472 
473 static intList *get_vlan_table(VirtIONet *n)
474 {
475     intList *list;
476     int i, j;
477 
478     list = NULL;
479     for (i = 0; i < MAX_VLAN >> 5; i++) {
480         for (j = 0; n->vlans[i] && j <= 0x1f; j++) {
481             if (n->vlans[i] & (1U << j)) {
482                 QAPI_LIST_PREPEND(list, (i << 5) + j);
483             }
484         }
485     }
486 
487     return list;
488 }
489 
490 static RxFilterInfo *virtio_net_query_rxfilter(NetClientState *nc)
491 {
492     VirtIONet *n = qemu_get_nic_opaque(nc);
493     VirtIODevice *vdev = VIRTIO_DEVICE(n);
494     RxFilterInfo *info;
495     strList *str_list;
496     int i;
497 
498     info = g_malloc0(sizeof(*info));
499     info->name = g_strdup(nc->name);
500     info->promiscuous = n->promisc;
501 
502     if (n->nouni) {
503         info->unicast = RX_STATE_NONE;
504     } else if (n->alluni) {
505         info->unicast = RX_STATE_ALL;
506     } else {
507         info->unicast = RX_STATE_NORMAL;
508     }
509 
510     if (n->nomulti) {
511         info->multicast = RX_STATE_NONE;
512     } else if (n->allmulti) {
513         info->multicast = RX_STATE_ALL;
514     } else {
515         info->multicast = RX_STATE_NORMAL;
516     }
517 
518     info->broadcast_allowed = n->nobcast;
519     info->multicast_overflow = n->mac_table.multi_overflow;
520     info->unicast_overflow = n->mac_table.uni_overflow;
521 
522     info->main_mac = qemu_mac_strdup_printf(n->mac);
523 
524     str_list = NULL;
525     for (i = 0; i < n->mac_table.first_multi; i++) {
526         QAPI_LIST_PREPEND(str_list,
527                       qemu_mac_strdup_printf(n->mac_table.macs + i * ETH_ALEN));
528     }
529     info->unicast_table = str_list;
530 
531     str_list = NULL;
532     for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) {
533         QAPI_LIST_PREPEND(str_list,
534                       qemu_mac_strdup_printf(n->mac_table.macs + i * ETH_ALEN));
535     }
536     info->multicast_table = str_list;
537     info->vlan_table = get_vlan_table(n);
538 
539     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VLAN)) {
540         info->vlan = RX_STATE_ALL;
541     } else if (!info->vlan_table) {
542         info->vlan = RX_STATE_NONE;
543     } else {
544         info->vlan = RX_STATE_NORMAL;
545     }
546 
547     /* enable event notification after query */
548     nc->rxfilter_notify_enabled = 1;
549 
550     return info;
551 }
552 
553 static void virtio_net_queue_reset(VirtIODevice *vdev, uint32_t queue_index)
554 {
555     VirtIONet *n = VIRTIO_NET(vdev);
556     NetClientState *nc;
557 
558     /* validate queue_index and skip for cvq */
559     if (queue_index >= n->max_queue_pairs * 2) {
560         return;
561     }
562 
563     nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
564 
565     if (!nc->peer) {
566         return;
567     }
568 
569     if (get_vhost_net(nc->peer) &&
570         nc->peer->info->type == NET_CLIENT_DRIVER_TAP) {
571         vhost_net_virtqueue_reset(vdev, nc, queue_index);
572     }
573 
574     flush_or_purge_queued_packets(nc);
575 }
576 
577 static void virtio_net_queue_enable(VirtIODevice *vdev, uint32_t queue_index)
578 {
579     VirtIONet *n = VIRTIO_NET(vdev);
580     NetClientState *nc;
581     int r;
582 
583     /* validate queue_index and skip for cvq */
584     if (queue_index >= n->max_queue_pairs * 2) {
585         return;
586     }
587 
588     nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
589 
590     if (!nc->peer || !vdev->vhost_started) {
591         return;
592     }
593 
594     if (get_vhost_net(nc->peer) &&
595         nc->peer->info->type == NET_CLIENT_DRIVER_TAP) {
596         r = vhost_net_virtqueue_restart(vdev, nc, queue_index);
597         if (r < 0) {
598             error_report("unable to restart vhost net virtqueue: %d, "
599                             "when resetting the queue", queue_index);
600         }
601     }
602 }
603 
604 static void peer_test_vnet_hdr(VirtIONet *n)
605 {
606     NetClientState *nc = qemu_get_queue(n->nic);
607     if (!nc->peer) {
608         return;
609     }
610 
611     n->has_vnet_hdr = qemu_has_vnet_hdr(nc->peer);
612 }
613 
614 static int peer_has_vnet_hdr(VirtIONet *n)
615 {
616     return n->has_vnet_hdr;
617 }
618 
619 static int peer_has_ufo(VirtIONet *n)
620 {
621     if (!peer_has_vnet_hdr(n))
622         return 0;
623 
624     n->has_ufo = qemu_has_ufo(qemu_get_queue(n->nic)->peer);
625 
626     return n->has_ufo;
627 }
628 
629 static int peer_has_uso(VirtIONet *n)
630 {
631     if (!peer_has_vnet_hdr(n)) {
632         return 0;
633     }
634 
635     return qemu_has_uso(qemu_get_queue(n->nic)->peer);
636 }
637 
638 static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
639                                        int version_1, int hash_report)
640 {
641     int i;
642     NetClientState *nc;
643 
644     n->mergeable_rx_bufs = mergeable_rx_bufs;
645 
646     if (version_1) {
647         n->guest_hdr_len = hash_report ?
648             sizeof(struct virtio_net_hdr_v1_hash) :
649             sizeof(struct virtio_net_hdr_mrg_rxbuf);
650         n->rss_data.populate_hash = !!hash_report;
651     } else {
652         n->guest_hdr_len = n->mergeable_rx_bufs ?
653             sizeof(struct virtio_net_hdr_mrg_rxbuf) :
654             sizeof(struct virtio_net_hdr);
655         n->rss_data.populate_hash = false;
656     }
657 
658     for (i = 0; i < n->max_queue_pairs; i++) {
659         nc = qemu_get_subqueue(n->nic, i);
660 
661         if (peer_has_vnet_hdr(n) &&
662             qemu_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) {
663             qemu_set_vnet_hdr_len(nc->peer, n->guest_hdr_len);
664             n->host_hdr_len = n->guest_hdr_len;
665         }
666     }
667 }
668 
669 static int virtio_net_max_tx_queue_size(VirtIONet *n)
670 {
671     NetClientState *peer = n->nic_conf.peers.ncs[0];
672 
673     /*
674      * Backends other than vhost-user or vhost-vdpa don't support max queue
675      * size.
676      */
677     if (!peer) {
678         return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
679     }
680 
681     switch(peer->info->type) {
682     case NET_CLIENT_DRIVER_VHOST_USER:
683     case NET_CLIENT_DRIVER_VHOST_VDPA:
684         return VIRTQUEUE_MAX_SIZE;
685     default:
686         return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
687     };
688 }
689 
690 static int peer_attach(VirtIONet *n, int index)
691 {
692     NetClientState *nc = qemu_get_subqueue(n->nic, index);
693 
694     if (!nc->peer) {
695         return 0;
696     }
697 
698     if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
699         vhost_set_vring_enable(nc->peer, 1);
700     }
701 
702     if (nc->peer->info->type != NET_CLIENT_DRIVER_TAP) {
703         return 0;
704     }
705 
706     if (n->max_queue_pairs == 1) {
707         return 0;
708     }
709 
710     return tap_enable(nc->peer);
711 }
712 
713 static int peer_detach(VirtIONet *n, int index)
714 {
715     NetClientState *nc = qemu_get_subqueue(n->nic, index);
716 
717     if (!nc->peer) {
718         return 0;
719     }
720 
721     if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
722         vhost_set_vring_enable(nc->peer, 0);
723     }
724 
725     if (nc->peer->info->type !=  NET_CLIENT_DRIVER_TAP) {
726         return 0;
727     }
728 
729     return tap_disable(nc->peer);
730 }
731 
732 static void virtio_net_set_queue_pairs(VirtIONet *n)
733 {
734     int i;
735     int r;
736 
737     if (n->nic->peer_deleted) {
738         return;
739     }
740 
741     for (i = 0; i < n->max_queue_pairs; i++) {
742         if (i < n->curr_queue_pairs) {
743             r = peer_attach(n, i);
744             assert(!r);
745         } else {
746             r = peer_detach(n, i);
747             assert(!r);
748         }
749     }
750 }
751 
752 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue);
753 
754 static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
755                                         Error **errp)
756 {
757     VirtIONet *n = VIRTIO_NET(vdev);
758     NetClientState *nc = qemu_get_queue(n->nic);
759 
760     /* Firstly sync all virtio-net possible supported features */
761     features |= n->host_features;
762 
763     virtio_add_feature(&features, VIRTIO_NET_F_MAC);
764 
765     if (!peer_has_vnet_hdr(n)) {
766         virtio_clear_feature(&features, VIRTIO_NET_F_CSUM);
767         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO4);
768         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO6);
769         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_ECN);
770 
771         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_CSUM);
772         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4);
773         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
774         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
775 
776         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
777         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
778         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
779 
780         virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
781     }
782 
783     if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) {
784         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_UFO);
785         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
786     }
787 
788     if (!peer_has_uso(n)) {
789         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
790         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
791         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
792     }
793 
794     if (!get_vhost_net(nc->peer)) {
795         return features;
796     }
797 
798     if (!ebpf_rss_is_loaded(&n->ebpf_rss)) {
799         virtio_clear_feature(&features, VIRTIO_NET_F_RSS);
800     }
801     features = vhost_net_get_features(get_vhost_net(nc->peer), features);
802     vdev->backend_features = features;
803 
804     if (n->mtu_bypass_backend &&
805             (n->host_features & 1ULL << VIRTIO_NET_F_MTU)) {
806         features |= (1ULL << VIRTIO_NET_F_MTU);
807     }
808 
809     /*
810      * Since GUEST_ANNOUNCE is emulated the feature bit could be set without
811      * enabled. This happens in the vDPA case.
812      *
813      * Make sure the feature set is not incoherent, as the driver could refuse
814      * to start.
815      *
816      * TODO: QEMU is able to emulate a CVQ just for guest_announce purposes,
817      * helping guest to notify the new location with vDPA devices that does not
818      * support it.
819      */
820     if (!virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_CTRL_VQ)) {
821         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE);
822     }
823 
824     return features;
825 }
826 
827 static uint64_t virtio_net_bad_features(VirtIODevice *vdev)
828 {
829     uint64_t features = 0;
830 
831     /* Linux kernel 2.6.25.  It understood MAC (as everyone must),
832      * but also these: */
833     virtio_add_feature(&features, VIRTIO_NET_F_MAC);
834     virtio_add_feature(&features, VIRTIO_NET_F_CSUM);
835     virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO4);
836     virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO6);
837     virtio_add_feature(&features, VIRTIO_NET_F_HOST_ECN);
838 
839     return features;
840 }
841 
842 static void virtio_net_apply_guest_offloads(VirtIONet *n)
843 {
844     qemu_set_offload(qemu_get_queue(n->nic)->peer,
845             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)),
846             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
847             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
848             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
849             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)),
850             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)),
851             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
852 }
853 
854 static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
855 {
856     static const uint64_t guest_offloads_mask =
857         (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
858         (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
859         (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
860         (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
861         (1ULL << VIRTIO_NET_F_GUEST_UFO)  |
862         (1ULL << VIRTIO_NET_F_GUEST_USO4) |
863         (1ULL << VIRTIO_NET_F_GUEST_USO6);
864 
865     return guest_offloads_mask & features;
866 }
867 
868 uint64_t virtio_net_supported_guest_offloads(const VirtIONet *n)
869 {
870     VirtIODevice *vdev = VIRTIO_DEVICE(n);
871     return virtio_net_guest_offloads_by_features(vdev->guest_features);
872 }
873 
874 typedef struct {
875     VirtIONet *n;
876     DeviceState *dev;
877 } FailoverDevice;
878 
879 /**
880  * Set the failover primary device
881  *
882  * @opaque: FailoverId to setup
883  * @opts: opts for device we are handling
884  * @errp: returns an error if this function fails
885  */
886 static int failover_set_primary(DeviceState *dev, void *opaque)
887 {
888     FailoverDevice *fdev = opaque;
889     PCIDevice *pci_dev = (PCIDevice *)
890         object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE);
891 
892     if (!pci_dev) {
893         return 0;
894     }
895 
896     if (!g_strcmp0(pci_dev->failover_pair_id, fdev->n->netclient_name)) {
897         fdev->dev = dev;
898         return 1;
899     }
900 
901     return 0;
902 }
903 
904 /**
905  * Find the primary device for this failover virtio-net
906  *
907  * @n: VirtIONet device
908  * @errp: returns an error if this function fails
909  */
910 static DeviceState *failover_find_primary_device(VirtIONet *n)
911 {
912     FailoverDevice fdev = {
913         .n = n,
914     };
915 
916     qbus_walk_children(sysbus_get_default(), failover_set_primary, NULL,
917                        NULL, NULL, &fdev);
918     return fdev.dev;
919 }
920 
921 static void failover_add_primary(VirtIONet *n, Error **errp)
922 {
923     Error *err = NULL;
924     DeviceState *dev = failover_find_primary_device(n);
925 
926     if (dev) {
927         return;
928     }
929 
930     if (!n->primary_opts) {
931         error_setg(errp, "Primary device not found");
932         error_append_hint(errp, "Virtio-net failover will not work. Make "
933                           "sure primary device has parameter"
934                           " failover_pair_id=%s\n", n->netclient_name);
935         return;
936     }
937 
938     dev = qdev_device_add_from_qdict(n->primary_opts,
939                                      n->primary_opts_from_json,
940                                      &err);
941     if (err) {
942         qobject_unref(n->primary_opts);
943         n->primary_opts = NULL;
944     } else {
945         object_unref(OBJECT(dev));
946     }
947     error_propagate(errp, err);
948 }
949 
950 static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
951 {
952     VirtIONet *n = VIRTIO_NET(vdev);
953     Error *err = NULL;
954     int i;
955 
956     if (n->mtu_bypass_backend &&
957             !virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_MTU)) {
958         features &= ~(1ULL << VIRTIO_NET_F_MTU);
959     }
960 
961     virtio_net_set_multiqueue(n,
962                               virtio_has_feature(features, VIRTIO_NET_F_RSS) ||
963                               virtio_has_feature(features, VIRTIO_NET_F_MQ));
964 
965     virtio_net_set_mrg_rx_bufs(n,
966                                virtio_has_feature(features,
967                                                   VIRTIO_NET_F_MRG_RXBUF),
968                                virtio_has_feature(features,
969                                                   VIRTIO_F_VERSION_1),
970                                virtio_has_feature(features,
971                                                   VIRTIO_NET_F_HASH_REPORT));
972 
973     n->rsc4_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
974         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO4);
975     n->rsc6_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
976         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO6);
977     n->rss_data.redirect = virtio_has_feature(features, VIRTIO_NET_F_RSS);
978 
979     if (n->has_vnet_hdr) {
980         n->curr_guest_offloads =
981             virtio_net_guest_offloads_by_features(features);
982         virtio_net_apply_guest_offloads(n);
983     }
984 
985     for (i = 0;  i < n->max_queue_pairs; i++) {
986         NetClientState *nc = qemu_get_subqueue(n->nic, i);
987 
988         if (!get_vhost_net(nc->peer)) {
989             continue;
990         }
991         vhost_net_ack_features(get_vhost_net(nc->peer), features);
992 
993         /*
994          * keep acked_features in NetVhostUserState up-to-date so it
995          * can't miss any features configured by guest virtio driver.
996          */
997         vhost_net_save_acked_features(nc->peer);
998     }
999 
1000     if (!virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN)) {
1001         memset(n->vlans, 0xff, MAX_VLAN >> 3);
1002     }
1003 
1004     if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
1005         qapi_event_send_failover_negotiated(n->netclient_name);
1006         qatomic_set(&n->failover_primary_hidden, false);
1007         failover_add_primary(n, &err);
1008         if (err) {
1009             if (!qtest_enabled()) {
1010                 warn_report_err(err);
1011             } else {
1012                 error_free(err);
1013             }
1014         }
1015     }
1016 }
1017 
1018 static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
1019                                      struct iovec *iov, unsigned int iov_cnt)
1020 {
1021     uint8_t on;
1022     size_t s;
1023     NetClientState *nc = qemu_get_queue(n->nic);
1024 
1025     s = iov_to_buf(iov, iov_cnt, 0, &on, sizeof(on));
1026     if (s != sizeof(on)) {
1027         return VIRTIO_NET_ERR;
1028     }
1029 
1030     if (cmd == VIRTIO_NET_CTRL_RX_PROMISC) {
1031         n->promisc = on;
1032     } else if (cmd == VIRTIO_NET_CTRL_RX_ALLMULTI) {
1033         n->allmulti = on;
1034     } else if (cmd == VIRTIO_NET_CTRL_RX_ALLUNI) {
1035         n->alluni = on;
1036     } else if (cmd == VIRTIO_NET_CTRL_RX_NOMULTI) {
1037         n->nomulti = on;
1038     } else if (cmd == VIRTIO_NET_CTRL_RX_NOUNI) {
1039         n->nouni = on;
1040     } else if (cmd == VIRTIO_NET_CTRL_RX_NOBCAST) {
1041         n->nobcast = on;
1042     } else {
1043         return VIRTIO_NET_ERR;
1044     }
1045 
1046     rxfilter_notify(nc);
1047 
1048     return VIRTIO_NET_OK;
1049 }
1050 
1051 static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
1052                                      struct iovec *iov, unsigned int iov_cnt)
1053 {
1054     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1055     uint64_t offloads;
1056     size_t s;
1057 
1058     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
1059         return VIRTIO_NET_ERR;
1060     }
1061 
1062     s = iov_to_buf(iov, iov_cnt, 0, &offloads, sizeof(offloads));
1063     if (s != sizeof(offloads)) {
1064         return VIRTIO_NET_ERR;
1065     }
1066 
1067     if (cmd == VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET) {
1068         uint64_t supported_offloads;
1069 
1070         offloads = virtio_ldq_p(vdev, &offloads);
1071 
1072         if (!n->has_vnet_hdr) {
1073             return VIRTIO_NET_ERR;
1074         }
1075 
1076         n->rsc4_enabled = virtio_has_feature(offloads, VIRTIO_NET_F_RSC_EXT) &&
1077             virtio_has_feature(offloads, VIRTIO_NET_F_GUEST_TSO4);
1078         n->rsc6_enabled = virtio_has_feature(offloads, VIRTIO_NET_F_RSC_EXT) &&
1079             virtio_has_feature(offloads, VIRTIO_NET_F_GUEST_TSO6);
1080         virtio_clear_feature(&offloads, VIRTIO_NET_F_RSC_EXT);
1081 
1082         supported_offloads = virtio_net_supported_guest_offloads(n);
1083         if (offloads & ~supported_offloads) {
1084             return VIRTIO_NET_ERR;
1085         }
1086 
1087         n->curr_guest_offloads = offloads;
1088         virtio_net_apply_guest_offloads(n);
1089 
1090         return VIRTIO_NET_OK;
1091     } else {
1092         return VIRTIO_NET_ERR;
1093     }
1094 }
1095 
1096 static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
1097                                  struct iovec *iov, unsigned int iov_cnt)
1098 {
1099     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1100     struct virtio_net_ctrl_mac mac_data;
1101     size_t s;
1102     NetClientState *nc = qemu_get_queue(n->nic);
1103 
1104     if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) {
1105         if (iov_size(iov, iov_cnt) != sizeof(n->mac)) {
1106             return VIRTIO_NET_ERR;
1107         }
1108         s = iov_to_buf(iov, iov_cnt, 0, &n->mac, sizeof(n->mac));
1109         assert(s == sizeof(n->mac));
1110         qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
1111         rxfilter_notify(nc);
1112 
1113         return VIRTIO_NET_OK;
1114     }
1115 
1116     if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) {
1117         return VIRTIO_NET_ERR;
1118     }
1119 
1120     int in_use = 0;
1121     int first_multi = 0;
1122     uint8_t uni_overflow = 0;
1123     uint8_t multi_overflow = 0;
1124     uint8_t *macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
1125 
1126     s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
1127                    sizeof(mac_data.entries));
1128     mac_data.entries = virtio_ldl_p(vdev, &mac_data.entries);
1129     if (s != sizeof(mac_data.entries)) {
1130         goto error;
1131     }
1132     iov_discard_front(&iov, &iov_cnt, s);
1133 
1134     if (mac_data.entries * ETH_ALEN > iov_size(iov, iov_cnt)) {
1135         goto error;
1136     }
1137 
1138     if (mac_data.entries <= MAC_TABLE_ENTRIES) {
1139         s = iov_to_buf(iov, iov_cnt, 0, macs,
1140                        mac_data.entries * ETH_ALEN);
1141         if (s != mac_data.entries * ETH_ALEN) {
1142             goto error;
1143         }
1144         in_use += mac_data.entries;
1145     } else {
1146         uni_overflow = 1;
1147     }
1148 
1149     iov_discard_front(&iov, &iov_cnt, mac_data.entries * ETH_ALEN);
1150 
1151     first_multi = in_use;
1152 
1153     s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
1154                    sizeof(mac_data.entries));
1155     mac_data.entries = virtio_ldl_p(vdev, &mac_data.entries);
1156     if (s != sizeof(mac_data.entries)) {
1157         goto error;
1158     }
1159 
1160     iov_discard_front(&iov, &iov_cnt, s);
1161 
1162     if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) {
1163         goto error;
1164     }
1165 
1166     if (mac_data.entries <= MAC_TABLE_ENTRIES - in_use) {
1167         s = iov_to_buf(iov, iov_cnt, 0, &macs[in_use * ETH_ALEN],
1168                        mac_data.entries * ETH_ALEN);
1169         if (s != mac_data.entries * ETH_ALEN) {
1170             goto error;
1171         }
1172         in_use += mac_data.entries;
1173     } else {
1174         multi_overflow = 1;
1175     }
1176 
1177     n->mac_table.in_use = in_use;
1178     n->mac_table.first_multi = first_multi;
1179     n->mac_table.uni_overflow = uni_overflow;
1180     n->mac_table.multi_overflow = multi_overflow;
1181     memcpy(n->mac_table.macs, macs, MAC_TABLE_ENTRIES * ETH_ALEN);
1182     g_free(macs);
1183     rxfilter_notify(nc);
1184 
1185     return VIRTIO_NET_OK;
1186 
1187 error:
1188     g_free(macs);
1189     return VIRTIO_NET_ERR;
1190 }
1191 
1192 static int virtio_net_handle_vlan_table(VirtIONet *n, uint8_t cmd,
1193                                         struct iovec *iov, unsigned int iov_cnt)
1194 {
1195     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1196     uint16_t vid;
1197     size_t s;
1198     NetClientState *nc = qemu_get_queue(n->nic);
1199 
1200     s = iov_to_buf(iov, iov_cnt, 0, &vid, sizeof(vid));
1201     vid = virtio_lduw_p(vdev, &vid);
1202     if (s != sizeof(vid)) {
1203         return VIRTIO_NET_ERR;
1204     }
1205 
1206     if (vid >= MAX_VLAN)
1207         return VIRTIO_NET_ERR;
1208 
1209     if (cmd == VIRTIO_NET_CTRL_VLAN_ADD)
1210         n->vlans[vid >> 5] |= (1U << (vid & 0x1f));
1211     else if (cmd == VIRTIO_NET_CTRL_VLAN_DEL)
1212         n->vlans[vid >> 5] &= ~(1U << (vid & 0x1f));
1213     else
1214         return VIRTIO_NET_ERR;
1215 
1216     rxfilter_notify(nc);
1217 
1218     return VIRTIO_NET_OK;
1219 }
1220 
1221 static int virtio_net_handle_announce(VirtIONet *n, uint8_t cmd,
1222                                       struct iovec *iov, unsigned int iov_cnt)
1223 {
1224     trace_virtio_net_handle_announce(n->announce_timer.round);
1225     if (cmd == VIRTIO_NET_CTRL_ANNOUNCE_ACK &&
1226         n->status & VIRTIO_NET_S_ANNOUNCE) {
1227         n->status &= ~VIRTIO_NET_S_ANNOUNCE;
1228         if (n->announce_timer.round) {
1229             qemu_announce_timer_step(&n->announce_timer);
1230         }
1231         return VIRTIO_NET_OK;
1232     } else {
1233         return VIRTIO_NET_ERR;
1234     }
1235 }
1236 
1237 static bool virtio_net_attach_ebpf_to_backend(NICState *nic, int prog_fd)
1238 {
1239     NetClientState *nc = qemu_get_peer(qemu_get_queue(nic), 0);
1240     if (nc == NULL || nc->info->set_steering_ebpf == NULL) {
1241         return false;
1242     }
1243 
1244     trace_virtio_net_rss_attach_ebpf(nic, prog_fd);
1245     return nc->info->set_steering_ebpf(nc, prog_fd);
1246 }
1247 
1248 static void rss_data_to_rss_config(struct VirtioNetRssData *data,
1249                                    struct EBPFRSSConfig *config)
1250 {
1251     config->redirect = data->redirect;
1252     config->populate_hash = data->populate_hash;
1253     config->hash_types = data->hash_types;
1254     config->indirections_len = data->indirections_len;
1255     config->default_queue = data->default_queue;
1256 }
1257 
1258 static bool virtio_net_attach_ebpf_rss(VirtIONet *n)
1259 {
1260     struct EBPFRSSConfig config = {};
1261 
1262     if (!ebpf_rss_is_loaded(&n->ebpf_rss)) {
1263         return false;
1264     }
1265 
1266     rss_data_to_rss_config(&n->rss_data, &config);
1267 
1268     if (!ebpf_rss_set_all(&n->ebpf_rss, &config,
1269                           n->rss_data.indirections_table, n->rss_data.key,
1270                           NULL)) {
1271         return false;
1272     }
1273 
1274     if (!virtio_net_attach_ebpf_to_backend(n->nic, n->ebpf_rss.program_fd)) {
1275         return false;
1276     }
1277 
1278     return true;
1279 }
1280 
1281 static void virtio_net_detach_ebpf_rss(VirtIONet *n)
1282 {
1283     virtio_net_attach_ebpf_to_backend(n->nic, -1);
1284 }
1285 
1286 static void virtio_net_commit_rss_config(VirtIONet *n)
1287 {
1288     if (n->rss_data.enabled) {
1289         n->rss_data.enabled_software_rss = n->rss_data.populate_hash;
1290         if (n->rss_data.populate_hash) {
1291             virtio_net_detach_ebpf_rss(n);
1292         } else if (!virtio_net_attach_ebpf_rss(n)) {
1293             if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
1294                 warn_report("Can't load eBPF RSS for vhost");
1295             } else {
1296                 warn_report("Can't load eBPF RSS - fallback to software RSS");
1297                 n->rss_data.enabled_software_rss = true;
1298             }
1299         }
1300 
1301         trace_virtio_net_rss_enable(n,
1302                                     n->rss_data.hash_types,
1303                                     n->rss_data.indirections_len,
1304                                     sizeof(n->rss_data.key));
1305     } else {
1306         virtio_net_detach_ebpf_rss(n);
1307         trace_virtio_net_rss_disable(n);
1308     }
1309 }
1310 
1311 static void virtio_net_disable_rss(VirtIONet *n)
1312 {
1313     if (!n->rss_data.enabled) {
1314         return;
1315     }
1316 
1317     n->rss_data.enabled = false;
1318     virtio_net_commit_rss_config(n);
1319 }
1320 
1321 static bool virtio_net_load_ebpf_fds(VirtIONet *n, Error **errp)
1322 {
1323     int fds[EBPF_RSS_MAX_FDS] = { [0 ... EBPF_RSS_MAX_FDS - 1] = -1};
1324     int ret = true;
1325     int i = 0;
1326 
1327     if (n->nr_ebpf_rss_fds != EBPF_RSS_MAX_FDS) {
1328         error_setg(errp, "Expected %d file descriptors but got %d",
1329                    EBPF_RSS_MAX_FDS, n->nr_ebpf_rss_fds);
1330         return false;
1331     }
1332 
1333     for (i = 0; i < n->nr_ebpf_rss_fds; i++) {
1334         fds[i] = monitor_fd_param(monitor_cur(), n->ebpf_rss_fds[i], errp);
1335         if (fds[i] < 0) {
1336             ret = false;
1337             goto exit;
1338         }
1339     }
1340 
1341     ret = ebpf_rss_load_fds(&n->ebpf_rss, fds[0], fds[1], fds[2], fds[3], errp);
1342 
1343 exit:
1344     if (!ret) {
1345         for (i = 0; i < n->nr_ebpf_rss_fds && fds[i] != -1; i++) {
1346             close(fds[i]);
1347         }
1348     }
1349 
1350     return ret;
1351 }
1352 
1353 static bool virtio_net_load_ebpf(VirtIONet *n, Error **errp)
1354 {
1355     bool ret = false;
1356 
1357     if (virtio_net_attach_ebpf_to_backend(n->nic, -1)) {
1358         trace_virtio_net_rss_load(n, n->nr_ebpf_rss_fds, n->ebpf_rss_fds);
1359         if (n->ebpf_rss_fds) {
1360             ret = virtio_net_load_ebpf_fds(n, errp);
1361         } else {
1362             ret = ebpf_rss_load(&n->ebpf_rss, errp);
1363         }
1364     }
1365 
1366     return ret;
1367 }
1368 
1369 static void virtio_net_unload_ebpf(VirtIONet *n)
1370 {
1371     virtio_net_attach_ebpf_to_backend(n->nic, -1);
1372     ebpf_rss_unload(&n->ebpf_rss);
1373 }
1374 
1375 static uint16_t virtio_net_handle_rss(VirtIONet *n,
1376                                       struct iovec *iov,
1377                                       unsigned int iov_cnt,
1378                                       bool do_rss)
1379 {
1380     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1381     struct virtio_net_rss_config cfg;
1382     size_t s, offset = 0, size_get;
1383     uint16_t queue_pairs, i;
1384     struct {
1385         uint16_t us;
1386         uint8_t b;
1387     } QEMU_PACKED temp;
1388     const char *err_msg = "";
1389     uint32_t err_value = 0;
1390 
1391     if (do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
1392         err_msg = "RSS is not negotiated";
1393         goto error;
1394     }
1395     if (!do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) {
1396         err_msg = "Hash report is not negotiated";
1397         goto error;
1398     }
1399     size_get = offsetof(struct virtio_net_rss_config, indirection_table);
1400     s = iov_to_buf(iov, iov_cnt, offset, &cfg, size_get);
1401     if (s != size_get) {
1402         err_msg = "Short command buffer";
1403         err_value = (uint32_t)s;
1404         goto error;
1405     }
1406     n->rss_data.hash_types = virtio_ldl_p(vdev, &cfg.hash_types);
1407     n->rss_data.indirections_len =
1408         virtio_lduw_p(vdev, &cfg.indirection_table_mask);
1409     if (!do_rss) {
1410         n->rss_data.indirections_len = 0;
1411     }
1412     if (n->rss_data.indirections_len >= VIRTIO_NET_RSS_MAX_TABLE_LEN) {
1413         err_msg = "Too large indirection table";
1414         err_value = n->rss_data.indirections_len;
1415         goto error;
1416     }
1417     n->rss_data.indirections_len++;
1418     if (!is_power_of_2(n->rss_data.indirections_len)) {
1419         err_msg = "Invalid size of indirection table";
1420         err_value = n->rss_data.indirections_len;
1421         goto error;
1422     }
1423     n->rss_data.default_queue = do_rss ?
1424         virtio_lduw_p(vdev, &cfg.unclassified_queue) : 0;
1425     if (n->rss_data.default_queue >= n->max_queue_pairs) {
1426         err_msg = "Invalid default queue";
1427         err_value = n->rss_data.default_queue;
1428         goto error;
1429     }
1430     offset += size_get;
1431     size_get = sizeof(uint16_t) * n->rss_data.indirections_len;
1432     g_free(n->rss_data.indirections_table);
1433     n->rss_data.indirections_table = g_malloc(size_get);
1434     if (!n->rss_data.indirections_table) {
1435         err_msg = "Can't allocate indirections table";
1436         err_value = n->rss_data.indirections_len;
1437         goto error;
1438     }
1439     s = iov_to_buf(iov, iov_cnt, offset,
1440                    n->rss_data.indirections_table, size_get);
1441     if (s != size_get) {
1442         err_msg = "Short indirection table buffer";
1443         err_value = (uint32_t)s;
1444         goto error;
1445     }
1446     for (i = 0; i < n->rss_data.indirections_len; ++i) {
1447         uint16_t val = n->rss_data.indirections_table[i];
1448         n->rss_data.indirections_table[i] = virtio_lduw_p(vdev, &val);
1449     }
1450     offset += size_get;
1451     size_get = sizeof(temp);
1452     s = iov_to_buf(iov, iov_cnt, offset, &temp, size_get);
1453     if (s != size_get) {
1454         err_msg = "Can't get queue_pairs";
1455         err_value = (uint32_t)s;
1456         goto error;
1457     }
1458     queue_pairs = do_rss ? virtio_lduw_p(vdev, &temp.us) : n->curr_queue_pairs;
1459     if (queue_pairs == 0 || queue_pairs > n->max_queue_pairs) {
1460         err_msg = "Invalid number of queue_pairs";
1461         err_value = queue_pairs;
1462         goto error;
1463     }
1464     if (temp.b > VIRTIO_NET_RSS_MAX_KEY_SIZE) {
1465         err_msg = "Invalid key size";
1466         err_value = temp.b;
1467         goto error;
1468     }
1469     if (!temp.b && n->rss_data.hash_types) {
1470         err_msg = "No key provided";
1471         err_value = 0;
1472         goto error;
1473     }
1474     if (!temp.b && !n->rss_data.hash_types) {
1475         virtio_net_disable_rss(n);
1476         return queue_pairs;
1477     }
1478     offset += size_get;
1479     size_get = temp.b;
1480     s = iov_to_buf(iov, iov_cnt, offset, n->rss_data.key, size_get);
1481     if (s != size_get) {
1482         err_msg = "Can get key buffer";
1483         err_value = (uint32_t)s;
1484         goto error;
1485     }
1486     n->rss_data.enabled = true;
1487     virtio_net_commit_rss_config(n);
1488     return queue_pairs;
1489 error:
1490     trace_virtio_net_rss_error(n, err_msg, err_value);
1491     virtio_net_disable_rss(n);
1492     return 0;
1493 }
1494 
1495 static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
1496                                 struct iovec *iov, unsigned int iov_cnt)
1497 {
1498     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1499     uint16_t queue_pairs;
1500     NetClientState *nc = qemu_get_queue(n->nic);
1501 
1502     virtio_net_disable_rss(n);
1503     if (cmd == VIRTIO_NET_CTRL_MQ_HASH_CONFIG) {
1504         queue_pairs = virtio_net_handle_rss(n, iov, iov_cnt, false);
1505         return queue_pairs ? VIRTIO_NET_OK : VIRTIO_NET_ERR;
1506     }
1507     if (cmd == VIRTIO_NET_CTRL_MQ_RSS_CONFIG) {
1508         queue_pairs = virtio_net_handle_rss(n, iov, iov_cnt, true);
1509     } else if (cmd == VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
1510         struct virtio_net_ctrl_mq mq;
1511         size_t s;
1512         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ)) {
1513             return VIRTIO_NET_ERR;
1514         }
1515         s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq));
1516         if (s != sizeof(mq)) {
1517             return VIRTIO_NET_ERR;
1518         }
1519         queue_pairs = virtio_lduw_p(vdev, &mq.virtqueue_pairs);
1520 
1521     } else {
1522         return VIRTIO_NET_ERR;
1523     }
1524 
1525     if (queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1526         queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
1527         queue_pairs > n->max_queue_pairs ||
1528         !n->multiqueue) {
1529         return VIRTIO_NET_ERR;
1530     }
1531 
1532     n->curr_queue_pairs = queue_pairs;
1533     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
1534         /*
1535          * Avoid updating the backend for a vdpa device: We're only interested
1536          * in updating the device model queues.
1537          */
1538         return VIRTIO_NET_OK;
1539     }
1540     /* stop the backend before changing the number of queue_pairs to avoid handling a
1541      * disabled queue */
1542     virtio_net_set_status(vdev, vdev->status);
1543     virtio_net_set_queue_pairs(n);
1544 
1545     return VIRTIO_NET_OK;
1546 }
1547 
1548 size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
1549                                   const struct iovec *in_sg, unsigned in_num,
1550                                   const struct iovec *out_sg,
1551                                   unsigned out_num)
1552 {
1553     VirtIONet *n = VIRTIO_NET(vdev);
1554     struct virtio_net_ctrl_hdr ctrl;
1555     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1556     size_t s;
1557     struct iovec *iov, *iov2;
1558 
1559     if (iov_size(in_sg, in_num) < sizeof(status) ||
1560         iov_size(out_sg, out_num) < sizeof(ctrl)) {
1561         virtio_error(vdev, "virtio-net ctrl missing headers");
1562         return 0;
1563     }
1564 
1565     iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
1566     s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl));
1567     iov_discard_front(&iov, &out_num, sizeof(ctrl));
1568     if (s != sizeof(ctrl)) {
1569         status = VIRTIO_NET_ERR;
1570     } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
1571         status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
1572     } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
1573         status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
1574     } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
1575         status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
1576     } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
1577         status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
1578     } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
1579         status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
1580     } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
1581         status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
1582     }
1583 
1584     s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
1585     assert(s == sizeof(status));
1586 
1587     g_free(iov2);
1588     return sizeof(status);
1589 }
1590 
1591 static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
1592 {
1593     VirtQueueElement *elem;
1594 
1595     for (;;) {
1596         size_t written;
1597         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
1598         if (!elem) {
1599             break;
1600         }
1601 
1602         written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
1603                                              elem->out_sg, elem->out_num);
1604         if (written > 0) {
1605             virtqueue_push(vq, elem, written);
1606             virtio_notify(vdev, vq);
1607             g_free(elem);
1608         } else {
1609             virtqueue_detach_element(vq, elem, 0);
1610             g_free(elem);
1611             break;
1612         }
1613     }
1614 }
1615 
1616 /* RX */
1617 
1618 static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
1619 {
1620     VirtIONet *n = VIRTIO_NET(vdev);
1621     int queue_index = vq2q(virtio_get_queue_index(vq));
1622 
1623     qemu_flush_queued_packets(qemu_get_subqueue(n->nic, queue_index));
1624 }
1625 
1626 static bool virtio_net_can_receive(NetClientState *nc)
1627 {
1628     VirtIONet *n = qemu_get_nic_opaque(nc);
1629     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1630     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
1631 
1632     if (!vdev->vm_running) {
1633         return false;
1634     }
1635 
1636     if (nc->queue_index >= n->curr_queue_pairs) {
1637         return false;
1638     }
1639 
1640     if (!virtio_queue_ready(q->rx_vq) ||
1641         !(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
1642         return false;
1643     }
1644 
1645     return true;
1646 }
1647 
1648 static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize)
1649 {
1650     int opaque;
1651     unsigned int in_bytes;
1652     VirtIONet *n = q->n;
1653 
1654     while (virtio_queue_empty(q->rx_vq) || n->mergeable_rx_bufs) {
1655         opaque = virtqueue_get_avail_bytes(q->rx_vq, &in_bytes, NULL,
1656                                            bufsize, 0);
1657         /* Buffer is enough, disable notifiaction */
1658         if (bufsize <= in_bytes) {
1659             break;
1660         }
1661 
1662         if (virtio_queue_enable_notification_and_check(q->rx_vq, opaque)) {
1663             /* Guest has added some buffers, try again */
1664             continue;
1665         } else {
1666             return 0;
1667         }
1668     }
1669 
1670     virtio_queue_set_notification(q->rx_vq, 0);
1671 
1672     return 1;
1673 }
1674 
1675 static void virtio_net_hdr_swap(VirtIODevice *vdev, struct virtio_net_hdr *hdr)
1676 {
1677     virtio_tswap16s(vdev, &hdr->hdr_len);
1678     virtio_tswap16s(vdev, &hdr->gso_size);
1679     virtio_tswap16s(vdev, &hdr->csum_start);
1680     virtio_tswap16s(vdev, &hdr->csum_offset);
1681 }
1682 
1683 /* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so
1684  * it never finds out that the packets don't have valid checksums.  This
1685  * causes dhclient to get upset.  Fedora's carried a patch for ages to
1686  * fix this with Xen but it hasn't appeared in an upstream release of
1687  * dhclient yet.
1688  *
1689  * To avoid breaking existing guests, we catch udp packets and add
1690  * checksums.  This is terrible but it's better than hacking the guest
1691  * kernels.
1692  *
1693  * N.B. if we introduce a zero-copy API, this operation is no longer free so
1694  * we should provide a mechanism to disable it to avoid polluting the host
1695  * cache.
1696  */
1697 static void work_around_broken_dhclient(struct virtio_net_hdr *hdr,
1698                                         uint8_t *buf, size_t size)
1699 {
1700     if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && /* missing csum */
1701         (size > 27 && size < 1500) && /* normal sized MTU */
1702         (buf[12] == 0x08 && buf[13] == 0x00) && /* ethertype == IPv4 */
1703         (buf[23] == 17) && /* ip.protocol == UDP */
1704         (buf[34] == 0 && buf[35] == 67)) { /* udp.srcport == bootps */
1705         net_checksum_calculate(buf, size, CSUM_UDP);
1706         hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM;
1707     }
1708 }
1709 
1710 static void receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt,
1711                            const void *buf, size_t size)
1712 {
1713     if (n->has_vnet_hdr) {
1714         /* FIXME this cast is evil */
1715         void *wbuf = (void *)buf;
1716         work_around_broken_dhclient(wbuf, wbuf + n->host_hdr_len,
1717                                     size - n->host_hdr_len);
1718 
1719         if (n->needs_vnet_hdr_swap) {
1720             virtio_net_hdr_swap(VIRTIO_DEVICE(n), wbuf);
1721         }
1722         iov_from_buf(iov, iov_cnt, 0, buf, sizeof(struct virtio_net_hdr));
1723     } else {
1724         struct virtio_net_hdr hdr = {
1725             .flags = 0,
1726             .gso_type = VIRTIO_NET_HDR_GSO_NONE
1727         };
1728         iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr);
1729     }
1730 }
1731 
1732 static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
1733 {
1734     static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
1735     static const uint8_t vlan[] = {0x81, 0x00};
1736     uint8_t *ptr = (uint8_t *)buf;
1737     int i;
1738 
1739     if (n->promisc)
1740         return 1;
1741 
1742     ptr += n->host_hdr_len;
1743 
1744     if (!memcmp(&ptr[12], vlan, sizeof(vlan))) {
1745         int vid = lduw_be_p(ptr + 14) & 0xfff;
1746         if (!(n->vlans[vid >> 5] & (1U << (vid & 0x1f))))
1747             return 0;
1748     }
1749 
1750     if (ptr[0] & 1) { // multicast
1751         if (!memcmp(ptr, bcast, sizeof(bcast))) {
1752             return !n->nobcast;
1753         } else if (n->nomulti) {
1754             return 0;
1755         } else if (n->allmulti || n->mac_table.multi_overflow) {
1756             return 1;
1757         }
1758 
1759         for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) {
1760             if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) {
1761                 return 1;
1762             }
1763         }
1764     } else { // unicast
1765         if (n->nouni) {
1766             return 0;
1767         } else if (n->alluni || n->mac_table.uni_overflow) {
1768             return 1;
1769         } else if (!memcmp(ptr, n->mac, ETH_ALEN)) {
1770             return 1;
1771         }
1772 
1773         for (i = 0; i < n->mac_table.first_multi; i++) {
1774             if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) {
1775                 return 1;
1776             }
1777         }
1778     }
1779 
1780     return 0;
1781 }
1782 
1783 static uint8_t virtio_net_get_hash_type(bool hasip4,
1784                                         bool hasip6,
1785                                         EthL4HdrProto l4hdr_proto,
1786                                         uint32_t types)
1787 {
1788     if (hasip4) {
1789         switch (l4hdr_proto) {
1790         case ETH_L4_HDR_PROTO_TCP:
1791             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
1792                 return NetPktRssIpV4Tcp;
1793             }
1794             break;
1795 
1796         case ETH_L4_HDR_PROTO_UDP:
1797             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
1798                 return NetPktRssIpV4Udp;
1799             }
1800             break;
1801 
1802         default:
1803             break;
1804         }
1805 
1806         if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
1807             return NetPktRssIpV4;
1808         }
1809     } else if (hasip6) {
1810         switch (l4hdr_proto) {
1811         case ETH_L4_HDR_PROTO_TCP:
1812             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
1813                 return NetPktRssIpV6TcpEx;
1814             }
1815             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
1816                 return NetPktRssIpV6Tcp;
1817             }
1818             break;
1819 
1820         case ETH_L4_HDR_PROTO_UDP:
1821             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
1822                 return NetPktRssIpV6UdpEx;
1823             }
1824             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
1825                 return NetPktRssIpV6Udp;
1826             }
1827             break;
1828 
1829         default:
1830             break;
1831         }
1832 
1833         if (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
1834             return NetPktRssIpV6Ex;
1835         }
1836         if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
1837             return NetPktRssIpV6;
1838         }
1839     }
1840     return 0xff;
1841 }
1842 
1843 static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
1844                                   size_t size,
1845                                   struct virtio_net_hdr_v1_hash *hdr)
1846 {
1847     VirtIONet *n = qemu_get_nic_opaque(nc);
1848     unsigned int index = nc->queue_index, new_index = index;
1849     struct NetRxPkt *pkt = n->rx_pkt;
1850     uint8_t net_hash_type;
1851     uint32_t hash;
1852     bool hasip4, hasip6;
1853     EthL4HdrProto l4hdr_proto;
1854     static const uint8_t reports[NetPktRssIpV6UdpEx + 1] = {
1855         VIRTIO_NET_HASH_REPORT_IPv4,
1856         VIRTIO_NET_HASH_REPORT_TCPv4,
1857         VIRTIO_NET_HASH_REPORT_TCPv6,
1858         VIRTIO_NET_HASH_REPORT_IPv6,
1859         VIRTIO_NET_HASH_REPORT_IPv6_EX,
1860         VIRTIO_NET_HASH_REPORT_TCPv6_EX,
1861         VIRTIO_NET_HASH_REPORT_UDPv4,
1862         VIRTIO_NET_HASH_REPORT_UDPv6,
1863         VIRTIO_NET_HASH_REPORT_UDPv6_EX
1864     };
1865     struct iovec iov = {
1866         .iov_base = (void *)buf,
1867         .iov_len = size
1868     };
1869 
1870     net_rx_pkt_set_protocols(pkt, &iov, 1, n->host_hdr_len);
1871     net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
1872     net_hash_type = virtio_net_get_hash_type(hasip4, hasip6, l4hdr_proto,
1873                                              n->rss_data.hash_types);
1874     if (net_hash_type > NetPktRssIpV6UdpEx) {
1875         if (n->rss_data.populate_hash) {
1876             hdr->hash_value = VIRTIO_NET_HASH_REPORT_NONE;
1877             hdr->hash_report = 0;
1878         }
1879         return n->rss_data.redirect ? n->rss_data.default_queue : -1;
1880     }
1881 
1882     hash = net_rx_pkt_calc_rss_hash(pkt, net_hash_type, n->rss_data.key);
1883 
1884     if (n->rss_data.populate_hash) {
1885         hdr->hash_value = hash;
1886         hdr->hash_report = reports[net_hash_type];
1887     }
1888 
1889     if (n->rss_data.redirect) {
1890         new_index = hash & (n->rss_data.indirections_len - 1);
1891         new_index = n->rss_data.indirections_table[new_index];
1892     }
1893 
1894     return (index == new_index) ? -1 : new_index;
1895 }
1896 
1897 static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
1898                                       size_t size, bool no_rss)
1899 {
1900     VirtIONet *n = qemu_get_nic_opaque(nc);
1901     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
1902     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1903     VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
1904     size_t lens[VIRTQUEUE_MAX_SIZE];
1905     struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
1906     struct virtio_net_hdr_v1_hash extra_hdr;
1907     unsigned mhdr_cnt = 0;
1908     size_t offset, i, guest_offset, j;
1909     ssize_t err;
1910 
1911     if (!virtio_net_can_receive(nc)) {
1912         return -1;
1913     }
1914 
1915     if (!no_rss && n->rss_data.enabled && n->rss_data.enabled_software_rss) {
1916         int index = virtio_net_process_rss(nc, buf, size, &extra_hdr);
1917         if (index >= 0) {
1918             NetClientState *nc2 =
1919                 qemu_get_subqueue(n->nic, index % n->curr_queue_pairs);
1920             return virtio_net_receive_rcu(nc2, buf, size, true);
1921         }
1922     }
1923 
1924     /* hdr_len refers to the header we supply to the guest */
1925     if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
1926         return 0;
1927     }
1928 
1929     if (!receive_filter(n, buf, size))
1930         return size;
1931 
1932     offset = i = 0;
1933 
1934     while (offset < size) {
1935         VirtQueueElement *elem;
1936         int len, total;
1937         const struct iovec *sg;
1938 
1939         total = 0;
1940 
1941         if (i == VIRTQUEUE_MAX_SIZE) {
1942             virtio_error(vdev, "virtio-net unexpected long buffer chain");
1943             err = size;
1944             goto err;
1945         }
1946 
1947         elem = virtqueue_pop(q->rx_vq, sizeof(VirtQueueElement));
1948         if (!elem) {
1949             if (i) {
1950                 virtio_error(vdev, "virtio-net unexpected empty queue: "
1951                              "i %zd mergeable %d offset %zd, size %zd, "
1952                              "guest hdr len %zd, host hdr len %zd "
1953                              "guest features 0x%" PRIx64,
1954                              i, n->mergeable_rx_bufs, offset, size,
1955                              n->guest_hdr_len, n->host_hdr_len,
1956                              vdev->guest_features);
1957             }
1958             err = -1;
1959             goto err;
1960         }
1961 
1962         if (elem->in_num < 1) {
1963             virtio_error(vdev,
1964                          "virtio-net receive queue contains no in buffers");
1965             virtqueue_detach_element(q->rx_vq, elem, 0);
1966             g_free(elem);
1967             err = -1;
1968             goto err;
1969         }
1970 
1971         sg = elem->in_sg;
1972         if (i == 0) {
1973             assert(offset == 0);
1974             if (n->mergeable_rx_bufs) {
1975                 mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg),
1976                                     sg, elem->in_num,
1977                                     offsetof(typeof(extra_hdr), hdr.num_buffers),
1978                                     sizeof(extra_hdr.hdr.num_buffers));
1979             }
1980 
1981             receive_header(n, sg, elem->in_num, buf, size);
1982             if (n->rss_data.populate_hash) {
1983                 offset = offsetof(typeof(extra_hdr), hash_value);
1984                 iov_from_buf(sg, elem->in_num, offset,
1985                              (char *)&extra_hdr + offset,
1986                              sizeof(extra_hdr.hash_value) +
1987                              sizeof(extra_hdr.hash_report));
1988             }
1989             offset = n->host_hdr_len;
1990             total += n->guest_hdr_len;
1991             guest_offset = n->guest_hdr_len;
1992         } else {
1993             guest_offset = 0;
1994         }
1995 
1996         /* copy in packet.  ugh */
1997         len = iov_from_buf(sg, elem->in_num, guest_offset,
1998                            buf + offset, size - offset);
1999         total += len;
2000         offset += len;
2001         /* If buffers can't be merged, at this point we
2002          * must have consumed the complete packet.
2003          * Otherwise, drop it. */
2004         if (!n->mergeable_rx_bufs && offset < size) {
2005             virtqueue_unpop(q->rx_vq, elem, total);
2006             g_free(elem);
2007             err = size;
2008             goto err;
2009         }
2010 
2011         elems[i] = elem;
2012         lens[i] = total;
2013         i++;
2014     }
2015 
2016     if (mhdr_cnt) {
2017         virtio_stw_p(vdev, &extra_hdr.hdr.num_buffers, i);
2018         iov_from_buf(mhdr_sg, mhdr_cnt,
2019                      0,
2020                      &extra_hdr.hdr.num_buffers,
2021                      sizeof extra_hdr.hdr.num_buffers);
2022     }
2023 
2024     for (j = 0; j < i; j++) {
2025         /* signal other side */
2026         virtqueue_fill(q->rx_vq, elems[j], lens[j], j);
2027         g_free(elems[j]);
2028     }
2029 
2030     virtqueue_flush(q->rx_vq, i);
2031     virtio_notify(vdev, q->rx_vq);
2032 
2033     return size;
2034 
2035 err:
2036     for (j = 0; j < i; j++) {
2037         virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
2038         g_free(elems[j]);
2039     }
2040 
2041     return err;
2042 }
2043 
2044 static ssize_t virtio_net_do_receive(NetClientState *nc, const uint8_t *buf,
2045                                   size_t size)
2046 {
2047     RCU_READ_LOCK_GUARD();
2048 
2049     return virtio_net_receive_rcu(nc, buf, size, false);
2050 }
2051 
2052 static void virtio_net_rsc_extract_unit4(VirtioNetRscChain *chain,
2053                                          const uint8_t *buf,
2054                                          VirtioNetRscUnit *unit)
2055 {
2056     uint16_t ip_hdrlen;
2057     struct ip_header *ip;
2058 
2059     ip = (struct ip_header *)(buf + chain->n->guest_hdr_len
2060                               + sizeof(struct eth_header));
2061     unit->ip = (void *)ip;
2062     ip_hdrlen = (ip->ip_ver_len & 0xF) << 2;
2063     unit->ip_plen = &ip->ip_len;
2064     unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip) + ip_hdrlen);
2065     unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
2066     unit->payload = htons(*unit->ip_plen) - ip_hdrlen - unit->tcp_hdrlen;
2067 }
2068 
2069 static void virtio_net_rsc_extract_unit6(VirtioNetRscChain *chain,
2070                                          const uint8_t *buf,
2071                                          VirtioNetRscUnit *unit)
2072 {
2073     struct ip6_header *ip6;
2074 
2075     ip6 = (struct ip6_header *)(buf + chain->n->guest_hdr_len
2076                                  + sizeof(struct eth_header));
2077     unit->ip = ip6;
2078     unit->ip_plen = &(ip6->ip6_ctlun.ip6_un1.ip6_un1_plen);
2079     unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip)
2080                                         + sizeof(struct ip6_header));
2081     unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
2082 
2083     /* There is a difference between payload length in ipv4 and v6,
2084        ip header is excluded in ipv6 */
2085     unit->payload = htons(*unit->ip_plen) - unit->tcp_hdrlen;
2086 }
2087 
2088 static size_t virtio_net_rsc_drain_seg(VirtioNetRscChain *chain,
2089                                        VirtioNetRscSeg *seg)
2090 {
2091     int ret;
2092     struct virtio_net_hdr_v1 *h;
2093 
2094     h = (struct virtio_net_hdr_v1 *)seg->buf;
2095     h->flags = 0;
2096     h->gso_type = VIRTIO_NET_HDR_GSO_NONE;
2097 
2098     if (seg->is_coalesced) {
2099         h->rsc.segments = seg->packets;
2100         h->rsc.dup_acks = seg->dup_ack;
2101         h->flags = VIRTIO_NET_HDR_F_RSC_INFO;
2102         if (chain->proto == ETH_P_IP) {
2103             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2104         } else {
2105             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2106         }
2107     }
2108 
2109     ret = virtio_net_do_receive(seg->nc, seg->buf, seg->size);
2110     QTAILQ_REMOVE(&chain->buffers, seg, next);
2111     g_free(seg->buf);
2112     g_free(seg);
2113 
2114     return ret;
2115 }
2116 
2117 static void virtio_net_rsc_purge(void *opq)
2118 {
2119     VirtioNetRscSeg *seg, *rn;
2120     VirtioNetRscChain *chain = (VirtioNetRscChain *)opq;
2121 
2122     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn) {
2123         if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2124             chain->stat.purge_failed++;
2125             continue;
2126         }
2127     }
2128 
2129     chain->stat.timer++;
2130     if (!QTAILQ_EMPTY(&chain->buffers)) {
2131         timer_mod(chain->drain_timer,
2132               qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + chain->n->rsc_timeout);
2133     }
2134 }
2135 
2136 static void virtio_net_rsc_cleanup(VirtIONet *n)
2137 {
2138     VirtioNetRscChain *chain, *rn_chain;
2139     VirtioNetRscSeg *seg, *rn_seg;
2140 
2141     QTAILQ_FOREACH_SAFE(chain, &n->rsc_chains, next, rn_chain) {
2142         QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn_seg) {
2143             QTAILQ_REMOVE(&chain->buffers, seg, next);
2144             g_free(seg->buf);
2145             g_free(seg);
2146         }
2147 
2148         timer_free(chain->drain_timer);
2149         QTAILQ_REMOVE(&n->rsc_chains, chain, next);
2150         g_free(chain);
2151     }
2152 }
2153 
2154 static void virtio_net_rsc_cache_buf(VirtioNetRscChain *chain,
2155                                      NetClientState *nc,
2156                                      const uint8_t *buf, size_t size)
2157 {
2158     uint16_t hdr_len;
2159     VirtioNetRscSeg *seg;
2160 
2161     hdr_len = chain->n->guest_hdr_len;
2162     seg = g_new(VirtioNetRscSeg, 1);
2163     seg->buf = g_malloc(hdr_len + sizeof(struct eth_header)
2164         + sizeof(struct ip6_header) + VIRTIO_NET_MAX_TCP_PAYLOAD);
2165     memcpy(seg->buf, buf, size);
2166     seg->size = size;
2167     seg->packets = 1;
2168     seg->dup_ack = 0;
2169     seg->is_coalesced = 0;
2170     seg->nc = nc;
2171 
2172     QTAILQ_INSERT_TAIL(&chain->buffers, seg, next);
2173     chain->stat.cache++;
2174 
2175     switch (chain->proto) {
2176     case ETH_P_IP:
2177         virtio_net_rsc_extract_unit4(chain, seg->buf, &seg->unit);
2178         break;
2179     case ETH_P_IPV6:
2180         virtio_net_rsc_extract_unit6(chain, seg->buf, &seg->unit);
2181         break;
2182     default:
2183         g_assert_not_reached();
2184     }
2185 }
2186 
2187 static int32_t virtio_net_rsc_handle_ack(VirtioNetRscChain *chain,
2188                                          VirtioNetRscSeg *seg,
2189                                          const uint8_t *buf,
2190                                          struct tcp_header *n_tcp,
2191                                          struct tcp_header *o_tcp)
2192 {
2193     uint32_t nack, oack;
2194     uint16_t nwin, owin;
2195 
2196     nack = htonl(n_tcp->th_ack);
2197     nwin = htons(n_tcp->th_win);
2198     oack = htonl(o_tcp->th_ack);
2199     owin = htons(o_tcp->th_win);
2200 
2201     if ((nack - oack) >= VIRTIO_NET_MAX_TCP_PAYLOAD) {
2202         chain->stat.ack_out_of_win++;
2203         return RSC_FINAL;
2204     } else if (nack == oack) {
2205         /* duplicated ack or window probe */
2206         if (nwin == owin) {
2207             /* duplicated ack, add dup ack count due to whql test up to 1 */
2208             chain->stat.dup_ack++;
2209             return RSC_FINAL;
2210         } else {
2211             /* Coalesce window update */
2212             o_tcp->th_win = n_tcp->th_win;
2213             chain->stat.win_update++;
2214             return RSC_COALESCE;
2215         }
2216     } else {
2217         /* pure ack, go to 'C', finalize*/
2218         chain->stat.pure_ack++;
2219         return RSC_FINAL;
2220     }
2221 }
2222 
2223 static int32_t virtio_net_rsc_coalesce_data(VirtioNetRscChain *chain,
2224                                             VirtioNetRscSeg *seg,
2225                                             const uint8_t *buf,
2226                                             VirtioNetRscUnit *n_unit)
2227 {
2228     void *data;
2229     uint16_t o_ip_len;
2230     uint32_t nseq, oseq;
2231     VirtioNetRscUnit *o_unit;
2232 
2233     o_unit = &seg->unit;
2234     o_ip_len = htons(*o_unit->ip_plen);
2235     nseq = htonl(n_unit->tcp->th_seq);
2236     oseq = htonl(o_unit->tcp->th_seq);
2237 
2238     /* out of order or retransmitted. */
2239     if ((nseq - oseq) > VIRTIO_NET_MAX_TCP_PAYLOAD) {
2240         chain->stat.data_out_of_win++;
2241         return RSC_FINAL;
2242     }
2243 
2244     data = ((uint8_t *)n_unit->tcp) + n_unit->tcp_hdrlen;
2245     if (nseq == oseq) {
2246         if ((o_unit->payload == 0) && n_unit->payload) {
2247             /* From no payload to payload, normal case, not a dup ack or etc */
2248             chain->stat.data_after_pure_ack++;
2249             goto coalesce;
2250         } else {
2251             return virtio_net_rsc_handle_ack(chain, seg, buf,
2252                                              n_unit->tcp, o_unit->tcp);
2253         }
2254     } else if ((nseq - oseq) != o_unit->payload) {
2255         /* Not a consistent packet, out of order */
2256         chain->stat.data_out_of_order++;
2257         return RSC_FINAL;
2258     } else {
2259 coalesce:
2260         if ((o_ip_len + n_unit->payload) > chain->max_payload) {
2261             chain->stat.over_size++;
2262             return RSC_FINAL;
2263         }
2264 
2265         /* Here comes the right data, the payload length in v4/v6 is different,
2266            so use the field value to update and record the new data len */
2267         o_unit->payload += n_unit->payload; /* update new data len */
2268 
2269         /* update field in ip header */
2270         *o_unit->ip_plen = htons(o_ip_len + n_unit->payload);
2271 
2272         /* Bring 'PUSH' big, the whql test guide says 'PUSH' can be coalesced
2273            for windows guest, while this may change the behavior for linux
2274            guest (only if it uses RSC feature). */
2275         o_unit->tcp->th_offset_flags = n_unit->tcp->th_offset_flags;
2276 
2277         o_unit->tcp->th_ack = n_unit->tcp->th_ack;
2278         o_unit->tcp->th_win = n_unit->tcp->th_win;
2279 
2280         memmove(seg->buf + seg->size, data, n_unit->payload);
2281         seg->size += n_unit->payload;
2282         seg->packets++;
2283         chain->stat.coalesced++;
2284         return RSC_COALESCE;
2285     }
2286 }
2287 
2288 static int32_t virtio_net_rsc_coalesce4(VirtioNetRscChain *chain,
2289                                         VirtioNetRscSeg *seg,
2290                                         const uint8_t *buf, size_t size,
2291                                         VirtioNetRscUnit *unit)
2292 {
2293     struct ip_header *ip1, *ip2;
2294 
2295     ip1 = (struct ip_header *)(unit->ip);
2296     ip2 = (struct ip_header *)(seg->unit.ip);
2297     if ((ip1->ip_src ^ ip2->ip_src) || (ip1->ip_dst ^ ip2->ip_dst)
2298         || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
2299         || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
2300         chain->stat.no_match++;
2301         return RSC_NO_MATCH;
2302     }
2303 
2304     return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
2305 }
2306 
2307 static int32_t virtio_net_rsc_coalesce6(VirtioNetRscChain *chain,
2308                                         VirtioNetRscSeg *seg,
2309                                         const uint8_t *buf, size_t size,
2310                                         VirtioNetRscUnit *unit)
2311 {
2312     struct ip6_header *ip1, *ip2;
2313 
2314     ip1 = (struct ip6_header *)(unit->ip);
2315     ip2 = (struct ip6_header *)(seg->unit.ip);
2316     if (memcmp(&ip1->ip6_src, &ip2->ip6_src, sizeof(struct in6_address))
2317         || memcmp(&ip1->ip6_dst, &ip2->ip6_dst, sizeof(struct in6_address))
2318         || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
2319         || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
2320             chain->stat.no_match++;
2321             return RSC_NO_MATCH;
2322     }
2323 
2324     return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
2325 }
2326 
2327 /* Packets with 'SYN' should bypass, other flag should be sent after drain
2328  * to prevent out of order */
2329 static int virtio_net_rsc_tcp_ctrl_check(VirtioNetRscChain *chain,
2330                                          struct tcp_header *tcp)
2331 {
2332     uint16_t tcp_hdr;
2333     uint16_t tcp_flag;
2334 
2335     tcp_flag = htons(tcp->th_offset_flags);
2336     tcp_hdr = (tcp_flag & VIRTIO_NET_TCP_HDR_LENGTH) >> 10;
2337     tcp_flag &= VIRTIO_NET_TCP_FLAG;
2338     if (tcp_flag & TH_SYN) {
2339         chain->stat.tcp_syn++;
2340         return RSC_BYPASS;
2341     }
2342 
2343     if (tcp_flag & (TH_FIN | TH_URG | TH_RST | TH_ECE | TH_CWR)) {
2344         chain->stat.tcp_ctrl_drain++;
2345         return RSC_FINAL;
2346     }
2347 
2348     if (tcp_hdr > sizeof(struct tcp_header)) {
2349         chain->stat.tcp_all_opt++;
2350         return RSC_FINAL;
2351     }
2352 
2353     return RSC_CANDIDATE;
2354 }
2355 
2356 static size_t virtio_net_rsc_do_coalesce(VirtioNetRscChain *chain,
2357                                          NetClientState *nc,
2358                                          const uint8_t *buf, size_t size,
2359                                          VirtioNetRscUnit *unit)
2360 {
2361     int ret;
2362     VirtioNetRscSeg *seg, *nseg;
2363 
2364     if (QTAILQ_EMPTY(&chain->buffers)) {
2365         chain->stat.empty_cache++;
2366         virtio_net_rsc_cache_buf(chain, nc, buf, size);
2367         timer_mod(chain->drain_timer,
2368               qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + chain->n->rsc_timeout);
2369         return size;
2370     }
2371 
2372     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
2373         if (chain->proto == ETH_P_IP) {
2374             ret = virtio_net_rsc_coalesce4(chain, seg, buf, size, unit);
2375         } else {
2376             ret = virtio_net_rsc_coalesce6(chain, seg, buf, size, unit);
2377         }
2378 
2379         if (ret == RSC_FINAL) {
2380             if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2381                 /* Send failed */
2382                 chain->stat.final_failed++;
2383                 return 0;
2384             }
2385 
2386             /* Send current packet */
2387             return virtio_net_do_receive(nc, buf, size);
2388         } else if (ret == RSC_NO_MATCH) {
2389             continue;
2390         } else {
2391             /* Coalesced, mark coalesced flag to tell calc cksum for ipv4 */
2392             seg->is_coalesced = 1;
2393             return size;
2394         }
2395     }
2396 
2397     chain->stat.no_match_cache++;
2398     virtio_net_rsc_cache_buf(chain, nc, buf, size);
2399     return size;
2400 }
2401 
2402 /* Drain a connection data, this is to avoid out of order segments */
2403 static size_t virtio_net_rsc_drain_flow(VirtioNetRscChain *chain,
2404                                         NetClientState *nc,
2405                                         const uint8_t *buf, size_t size,
2406                                         uint16_t ip_start, uint16_t ip_size,
2407                                         uint16_t tcp_port)
2408 {
2409     VirtioNetRscSeg *seg, *nseg;
2410     uint32_t ppair1, ppair2;
2411 
2412     ppair1 = *(uint32_t *)(buf + tcp_port);
2413     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
2414         ppair2 = *(uint32_t *)(seg->buf + tcp_port);
2415         if (memcmp(buf + ip_start, seg->buf + ip_start, ip_size)
2416             || (ppair1 != ppair2)) {
2417             continue;
2418         }
2419         if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2420             chain->stat.drain_failed++;
2421         }
2422 
2423         break;
2424     }
2425 
2426     return virtio_net_do_receive(nc, buf, size);
2427 }
2428 
2429 static int32_t virtio_net_rsc_sanity_check4(VirtioNetRscChain *chain,
2430                                             struct ip_header *ip,
2431                                             const uint8_t *buf, size_t size)
2432 {
2433     uint16_t ip_len;
2434 
2435     /* Not an ipv4 packet */
2436     if (((ip->ip_ver_len & 0xF0) >> 4) != IP_HEADER_VERSION_4) {
2437         chain->stat.ip_option++;
2438         return RSC_BYPASS;
2439     }
2440 
2441     /* Don't handle packets with ip option */
2442     if ((ip->ip_ver_len & 0xF) != VIRTIO_NET_IP4_HEADER_LENGTH) {
2443         chain->stat.ip_option++;
2444         return RSC_BYPASS;
2445     }
2446 
2447     if (ip->ip_p != IPPROTO_TCP) {
2448         chain->stat.bypass_not_tcp++;
2449         return RSC_BYPASS;
2450     }
2451 
2452     /* Don't handle packets with ip fragment */
2453     if (!(htons(ip->ip_off) & IP_DF)) {
2454         chain->stat.ip_frag++;
2455         return RSC_BYPASS;
2456     }
2457 
2458     /* Don't handle packets with ecn flag */
2459     if (IPTOS_ECN(ip->ip_tos)) {
2460         chain->stat.ip_ecn++;
2461         return RSC_BYPASS;
2462     }
2463 
2464     ip_len = htons(ip->ip_len);
2465     if (ip_len < (sizeof(struct ip_header) + sizeof(struct tcp_header))
2466         || ip_len > (size - chain->n->guest_hdr_len -
2467                      sizeof(struct eth_header))) {
2468         chain->stat.ip_hacked++;
2469         return RSC_BYPASS;
2470     }
2471 
2472     return RSC_CANDIDATE;
2473 }
2474 
2475 static size_t virtio_net_rsc_receive4(VirtioNetRscChain *chain,
2476                                       NetClientState *nc,
2477                                       const uint8_t *buf, size_t size)
2478 {
2479     int32_t ret;
2480     uint16_t hdr_len;
2481     VirtioNetRscUnit unit;
2482 
2483     hdr_len = ((VirtIONet *)(chain->n))->guest_hdr_len;
2484 
2485     if (size < (hdr_len + sizeof(struct eth_header) + sizeof(struct ip_header)
2486         + sizeof(struct tcp_header))) {
2487         chain->stat.bypass_not_tcp++;
2488         return virtio_net_do_receive(nc, buf, size);
2489     }
2490 
2491     virtio_net_rsc_extract_unit4(chain, buf, &unit);
2492     if (virtio_net_rsc_sanity_check4(chain, unit.ip, buf, size)
2493         != RSC_CANDIDATE) {
2494         return virtio_net_do_receive(nc, buf, size);
2495     }
2496 
2497     ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
2498     if (ret == RSC_BYPASS) {
2499         return virtio_net_do_receive(nc, buf, size);
2500     } else if (ret == RSC_FINAL) {
2501         return virtio_net_rsc_drain_flow(chain, nc, buf, size,
2502                 ((hdr_len + sizeof(struct eth_header)) + 12),
2503                 VIRTIO_NET_IP4_ADDR_SIZE,
2504                 hdr_len + sizeof(struct eth_header) + sizeof(struct ip_header));
2505     }
2506 
2507     return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
2508 }
2509 
2510 static int32_t virtio_net_rsc_sanity_check6(VirtioNetRscChain *chain,
2511                                             struct ip6_header *ip6,
2512                                             const uint8_t *buf, size_t size)
2513 {
2514     uint16_t ip_len;
2515 
2516     if (((ip6->ip6_ctlun.ip6_un1.ip6_un1_flow & 0xF0) >> 4)
2517         != IP_HEADER_VERSION_6) {
2518         return RSC_BYPASS;
2519     }
2520 
2521     /* Both option and protocol is checked in this */
2522     if (ip6->ip6_ctlun.ip6_un1.ip6_un1_nxt != IPPROTO_TCP) {
2523         chain->stat.bypass_not_tcp++;
2524         return RSC_BYPASS;
2525     }
2526 
2527     ip_len = htons(ip6->ip6_ctlun.ip6_un1.ip6_un1_plen);
2528     if (ip_len < sizeof(struct tcp_header) ||
2529         ip_len > (size - chain->n->guest_hdr_len - sizeof(struct eth_header)
2530                   - sizeof(struct ip6_header))) {
2531         chain->stat.ip_hacked++;
2532         return RSC_BYPASS;
2533     }
2534 
2535     /* Don't handle packets with ecn flag */
2536     if (IP6_ECN(ip6->ip6_ctlun.ip6_un3.ip6_un3_ecn)) {
2537         chain->stat.ip_ecn++;
2538         return RSC_BYPASS;
2539     }
2540 
2541     return RSC_CANDIDATE;
2542 }
2543 
2544 static size_t virtio_net_rsc_receive6(void *opq, NetClientState *nc,
2545                                       const uint8_t *buf, size_t size)
2546 {
2547     int32_t ret;
2548     uint16_t hdr_len;
2549     VirtioNetRscChain *chain;
2550     VirtioNetRscUnit unit;
2551 
2552     chain = opq;
2553     hdr_len = ((VirtIONet *)(chain->n))->guest_hdr_len;
2554 
2555     if (size < (hdr_len + sizeof(struct eth_header) + sizeof(struct ip6_header)
2556         + sizeof(tcp_header))) {
2557         return virtio_net_do_receive(nc, buf, size);
2558     }
2559 
2560     virtio_net_rsc_extract_unit6(chain, buf, &unit);
2561     if (RSC_CANDIDATE != virtio_net_rsc_sanity_check6(chain,
2562                                                  unit.ip, buf, size)) {
2563         return virtio_net_do_receive(nc, buf, size);
2564     }
2565 
2566     ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
2567     if (ret == RSC_BYPASS) {
2568         return virtio_net_do_receive(nc, buf, size);
2569     } else if (ret == RSC_FINAL) {
2570         return virtio_net_rsc_drain_flow(chain, nc, buf, size,
2571                 ((hdr_len + sizeof(struct eth_header)) + 8),
2572                 VIRTIO_NET_IP6_ADDR_SIZE,
2573                 hdr_len + sizeof(struct eth_header)
2574                 + sizeof(struct ip6_header));
2575     }
2576 
2577     return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
2578 }
2579 
2580 static VirtioNetRscChain *virtio_net_rsc_lookup_chain(VirtIONet *n,
2581                                                       NetClientState *nc,
2582                                                       uint16_t proto)
2583 {
2584     VirtioNetRscChain *chain;
2585 
2586     if ((proto != (uint16_t)ETH_P_IP) && (proto != (uint16_t)ETH_P_IPV6)) {
2587         return NULL;
2588     }
2589 
2590     QTAILQ_FOREACH(chain, &n->rsc_chains, next) {
2591         if (chain->proto == proto) {
2592             return chain;
2593         }
2594     }
2595 
2596     chain = g_malloc(sizeof(*chain));
2597     chain->n = n;
2598     chain->proto = proto;
2599     if (proto == (uint16_t)ETH_P_IP) {
2600         chain->max_payload = VIRTIO_NET_MAX_IP4_PAYLOAD;
2601         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2602     } else {
2603         chain->max_payload = VIRTIO_NET_MAX_IP6_PAYLOAD;
2604         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2605     }
2606     chain->drain_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
2607                                       virtio_net_rsc_purge, chain);
2608     memset(&chain->stat, 0, sizeof(chain->stat));
2609 
2610     QTAILQ_INIT(&chain->buffers);
2611     QTAILQ_INSERT_TAIL(&n->rsc_chains, chain, next);
2612 
2613     return chain;
2614 }
2615 
2616 static ssize_t virtio_net_rsc_receive(NetClientState *nc,
2617                                       const uint8_t *buf,
2618                                       size_t size)
2619 {
2620     uint16_t proto;
2621     VirtioNetRscChain *chain;
2622     struct eth_header *eth;
2623     VirtIONet *n;
2624 
2625     n = qemu_get_nic_opaque(nc);
2626     if (size < (n->host_hdr_len + sizeof(struct eth_header))) {
2627         return virtio_net_do_receive(nc, buf, size);
2628     }
2629 
2630     eth = (struct eth_header *)(buf + n->guest_hdr_len);
2631     proto = htons(eth->h_proto);
2632 
2633     chain = virtio_net_rsc_lookup_chain(n, nc, proto);
2634     if (chain) {
2635         chain->stat.received++;
2636         if (proto == (uint16_t)ETH_P_IP && n->rsc4_enabled) {
2637             return virtio_net_rsc_receive4(chain, nc, buf, size);
2638         } else if (proto == (uint16_t)ETH_P_IPV6 && n->rsc6_enabled) {
2639             return virtio_net_rsc_receive6(chain, nc, buf, size);
2640         }
2641     }
2642     return virtio_net_do_receive(nc, buf, size);
2643 }
2644 
2645 static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf,
2646                                   size_t size)
2647 {
2648     VirtIONet *n = qemu_get_nic_opaque(nc);
2649     if ((n->rsc4_enabled || n->rsc6_enabled)) {
2650         return virtio_net_rsc_receive(nc, buf, size);
2651     } else {
2652         return virtio_net_do_receive(nc, buf, size);
2653     }
2654 }
2655 
2656 static int32_t virtio_net_flush_tx(VirtIONetQueue *q);
2657 
2658 static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
2659 {
2660     VirtIONet *n = qemu_get_nic_opaque(nc);
2661     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
2662     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2663     int ret;
2664 
2665     virtqueue_push(q->tx_vq, q->async_tx.elem, 0);
2666     virtio_notify(vdev, q->tx_vq);
2667 
2668     g_free(q->async_tx.elem);
2669     q->async_tx.elem = NULL;
2670 
2671     virtio_queue_set_notification(q->tx_vq, 1);
2672     ret = virtio_net_flush_tx(q);
2673     if (ret >= n->tx_burst) {
2674         /*
2675          * the flush has been stopped by tx_burst
2676          * we will not receive notification for the
2677          * remainining part, so re-schedule
2678          */
2679         virtio_queue_set_notification(q->tx_vq, 0);
2680         if (q->tx_bh) {
2681             replay_bh_schedule_event(q->tx_bh);
2682         } else {
2683             timer_mod(q->tx_timer,
2684                       qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2685         }
2686         q->tx_waiting = 1;
2687     }
2688 }
2689 
2690 /* TX */
2691 static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
2692 {
2693     VirtIONet *n = q->n;
2694     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2695     VirtQueueElement *elem;
2696     int32_t num_packets = 0;
2697     int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
2698     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
2699         return num_packets;
2700     }
2701 
2702     if (q->async_tx.elem) {
2703         virtio_queue_set_notification(q->tx_vq, 0);
2704         return num_packets;
2705     }
2706 
2707     for (;;) {
2708         ssize_t ret;
2709         unsigned int out_num;
2710         struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg;
2711         struct virtio_net_hdr vhdr;
2712 
2713         elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
2714         if (!elem) {
2715             break;
2716         }
2717 
2718         out_num = elem->out_num;
2719         out_sg = elem->out_sg;
2720         if (out_num < 1) {
2721             virtio_error(vdev, "virtio-net header not in first element");
2722             goto detach;
2723         }
2724 
2725         if (n->needs_vnet_hdr_swap) {
2726             if (iov_to_buf(out_sg, out_num, 0, &vhdr, sizeof(vhdr)) <
2727                 sizeof(vhdr)) {
2728                 virtio_error(vdev, "virtio-net header incorrect");
2729                 goto detach;
2730             }
2731             virtio_net_hdr_swap(vdev, &vhdr);
2732             sg2[0].iov_base = &vhdr;
2733             sg2[0].iov_len = sizeof(vhdr);
2734             out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1, out_sg, out_num,
2735                                sizeof(vhdr), -1);
2736             if (out_num == VIRTQUEUE_MAX_SIZE) {
2737                 goto drop;
2738             }
2739             out_num += 1;
2740             out_sg = sg2;
2741         }
2742         /*
2743          * If host wants to see the guest header as is, we can
2744          * pass it on unchanged. Otherwise, copy just the parts
2745          * that host is interested in.
2746          */
2747         assert(n->host_hdr_len <= n->guest_hdr_len);
2748         if (n->host_hdr_len != n->guest_hdr_len) {
2749             if (iov_size(out_sg, out_num) < n->guest_hdr_len) {
2750                 virtio_error(vdev, "virtio-net header is invalid");
2751                 goto detach;
2752             }
2753             unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
2754                                        out_sg, out_num,
2755                                        0, n->host_hdr_len);
2756             sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
2757                              out_sg, out_num,
2758                              n->guest_hdr_len, -1);
2759             out_num = sg_num;
2760             out_sg = sg;
2761 
2762             if (out_num < 1) {
2763                 virtio_error(vdev, "virtio-net nothing to send");
2764                 goto detach;
2765             }
2766         }
2767 
2768         ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index),
2769                                       out_sg, out_num, virtio_net_tx_complete);
2770         if (ret == 0) {
2771             virtio_queue_set_notification(q->tx_vq, 0);
2772             q->async_tx.elem = elem;
2773             return -EBUSY;
2774         }
2775 
2776 drop:
2777         virtqueue_push(q->tx_vq, elem, 0);
2778         virtio_notify(vdev, q->tx_vq);
2779         g_free(elem);
2780 
2781         if (++num_packets >= n->tx_burst) {
2782             break;
2783         }
2784     }
2785     return num_packets;
2786 
2787 detach:
2788     virtqueue_detach_element(q->tx_vq, elem, 0);
2789     g_free(elem);
2790     return -EINVAL;
2791 }
2792 
2793 static void virtio_net_tx_timer(void *opaque);
2794 
2795 static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
2796 {
2797     VirtIONet *n = VIRTIO_NET(vdev);
2798     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
2799 
2800     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
2801         virtio_net_drop_tx_queue_data(vdev, vq);
2802         return;
2803     }
2804 
2805     /* This happens when device was stopped but VCPU wasn't. */
2806     if (!vdev->vm_running) {
2807         q->tx_waiting = 1;
2808         return;
2809     }
2810 
2811     if (q->tx_waiting) {
2812         /* We already have queued packets, immediately flush */
2813         timer_del(q->tx_timer);
2814         virtio_net_tx_timer(q);
2815     } else {
2816         /* re-arm timer to flush it (and more) on next tick */
2817         timer_mod(q->tx_timer,
2818                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2819         q->tx_waiting = 1;
2820         virtio_queue_set_notification(vq, 0);
2821     }
2822 }
2823 
2824 static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
2825 {
2826     VirtIONet *n = VIRTIO_NET(vdev);
2827     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
2828 
2829     if (unlikely(n->vhost_started)) {
2830         return;
2831     }
2832 
2833     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
2834         virtio_net_drop_tx_queue_data(vdev, vq);
2835         return;
2836     }
2837 
2838     if (unlikely(q->tx_waiting)) {
2839         return;
2840     }
2841     q->tx_waiting = 1;
2842     /* This happens when device was stopped but VCPU wasn't. */
2843     if (!vdev->vm_running) {
2844         return;
2845     }
2846     virtio_queue_set_notification(vq, 0);
2847     replay_bh_schedule_event(q->tx_bh);
2848 }
2849 
2850 static void virtio_net_tx_timer(void *opaque)
2851 {
2852     VirtIONetQueue *q = opaque;
2853     VirtIONet *n = q->n;
2854     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2855     int ret;
2856 
2857     /* This happens when device was stopped but BH wasn't. */
2858     if (!vdev->vm_running) {
2859         /* Make sure tx waiting is set, so we'll run when restarted. */
2860         assert(q->tx_waiting);
2861         return;
2862     }
2863 
2864     q->tx_waiting = 0;
2865 
2866     /* Just in case the driver is not ready on more */
2867     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
2868         return;
2869     }
2870 
2871     ret = virtio_net_flush_tx(q);
2872     if (ret == -EBUSY || ret == -EINVAL) {
2873         return;
2874     }
2875     /*
2876      * If we flush a full burst of packets, assume there are
2877      * more coming and immediately rearm
2878      */
2879     if (ret >= n->tx_burst) {
2880         q->tx_waiting = 1;
2881         timer_mod(q->tx_timer,
2882                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2883         return;
2884     }
2885     /*
2886      * If less than a full burst, re-enable notification and flush
2887      * anything that may have come in while we weren't looking.  If
2888      * we find something, assume the guest is still active and rearm
2889      */
2890     virtio_queue_set_notification(q->tx_vq, 1);
2891     ret = virtio_net_flush_tx(q);
2892     if (ret > 0) {
2893         virtio_queue_set_notification(q->tx_vq, 0);
2894         q->tx_waiting = 1;
2895         timer_mod(q->tx_timer,
2896                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2897     }
2898 }
2899 
2900 static void virtio_net_tx_bh(void *opaque)
2901 {
2902     VirtIONetQueue *q = opaque;
2903     VirtIONet *n = q->n;
2904     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2905     int32_t ret;
2906 
2907     /* This happens when device was stopped but BH wasn't. */
2908     if (!vdev->vm_running) {
2909         /* Make sure tx waiting is set, so we'll run when restarted. */
2910         assert(q->tx_waiting);
2911         return;
2912     }
2913 
2914     q->tx_waiting = 0;
2915 
2916     /* Just in case the driver is not ready on more */
2917     if (unlikely(!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))) {
2918         return;
2919     }
2920 
2921     ret = virtio_net_flush_tx(q);
2922     if (ret == -EBUSY || ret == -EINVAL) {
2923         return; /* Notification re-enable handled by tx_complete or device
2924                  * broken */
2925     }
2926 
2927     /* If we flush a full burst of packets, assume there are
2928      * more coming and immediately reschedule */
2929     if (ret >= n->tx_burst) {
2930         replay_bh_schedule_event(q->tx_bh);
2931         q->tx_waiting = 1;
2932         return;
2933     }
2934 
2935     /* If less than a full burst, re-enable notification and flush
2936      * anything that may have come in while we weren't looking.  If
2937      * we find something, assume the guest is still active and reschedule */
2938     virtio_queue_set_notification(q->tx_vq, 1);
2939     ret = virtio_net_flush_tx(q);
2940     if (ret == -EINVAL) {
2941         return;
2942     } else if (ret > 0) {
2943         virtio_queue_set_notification(q->tx_vq, 0);
2944         replay_bh_schedule_event(q->tx_bh);
2945         q->tx_waiting = 1;
2946     }
2947 }
2948 
2949 static void virtio_net_add_queue(VirtIONet *n, int index)
2950 {
2951     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2952 
2953     n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
2954                                            virtio_net_handle_rx);
2955 
2956     if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
2957         n->vqs[index].tx_vq =
2958             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
2959                              virtio_net_handle_tx_timer);
2960         n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
2961                                               virtio_net_tx_timer,
2962                                               &n->vqs[index]);
2963     } else {
2964         n->vqs[index].tx_vq =
2965             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
2966                              virtio_net_handle_tx_bh);
2967         n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
2968                                                   &DEVICE(vdev)->mem_reentrancy_guard);
2969     }
2970 
2971     n->vqs[index].tx_waiting = 0;
2972     n->vqs[index].n = n;
2973 }
2974 
2975 static void virtio_net_del_queue(VirtIONet *n, int index)
2976 {
2977     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2978     VirtIONetQueue *q = &n->vqs[index];
2979     NetClientState *nc = qemu_get_subqueue(n->nic, index);
2980 
2981     qemu_purge_queued_packets(nc);
2982 
2983     virtio_del_queue(vdev, index * 2);
2984     if (q->tx_timer) {
2985         timer_free(q->tx_timer);
2986         q->tx_timer = NULL;
2987     } else {
2988         qemu_bh_delete(q->tx_bh);
2989         q->tx_bh = NULL;
2990     }
2991     q->tx_waiting = 0;
2992     virtio_del_queue(vdev, index * 2 + 1);
2993 }
2994 
2995 static void virtio_net_change_num_queue_pairs(VirtIONet *n, int new_max_queue_pairs)
2996 {
2997     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2998     int old_num_queues = virtio_get_num_queues(vdev);
2999     int new_num_queues = new_max_queue_pairs * 2 + 1;
3000     int i;
3001 
3002     assert(old_num_queues >= 3);
3003     assert(old_num_queues % 2 == 1);
3004 
3005     if (old_num_queues == new_num_queues) {
3006         return;
3007     }
3008 
3009     /*
3010      * We always need to remove and add ctrl vq if
3011      * old_num_queues != new_num_queues. Remove ctrl_vq first,
3012      * and then we only enter one of the following two loops.
3013      */
3014     virtio_del_queue(vdev, old_num_queues - 1);
3015 
3016     for (i = new_num_queues - 1; i < old_num_queues - 1; i += 2) {
3017         /* new_num_queues < old_num_queues */
3018         virtio_net_del_queue(n, i / 2);
3019     }
3020 
3021     for (i = old_num_queues - 1; i < new_num_queues - 1; i += 2) {
3022         /* new_num_queues > old_num_queues */
3023         virtio_net_add_queue(n, i / 2);
3024     }
3025 
3026     /* add ctrl_vq last */
3027     n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
3028 }
3029 
3030 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue)
3031 {
3032     int max = multiqueue ? n->max_queue_pairs : 1;
3033 
3034     n->multiqueue = multiqueue;
3035     virtio_net_change_num_queue_pairs(n, max);
3036 
3037     virtio_net_set_queue_pairs(n);
3038 }
3039 
3040 static int virtio_net_post_load_device(void *opaque, int version_id)
3041 {
3042     VirtIONet *n = opaque;
3043     VirtIODevice *vdev = VIRTIO_DEVICE(n);
3044     int i, link_down;
3045 
3046     trace_virtio_net_post_load_device();
3047     virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs,
3048                                virtio_vdev_has_feature(vdev,
3049                                                        VIRTIO_F_VERSION_1),
3050                                virtio_vdev_has_feature(vdev,
3051                                                        VIRTIO_NET_F_HASH_REPORT));
3052 
3053     /* MAC_TABLE_ENTRIES may be different from the saved image */
3054     if (n->mac_table.in_use > MAC_TABLE_ENTRIES) {
3055         n->mac_table.in_use = 0;
3056     }
3057 
3058     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
3059         n->curr_guest_offloads = virtio_net_supported_guest_offloads(n);
3060     }
3061 
3062     /*
3063      * curr_guest_offloads will be later overwritten by the
3064      * virtio_set_features_nocheck call done from the virtio_load.
3065      * Here we make sure it is preserved and restored accordingly
3066      * in the virtio_net_post_load_virtio callback.
3067      */
3068     n->saved_guest_offloads = n->curr_guest_offloads;
3069 
3070     virtio_net_set_queue_pairs(n);
3071 
3072     /* Find the first multicast entry in the saved MAC filter */
3073     for (i = 0; i < n->mac_table.in_use; i++) {
3074         if (n->mac_table.macs[i * ETH_ALEN] & 1) {
3075             break;
3076         }
3077     }
3078     n->mac_table.first_multi = i;
3079 
3080     /* nc.link_down can't be migrated, so infer link_down according
3081      * to link status bit in n->status */
3082     link_down = (n->status & VIRTIO_NET_S_LINK_UP) == 0;
3083     for (i = 0; i < n->max_queue_pairs; i++) {
3084         qemu_get_subqueue(n->nic, i)->link_down = link_down;
3085     }
3086 
3087     if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE) &&
3088         virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3089         qemu_announce_timer_reset(&n->announce_timer, migrate_announce_params(),
3090                                   QEMU_CLOCK_VIRTUAL,
3091                                   virtio_net_announce_timer, n);
3092         if (n->announce_timer.round) {
3093             timer_mod(n->announce_timer.tm,
3094                       qemu_clock_get_ms(n->announce_timer.type));
3095         } else {
3096             qemu_announce_timer_del(&n->announce_timer, false);
3097         }
3098     }
3099 
3100     virtio_net_commit_rss_config(n);
3101     return 0;
3102 }
3103 
3104 static int virtio_net_post_load_virtio(VirtIODevice *vdev)
3105 {
3106     VirtIONet *n = VIRTIO_NET(vdev);
3107     /*
3108      * The actual needed state is now in saved_guest_offloads,
3109      * see virtio_net_post_load_device for detail.
3110      * Restore it back and apply the desired offloads.
3111      */
3112     n->curr_guest_offloads = n->saved_guest_offloads;
3113     if (peer_has_vnet_hdr(n)) {
3114         virtio_net_apply_guest_offloads(n);
3115     }
3116 
3117     return 0;
3118 }
3119 
3120 /* tx_waiting field of a VirtIONetQueue */
3121 static const VMStateDescription vmstate_virtio_net_queue_tx_waiting = {
3122     .name = "virtio-net-queue-tx_waiting",
3123     .fields = (const VMStateField[]) {
3124         VMSTATE_UINT32(tx_waiting, VirtIONetQueue),
3125         VMSTATE_END_OF_LIST()
3126    },
3127 };
3128 
3129 static bool max_queue_pairs_gt_1(void *opaque, int version_id)
3130 {
3131     return VIRTIO_NET(opaque)->max_queue_pairs > 1;
3132 }
3133 
3134 static bool has_ctrl_guest_offloads(void *opaque, int version_id)
3135 {
3136     return virtio_vdev_has_feature(VIRTIO_DEVICE(opaque),
3137                                    VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
3138 }
3139 
3140 static bool mac_table_fits(void *opaque, int version_id)
3141 {
3142     return VIRTIO_NET(opaque)->mac_table.in_use <= MAC_TABLE_ENTRIES;
3143 }
3144 
3145 static bool mac_table_doesnt_fit(void *opaque, int version_id)
3146 {
3147     return !mac_table_fits(opaque, version_id);
3148 }
3149 
3150 /* This temporary type is shared by all the WITH_TMP methods
3151  * although only some fields are used by each.
3152  */
3153 struct VirtIONetMigTmp {
3154     VirtIONet      *parent;
3155     VirtIONetQueue *vqs_1;
3156     uint16_t        curr_queue_pairs_1;
3157     uint8_t         has_ufo;
3158     uint32_t        has_vnet_hdr;
3159 };
3160 
3161 /* The 2nd and subsequent tx_waiting flags are loaded later than
3162  * the 1st entry in the queue_pairs and only if there's more than one
3163  * entry.  We use the tmp mechanism to calculate a temporary
3164  * pointer and count and also validate the count.
3165  */
3166 
3167 static int virtio_net_tx_waiting_pre_save(void *opaque)
3168 {
3169     struct VirtIONetMigTmp *tmp = opaque;
3170 
3171     tmp->vqs_1 = tmp->parent->vqs + 1;
3172     tmp->curr_queue_pairs_1 = tmp->parent->curr_queue_pairs - 1;
3173     if (tmp->parent->curr_queue_pairs == 0) {
3174         tmp->curr_queue_pairs_1 = 0;
3175     }
3176 
3177     return 0;
3178 }
3179 
3180 static int virtio_net_tx_waiting_pre_load(void *opaque)
3181 {
3182     struct VirtIONetMigTmp *tmp = opaque;
3183 
3184     /* Reuse the pointer setup from save */
3185     virtio_net_tx_waiting_pre_save(opaque);
3186 
3187     if (tmp->parent->curr_queue_pairs > tmp->parent->max_queue_pairs) {
3188         error_report("virtio-net: curr_queue_pairs %x > max_queue_pairs %x",
3189             tmp->parent->curr_queue_pairs, tmp->parent->max_queue_pairs);
3190 
3191         return -EINVAL;
3192     }
3193 
3194     return 0; /* all good */
3195 }
3196 
3197 static const VMStateDescription vmstate_virtio_net_tx_waiting = {
3198     .name      = "virtio-net-tx_waiting",
3199     .pre_load  = virtio_net_tx_waiting_pre_load,
3200     .pre_save  = virtio_net_tx_waiting_pre_save,
3201     .fields    = (const VMStateField[]) {
3202         VMSTATE_STRUCT_VARRAY_POINTER_UINT16(vqs_1, struct VirtIONetMigTmp,
3203                                      curr_queue_pairs_1,
3204                                      vmstate_virtio_net_queue_tx_waiting,
3205                                      struct VirtIONetQueue),
3206         VMSTATE_END_OF_LIST()
3207     },
3208 };
3209 
3210 /* the 'has_ufo' flag is just tested; if the incoming stream has the
3211  * flag set we need to check that we have it
3212  */
3213 static int virtio_net_ufo_post_load(void *opaque, int version_id)
3214 {
3215     struct VirtIONetMigTmp *tmp = opaque;
3216 
3217     if (tmp->has_ufo && !peer_has_ufo(tmp->parent)) {
3218         error_report("virtio-net: saved image requires TUN_F_UFO support");
3219         return -EINVAL;
3220     }
3221 
3222     return 0;
3223 }
3224 
3225 static int virtio_net_ufo_pre_save(void *opaque)
3226 {
3227     struct VirtIONetMigTmp *tmp = opaque;
3228 
3229     tmp->has_ufo = tmp->parent->has_ufo;
3230 
3231     return 0;
3232 }
3233 
3234 static const VMStateDescription vmstate_virtio_net_has_ufo = {
3235     .name      = "virtio-net-ufo",
3236     .post_load = virtio_net_ufo_post_load,
3237     .pre_save  = virtio_net_ufo_pre_save,
3238     .fields    = (const VMStateField[]) {
3239         VMSTATE_UINT8(has_ufo, struct VirtIONetMigTmp),
3240         VMSTATE_END_OF_LIST()
3241     },
3242 };
3243 
3244 /* the 'has_vnet_hdr' flag is just tested; if the incoming stream has the
3245  * flag set we need to check that we have it
3246  */
3247 static int virtio_net_vnet_post_load(void *opaque, int version_id)
3248 {
3249     struct VirtIONetMigTmp *tmp = opaque;
3250 
3251     if (tmp->has_vnet_hdr && !peer_has_vnet_hdr(tmp->parent)) {
3252         error_report("virtio-net: saved image requires vnet_hdr=on");
3253         return -EINVAL;
3254     }
3255 
3256     return 0;
3257 }
3258 
3259 static int virtio_net_vnet_pre_save(void *opaque)
3260 {
3261     struct VirtIONetMigTmp *tmp = opaque;
3262 
3263     tmp->has_vnet_hdr = tmp->parent->has_vnet_hdr;
3264 
3265     return 0;
3266 }
3267 
3268 static const VMStateDescription vmstate_virtio_net_has_vnet = {
3269     .name      = "virtio-net-vnet",
3270     .post_load = virtio_net_vnet_post_load,
3271     .pre_save  = virtio_net_vnet_pre_save,
3272     .fields    = (const VMStateField[]) {
3273         VMSTATE_UINT32(has_vnet_hdr, struct VirtIONetMigTmp),
3274         VMSTATE_END_OF_LIST()
3275     },
3276 };
3277 
3278 static bool virtio_net_rss_needed(void *opaque)
3279 {
3280     return VIRTIO_NET(opaque)->rss_data.enabled;
3281 }
3282 
3283 static const VMStateDescription vmstate_virtio_net_rss = {
3284     .name      = "virtio-net-device/rss",
3285     .version_id = 1,
3286     .minimum_version_id = 1,
3287     .needed = virtio_net_rss_needed,
3288     .fields = (const VMStateField[]) {
3289         VMSTATE_BOOL(rss_data.enabled, VirtIONet),
3290         VMSTATE_BOOL(rss_data.redirect, VirtIONet),
3291         VMSTATE_BOOL(rss_data.populate_hash, VirtIONet),
3292         VMSTATE_UINT32(rss_data.hash_types, VirtIONet),
3293         VMSTATE_UINT16(rss_data.indirections_len, VirtIONet),
3294         VMSTATE_UINT16(rss_data.default_queue, VirtIONet),
3295         VMSTATE_UINT8_ARRAY(rss_data.key, VirtIONet,
3296                             VIRTIO_NET_RSS_MAX_KEY_SIZE),
3297         VMSTATE_VARRAY_UINT16_ALLOC(rss_data.indirections_table, VirtIONet,
3298                                     rss_data.indirections_len, 0,
3299                                     vmstate_info_uint16, uint16_t),
3300         VMSTATE_END_OF_LIST()
3301     },
3302 };
3303 
3304 static const VMStateDescription vmstate_virtio_net_device = {
3305     .name = "virtio-net-device",
3306     .version_id = VIRTIO_NET_VM_VERSION,
3307     .minimum_version_id = VIRTIO_NET_VM_VERSION,
3308     .post_load = virtio_net_post_load_device,
3309     .fields = (const VMStateField[]) {
3310         VMSTATE_UINT8_ARRAY(mac, VirtIONet, ETH_ALEN),
3311         VMSTATE_STRUCT_POINTER(vqs, VirtIONet,
3312                                vmstate_virtio_net_queue_tx_waiting,
3313                                VirtIONetQueue),
3314         VMSTATE_UINT32(mergeable_rx_bufs, VirtIONet),
3315         VMSTATE_UINT16(status, VirtIONet),
3316         VMSTATE_UINT8(promisc, VirtIONet),
3317         VMSTATE_UINT8(allmulti, VirtIONet),
3318         VMSTATE_UINT32(mac_table.in_use, VirtIONet),
3319 
3320         /* Guarded pair: If it fits we load it, else we throw it away
3321          * - can happen if source has a larger MAC table.; post-load
3322          *  sets flags in this case.
3323          */
3324         VMSTATE_VBUFFER_MULTIPLY(mac_table.macs, VirtIONet,
3325                                 0, mac_table_fits, mac_table.in_use,
3326                                  ETH_ALEN),
3327         VMSTATE_UNUSED_VARRAY_UINT32(VirtIONet, mac_table_doesnt_fit, 0,
3328                                      mac_table.in_use, ETH_ALEN),
3329 
3330         /* Note: This is an array of uint32's that's always been saved as a
3331          * buffer; hold onto your endiannesses; it's actually used as a bitmap
3332          * but based on the uint.
3333          */
3334         VMSTATE_BUFFER_POINTER_UNSAFE(vlans, VirtIONet, 0, MAX_VLAN >> 3),
3335         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3336                          vmstate_virtio_net_has_vnet),
3337         VMSTATE_UINT8(mac_table.multi_overflow, VirtIONet),
3338         VMSTATE_UINT8(mac_table.uni_overflow, VirtIONet),
3339         VMSTATE_UINT8(alluni, VirtIONet),
3340         VMSTATE_UINT8(nomulti, VirtIONet),
3341         VMSTATE_UINT8(nouni, VirtIONet),
3342         VMSTATE_UINT8(nobcast, VirtIONet),
3343         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3344                          vmstate_virtio_net_has_ufo),
3345         VMSTATE_SINGLE_TEST(max_queue_pairs, VirtIONet, max_queue_pairs_gt_1, 0,
3346                             vmstate_info_uint16_equal, uint16_t),
3347         VMSTATE_UINT16_TEST(curr_queue_pairs, VirtIONet, max_queue_pairs_gt_1),
3348         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3349                          vmstate_virtio_net_tx_waiting),
3350         VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
3351                             has_ctrl_guest_offloads),
3352         VMSTATE_END_OF_LIST()
3353     },
3354     .subsections = (const VMStateDescription * const []) {
3355         &vmstate_virtio_net_rss,
3356         NULL
3357     }
3358 };
3359 
3360 static NetClientInfo net_virtio_info = {
3361     .type = NET_CLIENT_DRIVER_NIC,
3362     .size = sizeof(NICState),
3363     .can_receive = virtio_net_can_receive,
3364     .receive = virtio_net_receive,
3365     .link_status_changed = virtio_net_set_link_status,
3366     .query_rx_filter = virtio_net_query_rxfilter,
3367     .announce = virtio_net_announce,
3368 };
3369 
3370 static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx)
3371 {
3372     VirtIONet *n = VIRTIO_NET(vdev);
3373     NetClientState *nc;
3374     assert(n->vhost_started);
3375     if (!n->multiqueue && idx == 2) {
3376         /* Must guard against invalid features and bogus queue index
3377          * from being set by malicious guest, or penetrated through
3378          * buggy migration stream.
3379          */
3380         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3381             qemu_log_mask(LOG_GUEST_ERROR,
3382                           "%s: bogus vq index ignored\n", __func__);
3383             return false;
3384         }
3385         nc = qemu_get_subqueue(n->nic, n->max_queue_pairs);
3386     } else {
3387         nc = qemu_get_subqueue(n->nic, vq2q(idx));
3388     }
3389     /*
3390      * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
3391      * as the macro of configure interrupt's IDX, If this driver does not
3392      * support, the function will return false
3393      */
3394 
3395     if (idx == VIRTIO_CONFIG_IRQ_IDX) {
3396         return vhost_net_config_pending(get_vhost_net(nc->peer));
3397     }
3398     return vhost_net_virtqueue_pending(get_vhost_net(nc->peer), idx);
3399 }
3400 
3401 static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx,
3402                                            bool mask)
3403 {
3404     VirtIONet *n = VIRTIO_NET(vdev);
3405     NetClientState *nc;
3406     assert(n->vhost_started);
3407     if (!n->multiqueue && idx == 2) {
3408         /* Must guard against invalid features and bogus queue index
3409          * from being set by malicious guest, or penetrated through
3410          * buggy migration stream.
3411          */
3412         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3413             qemu_log_mask(LOG_GUEST_ERROR,
3414                           "%s: bogus vq index ignored\n", __func__);
3415             return;
3416         }
3417         nc = qemu_get_subqueue(n->nic, n->max_queue_pairs);
3418     } else {
3419         nc = qemu_get_subqueue(n->nic, vq2q(idx));
3420     }
3421     /*
3422      *Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
3423      * as the macro of configure interrupt's IDX, If this driver does not
3424      * support, the function will return
3425      */
3426 
3427     if (idx == VIRTIO_CONFIG_IRQ_IDX) {
3428         vhost_net_config_mask(get_vhost_net(nc->peer), vdev, mask);
3429         return;
3430     }
3431     vhost_net_virtqueue_mask(get_vhost_net(nc->peer), vdev, idx, mask);
3432 }
3433 
3434 static void virtio_net_set_config_size(VirtIONet *n, uint64_t host_features)
3435 {
3436     virtio_add_feature(&host_features, VIRTIO_NET_F_MAC);
3437 
3438     n->config_size = virtio_get_config_size(&cfg_size_params, host_features);
3439 }
3440 
3441 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
3442                                    const char *type)
3443 {
3444     /*
3445      * The name can be NULL, the netclient name will be type.x.
3446      */
3447     assert(type != NULL);
3448 
3449     g_free(n->netclient_name);
3450     g_free(n->netclient_type);
3451     n->netclient_name = g_strdup(name);
3452     n->netclient_type = g_strdup(type);
3453 }
3454 
3455 static bool failover_unplug_primary(VirtIONet *n, DeviceState *dev)
3456 {
3457     HotplugHandler *hotplug_ctrl;
3458     PCIDevice *pci_dev;
3459     Error *err = NULL;
3460 
3461     hotplug_ctrl = qdev_get_hotplug_handler(dev);
3462     if (hotplug_ctrl) {
3463         pci_dev = PCI_DEVICE(dev);
3464         pci_dev->partially_hotplugged = true;
3465         hotplug_handler_unplug_request(hotplug_ctrl, dev, &err);
3466         if (err) {
3467             error_report_err(err);
3468             return false;
3469         }
3470     } else {
3471         return false;
3472     }
3473     return true;
3474 }
3475 
3476 static bool failover_replug_primary(VirtIONet *n, DeviceState *dev,
3477                                     Error **errp)
3478 {
3479     Error *err = NULL;
3480     HotplugHandler *hotplug_ctrl;
3481     PCIDevice *pdev = PCI_DEVICE(dev);
3482     BusState *primary_bus;
3483 
3484     if (!pdev->partially_hotplugged) {
3485         return true;
3486     }
3487     primary_bus = dev->parent_bus;
3488     if (!primary_bus) {
3489         error_setg(errp, "virtio_net: couldn't find primary bus");
3490         return false;
3491     }
3492     qdev_set_parent_bus(dev, primary_bus, &error_abort);
3493     qatomic_set(&n->failover_primary_hidden, false);
3494     hotplug_ctrl = qdev_get_hotplug_handler(dev);
3495     if (hotplug_ctrl) {
3496         hotplug_handler_pre_plug(hotplug_ctrl, dev, &err);
3497         if (err) {
3498             goto out;
3499         }
3500         hotplug_handler_plug(hotplug_ctrl, dev, &err);
3501     }
3502     pdev->partially_hotplugged = false;
3503 
3504 out:
3505     error_propagate(errp, err);
3506     return !err;
3507 }
3508 
3509 static void virtio_net_handle_migration_primary(VirtIONet *n, MigrationEvent *e)
3510 {
3511     bool should_be_hidden;
3512     Error *err = NULL;
3513     DeviceState *dev = failover_find_primary_device(n);
3514 
3515     if (!dev) {
3516         return;
3517     }
3518 
3519     should_be_hidden = qatomic_read(&n->failover_primary_hidden);
3520 
3521     if (e->type == MIG_EVENT_PRECOPY_SETUP && !should_be_hidden) {
3522         if (failover_unplug_primary(n, dev)) {
3523             vmstate_unregister(VMSTATE_IF(dev), qdev_get_vmsd(dev), dev);
3524             qapi_event_send_unplug_primary(dev->id);
3525             qatomic_set(&n->failover_primary_hidden, true);
3526         } else {
3527             warn_report("couldn't unplug primary device");
3528         }
3529     } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
3530         /* We already unplugged the device let's plug it back */
3531         if (!failover_replug_primary(n, dev, &err)) {
3532             if (err) {
3533                 error_report_err(err);
3534             }
3535         }
3536     }
3537 }
3538 
3539 static int virtio_net_migration_state_notifier(NotifierWithReturn *notifier,
3540                                                MigrationEvent *e, Error **errp)
3541 {
3542     VirtIONet *n = container_of(notifier, VirtIONet, migration_state);
3543     virtio_net_handle_migration_primary(n, e);
3544     return 0;
3545 }
3546 
3547 static bool failover_hide_primary_device(DeviceListener *listener,
3548                                          const QDict *device_opts,
3549                                          bool from_json,
3550                                          Error **errp)
3551 {
3552     VirtIONet *n = container_of(listener, VirtIONet, primary_listener);
3553     const char *standby_id;
3554 
3555     if (!device_opts) {
3556         return false;
3557     }
3558 
3559     if (!qdict_haskey(device_opts, "failover_pair_id")) {
3560         return false;
3561     }
3562 
3563     if (!qdict_haskey(device_opts, "id")) {
3564         error_setg(errp, "Device with failover_pair_id needs to have id");
3565         return false;
3566     }
3567 
3568     standby_id = qdict_get_str(device_opts, "failover_pair_id");
3569     if (g_strcmp0(standby_id, n->netclient_name) != 0) {
3570         return false;
3571     }
3572 
3573     /*
3574      * The hide helper can be called several times for a given device.
3575      * Check there is only one primary for a virtio-net device but
3576      * don't duplicate the qdict several times if it's called for the same
3577      * device.
3578      */
3579     if (n->primary_opts) {
3580         const char *old, *new;
3581         /* devices with failover_pair_id always have an id */
3582         old = qdict_get_str(n->primary_opts, "id");
3583         new = qdict_get_str(device_opts, "id");
3584         if (strcmp(old, new) != 0) {
3585             error_setg(errp, "Cannot attach more than one primary device to "
3586                        "'%s': '%s' and '%s'", n->netclient_name, old, new);
3587             return false;
3588         }
3589     } else {
3590         n->primary_opts = qdict_clone_shallow(device_opts);
3591         n->primary_opts_from_json = from_json;
3592     }
3593 
3594     /* failover_primary_hidden is set during feature negotiation */
3595     return qatomic_read(&n->failover_primary_hidden);
3596 }
3597 
3598 static void virtio_net_device_realize(DeviceState *dev, Error **errp)
3599 {
3600     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3601     VirtIONet *n = VIRTIO_NET(dev);
3602     NetClientState *nc;
3603     int i;
3604 
3605     if (n->net_conf.mtu) {
3606         n->host_features |= (1ULL << VIRTIO_NET_F_MTU);
3607     }
3608 
3609     if (n->net_conf.duplex_str) {
3610         if (strncmp(n->net_conf.duplex_str, "half", 5) == 0) {
3611             n->net_conf.duplex = DUPLEX_HALF;
3612         } else if (strncmp(n->net_conf.duplex_str, "full", 5) == 0) {
3613             n->net_conf.duplex = DUPLEX_FULL;
3614         } else {
3615             error_setg(errp, "'duplex' must be 'half' or 'full'");
3616             return;
3617         }
3618         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
3619     } else {
3620         n->net_conf.duplex = DUPLEX_UNKNOWN;
3621     }
3622 
3623     if (n->net_conf.speed < SPEED_UNKNOWN) {
3624         error_setg(errp, "'speed' must be between 0 and INT_MAX");
3625         return;
3626     }
3627     if (n->net_conf.speed >= 0) {
3628         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
3629     }
3630 
3631     if (n->failover) {
3632         n->primary_listener.hide_device = failover_hide_primary_device;
3633         qatomic_set(&n->failover_primary_hidden, true);
3634         device_listener_register(&n->primary_listener);
3635         migration_add_notifier(&n->migration_state,
3636                                virtio_net_migration_state_notifier);
3637         n->host_features |= (1ULL << VIRTIO_NET_F_STANDBY);
3638     }
3639 
3640     virtio_net_set_config_size(n, n->host_features);
3641     virtio_init(vdev, VIRTIO_ID_NET, n->config_size);
3642 
3643     /*
3644      * We set a lower limit on RX queue size to what it always was.
3645      * Guests that want a smaller ring can always resize it without
3646      * help from us (using virtio 1 and up).
3647      */
3648     if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE ||
3649         n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE ||
3650         !is_power_of_2(n->net_conf.rx_queue_size)) {
3651         error_setg(errp, "Invalid rx_queue_size (= %" PRIu16 "), "
3652                    "must be a power of 2 between %d and %d.",
3653                    n->net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_MIN_SIZE,
3654                    VIRTQUEUE_MAX_SIZE);
3655         virtio_cleanup(vdev);
3656         return;
3657     }
3658 
3659     if (n->net_conf.tx_queue_size < VIRTIO_NET_TX_QUEUE_MIN_SIZE ||
3660         n->net_conf.tx_queue_size > virtio_net_max_tx_queue_size(n) ||
3661         !is_power_of_2(n->net_conf.tx_queue_size)) {
3662         error_setg(errp, "Invalid tx_queue_size (= %" PRIu16 "), "
3663                    "must be a power of 2 between %d and %d",
3664                    n->net_conf.tx_queue_size, VIRTIO_NET_TX_QUEUE_MIN_SIZE,
3665                    virtio_net_max_tx_queue_size(n));
3666         virtio_cleanup(vdev);
3667         return;
3668     }
3669 
3670     n->max_ncs = MAX(n->nic_conf.peers.queues, 1);
3671 
3672     /*
3673      * Figure out the datapath queue pairs since the backend could
3674      * provide control queue via peers as well.
3675      */
3676     if (n->nic_conf.peers.queues) {
3677         for (i = 0; i < n->max_ncs; i++) {
3678             if (n->nic_conf.peers.ncs[i]->is_datapath) {
3679                 ++n->max_queue_pairs;
3680             }
3681         }
3682     }
3683     n->max_queue_pairs = MAX(n->max_queue_pairs, 1);
3684 
3685     if (n->max_queue_pairs * 2 + 1 > VIRTIO_QUEUE_MAX) {
3686         error_setg(errp, "Invalid number of queue pairs (= %" PRIu32 "), "
3687                    "must be a positive integer less than %d.",
3688                    n->max_queue_pairs, (VIRTIO_QUEUE_MAX - 1) / 2);
3689         virtio_cleanup(vdev);
3690         return;
3691     }
3692     n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
3693     n->curr_queue_pairs = 1;
3694     n->tx_timeout = n->net_conf.txtimer;
3695 
3696     if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
3697                        && strcmp(n->net_conf.tx, "bh")) {
3698         warn_report("virtio-net: "
3699                     "Unknown option tx=%s, valid options: \"timer\" \"bh\"",
3700                     n->net_conf.tx);
3701         error_printf("Defaulting to \"bh\"");
3702     }
3703 
3704     n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
3705                                     n->net_conf.tx_queue_size);
3706 
3707     virtio_net_add_queue(n, 0);
3708 
3709     n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
3710     qemu_macaddr_default_if_unset(&n->nic_conf.macaddr);
3711     memcpy(&n->mac[0], &n->nic_conf.macaddr, sizeof(n->mac));
3712     n->status = VIRTIO_NET_S_LINK_UP;
3713     qemu_announce_timer_reset(&n->announce_timer, migrate_announce_params(),
3714                               QEMU_CLOCK_VIRTUAL,
3715                               virtio_net_announce_timer, n);
3716     n->announce_timer.round = 0;
3717 
3718     if (n->netclient_type) {
3719         /*
3720          * Happen when virtio_net_set_netclient_name has been called.
3721          */
3722         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
3723                               n->netclient_type, n->netclient_name,
3724                               &dev->mem_reentrancy_guard, n);
3725     } else {
3726         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
3727                               object_get_typename(OBJECT(dev)), dev->id,
3728                               &dev->mem_reentrancy_guard, n);
3729     }
3730 
3731     for (i = 0; i < n->max_queue_pairs; i++) {
3732         n->nic->ncs[i].do_not_pad = true;
3733     }
3734 
3735     peer_test_vnet_hdr(n);
3736     if (peer_has_vnet_hdr(n)) {
3737         n->host_hdr_len = sizeof(struct virtio_net_hdr);
3738     } else {
3739         n->host_hdr_len = 0;
3740     }
3741 
3742     qemu_format_nic_info_str(qemu_get_queue(n->nic), n->nic_conf.macaddr.a);
3743 
3744     n->vqs[0].tx_waiting = 0;
3745     n->tx_burst = n->net_conf.txburst;
3746     virtio_net_set_mrg_rx_bufs(n, 0, 0, 0);
3747     n->promisc = 1; /* for compatibility */
3748 
3749     n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
3750 
3751     n->vlans = g_malloc0(MAX_VLAN >> 3);
3752 
3753     nc = qemu_get_queue(n->nic);
3754     nc->rxfilter_notify_enabled = 1;
3755 
3756    if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
3757         struct virtio_net_config netcfg = {};
3758         memcpy(&netcfg.mac, &n->nic_conf.macaddr, ETH_ALEN);
3759         vhost_net_set_config(get_vhost_net(nc->peer),
3760             (uint8_t *)&netcfg, 0, ETH_ALEN, VHOST_SET_CONFIG_TYPE_FRONTEND);
3761     }
3762     QTAILQ_INIT(&n->rsc_chains);
3763     n->qdev = dev;
3764 
3765     net_rx_pkt_init(&n->rx_pkt);
3766 
3767     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
3768         Error *err = NULL;
3769         if (!virtio_net_load_ebpf(n, &err)) {
3770             /*
3771              * If user explicitly gave QEMU RSS FDs to use, then
3772              * failing to use them must be considered a fatal
3773              * error. If no RSS FDs were provided, QEMU is trying
3774              * eBPF on a "best effort" basis only, so report a
3775              * warning and allow fallback to software RSS.
3776              */
3777             if (n->ebpf_rss_fds) {
3778                 error_propagate(errp, err);
3779             } else {
3780                 warn_report("unable to load eBPF RSS: %s",
3781                             error_get_pretty(err));
3782                 error_free(err);
3783             }
3784         }
3785     }
3786 }
3787 
3788 static void virtio_net_device_unrealize(DeviceState *dev)
3789 {
3790     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3791     VirtIONet *n = VIRTIO_NET(dev);
3792     int i, max_queue_pairs;
3793 
3794     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
3795         virtio_net_unload_ebpf(n);
3796     }
3797 
3798     /* This will stop vhost backend if appropriate. */
3799     virtio_net_set_status(vdev, 0);
3800 
3801     g_free(n->netclient_name);
3802     n->netclient_name = NULL;
3803     g_free(n->netclient_type);
3804     n->netclient_type = NULL;
3805 
3806     g_free(n->mac_table.macs);
3807     g_free(n->vlans);
3808 
3809     if (n->failover) {
3810         qobject_unref(n->primary_opts);
3811         device_listener_unregister(&n->primary_listener);
3812         migration_remove_notifier(&n->migration_state);
3813     } else {
3814         assert(n->primary_opts == NULL);
3815     }
3816 
3817     max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
3818     for (i = 0; i < max_queue_pairs; i++) {
3819         virtio_net_del_queue(n, i);
3820     }
3821     /* delete also control vq */
3822     virtio_del_queue(vdev, max_queue_pairs * 2);
3823     qemu_announce_timer_del(&n->announce_timer, false);
3824     g_free(n->vqs);
3825     qemu_del_nic(n->nic);
3826     virtio_net_rsc_cleanup(n);
3827     g_free(n->rss_data.indirections_table);
3828     net_rx_pkt_uninit(n->rx_pkt);
3829     virtio_cleanup(vdev);
3830 }
3831 
3832 static void virtio_net_reset(VirtIODevice *vdev)
3833 {
3834     VirtIONet *n = VIRTIO_NET(vdev);
3835     int i;
3836 
3837     /* Reset back to compatibility mode */
3838     n->promisc = 1;
3839     n->allmulti = 0;
3840     n->alluni = 0;
3841     n->nomulti = 0;
3842     n->nouni = 0;
3843     n->nobcast = 0;
3844     /* multiqueue is disabled by default */
3845     n->curr_queue_pairs = 1;
3846     timer_del(n->announce_timer.tm);
3847     n->announce_timer.round = 0;
3848     n->status &= ~VIRTIO_NET_S_ANNOUNCE;
3849 
3850     /* Flush any MAC and VLAN filter table state */
3851     n->mac_table.in_use = 0;
3852     n->mac_table.first_multi = 0;
3853     n->mac_table.multi_overflow = 0;
3854     n->mac_table.uni_overflow = 0;
3855     memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
3856     memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac));
3857     qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
3858     memset(n->vlans, 0, MAX_VLAN >> 3);
3859 
3860     /* Flush any async TX */
3861     for (i = 0;  i < n->max_queue_pairs; i++) {
3862         flush_or_purge_queued_packets(qemu_get_subqueue(n->nic, i));
3863     }
3864 
3865     virtio_net_disable_rss(n);
3866 }
3867 
3868 static void virtio_net_instance_init(Object *obj)
3869 {
3870     VirtIONet *n = VIRTIO_NET(obj);
3871 
3872     /*
3873      * The default config_size is sizeof(struct virtio_net_config).
3874      * Can be overridden with virtio_net_set_config_size.
3875      */
3876     n->config_size = sizeof(struct virtio_net_config);
3877     device_add_bootindex_property(obj, &n->nic_conf.bootindex,
3878                                   "bootindex", "/ethernet-phy@0",
3879                                   DEVICE(n));
3880 
3881     ebpf_rss_init(&n->ebpf_rss);
3882 }
3883 
3884 static int virtio_net_pre_save(void *opaque)
3885 {
3886     VirtIONet *n = opaque;
3887 
3888     /* At this point, backend must be stopped, otherwise
3889      * it might keep writing to memory. */
3890     assert(!n->vhost_started);
3891 
3892     return 0;
3893 }
3894 
3895 static bool primary_unplug_pending(void *opaque)
3896 {
3897     DeviceState *dev = opaque;
3898     DeviceState *primary;
3899     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3900     VirtIONet *n = VIRTIO_NET(vdev);
3901 
3902     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
3903         return false;
3904     }
3905     primary = failover_find_primary_device(n);
3906     return primary ? primary->pending_deleted_event : false;
3907 }
3908 
3909 static bool dev_unplug_pending(void *opaque)
3910 {
3911     DeviceState *dev = opaque;
3912     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev);
3913 
3914     return vdc->primary_unplug_pending(dev);
3915 }
3916 
3917 static struct vhost_dev *virtio_net_get_vhost(VirtIODevice *vdev)
3918 {
3919     VirtIONet *n = VIRTIO_NET(vdev);
3920     NetClientState *nc;
3921     struct vhost_net *net;
3922 
3923     if (!n->nic) {
3924         return NULL;
3925     }
3926 
3927     nc = qemu_get_queue(n->nic);
3928     if (!nc) {
3929         return NULL;
3930     }
3931 
3932     net = get_vhost_net(nc->peer);
3933     if (!net) {
3934         return NULL;
3935     }
3936 
3937     return &net->dev;
3938 }
3939 
3940 static const VMStateDescription vmstate_virtio_net = {
3941     .name = "virtio-net",
3942     .minimum_version_id = VIRTIO_NET_VM_VERSION,
3943     .version_id = VIRTIO_NET_VM_VERSION,
3944     .fields = (const VMStateField[]) {
3945         VMSTATE_VIRTIO_DEVICE,
3946         VMSTATE_END_OF_LIST()
3947     },
3948     .pre_save = virtio_net_pre_save,
3949     .dev_unplug_pending = dev_unplug_pending,
3950 };
3951 
3952 static Property virtio_net_properties[] = {
3953     DEFINE_PROP_BIT64("csum", VirtIONet, host_features,
3954                     VIRTIO_NET_F_CSUM, true),
3955     DEFINE_PROP_BIT64("guest_csum", VirtIONet, host_features,
3956                     VIRTIO_NET_F_GUEST_CSUM, true),
3957     DEFINE_PROP_BIT64("gso", VirtIONet, host_features, VIRTIO_NET_F_GSO, true),
3958     DEFINE_PROP_BIT64("guest_tso4", VirtIONet, host_features,
3959                     VIRTIO_NET_F_GUEST_TSO4, true),
3960     DEFINE_PROP_BIT64("guest_tso6", VirtIONet, host_features,
3961                     VIRTIO_NET_F_GUEST_TSO6, true),
3962     DEFINE_PROP_BIT64("guest_ecn", VirtIONet, host_features,
3963                     VIRTIO_NET_F_GUEST_ECN, true),
3964     DEFINE_PROP_BIT64("guest_ufo", VirtIONet, host_features,
3965                     VIRTIO_NET_F_GUEST_UFO, true),
3966     DEFINE_PROP_BIT64("guest_announce", VirtIONet, host_features,
3967                     VIRTIO_NET_F_GUEST_ANNOUNCE, true),
3968     DEFINE_PROP_BIT64("host_tso4", VirtIONet, host_features,
3969                     VIRTIO_NET_F_HOST_TSO4, true),
3970     DEFINE_PROP_BIT64("host_tso6", VirtIONet, host_features,
3971                     VIRTIO_NET_F_HOST_TSO6, true),
3972     DEFINE_PROP_BIT64("host_ecn", VirtIONet, host_features,
3973                     VIRTIO_NET_F_HOST_ECN, true),
3974     DEFINE_PROP_BIT64("host_ufo", VirtIONet, host_features,
3975                     VIRTIO_NET_F_HOST_UFO, true),
3976     DEFINE_PROP_BIT64("mrg_rxbuf", VirtIONet, host_features,
3977                     VIRTIO_NET_F_MRG_RXBUF, true),
3978     DEFINE_PROP_BIT64("status", VirtIONet, host_features,
3979                     VIRTIO_NET_F_STATUS, true),
3980     DEFINE_PROP_BIT64("ctrl_vq", VirtIONet, host_features,
3981                     VIRTIO_NET_F_CTRL_VQ, true),
3982     DEFINE_PROP_BIT64("ctrl_rx", VirtIONet, host_features,
3983                     VIRTIO_NET_F_CTRL_RX, true),
3984     DEFINE_PROP_BIT64("ctrl_vlan", VirtIONet, host_features,
3985                     VIRTIO_NET_F_CTRL_VLAN, true),
3986     DEFINE_PROP_BIT64("ctrl_rx_extra", VirtIONet, host_features,
3987                     VIRTIO_NET_F_CTRL_RX_EXTRA, true),
3988     DEFINE_PROP_BIT64("ctrl_mac_addr", VirtIONet, host_features,
3989                     VIRTIO_NET_F_CTRL_MAC_ADDR, true),
3990     DEFINE_PROP_BIT64("ctrl_guest_offloads", VirtIONet, host_features,
3991                     VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, true),
3992     DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
3993     DEFINE_PROP_BIT64("rss", VirtIONet, host_features,
3994                     VIRTIO_NET_F_RSS, false),
3995     DEFINE_PROP_BIT64("hash", VirtIONet, host_features,
3996                     VIRTIO_NET_F_HASH_REPORT, false),
3997     DEFINE_PROP_ARRAY("ebpf-rss-fds", VirtIONet, nr_ebpf_rss_fds,
3998                       ebpf_rss_fds, qdev_prop_string, char*),
3999     DEFINE_PROP_BIT64("guest_rsc_ext", VirtIONet, host_features,
4000                     VIRTIO_NET_F_RSC_EXT, false),
4001     DEFINE_PROP_UINT32("rsc_interval", VirtIONet, rsc_timeout,
4002                        VIRTIO_NET_RSC_DEFAULT_INTERVAL),
4003     DEFINE_NIC_PROPERTIES(VirtIONet, nic_conf),
4004     DEFINE_PROP_UINT32("x-txtimer", VirtIONet, net_conf.txtimer,
4005                        TX_TIMER_INTERVAL),
4006     DEFINE_PROP_INT32("x-txburst", VirtIONet, net_conf.txburst, TX_BURST),
4007     DEFINE_PROP_STRING("tx", VirtIONet, net_conf.tx),
4008     DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size,
4009                        VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE),
4010     DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size,
4011                        VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE),
4012     DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0),
4013     DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend,
4014                      true),
4015     DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
4016     DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
4017     DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
4018     DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
4019                       VIRTIO_NET_F_GUEST_USO4, true),
4020     DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
4021                       VIRTIO_NET_F_GUEST_USO6, true),
4022     DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
4023                       VIRTIO_NET_F_HOST_USO, true),
4024     DEFINE_PROP_END_OF_LIST(),
4025 };
4026 
4027 static void virtio_net_class_init(ObjectClass *klass, void *data)
4028 {
4029     DeviceClass *dc = DEVICE_CLASS(klass);
4030     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
4031 
4032     device_class_set_props(dc, virtio_net_properties);
4033     dc->vmsd = &vmstate_virtio_net;
4034     set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
4035     vdc->realize = virtio_net_device_realize;
4036     vdc->unrealize = virtio_net_device_unrealize;
4037     vdc->get_config = virtio_net_get_config;
4038     vdc->set_config = virtio_net_set_config;
4039     vdc->get_features = virtio_net_get_features;
4040     vdc->set_features = virtio_net_set_features;
4041     vdc->bad_features = virtio_net_bad_features;
4042     vdc->reset = virtio_net_reset;
4043     vdc->queue_reset = virtio_net_queue_reset;
4044     vdc->queue_enable = virtio_net_queue_enable;
4045     vdc->set_status = virtio_net_set_status;
4046     vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
4047     vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;
4048     vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO);
4049     vdc->post_load = virtio_net_post_load_virtio;
4050     vdc->vmsd = &vmstate_virtio_net_device;
4051     vdc->primary_unplug_pending = primary_unplug_pending;
4052     vdc->get_vhost = virtio_net_get_vhost;
4053     vdc->toggle_device_iotlb = vhost_toggle_device_iotlb;
4054 }
4055 
4056 static const TypeInfo virtio_net_info = {
4057     .name = TYPE_VIRTIO_NET,
4058     .parent = TYPE_VIRTIO_DEVICE,
4059     .instance_size = sizeof(VirtIONet),
4060     .instance_init = virtio_net_instance_init,
4061     .class_init = virtio_net_class_init,
4062 };
4063 
4064 static void virtio_register_types(void)
4065 {
4066     type_register_static(&virtio_net_info);
4067 }
4068 
4069 type_init(virtio_register_types)
4070