xref: /openbmc/qemu/hw/net/virtio-net.c (revision b5900dff14e5a8334766de6b37629c8020c6bbb0)
1 /*
2  * Virtio Network Device
3  *
4  * Copyright IBM, Corp. 2007
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 #include "qemu/atomic.h"
16 #include "qemu/iov.h"
17 #include "qemu/log.h"
18 #include "qemu/main-loop.h"
19 #include "qemu/module.h"
20 #include "hw/virtio/virtio.h"
21 #include "net/net.h"
22 #include "net/checksum.h"
23 #include "net/tap.h"
24 #include "qemu/error-report.h"
25 #include "qemu/timer.h"
26 #include "qemu/option.h"
27 #include "qemu/option_int.h"
28 #include "qemu/config-file.h"
29 #include "qapi/qmp/qdict.h"
30 #include "hw/virtio/virtio-net.h"
31 #include "net/vhost_net.h"
32 #include "net/announce.h"
33 #include "hw/virtio/virtio-bus.h"
34 #include "qapi/error.h"
35 #include "qapi/qapi-events-net.h"
36 #include "hw/qdev-properties.h"
37 #include "qapi/qapi-types-migration.h"
38 #include "qapi/qapi-events-migration.h"
39 #include "hw/virtio/virtio-access.h"
40 #include "migration/misc.h"
41 #include "standard-headers/linux/ethtool.h"
42 #include "sysemu/sysemu.h"
43 #include "sysemu/replay.h"
44 #include "trace.h"
45 #include "monitor/qdev.h"
46 #include "monitor/monitor.h"
47 #include "hw/pci/pci_device.h"
48 #include "net_rx_pkt.h"
49 #include "hw/virtio/vhost.h"
50 #include "sysemu/qtest.h"
51 
52 #define VIRTIO_NET_VM_VERSION    11
53 
54 /* previously fixed value */
55 #define VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE 256
56 #define VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE 256
57 
58 /* for now, only allow larger queue_pairs; with virtio-1, guest can downsize */
59 #define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE
60 #define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE
61 
62 #define VIRTIO_NET_IP4_ADDR_SIZE   8        /* ipv4 saddr + daddr */
63 
64 #define VIRTIO_NET_TCP_FLAG         0x3F
65 #define VIRTIO_NET_TCP_HDR_LENGTH   0xF000
66 
67 /* IPv4 max payload, 16 bits in the header */
68 #define VIRTIO_NET_MAX_IP4_PAYLOAD (65535 - sizeof(struct ip_header))
69 #define VIRTIO_NET_MAX_TCP_PAYLOAD 65535
70 
71 /* header length value in ip header without option */
72 #define VIRTIO_NET_IP4_HEADER_LENGTH 5
73 
74 #define VIRTIO_NET_IP6_ADDR_SIZE   32      /* ipv6 saddr + daddr */
75 #define VIRTIO_NET_MAX_IP6_PAYLOAD VIRTIO_NET_MAX_TCP_PAYLOAD
76 
77 /* Purge coalesced packets timer interval, This value affects the performance
78    a lot, and should be tuned carefully, '300000'(300us) is the recommended
79    value to pass the WHQL test, '50000' can gain 2x netperf throughput with
80    tso/gso/gro 'off'. */
81 #define VIRTIO_NET_RSC_DEFAULT_INTERVAL 300000
82 
83 #define VIRTIO_NET_RSS_SUPPORTED_HASHES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
84                                          VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
85                                          VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
86                                          VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
87                                          VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
88                                          VIRTIO_NET_RSS_HASH_TYPE_UDPv6 | \
89                                          VIRTIO_NET_RSS_HASH_TYPE_IP_EX | \
90                                          VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \
91                                          VIRTIO_NET_RSS_HASH_TYPE_UDP_EX)
92 
93 static const VirtIOFeature feature_sizes[] = {
94     {.flags = 1ULL << VIRTIO_NET_F_MAC,
95      .end = endof(struct virtio_net_config, mac)},
96     {.flags = 1ULL << VIRTIO_NET_F_STATUS,
97      .end = endof(struct virtio_net_config, status)},
98     {.flags = 1ULL << VIRTIO_NET_F_MQ,
99      .end = endof(struct virtio_net_config, max_virtqueue_pairs)},
100     {.flags = 1ULL << VIRTIO_NET_F_MTU,
101      .end = endof(struct virtio_net_config, mtu)},
102     {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
103      .end = endof(struct virtio_net_config, duplex)},
104     {.flags = (1ULL << VIRTIO_NET_F_RSS) | (1ULL << VIRTIO_NET_F_HASH_REPORT),
105      .end = endof(struct virtio_net_config, supported_hash_types)},
106     {}
107 };
108 
109 static const VirtIOConfigSizeParams cfg_size_params = {
110     .min_size = endof(struct virtio_net_config, mac),
111     .max_size = sizeof(struct virtio_net_config),
112     .feature_sizes = feature_sizes
113 };
114 
115 static VirtIONetQueue *virtio_net_get_subqueue(NetClientState *nc)
116 {
117     VirtIONet *n = qemu_get_nic_opaque(nc);
118 
119     return &n->vqs[nc->queue_index];
120 }
121 
122 static int vq2q(int queue_index)
123 {
124     return queue_index / 2;
125 }
126 
127 static void flush_or_purge_queued_packets(NetClientState *nc)
128 {
129     if (!nc->peer) {
130         return;
131     }
132 
133     qemu_flush_or_purge_queued_packets(nc->peer, true);
134     assert(!virtio_net_get_subqueue(nc)->async_tx.elem);
135 }
136 
137 /* TODO
138  * - we could suppress RX interrupt if we were so inclined.
139  */
140 
141 static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
142 {
143     VirtIONet *n = VIRTIO_NET(vdev);
144     struct virtio_net_config netcfg;
145     NetClientState *nc = qemu_get_queue(n->nic);
146     static const MACAddr zero = { .a = { 0, 0, 0, 0, 0, 0 } };
147 
148     int ret = 0;
149     memset(&netcfg, 0 , sizeof(struct virtio_net_config));
150     virtio_stw_p(vdev, &netcfg.status, n->status);
151     virtio_stw_p(vdev, &netcfg.max_virtqueue_pairs, n->max_queue_pairs);
152     virtio_stw_p(vdev, &netcfg.mtu, n->net_conf.mtu);
153     memcpy(netcfg.mac, n->mac, ETH_ALEN);
154     virtio_stl_p(vdev, &netcfg.speed, n->net_conf.speed);
155     netcfg.duplex = n->net_conf.duplex;
156     netcfg.rss_max_key_size = VIRTIO_NET_RSS_MAX_KEY_SIZE;
157     virtio_stw_p(vdev, &netcfg.rss_max_indirection_table_length,
158                  virtio_host_has_feature(vdev, VIRTIO_NET_F_RSS) ?
159                  VIRTIO_NET_RSS_MAX_TABLE_LEN : 1);
160     virtio_stl_p(vdev, &netcfg.supported_hash_types,
161                  VIRTIO_NET_RSS_SUPPORTED_HASHES);
162     memcpy(config, &netcfg, n->config_size);
163 
164     /*
165      * Is this VDPA? No peer means not VDPA: there's no way to
166      * disconnect/reconnect a VDPA peer.
167      */
168     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
169         ret = vhost_net_get_config(get_vhost_net(nc->peer), (uint8_t *)&netcfg,
170                                    n->config_size);
171         if (ret == -1) {
172             return;
173         }
174 
175         /*
176          * Some NIC/kernel combinations present 0 as the mac address.  As that
177          * is not a legal address, try to proceed with the address from the
178          * QEMU command line in the hope that the address has been configured
179          * correctly elsewhere - just not reported by the device.
180          */
181         if (memcmp(&netcfg.mac, &zero, sizeof(zero)) == 0) {
182             info_report("Zero hardware mac address detected. Ignoring.");
183             memcpy(netcfg.mac, n->mac, ETH_ALEN);
184         }
185 
186         netcfg.status |= virtio_tswap16(vdev,
187                                         n->status & VIRTIO_NET_S_ANNOUNCE);
188         memcpy(config, &netcfg, n->config_size);
189     }
190 }
191 
192 static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config)
193 {
194     VirtIONet *n = VIRTIO_NET(vdev);
195     struct virtio_net_config netcfg = {};
196     NetClientState *nc = qemu_get_queue(n->nic);
197 
198     memcpy(&netcfg, config, n->config_size);
199 
200     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR) &&
201         !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) &&
202         memcmp(netcfg.mac, n->mac, ETH_ALEN)) {
203         memcpy(n->mac, netcfg.mac, ETH_ALEN);
204         qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
205     }
206 
207     /*
208      * Is this VDPA? No peer means not VDPA: there's no way to
209      * disconnect/reconnect a VDPA peer.
210      */
211     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
212         vhost_net_set_config(get_vhost_net(nc->peer),
213                              (uint8_t *)&netcfg, 0, n->config_size,
214                              VHOST_SET_CONFIG_TYPE_FRONTEND);
215       }
216 }
217 
218 static bool virtio_net_started(VirtIONet *n, uint8_t status)
219 {
220     VirtIODevice *vdev = VIRTIO_DEVICE(n);
221     return (status & VIRTIO_CONFIG_S_DRIVER_OK) &&
222         (n->status & VIRTIO_NET_S_LINK_UP) && vdev->vm_running;
223 }
224 
225 static void virtio_net_announce_notify(VirtIONet *net)
226 {
227     VirtIODevice *vdev = VIRTIO_DEVICE(net);
228     trace_virtio_net_announce_notify();
229 
230     net->status |= VIRTIO_NET_S_ANNOUNCE;
231     virtio_notify_config(vdev);
232 }
233 
234 static void virtio_net_announce_timer(void *opaque)
235 {
236     VirtIONet *n = opaque;
237     trace_virtio_net_announce_timer(n->announce_timer.round);
238 
239     n->announce_timer.round--;
240     virtio_net_announce_notify(n);
241 }
242 
243 static void virtio_net_announce(NetClientState *nc)
244 {
245     VirtIONet *n = qemu_get_nic_opaque(nc);
246     VirtIODevice *vdev = VIRTIO_DEVICE(n);
247 
248     /*
249      * Make sure the virtio migration announcement timer isn't running
250      * If it is, let it trigger announcement so that we do not cause
251      * confusion.
252      */
253     if (n->announce_timer.round) {
254         return;
255     }
256 
257     if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE) &&
258         virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
259             virtio_net_announce_notify(n);
260     }
261 }
262 
263 static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
264 {
265     VirtIODevice *vdev = VIRTIO_DEVICE(n);
266     NetClientState *nc = qemu_get_queue(n->nic);
267     int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
268     int cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
269               n->max_ncs - n->max_queue_pairs : 0;
270 
271     if (!get_vhost_net(nc->peer)) {
272         return;
273     }
274 
275     if ((virtio_net_started(n, status) && !nc->peer->link_down) ==
276         !!n->vhost_started) {
277         return;
278     }
279     if (!n->vhost_started) {
280         int r, i;
281 
282         if (n->needs_vnet_hdr_swap) {
283             error_report("backend does not support %s vnet headers; "
284                          "falling back on userspace virtio",
285                          virtio_is_big_endian(vdev) ? "BE" : "LE");
286             return;
287         }
288 
289         /* Any packets outstanding? Purge them to avoid touching rings
290          * when vhost is running.
291          */
292         for (i = 0;  i < queue_pairs; i++) {
293             NetClientState *qnc = qemu_get_subqueue(n->nic, i);
294 
295             /* Purge both directions: TX and RX. */
296             qemu_net_queue_purge(qnc->peer->incoming_queue, qnc);
297             qemu_net_queue_purge(qnc->incoming_queue, qnc->peer);
298         }
299 
300         if (virtio_has_feature(vdev->guest_features, VIRTIO_NET_F_MTU)) {
301             r = vhost_net_set_mtu(get_vhost_net(nc->peer), n->net_conf.mtu);
302             if (r < 0) {
303                 error_report("%uBytes MTU not supported by the backend",
304                              n->net_conf.mtu);
305 
306                 return;
307             }
308         }
309 
310         n->vhost_started = 1;
311         r = vhost_net_start(vdev, n->nic->ncs, queue_pairs, cvq);
312         if (r < 0) {
313             error_report("unable to start vhost net: %d: "
314                          "falling back on userspace virtio", -r);
315             n->vhost_started = 0;
316         }
317     } else {
318         vhost_net_stop(vdev, n->nic->ncs, queue_pairs, cvq);
319         n->vhost_started = 0;
320     }
321 }
322 
323 static int virtio_net_set_vnet_endian_one(VirtIODevice *vdev,
324                                           NetClientState *peer,
325                                           bool enable)
326 {
327     if (virtio_is_big_endian(vdev)) {
328         return qemu_set_vnet_be(peer, enable);
329     } else {
330         return qemu_set_vnet_le(peer, enable);
331     }
332 }
333 
334 static bool virtio_net_set_vnet_endian(VirtIODevice *vdev, NetClientState *ncs,
335                                        int queue_pairs, bool enable)
336 {
337     int i;
338 
339     for (i = 0; i < queue_pairs; i++) {
340         if (virtio_net_set_vnet_endian_one(vdev, ncs[i].peer, enable) < 0 &&
341             enable) {
342             while (--i >= 0) {
343                 virtio_net_set_vnet_endian_one(vdev, ncs[i].peer, false);
344             }
345 
346             return true;
347         }
348     }
349 
350     return false;
351 }
352 
353 static void virtio_net_vnet_endian_status(VirtIONet *n, uint8_t status)
354 {
355     VirtIODevice *vdev = VIRTIO_DEVICE(n);
356     int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
357 
358     if (virtio_net_started(n, status)) {
359         /* Before using the device, we tell the network backend about the
360          * endianness to use when parsing vnet headers. If the backend
361          * can't do it, we fallback onto fixing the headers in the core
362          * virtio-net code.
363          */
364         n->needs_vnet_hdr_swap = n->has_vnet_hdr &&
365                                  virtio_net_set_vnet_endian(vdev, n->nic->ncs,
366                                                             queue_pairs, true);
367     } else if (virtio_net_started(n, vdev->status)) {
368         /* After using the device, we need to reset the network backend to
369          * the default (guest native endianness), otherwise the guest may
370          * lose network connectivity if it is rebooted into a different
371          * endianness.
372          */
373         virtio_net_set_vnet_endian(vdev, n->nic->ncs, queue_pairs, false);
374     }
375 }
376 
377 static void virtio_net_drop_tx_queue_data(VirtIODevice *vdev, VirtQueue *vq)
378 {
379     unsigned int dropped = virtqueue_drop_all(vq);
380     if (dropped) {
381         virtio_notify(vdev, vq);
382     }
383 }
384 
385 static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
386 {
387     VirtIONet *n = VIRTIO_NET(vdev);
388     VirtIONetQueue *q;
389     int i;
390     uint8_t queue_status;
391 
392     virtio_net_vnet_endian_status(n, status);
393     virtio_net_vhost_status(n, status);
394 
395     for (i = 0; i < n->max_queue_pairs; i++) {
396         NetClientState *ncs = qemu_get_subqueue(n->nic, i);
397         bool queue_started;
398         q = &n->vqs[i];
399 
400         if ((!n->multiqueue && i != 0) || i >= n->curr_queue_pairs) {
401             queue_status = 0;
402         } else {
403             queue_status = status;
404         }
405         queue_started =
406             virtio_net_started(n, queue_status) && !n->vhost_started;
407 
408         if (queue_started) {
409             qemu_flush_queued_packets(ncs);
410         }
411 
412         if (!q->tx_waiting) {
413             continue;
414         }
415 
416         if (queue_started) {
417             if (q->tx_timer) {
418                 timer_mod(q->tx_timer,
419                                qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
420             } else {
421                 replay_bh_schedule_event(q->tx_bh);
422             }
423         } else {
424             if (q->tx_timer) {
425                 timer_del(q->tx_timer);
426             } else {
427                 qemu_bh_cancel(q->tx_bh);
428             }
429             if ((n->status & VIRTIO_NET_S_LINK_UP) == 0 &&
430                 (queue_status & VIRTIO_CONFIG_S_DRIVER_OK) &&
431                 vdev->vm_running) {
432                 /* if tx is waiting we are likely have some packets in tx queue
433                  * and disabled notification */
434                 q->tx_waiting = 0;
435                 virtio_queue_set_notification(q->tx_vq, 1);
436                 virtio_net_drop_tx_queue_data(vdev, q->tx_vq);
437             }
438         }
439     }
440 }
441 
442 static void virtio_net_set_link_status(NetClientState *nc)
443 {
444     VirtIONet *n = qemu_get_nic_opaque(nc);
445     VirtIODevice *vdev = VIRTIO_DEVICE(n);
446     uint16_t old_status = n->status;
447 
448     if (nc->link_down)
449         n->status &= ~VIRTIO_NET_S_LINK_UP;
450     else
451         n->status |= VIRTIO_NET_S_LINK_UP;
452 
453     if (n->status != old_status)
454         virtio_notify_config(vdev);
455 
456     virtio_net_set_status(vdev, vdev->status);
457 }
458 
459 static void rxfilter_notify(NetClientState *nc)
460 {
461     VirtIONet *n = qemu_get_nic_opaque(nc);
462 
463     if (nc->rxfilter_notify_enabled) {
464         char *path = object_get_canonical_path(OBJECT(n->qdev));
465         qapi_event_send_nic_rx_filter_changed(n->netclient_name, path);
466         g_free(path);
467 
468         /* disable event notification to avoid events flooding */
469         nc->rxfilter_notify_enabled = 0;
470     }
471 }
472 
473 static intList *get_vlan_table(VirtIONet *n)
474 {
475     intList *list;
476     int i, j;
477 
478     list = NULL;
479     for (i = 0; i < MAX_VLAN >> 5; i++) {
480         for (j = 0; n->vlans[i] && j <= 0x1f; j++) {
481             if (n->vlans[i] & (1U << j)) {
482                 QAPI_LIST_PREPEND(list, (i << 5) + j);
483             }
484         }
485     }
486 
487     return list;
488 }
489 
490 static RxFilterInfo *virtio_net_query_rxfilter(NetClientState *nc)
491 {
492     VirtIONet *n = qemu_get_nic_opaque(nc);
493     VirtIODevice *vdev = VIRTIO_DEVICE(n);
494     RxFilterInfo *info;
495     strList *str_list;
496     int i;
497 
498     info = g_malloc0(sizeof(*info));
499     info->name = g_strdup(nc->name);
500     info->promiscuous = n->promisc;
501 
502     if (n->nouni) {
503         info->unicast = RX_STATE_NONE;
504     } else if (n->alluni) {
505         info->unicast = RX_STATE_ALL;
506     } else {
507         info->unicast = RX_STATE_NORMAL;
508     }
509 
510     if (n->nomulti) {
511         info->multicast = RX_STATE_NONE;
512     } else if (n->allmulti) {
513         info->multicast = RX_STATE_ALL;
514     } else {
515         info->multicast = RX_STATE_NORMAL;
516     }
517 
518     info->broadcast_allowed = n->nobcast;
519     info->multicast_overflow = n->mac_table.multi_overflow;
520     info->unicast_overflow = n->mac_table.uni_overflow;
521 
522     info->main_mac = qemu_mac_strdup_printf(n->mac);
523 
524     str_list = NULL;
525     for (i = 0; i < n->mac_table.first_multi; i++) {
526         QAPI_LIST_PREPEND(str_list,
527                       qemu_mac_strdup_printf(n->mac_table.macs + i * ETH_ALEN));
528     }
529     info->unicast_table = str_list;
530 
531     str_list = NULL;
532     for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) {
533         QAPI_LIST_PREPEND(str_list,
534                       qemu_mac_strdup_printf(n->mac_table.macs + i * ETH_ALEN));
535     }
536     info->multicast_table = str_list;
537     info->vlan_table = get_vlan_table(n);
538 
539     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VLAN)) {
540         info->vlan = RX_STATE_ALL;
541     } else if (!info->vlan_table) {
542         info->vlan = RX_STATE_NONE;
543     } else {
544         info->vlan = RX_STATE_NORMAL;
545     }
546 
547     /* enable event notification after query */
548     nc->rxfilter_notify_enabled = 1;
549 
550     return info;
551 }
552 
553 static void virtio_net_queue_reset(VirtIODevice *vdev, uint32_t queue_index)
554 {
555     VirtIONet *n = VIRTIO_NET(vdev);
556     NetClientState *nc;
557 
558     /* validate queue_index and skip for cvq */
559     if (queue_index >= n->max_queue_pairs * 2) {
560         return;
561     }
562 
563     nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
564 
565     if (!nc->peer) {
566         return;
567     }
568 
569     if (get_vhost_net(nc->peer) &&
570         nc->peer->info->type == NET_CLIENT_DRIVER_TAP) {
571         vhost_net_virtqueue_reset(vdev, nc, queue_index);
572     }
573 
574     flush_or_purge_queued_packets(nc);
575 }
576 
577 static void virtio_net_queue_enable(VirtIODevice *vdev, uint32_t queue_index)
578 {
579     VirtIONet *n = VIRTIO_NET(vdev);
580     NetClientState *nc;
581     int r;
582 
583     /* validate queue_index and skip for cvq */
584     if (queue_index >= n->max_queue_pairs * 2) {
585         return;
586     }
587 
588     nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
589 
590     if (!nc->peer || !vdev->vhost_started) {
591         return;
592     }
593 
594     if (get_vhost_net(nc->peer) &&
595         nc->peer->info->type == NET_CLIENT_DRIVER_TAP) {
596         r = vhost_net_virtqueue_restart(vdev, nc, queue_index);
597         if (r < 0) {
598             error_report("unable to restart vhost net virtqueue: %d, "
599                             "when resetting the queue", queue_index);
600         }
601     }
602 }
603 
604 static void peer_test_vnet_hdr(VirtIONet *n)
605 {
606     NetClientState *nc = qemu_get_queue(n->nic);
607     if (!nc->peer) {
608         return;
609     }
610 
611     n->has_vnet_hdr = qemu_has_vnet_hdr(nc->peer);
612 }
613 
614 static int peer_has_vnet_hdr(VirtIONet *n)
615 {
616     return n->has_vnet_hdr;
617 }
618 
619 static int peer_has_ufo(VirtIONet *n)
620 {
621     if (!peer_has_vnet_hdr(n))
622         return 0;
623 
624     n->has_ufo = qemu_has_ufo(qemu_get_queue(n->nic)->peer);
625 
626     return n->has_ufo;
627 }
628 
629 static int peer_has_uso(VirtIONet *n)
630 {
631     if (!peer_has_vnet_hdr(n)) {
632         return 0;
633     }
634 
635     return qemu_has_uso(qemu_get_queue(n->nic)->peer);
636 }
637 
638 static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
639                                        int version_1, int hash_report)
640 {
641     int i;
642     NetClientState *nc;
643 
644     n->mergeable_rx_bufs = mergeable_rx_bufs;
645 
646     if (version_1) {
647         n->guest_hdr_len = hash_report ?
648             sizeof(struct virtio_net_hdr_v1_hash) :
649             sizeof(struct virtio_net_hdr_mrg_rxbuf);
650         n->rss_data.populate_hash = !!hash_report;
651     } else {
652         n->guest_hdr_len = n->mergeable_rx_bufs ?
653             sizeof(struct virtio_net_hdr_mrg_rxbuf) :
654             sizeof(struct virtio_net_hdr);
655         n->rss_data.populate_hash = false;
656     }
657 
658     for (i = 0; i < n->max_queue_pairs; i++) {
659         nc = qemu_get_subqueue(n->nic, i);
660 
661         if (peer_has_vnet_hdr(n) &&
662             qemu_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) {
663             qemu_set_vnet_hdr_len(nc->peer, n->guest_hdr_len);
664             n->host_hdr_len = n->guest_hdr_len;
665         }
666     }
667 }
668 
669 static int virtio_net_max_tx_queue_size(VirtIONet *n)
670 {
671     NetClientState *peer = n->nic_conf.peers.ncs[0];
672 
673     /*
674      * Backends other than vhost-user or vhost-vdpa don't support max queue
675      * size.
676      */
677     if (!peer) {
678         return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
679     }
680 
681     switch(peer->info->type) {
682     case NET_CLIENT_DRIVER_VHOST_USER:
683     case NET_CLIENT_DRIVER_VHOST_VDPA:
684         return VIRTQUEUE_MAX_SIZE;
685     default:
686         return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
687     };
688 }
689 
690 static int peer_attach(VirtIONet *n, int index)
691 {
692     NetClientState *nc = qemu_get_subqueue(n->nic, index);
693 
694     if (!nc->peer) {
695         return 0;
696     }
697 
698     if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
699         vhost_set_vring_enable(nc->peer, 1);
700     }
701 
702     if (nc->peer->info->type != NET_CLIENT_DRIVER_TAP) {
703         return 0;
704     }
705 
706     if (n->max_queue_pairs == 1) {
707         return 0;
708     }
709 
710     return tap_enable(nc->peer);
711 }
712 
713 static int peer_detach(VirtIONet *n, int index)
714 {
715     NetClientState *nc = qemu_get_subqueue(n->nic, index);
716 
717     if (!nc->peer) {
718         return 0;
719     }
720 
721     if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
722         vhost_set_vring_enable(nc->peer, 0);
723     }
724 
725     if (nc->peer->info->type !=  NET_CLIENT_DRIVER_TAP) {
726         return 0;
727     }
728 
729     return tap_disable(nc->peer);
730 }
731 
732 static void virtio_net_set_queue_pairs(VirtIONet *n)
733 {
734     int i;
735     int r;
736 
737     if (n->nic->peer_deleted) {
738         return;
739     }
740 
741     for (i = 0; i < n->max_queue_pairs; i++) {
742         if (i < n->curr_queue_pairs) {
743             r = peer_attach(n, i);
744             assert(!r);
745         } else {
746             r = peer_detach(n, i);
747             assert(!r);
748         }
749     }
750 }
751 
752 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue);
753 
754 static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
755                                         Error **errp)
756 {
757     VirtIONet *n = VIRTIO_NET(vdev);
758     NetClientState *nc = qemu_get_queue(n->nic);
759 
760     /* Firstly sync all virtio-net possible supported features */
761     features |= n->host_features;
762 
763     virtio_add_feature(&features, VIRTIO_NET_F_MAC);
764 
765     if (!peer_has_vnet_hdr(n)) {
766         virtio_clear_feature(&features, VIRTIO_NET_F_CSUM);
767         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO4);
768         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO6);
769         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_ECN);
770 
771         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_CSUM);
772         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4);
773         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
774         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
775 
776         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
777         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
778         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
779 
780         virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
781     }
782 
783     if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) {
784         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_UFO);
785         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
786     }
787 
788     if (!peer_has_uso(n)) {
789         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
790         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
791         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
792     }
793 
794     if (!get_vhost_net(nc->peer)) {
795         return features;
796     }
797 
798     if (!ebpf_rss_is_loaded(&n->ebpf_rss)) {
799         virtio_clear_feature(&features, VIRTIO_NET_F_RSS);
800     }
801     features = vhost_net_get_features(get_vhost_net(nc->peer), features);
802     vdev->backend_features = features;
803 
804     if (n->mtu_bypass_backend &&
805             (n->host_features & 1ULL << VIRTIO_NET_F_MTU)) {
806         features |= (1ULL << VIRTIO_NET_F_MTU);
807     }
808 
809     /*
810      * Since GUEST_ANNOUNCE is emulated the feature bit could be set without
811      * enabled. This happens in the vDPA case.
812      *
813      * Make sure the feature set is not incoherent, as the driver could refuse
814      * to start.
815      *
816      * TODO: QEMU is able to emulate a CVQ just for guest_announce purposes,
817      * helping guest to notify the new location with vDPA devices that does not
818      * support it.
819      */
820     if (!virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_CTRL_VQ)) {
821         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE);
822     }
823 
824     return features;
825 }
826 
827 static uint64_t virtio_net_bad_features(VirtIODevice *vdev)
828 {
829     uint64_t features = 0;
830 
831     /* Linux kernel 2.6.25.  It understood MAC (as everyone must),
832      * but also these: */
833     virtio_add_feature(&features, VIRTIO_NET_F_MAC);
834     virtio_add_feature(&features, VIRTIO_NET_F_CSUM);
835     virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO4);
836     virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO6);
837     virtio_add_feature(&features, VIRTIO_NET_F_HOST_ECN);
838 
839     return features;
840 }
841 
842 static void virtio_net_apply_guest_offloads(VirtIONet *n)
843 {
844     qemu_set_offload(qemu_get_queue(n->nic)->peer,
845             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)),
846             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
847             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
848             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
849             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)),
850             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)),
851             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
852 }
853 
854 static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
855 {
856     static const uint64_t guest_offloads_mask =
857         (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
858         (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
859         (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
860         (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
861         (1ULL << VIRTIO_NET_F_GUEST_UFO)  |
862         (1ULL << VIRTIO_NET_F_GUEST_USO4) |
863         (1ULL << VIRTIO_NET_F_GUEST_USO6);
864 
865     return guest_offloads_mask & features;
866 }
867 
868 uint64_t virtio_net_supported_guest_offloads(const VirtIONet *n)
869 {
870     VirtIODevice *vdev = VIRTIO_DEVICE(n);
871     return virtio_net_guest_offloads_by_features(vdev->guest_features);
872 }
873 
874 typedef struct {
875     VirtIONet *n;
876     DeviceState *dev;
877 } FailoverDevice;
878 
879 /**
880  * Set the failover primary device
881  *
882  * @opaque: FailoverId to setup
883  * @opts: opts for device we are handling
884  * @errp: returns an error if this function fails
885  */
886 static int failover_set_primary(DeviceState *dev, void *opaque)
887 {
888     FailoverDevice *fdev = opaque;
889     PCIDevice *pci_dev = (PCIDevice *)
890         object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE);
891 
892     if (!pci_dev) {
893         return 0;
894     }
895 
896     if (!g_strcmp0(pci_dev->failover_pair_id, fdev->n->netclient_name)) {
897         fdev->dev = dev;
898         return 1;
899     }
900 
901     return 0;
902 }
903 
904 /**
905  * Find the primary device for this failover virtio-net
906  *
907  * @n: VirtIONet device
908  * @errp: returns an error if this function fails
909  */
910 static DeviceState *failover_find_primary_device(VirtIONet *n)
911 {
912     FailoverDevice fdev = {
913         .n = n,
914     };
915 
916     qbus_walk_children(sysbus_get_default(), failover_set_primary, NULL,
917                        NULL, NULL, &fdev);
918     return fdev.dev;
919 }
920 
921 static void failover_add_primary(VirtIONet *n, Error **errp)
922 {
923     Error *err = NULL;
924     DeviceState *dev = failover_find_primary_device(n);
925 
926     if (dev) {
927         return;
928     }
929 
930     if (!n->primary_opts) {
931         error_setg(errp, "Primary device not found");
932         error_append_hint(errp, "Virtio-net failover will not work. Make "
933                           "sure primary device has parameter"
934                           " failover_pair_id=%s\n", n->netclient_name);
935         return;
936     }
937 
938     dev = qdev_device_add_from_qdict(n->primary_opts,
939                                      n->primary_opts_from_json,
940                                      &err);
941     if (err) {
942         qobject_unref(n->primary_opts);
943         n->primary_opts = NULL;
944     } else {
945         object_unref(OBJECT(dev));
946     }
947     error_propagate(errp, err);
948 }
949 
950 static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
951 {
952     VirtIONet *n = VIRTIO_NET(vdev);
953     Error *err = NULL;
954     int i;
955 
956     if (n->mtu_bypass_backend &&
957             !virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_MTU)) {
958         features &= ~(1ULL << VIRTIO_NET_F_MTU);
959     }
960 
961     virtio_net_set_multiqueue(n,
962                               virtio_has_feature(features, VIRTIO_NET_F_RSS) ||
963                               virtio_has_feature(features, VIRTIO_NET_F_MQ));
964 
965     virtio_net_set_mrg_rx_bufs(n,
966                                virtio_has_feature(features,
967                                                   VIRTIO_NET_F_MRG_RXBUF),
968                                virtio_has_feature(features,
969                                                   VIRTIO_F_VERSION_1),
970                                virtio_has_feature(features,
971                                                   VIRTIO_NET_F_HASH_REPORT));
972 
973     n->rsc4_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
974         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO4);
975     n->rsc6_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
976         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO6);
977     n->rss_data.redirect = virtio_has_feature(features, VIRTIO_NET_F_RSS);
978 
979     if (n->has_vnet_hdr) {
980         n->curr_guest_offloads =
981             virtio_net_guest_offloads_by_features(features);
982         virtio_net_apply_guest_offloads(n);
983     }
984 
985     for (i = 0;  i < n->max_queue_pairs; i++) {
986         NetClientState *nc = qemu_get_subqueue(n->nic, i);
987 
988         if (!get_vhost_net(nc->peer)) {
989             continue;
990         }
991         vhost_net_ack_features(get_vhost_net(nc->peer), features);
992 
993         /*
994          * keep acked_features in NetVhostUserState up-to-date so it
995          * can't miss any features configured by guest virtio driver.
996          */
997         vhost_net_save_acked_features(nc->peer);
998     }
999 
1000     if (!virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN)) {
1001         memset(n->vlans, 0xff, MAX_VLAN >> 3);
1002     }
1003 
1004     if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
1005         qapi_event_send_failover_negotiated(n->netclient_name);
1006         qatomic_set(&n->failover_primary_hidden, false);
1007         failover_add_primary(n, &err);
1008         if (err) {
1009             if (!qtest_enabled()) {
1010                 warn_report_err(err);
1011             } else {
1012                 error_free(err);
1013             }
1014         }
1015     }
1016 }
1017 
1018 static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
1019                                      struct iovec *iov, unsigned int iov_cnt)
1020 {
1021     uint8_t on;
1022     size_t s;
1023     NetClientState *nc = qemu_get_queue(n->nic);
1024 
1025     s = iov_to_buf(iov, iov_cnt, 0, &on, sizeof(on));
1026     if (s != sizeof(on)) {
1027         return VIRTIO_NET_ERR;
1028     }
1029 
1030     if (cmd == VIRTIO_NET_CTRL_RX_PROMISC) {
1031         n->promisc = on;
1032     } else if (cmd == VIRTIO_NET_CTRL_RX_ALLMULTI) {
1033         n->allmulti = on;
1034     } else if (cmd == VIRTIO_NET_CTRL_RX_ALLUNI) {
1035         n->alluni = on;
1036     } else if (cmd == VIRTIO_NET_CTRL_RX_NOMULTI) {
1037         n->nomulti = on;
1038     } else if (cmd == VIRTIO_NET_CTRL_RX_NOUNI) {
1039         n->nouni = on;
1040     } else if (cmd == VIRTIO_NET_CTRL_RX_NOBCAST) {
1041         n->nobcast = on;
1042     } else {
1043         return VIRTIO_NET_ERR;
1044     }
1045 
1046     rxfilter_notify(nc);
1047 
1048     return VIRTIO_NET_OK;
1049 }
1050 
1051 static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
1052                                      struct iovec *iov, unsigned int iov_cnt)
1053 {
1054     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1055     uint64_t offloads;
1056     size_t s;
1057 
1058     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
1059         return VIRTIO_NET_ERR;
1060     }
1061 
1062     s = iov_to_buf(iov, iov_cnt, 0, &offloads, sizeof(offloads));
1063     if (s != sizeof(offloads)) {
1064         return VIRTIO_NET_ERR;
1065     }
1066 
1067     if (cmd == VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET) {
1068         uint64_t supported_offloads;
1069 
1070         offloads = virtio_ldq_p(vdev, &offloads);
1071 
1072         if (!n->has_vnet_hdr) {
1073             return VIRTIO_NET_ERR;
1074         }
1075 
1076         n->rsc4_enabled = virtio_has_feature(offloads, VIRTIO_NET_F_RSC_EXT) &&
1077             virtio_has_feature(offloads, VIRTIO_NET_F_GUEST_TSO4);
1078         n->rsc6_enabled = virtio_has_feature(offloads, VIRTIO_NET_F_RSC_EXT) &&
1079             virtio_has_feature(offloads, VIRTIO_NET_F_GUEST_TSO6);
1080         virtio_clear_feature(&offloads, VIRTIO_NET_F_RSC_EXT);
1081 
1082         supported_offloads = virtio_net_supported_guest_offloads(n);
1083         if (offloads & ~supported_offloads) {
1084             return VIRTIO_NET_ERR;
1085         }
1086 
1087         n->curr_guest_offloads = offloads;
1088         virtio_net_apply_guest_offloads(n);
1089 
1090         return VIRTIO_NET_OK;
1091     } else {
1092         return VIRTIO_NET_ERR;
1093     }
1094 }
1095 
1096 static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
1097                                  struct iovec *iov, unsigned int iov_cnt)
1098 {
1099     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1100     struct virtio_net_ctrl_mac mac_data;
1101     size_t s;
1102     NetClientState *nc = qemu_get_queue(n->nic);
1103 
1104     if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) {
1105         if (iov_size(iov, iov_cnt) != sizeof(n->mac)) {
1106             return VIRTIO_NET_ERR;
1107         }
1108         s = iov_to_buf(iov, iov_cnt, 0, &n->mac, sizeof(n->mac));
1109         assert(s == sizeof(n->mac));
1110         qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
1111         rxfilter_notify(nc);
1112 
1113         return VIRTIO_NET_OK;
1114     }
1115 
1116     if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) {
1117         return VIRTIO_NET_ERR;
1118     }
1119 
1120     int in_use = 0;
1121     int first_multi = 0;
1122     uint8_t uni_overflow = 0;
1123     uint8_t multi_overflow = 0;
1124     uint8_t *macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
1125 
1126     s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
1127                    sizeof(mac_data.entries));
1128     mac_data.entries = virtio_ldl_p(vdev, &mac_data.entries);
1129     if (s != sizeof(mac_data.entries)) {
1130         goto error;
1131     }
1132     iov_discard_front(&iov, &iov_cnt, s);
1133 
1134     if (mac_data.entries * ETH_ALEN > iov_size(iov, iov_cnt)) {
1135         goto error;
1136     }
1137 
1138     if (mac_data.entries <= MAC_TABLE_ENTRIES) {
1139         s = iov_to_buf(iov, iov_cnt, 0, macs,
1140                        mac_data.entries * ETH_ALEN);
1141         if (s != mac_data.entries * ETH_ALEN) {
1142             goto error;
1143         }
1144         in_use += mac_data.entries;
1145     } else {
1146         uni_overflow = 1;
1147     }
1148 
1149     iov_discard_front(&iov, &iov_cnt, mac_data.entries * ETH_ALEN);
1150 
1151     first_multi = in_use;
1152 
1153     s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
1154                    sizeof(mac_data.entries));
1155     mac_data.entries = virtio_ldl_p(vdev, &mac_data.entries);
1156     if (s != sizeof(mac_data.entries)) {
1157         goto error;
1158     }
1159 
1160     iov_discard_front(&iov, &iov_cnt, s);
1161 
1162     if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) {
1163         goto error;
1164     }
1165 
1166     if (mac_data.entries <= MAC_TABLE_ENTRIES - in_use) {
1167         s = iov_to_buf(iov, iov_cnt, 0, &macs[in_use * ETH_ALEN],
1168                        mac_data.entries * ETH_ALEN);
1169         if (s != mac_data.entries * ETH_ALEN) {
1170             goto error;
1171         }
1172         in_use += mac_data.entries;
1173     } else {
1174         multi_overflow = 1;
1175     }
1176 
1177     n->mac_table.in_use = in_use;
1178     n->mac_table.first_multi = first_multi;
1179     n->mac_table.uni_overflow = uni_overflow;
1180     n->mac_table.multi_overflow = multi_overflow;
1181     memcpy(n->mac_table.macs, macs, MAC_TABLE_ENTRIES * ETH_ALEN);
1182     g_free(macs);
1183     rxfilter_notify(nc);
1184 
1185     return VIRTIO_NET_OK;
1186 
1187 error:
1188     g_free(macs);
1189     return VIRTIO_NET_ERR;
1190 }
1191 
1192 static int virtio_net_handle_vlan_table(VirtIONet *n, uint8_t cmd,
1193                                         struct iovec *iov, unsigned int iov_cnt)
1194 {
1195     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1196     uint16_t vid;
1197     size_t s;
1198     NetClientState *nc = qemu_get_queue(n->nic);
1199 
1200     s = iov_to_buf(iov, iov_cnt, 0, &vid, sizeof(vid));
1201     vid = virtio_lduw_p(vdev, &vid);
1202     if (s != sizeof(vid)) {
1203         return VIRTIO_NET_ERR;
1204     }
1205 
1206     if (vid >= MAX_VLAN)
1207         return VIRTIO_NET_ERR;
1208 
1209     if (cmd == VIRTIO_NET_CTRL_VLAN_ADD)
1210         n->vlans[vid >> 5] |= (1U << (vid & 0x1f));
1211     else if (cmd == VIRTIO_NET_CTRL_VLAN_DEL)
1212         n->vlans[vid >> 5] &= ~(1U << (vid & 0x1f));
1213     else
1214         return VIRTIO_NET_ERR;
1215 
1216     rxfilter_notify(nc);
1217 
1218     return VIRTIO_NET_OK;
1219 }
1220 
1221 static int virtio_net_handle_announce(VirtIONet *n, uint8_t cmd,
1222                                       struct iovec *iov, unsigned int iov_cnt)
1223 {
1224     trace_virtio_net_handle_announce(n->announce_timer.round);
1225     if (cmd == VIRTIO_NET_CTRL_ANNOUNCE_ACK &&
1226         n->status & VIRTIO_NET_S_ANNOUNCE) {
1227         n->status &= ~VIRTIO_NET_S_ANNOUNCE;
1228         if (n->announce_timer.round) {
1229             qemu_announce_timer_step(&n->announce_timer);
1230         }
1231         return VIRTIO_NET_OK;
1232     } else {
1233         return VIRTIO_NET_ERR;
1234     }
1235 }
1236 
1237 static bool virtio_net_attach_ebpf_to_backend(NICState *nic, int prog_fd)
1238 {
1239     NetClientState *nc = qemu_get_peer(qemu_get_queue(nic), 0);
1240     if (nc == NULL || nc->info->set_steering_ebpf == NULL) {
1241         return false;
1242     }
1243 
1244     return nc->info->set_steering_ebpf(nc, prog_fd);
1245 }
1246 
1247 static void rss_data_to_rss_config(struct VirtioNetRssData *data,
1248                                    struct EBPFRSSConfig *config)
1249 {
1250     config->redirect = data->redirect;
1251     config->populate_hash = data->populate_hash;
1252     config->hash_types = data->hash_types;
1253     config->indirections_len = data->indirections_len;
1254     config->default_queue = data->default_queue;
1255 }
1256 
1257 static bool virtio_net_attach_ebpf_rss(VirtIONet *n)
1258 {
1259     struct EBPFRSSConfig config = {};
1260 
1261     if (!ebpf_rss_is_loaded(&n->ebpf_rss)) {
1262         return false;
1263     }
1264 
1265     rss_data_to_rss_config(&n->rss_data, &config);
1266 
1267     if (!ebpf_rss_set_all(&n->ebpf_rss, &config,
1268                           n->rss_data.indirections_table, n->rss_data.key,
1269                           NULL)) {
1270         return false;
1271     }
1272 
1273     if (!virtio_net_attach_ebpf_to_backend(n->nic, n->ebpf_rss.program_fd)) {
1274         return false;
1275     }
1276 
1277     return true;
1278 }
1279 
1280 static void virtio_net_detach_ebpf_rss(VirtIONet *n)
1281 {
1282     virtio_net_attach_ebpf_to_backend(n->nic, -1);
1283 }
1284 
1285 static void virtio_net_commit_rss_config(VirtIONet *n)
1286 {
1287     if (n->rss_data.enabled) {
1288         n->rss_data.enabled_software_rss = n->rss_data.populate_hash;
1289         if (n->rss_data.populate_hash) {
1290             virtio_net_detach_ebpf_rss(n);
1291         } else if (!virtio_net_attach_ebpf_rss(n)) {
1292             if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
1293                 warn_report("Can't load eBPF RSS for vhost");
1294             } else {
1295                 warn_report("Can't load eBPF RSS - fallback to software RSS");
1296                 n->rss_data.enabled_software_rss = true;
1297             }
1298         }
1299 
1300         trace_virtio_net_rss_enable(n->rss_data.hash_types,
1301                                     n->rss_data.indirections_len,
1302                                     sizeof(n->rss_data.key));
1303     } else {
1304         virtio_net_detach_ebpf_rss(n);
1305         trace_virtio_net_rss_disable();
1306     }
1307 }
1308 
1309 static void virtio_net_disable_rss(VirtIONet *n)
1310 {
1311     if (!n->rss_data.enabled) {
1312         return;
1313     }
1314 
1315     n->rss_data.enabled = false;
1316     virtio_net_commit_rss_config(n);
1317 }
1318 
1319 static bool virtio_net_load_ebpf_fds(VirtIONet *n, Error **errp)
1320 {
1321     int fds[EBPF_RSS_MAX_FDS] = { [0 ... EBPF_RSS_MAX_FDS - 1] = -1};
1322     int ret = true;
1323     int i = 0;
1324 
1325     if (n->nr_ebpf_rss_fds != EBPF_RSS_MAX_FDS) {
1326         error_setg(errp, "Expected %d file descriptors but got %d",
1327                    EBPF_RSS_MAX_FDS, n->nr_ebpf_rss_fds);
1328         return false;
1329     }
1330 
1331     for (i = 0; i < n->nr_ebpf_rss_fds; i++) {
1332         fds[i] = monitor_fd_param(monitor_cur(), n->ebpf_rss_fds[i], errp);
1333         if (fds[i] < 0) {
1334             ret = false;
1335             goto exit;
1336         }
1337     }
1338 
1339     ret = ebpf_rss_load_fds(&n->ebpf_rss, fds[0], fds[1], fds[2], fds[3], errp);
1340 
1341 exit:
1342     if (!ret) {
1343         for (i = 0; i < n->nr_ebpf_rss_fds && fds[i] != -1; i++) {
1344             close(fds[i]);
1345         }
1346     }
1347 
1348     return ret;
1349 }
1350 
1351 static bool virtio_net_load_ebpf(VirtIONet *n, Error **errp)
1352 {
1353     bool ret = false;
1354 
1355     if (virtio_net_attach_ebpf_to_backend(n->nic, -1)) {
1356         if (n->ebpf_rss_fds) {
1357             ret = virtio_net_load_ebpf_fds(n, errp);
1358         } else {
1359             ret = ebpf_rss_load(&n->ebpf_rss, errp);
1360         }
1361     }
1362 
1363     return ret;
1364 }
1365 
1366 static void virtio_net_unload_ebpf(VirtIONet *n)
1367 {
1368     virtio_net_attach_ebpf_to_backend(n->nic, -1);
1369     ebpf_rss_unload(&n->ebpf_rss);
1370 }
1371 
1372 static uint16_t virtio_net_handle_rss(VirtIONet *n,
1373                                       struct iovec *iov,
1374                                       unsigned int iov_cnt,
1375                                       bool do_rss)
1376 {
1377     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1378     struct virtio_net_rss_config cfg;
1379     size_t s, offset = 0, size_get;
1380     uint16_t queue_pairs, i;
1381     struct {
1382         uint16_t us;
1383         uint8_t b;
1384     } QEMU_PACKED temp;
1385     const char *err_msg = "";
1386     uint32_t err_value = 0;
1387 
1388     if (do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
1389         err_msg = "RSS is not negotiated";
1390         goto error;
1391     }
1392     if (!do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) {
1393         err_msg = "Hash report is not negotiated";
1394         goto error;
1395     }
1396     size_get = offsetof(struct virtio_net_rss_config, indirection_table);
1397     s = iov_to_buf(iov, iov_cnt, offset, &cfg, size_get);
1398     if (s != size_get) {
1399         err_msg = "Short command buffer";
1400         err_value = (uint32_t)s;
1401         goto error;
1402     }
1403     n->rss_data.hash_types = virtio_ldl_p(vdev, &cfg.hash_types);
1404     n->rss_data.indirections_len =
1405         virtio_lduw_p(vdev, &cfg.indirection_table_mask);
1406     n->rss_data.indirections_len++;
1407     if (!do_rss) {
1408         n->rss_data.indirections_len = 1;
1409     }
1410     if (!is_power_of_2(n->rss_data.indirections_len)) {
1411         err_msg = "Invalid size of indirection table";
1412         err_value = n->rss_data.indirections_len;
1413         goto error;
1414     }
1415     if (n->rss_data.indirections_len > VIRTIO_NET_RSS_MAX_TABLE_LEN) {
1416         err_msg = "Too large indirection table";
1417         err_value = n->rss_data.indirections_len;
1418         goto error;
1419     }
1420     n->rss_data.default_queue = do_rss ?
1421         virtio_lduw_p(vdev, &cfg.unclassified_queue) : 0;
1422     if (n->rss_data.default_queue >= n->max_queue_pairs) {
1423         err_msg = "Invalid default queue";
1424         err_value = n->rss_data.default_queue;
1425         goto error;
1426     }
1427     offset += size_get;
1428     size_get = sizeof(uint16_t) * n->rss_data.indirections_len;
1429     g_free(n->rss_data.indirections_table);
1430     n->rss_data.indirections_table = g_malloc(size_get);
1431     if (!n->rss_data.indirections_table) {
1432         err_msg = "Can't allocate indirections table";
1433         err_value = n->rss_data.indirections_len;
1434         goto error;
1435     }
1436     s = iov_to_buf(iov, iov_cnt, offset,
1437                    n->rss_data.indirections_table, size_get);
1438     if (s != size_get) {
1439         err_msg = "Short indirection table buffer";
1440         err_value = (uint32_t)s;
1441         goto error;
1442     }
1443     for (i = 0; i < n->rss_data.indirections_len; ++i) {
1444         uint16_t val = n->rss_data.indirections_table[i];
1445         n->rss_data.indirections_table[i] = virtio_lduw_p(vdev, &val);
1446     }
1447     offset += size_get;
1448     size_get = sizeof(temp);
1449     s = iov_to_buf(iov, iov_cnt, offset, &temp, size_get);
1450     if (s != size_get) {
1451         err_msg = "Can't get queue_pairs";
1452         err_value = (uint32_t)s;
1453         goto error;
1454     }
1455     queue_pairs = do_rss ? virtio_lduw_p(vdev, &temp.us) : n->curr_queue_pairs;
1456     if (queue_pairs == 0 || queue_pairs > n->max_queue_pairs) {
1457         err_msg = "Invalid number of queue_pairs";
1458         err_value = queue_pairs;
1459         goto error;
1460     }
1461     if (temp.b > VIRTIO_NET_RSS_MAX_KEY_SIZE) {
1462         err_msg = "Invalid key size";
1463         err_value = temp.b;
1464         goto error;
1465     }
1466     if (!temp.b && n->rss_data.hash_types) {
1467         err_msg = "No key provided";
1468         err_value = 0;
1469         goto error;
1470     }
1471     if (!temp.b && !n->rss_data.hash_types) {
1472         virtio_net_disable_rss(n);
1473         return queue_pairs;
1474     }
1475     offset += size_get;
1476     size_get = temp.b;
1477     s = iov_to_buf(iov, iov_cnt, offset, n->rss_data.key, size_get);
1478     if (s != size_get) {
1479         err_msg = "Can get key buffer";
1480         err_value = (uint32_t)s;
1481         goto error;
1482     }
1483     n->rss_data.enabled = true;
1484     virtio_net_commit_rss_config(n);
1485     return queue_pairs;
1486 error:
1487     trace_virtio_net_rss_error(err_msg, err_value);
1488     virtio_net_disable_rss(n);
1489     return 0;
1490 }
1491 
1492 static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
1493                                 struct iovec *iov, unsigned int iov_cnt)
1494 {
1495     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1496     uint16_t queue_pairs;
1497     NetClientState *nc = qemu_get_queue(n->nic);
1498 
1499     virtio_net_disable_rss(n);
1500     if (cmd == VIRTIO_NET_CTRL_MQ_HASH_CONFIG) {
1501         queue_pairs = virtio_net_handle_rss(n, iov, iov_cnt, false);
1502         return queue_pairs ? VIRTIO_NET_OK : VIRTIO_NET_ERR;
1503     }
1504     if (cmd == VIRTIO_NET_CTRL_MQ_RSS_CONFIG) {
1505         queue_pairs = virtio_net_handle_rss(n, iov, iov_cnt, true);
1506     } else if (cmd == VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
1507         struct virtio_net_ctrl_mq mq;
1508         size_t s;
1509         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ)) {
1510             return VIRTIO_NET_ERR;
1511         }
1512         s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq));
1513         if (s != sizeof(mq)) {
1514             return VIRTIO_NET_ERR;
1515         }
1516         queue_pairs = virtio_lduw_p(vdev, &mq.virtqueue_pairs);
1517 
1518     } else {
1519         return VIRTIO_NET_ERR;
1520     }
1521 
1522     if (queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1523         queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
1524         queue_pairs > n->max_queue_pairs ||
1525         !n->multiqueue) {
1526         return VIRTIO_NET_ERR;
1527     }
1528 
1529     n->curr_queue_pairs = queue_pairs;
1530     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
1531         /*
1532          * Avoid updating the backend for a vdpa device: We're only interested
1533          * in updating the device model queues.
1534          */
1535         return VIRTIO_NET_OK;
1536     }
1537     /* stop the backend before changing the number of queue_pairs to avoid handling a
1538      * disabled queue */
1539     virtio_net_set_status(vdev, vdev->status);
1540     virtio_net_set_queue_pairs(n);
1541 
1542     return VIRTIO_NET_OK;
1543 }
1544 
1545 size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
1546                                   const struct iovec *in_sg, unsigned in_num,
1547                                   const struct iovec *out_sg,
1548                                   unsigned out_num)
1549 {
1550     VirtIONet *n = VIRTIO_NET(vdev);
1551     struct virtio_net_ctrl_hdr ctrl;
1552     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1553     size_t s;
1554     struct iovec *iov, *iov2;
1555 
1556     if (iov_size(in_sg, in_num) < sizeof(status) ||
1557         iov_size(out_sg, out_num) < sizeof(ctrl)) {
1558         virtio_error(vdev, "virtio-net ctrl missing headers");
1559         return 0;
1560     }
1561 
1562     iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
1563     s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl));
1564     iov_discard_front(&iov, &out_num, sizeof(ctrl));
1565     if (s != sizeof(ctrl)) {
1566         status = VIRTIO_NET_ERR;
1567     } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
1568         status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
1569     } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
1570         status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
1571     } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
1572         status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
1573     } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
1574         status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
1575     } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
1576         status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
1577     } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
1578         status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
1579     }
1580 
1581     s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
1582     assert(s == sizeof(status));
1583 
1584     g_free(iov2);
1585     return sizeof(status);
1586 }
1587 
1588 static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
1589 {
1590     VirtQueueElement *elem;
1591 
1592     for (;;) {
1593         size_t written;
1594         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
1595         if (!elem) {
1596             break;
1597         }
1598 
1599         written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
1600                                              elem->out_sg, elem->out_num);
1601         if (written > 0) {
1602             virtqueue_push(vq, elem, written);
1603             virtio_notify(vdev, vq);
1604             g_free(elem);
1605         } else {
1606             virtqueue_detach_element(vq, elem, 0);
1607             g_free(elem);
1608             break;
1609         }
1610     }
1611 }
1612 
1613 /* RX */
1614 
1615 static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
1616 {
1617     VirtIONet *n = VIRTIO_NET(vdev);
1618     int queue_index = vq2q(virtio_get_queue_index(vq));
1619 
1620     qemu_flush_queued_packets(qemu_get_subqueue(n->nic, queue_index));
1621 }
1622 
1623 static bool virtio_net_can_receive(NetClientState *nc)
1624 {
1625     VirtIONet *n = qemu_get_nic_opaque(nc);
1626     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1627     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
1628 
1629     if (!vdev->vm_running) {
1630         return false;
1631     }
1632 
1633     if (nc->queue_index >= n->curr_queue_pairs) {
1634         return false;
1635     }
1636 
1637     if (!virtio_queue_ready(q->rx_vq) ||
1638         !(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
1639         return false;
1640     }
1641 
1642     return true;
1643 }
1644 
1645 static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize)
1646 {
1647     int opaque;
1648     unsigned int in_bytes;
1649     VirtIONet *n = q->n;
1650 
1651     while (virtio_queue_empty(q->rx_vq) || n->mergeable_rx_bufs) {
1652         opaque = virtqueue_get_avail_bytes(q->rx_vq, &in_bytes, NULL,
1653                                            bufsize, 0);
1654         /* Buffer is enough, disable notifiaction */
1655         if (bufsize <= in_bytes) {
1656             break;
1657         }
1658 
1659         if (virtio_queue_enable_notification_and_check(q->rx_vq, opaque)) {
1660             /* Guest has added some buffers, try again */
1661             continue;
1662         } else {
1663             return 0;
1664         }
1665     }
1666 
1667     virtio_queue_set_notification(q->rx_vq, 0);
1668 
1669     return 1;
1670 }
1671 
1672 static void virtio_net_hdr_swap(VirtIODevice *vdev, struct virtio_net_hdr *hdr)
1673 {
1674     virtio_tswap16s(vdev, &hdr->hdr_len);
1675     virtio_tswap16s(vdev, &hdr->gso_size);
1676     virtio_tswap16s(vdev, &hdr->csum_start);
1677     virtio_tswap16s(vdev, &hdr->csum_offset);
1678 }
1679 
1680 /* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so
1681  * it never finds out that the packets don't have valid checksums.  This
1682  * causes dhclient to get upset.  Fedora's carried a patch for ages to
1683  * fix this with Xen but it hasn't appeared in an upstream release of
1684  * dhclient yet.
1685  *
1686  * To avoid breaking existing guests, we catch udp packets and add
1687  * checksums.  This is terrible but it's better than hacking the guest
1688  * kernels.
1689  *
1690  * N.B. if we introduce a zero-copy API, this operation is no longer free so
1691  * we should provide a mechanism to disable it to avoid polluting the host
1692  * cache.
1693  */
1694 static void work_around_broken_dhclient(struct virtio_net_hdr *hdr,
1695                                         uint8_t *buf, size_t size)
1696 {
1697     if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && /* missing csum */
1698         (size > 27 && size < 1500) && /* normal sized MTU */
1699         (buf[12] == 0x08 && buf[13] == 0x00) && /* ethertype == IPv4 */
1700         (buf[23] == 17) && /* ip.protocol == UDP */
1701         (buf[34] == 0 && buf[35] == 67)) { /* udp.srcport == bootps */
1702         net_checksum_calculate(buf, size, CSUM_UDP);
1703         hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM;
1704     }
1705 }
1706 
1707 static void receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt,
1708                            const void *buf, size_t size)
1709 {
1710     if (n->has_vnet_hdr) {
1711         /* FIXME this cast is evil */
1712         void *wbuf = (void *)buf;
1713         work_around_broken_dhclient(wbuf, wbuf + n->host_hdr_len,
1714                                     size - n->host_hdr_len);
1715 
1716         if (n->needs_vnet_hdr_swap) {
1717             virtio_net_hdr_swap(VIRTIO_DEVICE(n), wbuf);
1718         }
1719         iov_from_buf(iov, iov_cnt, 0, buf, sizeof(struct virtio_net_hdr));
1720     } else {
1721         struct virtio_net_hdr hdr = {
1722             .flags = 0,
1723             .gso_type = VIRTIO_NET_HDR_GSO_NONE
1724         };
1725         iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr);
1726     }
1727 }
1728 
1729 static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
1730 {
1731     static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
1732     static const uint8_t vlan[] = {0x81, 0x00};
1733     uint8_t *ptr = (uint8_t *)buf;
1734     int i;
1735 
1736     if (n->promisc)
1737         return 1;
1738 
1739     ptr += n->host_hdr_len;
1740 
1741     if (!memcmp(&ptr[12], vlan, sizeof(vlan))) {
1742         int vid = lduw_be_p(ptr + 14) & 0xfff;
1743         if (!(n->vlans[vid >> 5] & (1U << (vid & 0x1f))))
1744             return 0;
1745     }
1746 
1747     if (ptr[0] & 1) { // multicast
1748         if (!memcmp(ptr, bcast, sizeof(bcast))) {
1749             return !n->nobcast;
1750         } else if (n->nomulti) {
1751             return 0;
1752         } else if (n->allmulti || n->mac_table.multi_overflow) {
1753             return 1;
1754         }
1755 
1756         for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) {
1757             if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) {
1758                 return 1;
1759             }
1760         }
1761     } else { // unicast
1762         if (n->nouni) {
1763             return 0;
1764         } else if (n->alluni || n->mac_table.uni_overflow) {
1765             return 1;
1766         } else if (!memcmp(ptr, n->mac, ETH_ALEN)) {
1767             return 1;
1768         }
1769 
1770         for (i = 0; i < n->mac_table.first_multi; i++) {
1771             if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) {
1772                 return 1;
1773             }
1774         }
1775     }
1776 
1777     return 0;
1778 }
1779 
1780 static uint8_t virtio_net_get_hash_type(bool hasip4,
1781                                         bool hasip6,
1782                                         EthL4HdrProto l4hdr_proto,
1783                                         uint32_t types)
1784 {
1785     if (hasip4) {
1786         switch (l4hdr_proto) {
1787         case ETH_L4_HDR_PROTO_TCP:
1788             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
1789                 return NetPktRssIpV4Tcp;
1790             }
1791             break;
1792 
1793         case ETH_L4_HDR_PROTO_UDP:
1794             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
1795                 return NetPktRssIpV4Udp;
1796             }
1797             break;
1798 
1799         default:
1800             break;
1801         }
1802 
1803         if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
1804             return NetPktRssIpV4;
1805         }
1806     } else if (hasip6) {
1807         switch (l4hdr_proto) {
1808         case ETH_L4_HDR_PROTO_TCP:
1809             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
1810                 return NetPktRssIpV6TcpEx;
1811             }
1812             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
1813                 return NetPktRssIpV6Tcp;
1814             }
1815             break;
1816 
1817         case ETH_L4_HDR_PROTO_UDP:
1818             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
1819                 return NetPktRssIpV6UdpEx;
1820             }
1821             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
1822                 return NetPktRssIpV6Udp;
1823             }
1824             break;
1825 
1826         default:
1827             break;
1828         }
1829 
1830         if (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
1831             return NetPktRssIpV6Ex;
1832         }
1833         if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
1834             return NetPktRssIpV6;
1835         }
1836     }
1837     return 0xff;
1838 }
1839 
1840 static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
1841                                   size_t size,
1842                                   struct virtio_net_hdr_v1_hash *hdr)
1843 {
1844     VirtIONet *n = qemu_get_nic_opaque(nc);
1845     unsigned int index = nc->queue_index, new_index = index;
1846     struct NetRxPkt *pkt = n->rx_pkt;
1847     uint8_t net_hash_type;
1848     uint32_t hash;
1849     bool hasip4, hasip6;
1850     EthL4HdrProto l4hdr_proto;
1851     static const uint8_t reports[NetPktRssIpV6UdpEx + 1] = {
1852         VIRTIO_NET_HASH_REPORT_IPv4,
1853         VIRTIO_NET_HASH_REPORT_TCPv4,
1854         VIRTIO_NET_HASH_REPORT_TCPv6,
1855         VIRTIO_NET_HASH_REPORT_IPv6,
1856         VIRTIO_NET_HASH_REPORT_IPv6_EX,
1857         VIRTIO_NET_HASH_REPORT_TCPv6_EX,
1858         VIRTIO_NET_HASH_REPORT_UDPv4,
1859         VIRTIO_NET_HASH_REPORT_UDPv6,
1860         VIRTIO_NET_HASH_REPORT_UDPv6_EX
1861     };
1862     struct iovec iov = {
1863         .iov_base = (void *)buf,
1864         .iov_len = size
1865     };
1866 
1867     net_rx_pkt_set_protocols(pkt, &iov, 1, n->host_hdr_len);
1868     net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
1869     net_hash_type = virtio_net_get_hash_type(hasip4, hasip6, l4hdr_proto,
1870                                              n->rss_data.hash_types);
1871     if (net_hash_type > NetPktRssIpV6UdpEx) {
1872         if (n->rss_data.populate_hash) {
1873             hdr->hash_value = VIRTIO_NET_HASH_REPORT_NONE;
1874             hdr->hash_report = 0;
1875         }
1876         return n->rss_data.redirect ? n->rss_data.default_queue : -1;
1877     }
1878 
1879     hash = net_rx_pkt_calc_rss_hash(pkt, net_hash_type, n->rss_data.key);
1880 
1881     if (n->rss_data.populate_hash) {
1882         hdr->hash_value = hash;
1883         hdr->hash_report = reports[net_hash_type];
1884     }
1885 
1886     if (n->rss_data.redirect) {
1887         new_index = hash & (n->rss_data.indirections_len - 1);
1888         new_index = n->rss_data.indirections_table[new_index];
1889     }
1890 
1891     return (index == new_index) ? -1 : new_index;
1892 }
1893 
1894 static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
1895                                       size_t size, bool no_rss)
1896 {
1897     VirtIONet *n = qemu_get_nic_opaque(nc);
1898     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
1899     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1900     VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
1901     size_t lens[VIRTQUEUE_MAX_SIZE];
1902     struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
1903     struct virtio_net_hdr_v1_hash extra_hdr;
1904     unsigned mhdr_cnt = 0;
1905     size_t offset, i, guest_offset, j;
1906     ssize_t err;
1907 
1908     if (!virtio_net_can_receive(nc)) {
1909         return -1;
1910     }
1911 
1912     if (!no_rss && n->rss_data.enabled && n->rss_data.enabled_software_rss) {
1913         int index = virtio_net_process_rss(nc, buf, size, &extra_hdr);
1914         if (index >= 0) {
1915             NetClientState *nc2 =
1916                 qemu_get_subqueue(n->nic, index % n->curr_queue_pairs);
1917             return virtio_net_receive_rcu(nc2, buf, size, true);
1918         }
1919     }
1920 
1921     /* hdr_len refers to the header we supply to the guest */
1922     if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
1923         return 0;
1924     }
1925 
1926     if (!receive_filter(n, buf, size))
1927         return size;
1928 
1929     offset = i = 0;
1930 
1931     while (offset < size) {
1932         VirtQueueElement *elem;
1933         int len, total;
1934         const struct iovec *sg;
1935 
1936         total = 0;
1937 
1938         if (i == VIRTQUEUE_MAX_SIZE) {
1939             virtio_error(vdev, "virtio-net unexpected long buffer chain");
1940             err = size;
1941             goto err;
1942         }
1943 
1944         elem = virtqueue_pop(q->rx_vq, sizeof(VirtQueueElement));
1945         if (!elem) {
1946             if (i) {
1947                 virtio_error(vdev, "virtio-net unexpected empty queue: "
1948                              "i %zd mergeable %d offset %zd, size %zd, "
1949                              "guest hdr len %zd, host hdr len %zd "
1950                              "guest features 0x%" PRIx64,
1951                              i, n->mergeable_rx_bufs, offset, size,
1952                              n->guest_hdr_len, n->host_hdr_len,
1953                              vdev->guest_features);
1954             }
1955             err = -1;
1956             goto err;
1957         }
1958 
1959         if (elem->in_num < 1) {
1960             virtio_error(vdev,
1961                          "virtio-net receive queue contains no in buffers");
1962             virtqueue_detach_element(q->rx_vq, elem, 0);
1963             g_free(elem);
1964             err = -1;
1965             goto err;
1966         }
1967 
1968         sg = elem->in_sg;
1969         if (i == 0) {
1970             assert(offset == 0);
1971             if (n->mergeable_rx_bufs) {
1972                 mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg),
1973                                     sg, elem->in_num,
1974                                     offsetof(typeof(extra_hdr), hdr.num_buffers),
1975                                     sizeof(extra_hdr.hdr.num_buffers));
1976             }
1977 
1978             receive_header(n, sg, elem->in_num, buf, size);
1979             if (n->rss_data.populate_hash) {
1980                 offset = offsetof(typeof(extra_hdr), hash_value);
1981                 iov_from_buf(sg, elem->in_num, offset,
1982                              (char *)&extra_hdr + offset,
1983                              sizeof(extra_hdr.hash_value) +
1984                              sizeof(extra_hdr.hash_report));
1985             }
1986             offset = n->host_hdr_len;
1987             total += n->guest_hdr_len;
1988             guest_offset = n->guest_hdr_len;
1989         } else {
1990             guest_offset = 0;
1991         }
1992 
1993         /* copy in packet.  ugh */
1994         len = iov_from_buf(sg, elem->in_num, guest_offset,
1995                            buf + offset, size - offset);
1996         total += len;
1997         offset += len;
1998         /* If buffers can't be merged, at this point we
1999          * must have consumed the complete packet.
2000          * Otherwise, drop it. */
2001         if (!n->mergeable_rx_bufs && offset < size) {
2002             virtqueue_unpop(q->rx_vq, elem, total);
2003             g_free(elem);
2004             err = size;
2005             goto err;
2006         }
2007 
2008         elems[i] = elem;
2009         lens[i] = total;
2010         i++;
2011     }
2012 
2013     if (mhdr_cnt) {
2014         virtio_stw_p(vdev, &extra_hdr.hdr.num_buffers, i);
2015         iov_from_buf(mhdr_sg, mhdr_cnt,
2016                      0,
2017                      &extra_hdr.hdr.num_buffers,
2018                      sizeof extra_hdr.hdr.num_buffers);
2019     }
2020 
2021     for (j = 0; j < i; j++) {
2022         /* signal other side */
2023         virtqueue_fill(q->rx_vq, elems[j], lens[j], j);
2024         g_free(elems[j]);
2025     }
2026 
2027     virtqueue_flush(q->rx_vq, i);
2028     virtio_notify(vdev, q->rx_vq);
2029 
2030     return size;
2031 
2032 err:
2033     for (j = 0; j < i; j++) {
2034         virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
2035         g_free(elems[j]);
2036     }
2037 
2038     return err;
2039 }
2040 
2041 static ssize_t virtio_net_do_receive(NetClientState *nc, const uint8_t *buf,
2042                                   size_t size)
2043 {
2044     RCU_READ_LOCK_GUARD();
2045 
2046     return virtio_net_receive_rcu(nc, buf, size, false);
2047 }
2048 
2049 static void virtio_net_rsc_extract_unit4(VirtioNetRscChain *chain,
2050                                          const uint8_t *buf,
2051                                          VirtioNetRscUnit *unit)
2052 {
2053     uint16_t ip_hdrlen;
2054     struct ip_header *ip;
2055 
2056     ip = (struct ip_header *)(buf + chain->n->guest_hdr_len
2057                               + sizeof(struct eth_header));
2058     unit->ip = (void *)ip;
2059     ip_hdrlen = (ip->ip_ver_len & 0xF) << 2;
2060     unit->ip_plen = &ip->ip_len;
2061     unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip) + ip_hdrlen);
2062     unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
2063     unit->payload = htons(*unit->ip_plen) - ip_hdrlen - unit->tcp_hdrlen;
2064 }
2065 
2066 static void virtio_net_rsc_extract_unit6(VirtioNetRscChain *chain,
2067                                          const uint8_t *buf,
2068                                          VirtioNetRscUnit *unit)
2069 {
2070     struct ip6_header *ip6;
2071 
2072     ip6 = (struct ip6_header *)(buf + chain->n->guest_hdr_len
2073                                  + sizeof(struct eth_header));
2074     unit->ip = ip6;
2075     unit->ip_plen = &(ip6->ip6_ctlun.ip6_un1.ip6_un1_plen);
2076     unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip)
2077                                         + sizeof(struct ip6_header));
2078     unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
2079 
2080     /* There is a difference between payload length in ipv4 and v6,
2081        ip header is excluded in ipv6 */
2082     unit->payload = htons(*unit->ip_plen) - unit->tcp_hdrlen;
2083 }
2084 
2085 static size_t virtio_net_rsc_drain_seg(VirtioNetRscChain *chain,
2086                                        VirtioNetRscSeg *seg)
2087 {
2088     int ret;
2089     struct virtio_net_hdr_v1 *h;
2090 
2091     h = (struct virtio_net_hdr_v1 *)seg->buf;
2092     h->flags = 0;
2093     h->gso_type = VIRTIO_NET_HDR_GSO_NONE;
2094 
2095     if (seg->is_coalesced) {
2096         h->rsc.segments = seg->packets;
2097         h->rsc.dup_acks = seg->dup_ack;
2098         h->flags = VIRTIO_NET_HDR_F_RSC_INFO;
2099         if (chain->proto == ETH_P_IP) {
2100             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2101         } else {
2102             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2103         }
2104     }
2105 
2106     ret = virtio_net_do_receive(seg->nc, seg->buf, seg->size);
2107     QTAILQ_REMOVE(&chain->buffers, seg, next);
2108     g_free(seg->buf);
2109     g_free(seg);
2110 
2111     return ret;
2112 }
2113 
2114 static void virtio_net_rsc_purge(void *opq)
2115 {
2116     VirtioNetRscSeg *seg, *rn;
2117     VirtioNetRscChain *chain = (VirtioNetRscChain *)opq;
2118 
2119     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn) {
2120         if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2121             chain->stat.purge_failed++;
2122             continue;
2123         }
2124     }
2125 
2126     chain->stat.timer++;
2127     if (!QTAILQ_EMPTY(&chain->buffers)) {
2128         timer_mod(chain->drain_timer,
2129               qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + chain->n->rsc_timeout);
2130     }
2131 }
2132 
2133 static void virtio_net_rsc_cleanup(VirtIONet *n)
2134 {
2135     VirtioNetRscChain *chain, *rn_chain;
2136     VirtioNetRscSeg *seg, *rn_seg;
2137 
2138     QTAILQ_FOREACH_SAFE(chain, &n->rsc_chains, next, rn_chain) {
2139         QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn_seg) {
2140             QTAILQ_REMOVE(&chain->buffers, seg, next);
2141             g_free(seg->buf);
2142             g_free(seg);
2143         }
2144 
2145         timer_free(chain->drain_timer);
2146         QTAILQ_REMOVE(&n->rsc_chains, chain, next);
2147         g_free(chain);
2148     }
2149 }
2150 
2151 static void virtio_net_rsc_cache_buf(VirtioNetRscChain *chain,
2152                                      NetClientState *nc,
2153                                      const uint8_t *buf, size_t size)
2154 {
2155     uint16_t hdr_len;
2156     VirtioNetRscSeg *seg;
2157 
2158     hdr_len = chain->n->guest_hdr_len;
2159     seg = g_new(VirtioNetRscSeg, 1);
2160     seg->buf = g_malloc(hdr_len + sizeof(struct eth_header)
2161         + sizeof(struct ip6_header) + VIRTIO_NET_MAX_TCP_PAYLOAD);
2162     memcpy(seg->buf, buf, size);
2163     seg->size = size;
2164     seg->packets = 1;
2165     seg->dup_ack = 0;
2166     seg->is_coalesced = 0;
2167     seg->nc = nc;
2168 
2169     QTAILQ_INSERT_TAIL(&chain->buffers, seg, next);
2170     chain->stat.cache++;
2171 
2172     switch (chain->proto) {
2173     case ETH_P_IP:
2174         virtio_net_rsc_extract_unit4(chain, seg->buf, &seg->unit);
2175         break;
2176     case ETH_P_IPV6:
2177         virtio_net_rsc_extract_unit6(chain, seg->buf, &seg->unit);
2178         break;
2179     default:
2180         g_assert_not_reached();
2181     }
2182 }
2183 
2184 static int32_t virtio_net_rsc_handle_ack(VirtioNetRscChain *chain,
2185                                          VirtioNetRscSeg *seg,
2186                                          const uint8_t *buf,
2187                                          struct tcp_header *n_tcp,
2188                                          struct tcp_header *o_tcp)
2189 {
2190     uint32_t nack, oack;
2191     uint16_t nwin, owin;
2192 
2193     nack = htonl(n_tcp->th_ack);
2194     nwin = htons(n_tcp->th_win);
2195     oack = htonl(o_tcp->th_ack);
2196     owin = htons(o_tcp->th_win);
2197 
2198     if ((nack - oack) >= VIRTIO_NET_MAX_TCP_PAYLOAD) {
2199         chain->stat.ack_out_of_win++;
2200         return RSC_FINAL;
2201     } else if (nack == oack) {
2202         /* duplicated ack or window probe */
2203         if (nwin == owin) {
2204             /* duplicated ack, add dup ack count due to whql test up to 1 */
2205             chain->stat.dup_ack++;
2206             return RSC_FINAL;
2207         } else {
2208             /* Coalesce window update */
2209             o_tcp->th_win = n_tcp->th_win;
2210             chain->stat.win_update++;
2211             return RSC_COALESCE;
2212         }
2213     } else {
2214         /* pure ack, go to 'C', finalize*/
2215         chain->stat.pure_ack++;
2216         return RSC_FINAL;
2217     }
2218 }
2219 
2220 static int32_t virtio_net_rsc_coalesce_data(VirtioNetRscChain *chain,
2221                                             VirtioNetRscSeg *seg,
2222                                             const uint8_t *buf,
2223                                             VirtioNetRscUnit *n_unit)
2224 {
2225     void *data;
2226     uint16_t o_ip_len;
2227     uint32_t nseq, oseq;
2228     VirtioNetRscUnit *o_unit;
2229 
2230     o_unit = &seg->unit;
2231     o_ip_len = htons(*o_unit->ip_plen);
2232     nseq = htonl(n_unit->tcp->th_seq);
2233     oseq = htonl(o_unit->tcp->th_seq);
2234 
2235     /* out of order or retransmitted. */
2236     if ((nseq - oseq) > VIRTIO_NET_MAX_TCP_PAYLOAD) {
2237         chain->stat.data_out_of_win++;
2238         return RSC_FINAL;
2239     }
2240 
2241     data = ((uint8_t *)n_unit->tcp) + n_unit->tcp_hdrlen;
2242     if (nseq == oseq) {
2243         if ((o_unit->payload == 0) && n_unit->payload) {
2244             /* From no payload to payload, normal case, not a dup ack or etc */
2245             chain->stat.data_after_pure_ack++;
2246             goto coalesce;
2247         } else {
2248             return virtio_net_rsc_handle_ack(chain, seg, buf,
2249                                              n_unit->tcp, o_unit->tcp);
2250         }
2251     } else if ((nseq - oseq) != o_unit->payload) {
2252         /* Not a consistent packet, out of order */
2253         chain->stat.data_out_of_order++;
2254         return RSC_FINAL;
2255     } else {
2256 coalesce:
2257         if ((o_ip_len + n_unit->payload) > chain->max_payload) {
2258             chain->stat.over_size++;
2259             return RSC_FINAL;
2260         }
2261 
2262         /* Here comes the right data, the payload length in v4/v6 is different,
2263            so use the field value to update and record the new data len */
2264         o_unit->payload += n_unit->payload; /* update new data len */
2265 
2266         /* update field in ip header */
2267         *o_unit->ip_plen = htons(o_ip_len + n_unit->payload);
2268 
2269         /* Bring 'PUSH' big, the whql test guide says 'PUSH' can be coalesced
2270            for windows guest, while this may change the behavior for linux
2271            guest (only if it uses RSC feature). */
2272         o_unit->tcp->th_offset_flags = n_unit->tcp->th_offset_flags;
2273 
2274         o_unit->tcp->th_ack = n_unit->tcp->th_ack;
2275         o_unit->tcp->th_win = n_unit->tcp->th_win;
2276 
2277         memmove(seg->buf + seg->size, data, n_unit->payload);
2278         seg->size += n_unit->payload;
2279         seg->packets++;
2280         chain->stat.coalesced++;
2281         return RSC_COALESCE;
2282     }
2283 }
2284 
2285 static int32_t virtio_net_rsc_coalesce4(VirtioNetRscChain *chain,
2286                                         VirtioNetRscSeg *seg,
2287                                         const uint8_t *buf, size_t size,
2288                                         VirtioNetRscUnit *unit)
2289 {
2290     struct ip_header *ip1, *ip2;
2291 
2292     ip1 = (struct ip_header *)(unit->ip);
2293     ip2 = (struct ip_header *)(seg->unit.ip);
2294     if ((ip1->ip_src ^ ip2->ip_src) || (ip1->ip_dst ^ ip2->ip_dst)
2295         || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
2296         || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
2297         chain->stat.no_match++;
2298         return RSC_NO_MATCH;
2299     }
2300 
2301     return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
2302 }
2303 
2304 static int32_t virtio_net_rsc_coalesce6(VirtioNetRscChain *chain,
2305                                         VirtioNetRscSeg *seg,
2306                                         const uint8_t *buf, size_t size,
2307                                         VirtioNetRscUnit *unit)
2308 {
2309     struct ip6_header *ip1, *ip2;
2310 
2311     ip1 = (struct ip6_header *)(unit->ip);
2312     ip2 = (struct ip6_header *)(seg->unit.ip);
2313     if (memcmp(&ip1->ip6_src, &ip2->ip6_src, sizeof(struct in6_address))
2314         || memcmp(&ip1->ip6_dst, &ip2->ip6_dst, sizeof(struct in6_address))
2315         || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
2316         || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
2317             chain->stat.no_match++;
2318             return RSC_NO_MATCH;
2319     }
2320 
2321     return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
2322 }
2323 
2324 /* Packets with 'SYN' should bypass, other flag should be sent after drain
2325  * to prevent out of order */
2326 static int virtio_net_rsc_tcp_ctrl_check(VirtioNetRscChain *chain,
2327                                          struct tcp_header *tcp)
2328 {
2329     uint16_t tcp_hdr;
2330     uint16_t tcp_flag;
2331 
2332     tcp_flag = htons(tcp->th_offset_flags);
2333     tcp_hdr = (tcp_flag & VIRTIO_NET_TCP_HDR_LENGTH) >> 10;
2334     tcp_flag &= VIRTIO_NET_TCP_FLAG;
2335     if (tcp_flag & TH_SYN) {
2336         chain->stat.tcp_syn++;
2337         return RSC_BYPASS;
2338     }
2339 
2340     if (tcp_flag & (TH_FIN | TH_URG | TH_RST | TH_ECE | TH_CWR)) {
2341         chain->stat.tcp_ctrl_drain++;
2342         return RSC_FINAL;
2343     }
2344 
2345     if (tcp_hdr > sizeof(struct tcp_header)) {
2346         chain->stat.tcp_all_opt++;
2347         return RSC_FINAL;
2348     }
2349 
2350     return RSC_CANDIDATE;
2351 }
2352 
2353 static size_t virtio_net_rsc_do_coalesce(VirtioNetRscChain *chain,
2354                                          NetClientState *nc,
2355                                          const uint8_t *buf, size_t size,
2356                                          VirtioNetRscUnit *unit)
2357 {
2358     int ret;
2359     VirtioNetRscSeg *seg, *nseg;
2360 
2361     if (QTAILQ_EMPTY(&chain->buffers)) {
2362         chain->stat.empty_cache++;
2363         virtio_net_rsc_cache_buf(chain, nc, buf, size);
2364         timer_mod(chain->drain_timer,
2365               qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + chain->n->rsc_timeout);
2366         return size;
2367     }
2368 
2369     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
2370         if (chain->proto == ETH_P_IP) {
2371             ret = virtio_net_rsc_coalesce4(chain, seg, buf, size, unit);
2372         } else {
2373             ret = virtio_net_rsc_coalesce6(chain, seg, buf, size, unit);
2374         }
2375 
2376         if (ret == RSC_FINAL) {
2377             if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2378                 /* Send failed */
2379                 chain->stat.final_failed++;
2380                 return 0;
2381             }
2382 
2383             /* Send current packet */
2384             return virtio_net_do_receive(nc, buf, size);
2385         } else if (ret == RSC_NO_MATCH) {
2386             continue;
2387         } else {
2388             /* Coalesced, mark coalesced flag to tell calc cksum for ipv4 */
2389             seg->is_coalesced = 1;
2390             return size;
2391         }
2392     }
2393 
2394     chain->stat.no_match_cache++;
2395     virtio_net_rsc_cache_buf(chain, nc, buf, size);
2396     return size;
2397 }
2398 
2399 /* Drain a connection data, this is to avoid out of order segments */
2400 static size_t virtio_net_rsc_drain_flow(VirtioNetRscChain *chain,
2401                                         NetClientState *nc,
2402                                         const uint8_t *buf, size_t size,
2403                                         uint16_t ip_start, uint16_t ip_size,
2404                                         uint16_t tcp_port)
2405 {
2406     VirtioNetRscSeg *seg, *nseg;
2407     uint32_t ppair1, ppair2;
2408 
2409     ppair1 = *(uint32_t *)(buf + tcp_port);
2410     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
2411         ppair2 = *(uint32_t *)(seg->buf + tcp_port);
2412         if (memcmp(buf + ip_start, seg->buf + ip_start, ip_size)
2413             || (ppair1 != ppair2)) {
2414             continue;
2415         }
2416         if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2417             chain->stat.drain_failed++;
2418         }
2419 
2420         break;
2421     }
2422 
2423     return virtio_net_do_receive(nc, buf, size);
2424 }
2425 
2426 static int32_t virtio_net_rsc_sanity_check4(VirtioNetRscChain *chain,
2427                                             struct ip_header *ip,
2428                                             const uint8_t *buf, size_t size)
2429 {
2430     uint16_t ip_len;
2431 
2432     /* Not an ipv4 packet */
2433     if (((ip->ip_ver_len & 0xF0) >> 4) != IP_HEADER_VERSION_4) {
2434         chain->stat.ip_option++;
2435         return RSC_BYPASS;
2436     }
2437 
2438     /* Don't handle packets with ip option */
2439     if ((ip->ip_ver_len & 0xF) != VIRTIO_NET_IP4_HEADER_LENGTH) {
2440         chain->stat.ip_option++;
2441         return RSC_BYPASS;
2442     }
2443 
2444     if (ip->ip_p != IPPROTO_TCP) {
2445         chain->stat.bypass_not_tcp++;
2446         return RSC_BYPASS;
2447     }
2448 
2449     /* Don't handle packets with ip fragment */
2450     if (!(htons(ip->ip_off) & IP_DF)) {
2451         chain->stat.ip_frag++;
2452         return RSC_BYPASS;
2453     }
2454 
2455     /* Don't handle packets with ecn flag */
2456     if (IPTOS_ECN(ip->ip_tos)) {
2457         chain->stat.ip_ecn++;
2458         return RSC_BYPASS;
2459     }
2460 
2461     ip_len = htons(ip->ip_len);
2462     if (ip_len < (sizeof(struct ip_header) + sizeof(struct tcp_header))
2463         || ip_len > (size - chain->n->guest_hdr_len -
2464                      sizeof(struct eth_header))) {
2465         chain->stat.ip_hacked++;
2466         return RSC_BYPASS;
2467     }
2468 
2469     return RSC_CANDIDATE;
2470 }
2471 
2472 static size_t virtio_net_rsc_receive4(VirtioNetRscChain *chain,
2473                                       NetClientState *nc,
2474                                       const uint8_t *buf, size_t size)
2475 {
2476     int32_t ret;
2477     uint16_t hdr_len;
2478     VirtioNetRscUnit unit;
2479 
2480     hdr_len = ((VirtIONet *)(chain->n))->guest_hdr_len;
2481 
2482     if (size < (hdr_len + sizeof(struct eth_header) + sizeof(struct ip_header)
2483         + sizeof(struct tcp_header))) {
2484         chain->stat.bypass_not_tcp++;
2485         return virtio_net_do_receive(nc, buf, size);
2486     }
2487 
2488     virtio_net_rsc_extract_unit4(chain, buf, &unit);
2489     if (virtio_net_rsc_sanity_check4(chain, unit.ip, buf, size)
2490         != RSC_CANDIDATE) {
2491         return virtio_net_do_receive(nc, buf, size);
2492     }
2493 
2494     ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
2495     if (ret == RSC_BYPASS) {
2496         return virtio_net_do_receive(nc, buf, size);
2497     } else if (ret == RSC_FINAL) {
2498         return virtio_net_rsc_drain_flow(chain, nc, buf, size,
2499                 ((hdr_len + sizeof(struct eth_header)) + 12),
2500                 VIRTIO_NET_IP4_ADDR_SIZE,
2501                 hdr_len + sizeof(struct eth_header) + sizeof(struct ip_header));
2502     }
2503 
2504     return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
2505 }
2506 
2507 static int32_t virtio_net_rsc_sanity_check6(VirtioNetRscChain *chain,
2508                                             struct ip6_header *ip6,
2509                                             const uint8_t *buf, size_t size)
2510 {
2511     uint16_t ip_len;
2512 
2513     if (((ip6->ip6_ctlun.ip6_un1.ip6_un1_flow & 0xF0) >> 4)
2514         != IP_HEADER_VERSION_6) {
2515         return RSC_BYPASS;
2516     }
2517 
2518     /* Both option and protocol is checked in this */
2519     if (ip6->ip6_ctlun.ip6_un1.ip6_un1_nxt != IPPROTO_TCP) {
2520         chain->stat.bypass_not_tcp++;
2521         return RSC_BYPASS;
2522     }
2523 
2524     ip_len = htons(ip6->ip6_ctlun.ip6_un1.ip6_un1_plen);
2525     if (ip_len < sizeof(struct tcp_header) ||
2526         ip_len > (size - chain->n->guest_hdr_len - sizeof(struct eth_header)
2527                   - sizeof(struct ip6_header))) {
2528         chain->stat.ip_hacked++;
2529         return RSC_BYPASS;
2530     }
2531 
2532     /* Don't handle packets with ecn flag */
2533     if (IP6_ECN(ip6->ip6_ctlun.ip6_un3.ip6_un3_ecn)) {
2534         chain->stat.ip_ecn++;
2535         return RSC_BYPASS;
2536     }
2537 
2538     return RSC_CANDIDATE;
2539 }
2540 
2541 static size_t virtio_net_rsc_receive6(void *opq, NetClientState *nc,
2542                                       const uint8_t *buf, size_t size)
2543 {
2544     int32_t ret;
2545     uint16_t hdr_len;
2546     VirtioNetRscChain *chain;
2547     VirtioNetRscUnit unit;
2548 
2549     chain = opq;
2550     hdr_len = ((VirtIONet *)(chain->n))->guest_hdr_len;
2551 
2552     if (size < (hdr_len + sizeof(struct eth_header) + sizeof(struct ip6_header)
2553         + sizeof(tcp_header))) {
2554         return virtio_net_do_receive(nc, buf, size);
2555     }
2556 
2557     virtio_net_rsc_extract_unit6(chain, buf, &unit);
2558     if (RSC_CANDIDATE != virtio_net_rsc_sanity_check6(chain,
2559                                                  unit.ip, buf, size)) {
2560         return virtio_net_do_receive(nc, buf, size);
2561     }
2562 
2563     ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
2564     if (ret == RSC_BYPASS) {
2565         return virtio_net_do_receive(nc, buf, size);
2566     } else if (ret == RSC_FINAL) {
2567         return virtio_net_rsc_drain_flow(chain, nc, buf, size,
2568                 ((hdr_len + sizeof(struct eth_header)) + 8),
2569                 VIRTIO_NET_IP6_ADDR_SIZE,
2570                 hdr_len + sizeof(struct eth_header)
2571                 + sizeof(struct ip6_header));
2572     }
2573 
2574     return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
2575 }
2576 
2577 static VirtioNetRscChain *virtio_net_rsc_lookup_chain(VirtIONet *n,
2578                                                       NetClientState *nc,
2579                                                       uint16_t proto)
2580 {
2581     VirtioNetRscChain *chain;
2582 
2583     if ((proto != (uint16_t)ETH_P_IP) && (proto != (uint16_t)ETH_P_IPV6)) {
2584         return NULL;
2585     }
2586 
2587     QTAILQ_FOREACH(chain, &n->rsc_chains, next) {
2588         if (chain->proto == proto) {
2589             return chain;
2590         }
2591     }
2592 
2593     chain = g_malloc(sizeof(*chain));
2594     chain->n = n;
2595     chain->proto = proto;
2596     if (proto == (uint16_t)ETH_P_IP) {
2597         chain->max_payload = VIRTIO_NET_MAX_IP4_PAYLOAD;
2598         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2599     } else {
2600         chain->max_payload = VIRTIO_NET_MAX_IP6_PAYLOAD;
2601         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2602     }
2603     chain->drain_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
2604                                       virtio_net_rsc_purge, chain);
2605     memset(&chain->stat, 0, sizeof(chain->stat));
2606 
2607     QTAILQ_INIT(&chain->buffers);
2608     QTAILQ_INSERT_TAIL(&n->rsc_chains, chain, next);
2609 
2610     return chain;
2611 }
2612 
2613 static ssize_t virtio_net_rsc_receive(NetClientState *nc,
2614                                       const uint8_t *buf,
2615                                       size_t size)
2616 {
2617     uint16_t proto;
2618     VirtioNetRscChain *chain;
2619     struct eth_header *eth;
2620     VirtIONet *n;
2621 
2622     n = qemu_get_nic_opaque(nc);
2623     if (size < (n->host_hdr_len + sizeof(struct eth_header))) {
2624         return virtio_net_do_receive(nc, buf, size);
2625     }
2626 
2627     eth = (struct eth_header *)(buf + n->guest_hdr_len);
2628     proto = htons(eth->h_proto);
2629 
2630     chain = virtio_net_rsc_lookup_chain(n, nc, proto);
2631     if (chain) {
2632         chain->stat.received++;
2633         if (proto == (uint16_t)ETH_P_IP && n->rsc4_enabled) {
2634             return virtio_net_rsc_receive4(chain, nc, buf, size);
2635         } else if (proto == (uint16_t)ETH_P_IPV6 && n->rsc6_enabled) {
2636             return virtio_net_rsc_receive6(chain, nc, buf, size);
2637         }
2638     }
2639     return virtio_net_do_receive(nc, buf, size);
2640 }
2641 
2642 static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf,
2643                                   size_t size)
2644 {
2645     VirtIONet *n = qemu_get_nic_opaque(nc);
2646     if ((n->rsc4_enabled || n->rsc6_enabled)) {
2647         return virtio_net_rsc_receive(nc, buf, size);
2648     } else {
2649         return virtio_net_do_receive(nc, buf, size);
2650     }
2651 }
2652 
2653 static int32_t virtio_net_flush_tx(VirtIONetQueue *q);
2654 
2655 static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
2656 {
2657     VirtIONet *n = qemu_get_nic_opaque(nc);
2658     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
2659     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2660     int ret;
2661 
2662     virtqueue_push(q->tx_vq, q->async_tx.elem, 0);
2663     virtio_notify(vdev, q->tx_vq);
2664 
2665     g_free(q->async_tx.elem);
2666     q->async_tx.elem = NULL;
2667 
2668     virtio_queue_set_notification(q->tx_vq, 1);
2669     ret = virtio_net_flush_tx(q);
2670     if (ret >= n->tx_burst) {
2671         /*
2672          * the flush has been stopped by tx_burst
2673          * we will not receive notification for the
2674          * remainining part, so re-schedule
2675          */
2676         virtio_queue_set_notification(q->tx_vq, 0);
2677         if (q->tx_bh) {
2678             replay_bh_schedule_event(q->tx_bh);
2679         } else {
2680             timer_mod(q->tx_timer,
2681                       qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2682         }
2683         q->tx_waiting = 1;
2684     }
2685 }
2686 
2687 /* TX */
2688 static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
2689 {
2690     VirtIONet *n = q->n;
2691     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2692     VirtQueueElement *elem;
2693     int32_t num_packets = 0;
2694     int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
2695     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
2696         return num_packets;
2697     }
2698 
2699     if (q->async_tx.elem) {
2700         virtio_queue_set_notification(q->tx_vq, 0);
2701         return num_packets;
2702     }
2703 
2704     for (;;) {
2705         ssize_t ret;
2706         unsigned int out_num;
2707         struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg;
2708         struct virtio_net_hdr vhdr;
2709 
2710         elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
2711         if (!elem) {
2712             break;
2713         }
2714 
2715         out_num = elem->out_num;
2716         out_sg = elem->out_sg;
2717         if (out_num < 1) {
2718             virtio_error(vdev, "virtio-net header not in first element");
2719             goto detach;
2720         }
2721 
2722         if (n->needs_vnet_hdr_swap) {
2723             if (iov_to_buf(out_sg, out_num, 0, &vhdr, sizeof(vhdr)) <
2724                 sizeof(vhdr)) {
2725                 virtio_error(vdev, "virtio-net header incorrect");
2726                 goto detach;
2727             }
2728             virtio_net_hdr_swap(vdev, &vhdr);
2729             sg2[0].iov_base = &vhdr;
2730             sg2[0].iov_len = sizeof(vhdr);
2731             out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1, out_sg, out_num,
2732                                sizeof(vhdr), -1);
2733             if (out_num == VIRTQUEUE_MAX_SIZE) {
2734                 goto drop;
2735             }
2736             out_num += 1;
2737             out_sg = sg2;
2738         }
2739         /*
2740          * If host wants to see the guest header as is, we can
2741          * pass it on unchanged. Otherwise, copy just the parts
2742          * that host is interested in.
2743          */
2744         assert(n->host_hdr_len <= n->guest_hdr_len);
2745         if (n->host_hdr_len != n->guest_hdr_len) {
2746             if (iov_size(out_sg, out_num) < n->guest_hdr_len) {
2747                 virtio_error(vdev, "virtio-net header is invalid");
2748                 goto detach;
2749             }
2750             unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
2751                                        out_sg, out_num,
2752                                        0, n->host_hdr_len);
2753             sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
2754                              out_sg, out_num,
2755                              n->guest_hdr_len, -1);
2756             out_num = sg_num;
2757             out_sg = sg;
2758 
2759             if (out_num < 1) {
2760                 virtio_error(vdev, "virtio-net nothing to send");
2761                 goto detach;
2762             }
2763         }
2764 
2765         ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index),
2766                                       out_sg, out_num, virtio_net_tx_complete);
2767         if (ret == 0) {
2768             virtio_queue_set_notification(q->tx_vq, 0);
2769             q->async_tx.elem = elem;
2770             return -EBUSY;
2771         }
2772 
2773 drop:
2774         virtqueue_push(q->tx_vq, elem, 0);
2775         virtio_notify(vdev, q->tx_vq);
2776         g_free(elem);
2777 
2778         if (++num_packets >= n->tx_burst) {
2779             break;
2780         }
2781     }
2782     return num_packets;
2783 
2784 detach:
2785     virtqueue_detach_element(q->tx_vq, elem, 0);
2786     g_free(elem);
2787     return -EINVAL;
2788 }
2789 
2790 static void virtio_net_tx_timer(void *opaque);
2791 
2792 static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
2793 {
2794     VirtIONet *n = VIRTIO_NET(vdev);
2795     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
2796 
2797     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
2798         virtio_net_drop_tx_queue_data(vdev, vq);
2799         return;
2800     }
2801 
2802     /* This happens when device was stopped but VCPU wasn't. */
2803     if (!vdev->vm_running) {
2804         q->tx_waiting = 1;
2805         return;
2806     }
2807 
2808     if (q->tx_waiting) {
2809         /* We already have queued packets, immediately flush */
2810         timer_del(q->tx_timer);
2811         virtio_net_tx_timer(q);
2812     } else {
2813         /* re-arm timer to flush it (and more) on next tick */
2814         timer_mod(q->tx_timer,
2815                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2816         q->tx_waiting = 1;
2817         virtio_queue_set_notification(vq, 0);
2818     }
2819 }
2820 
2821 static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
2822 {
2823     VirtIONet *n = VIRTIO_NET(vdev);
2824     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
2825 
2826     if (unlikely(n->vhost_started)) {
2827         return;
2828     }
2829 
2830     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
2831         virtio_net_drop_tx_queue_data(vdev, vq);
2832         return;
2833     }
2834 
2835     if (unlikely(q->tx_waiting)) {
2836         return;
2837     }
2838     q->tx_waiting = 1;
2839     /* This happens when device was stopped but VCPU wasn't. */
2840     if (!vdev->vm_running) {
2841         return;
2842     }
2843     virtio_queue_set_notification(vq, 0);
2844     replay_bh_schedule_event(q->tx_bh);
2845 }
2846 
2847 static void virtio_net_tx_timer(void *opaque)
2848 {
2849     VirtIONetQueue *q = opaque;
2850     VirtIONet *n = q->n;
2851     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2852     int ret;
2853 
2854     /* This happens when device was stopped but BH wasn't. */
2855     if (!vdev->vm_running) {
2856         /* Make sure tx waiting is set, so we'll run when restarted. */
2857         assert(q->tx_waiting);
2858         return;
2859     }
2860 
2861     q->tx_waiting = 0;
2862 
2863     /* Just in case the driver is not ready on more */
2864     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
2865         return;
2866     }
2867 
2868     ret = virtio_net_flush_tx(q);
2869     if (ret == -EBUSY || ret == -EINVAL) {
2870         return;
2871     }
2872     /*
2873      * If we flush a full burst of packets, assume there are
2874      * more coming and immediately rearm
2875      */
2876     if (ret >= n->tx_burst) {
2877         q->tx_waiting = 1;
2878         timer_mod(q->tx_timer,
2879                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2880         return;
2881     }
2882     /*
2883      * If less than a full burst, re-enable notification and flush
2884      * anything that may have come in while we weren't looking.  If
2885      * we find something, assume the guest is still active and rearm
2886      */
2887     virtio_queue_set_notification(q->tx_vq, 1);
2888     ret = virtio_net_flush_tx(q);
2889     if (ret > 0) {
2890         virtio_queue_set_notification(q->tx_vq, 0);
2891         q->tx_waiting = 1;
2892         timer_mod(q->tx_timer,
2893                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2894     }
2895 }
2896 
2897 static void virtio_net_tx_bh(void *opaque)
2898 {
2899     VirtIONetQueue *q = opaque;
2900     VirtIONet *n = q->n;
2901     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2902     int32_t ret;
2903 
2904     /* This happens when device was stopped but BH wasn't. */
2905     if (!vdev->vm_running) {
2906         /* Make sure tx waiting is set, so we'll run when restarted. */
2907         assert(q->tx_waiting);
2908         return;
2909     }
2910 
2911     q->tx_waiting = 0;
2912 
2913     /* Just in case the driver is not ready on more */
2914     if (unlikely(!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))) {
2915         return;
2916     }
2917 
2918     ret = virtio_net_flush_tx(q);
2919     if (ret == -EBUSY || ret == -EINVAL) {
2920         return; /* Notification re-enable handled by tx_complete or device
2921                  * broken */
2922     }
2923 
2924     /* If we flush a full burst of packets, assume there are
2925      * more coming and immediately reschedule */
2926     if (ret >= n->tx_burst) {
2927         replay_bh_schedule_event(q->tx_bh);
2928         q->tx_waiting = 1;
2929         return;
2930     }
2931 
2932     /* If less than a full burst, re-enable notification and flush
2933      * anything that may have come in while we weren't looking.  If
2934      * we find something, assume the guest is still active and reschedule */
2935     virtio_queue_set_notification(q->tx_vq, 1);
2936     ret = virtio_net_flush_tx(q);
2937     if (ret == -EINVAL) {
2938         return;
2939     } else if (ret > 0) {
2940         virtio_queue_set_notification(q->tx_vq, 0);
2941         replay_bh_schedule_event(q->tx_bh);
2942         q->tx_waiting = 1;
2943     }
2944 }
2945 
2946 static void virtio_net_add_queue(VirtIONet *n, int index)
2947 {
2948     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2949 
2950     n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
2951                                            virtio_net_handle_rx);
2952 
2953     if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
2954         n->vqs[index].tx_vq =
2955             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
2956                              virtio_net_handle_tx_timer);
2957         n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
2958                                               virtio_net_tx_timer,
2959                                               &n->vqs[index]);
2960     } else {
2961         n->vqs[index].tx_vq =
2962             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
2963                              virtio_net_handle_tx_bh);
2964         n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
2965                                                   &DEVICE(vdev)->mem_reentrancy_guard);
2966     }
2967 
2968     n->vqs[index].tx_waiting = 0;
2969     n->vqs[index].n = n;
2970 }
2971 
2972 static void virtio_net_del_queue(VirtIONet *n, int index)
2973 {
2974     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2975     VirtIONetQueue *q = &n->vqs[index];
2976     NetClientState *nc = qemu_get_subqueue(n->nic, index);
2977 
2978     qemu_purge_queued_packets(nc);
2979 
2980     virtio_del_queue(vdev, index * 2);
2981     if (q->tx_timer) {
2982         timer_free(q->tx_timer);
2983         q->tx_timer = NULL;
2984     } else {
2985         qemu_bh_delete(q->tx_bh);
2986         q->tx_bh = NULL;
2987     }
2988     q->tx_waiting = 0;
2989     virtio_del_queue(vdev, index * 2 + 1);
2990 }
2991 
2992 static void virtio_net_change_num_queue_pairs(VirtIONet *n, int new_max_queue_pairs)
2993 {
2994     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2995     int old_num_queues = virtio_get_num_queues(vdev);
2996     int new_num_queues = new_max_queue_pairs * 2 + 1;
2997     int i;
2998 
2999     assert(old_num_queues >= 3);
3000     assert(old_num_queues % 2 == 1);
3001 
3002     if (old_num_queues == new_num_queues) {
3003         return;
3004     }
3005 
3006     /*
3007      * We always need to remove and add ctrl vq if
3008      * old_num_queues != new_num_queues. Remove ctrl_vq first,
3009      * and then we only enter one of the following two loops.
3010      */
3011     virtio_del_queue(vdev, old_num_queues - 1);
3012 
3013     for (i = new_num_queues - 1; i < old_num_queues - 1; i += 2) {
3014         /* new_num_queues < old_num_queues */
3015         virtio_net_del_queue(n, i / 2);
3016     }
3017 
3018     for (i = old_num_queues - 1; i < new_num_queues - 1; i += 2) {
3019         /* new_num_queues > old_num_queues */
3020         virtio_net_add_queue(n, i / 2);
3021     }
3022 
3023     /* add ctrl_vq last */
3024     n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
3025 }
3026 
3027 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue)
3028 {
3029     int max = multiqueue ? n->max_queue_pairs : 1;
3030 
3031     n->multiqueue = multiqueue;
3032     virtio_net_change_num_queue_pairs(n, max);
3033 
3034     virtio_net_set_queue_pairs(n);
3035 }
3036 
3037 static int virtio_net_post_load_device(void *opaque, int version_id)
3038 {
3039     VirtIONet *n = opaque;
3040     VirtIODevice *vdev = VIRTIO_DEVICE(n);
3041     int i, link_down;
3042 
3043     trace_virtio_net_post_load_device();
3044     virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs,
3045                                virtio_vdev_has_feature(vdev,
3046                                                        VIRTIO_F_VERSION_1),
3047                                virtio_vdev_has_feature(vdev,
3048                                                        VIRTIO_NET_F_HASH_REPORT));
3049 
3050     /* MAC_TABLE_ENTRIES may be different from the saved image */
3051     if (n->mac_table.in_use > MAC_TABLE_ENTRIES) {
3052         n->mac_table.in_use = 0;
3053     }
3054 
3055     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
3056         n->curr_guest_offloads = virtio_net_supported_guest_offloads(n);
3057     }
3058 
3059     /*
3060      * curr_guest_offloads will be later overwritten by the
3061      * virtio_set_features_nocheck call done from the virtio_load.
3062      * Here we make sure it is preserved and restored accordingly
3063      * in the virtio_net_post_load_virtio callback.
3064      */
3065     n->saved_guest_offloads = n->curr_guest_offloads;
3066 
3067     virtio_net_set_queue_pairs(n);
3068 
3069     /* Find the first multicast entry in the saved MAC filter */
3070     for (i = 0; i < n->mac_table.in_use; i++) {
3071         if (n->mac_table.macs[i * ETH_ALEN] & 1) {
3072             break;
3073         }
3074     }
3075     n->mac_table.first_multi = i;
3076 
3077     /* nc.link_down can't be migrated, so infer link_down according
3078      * to link status bit in n->status */
3079     link_down = (n->status & VIRTIO_NET_S_LINK_UP) == 0;
3080     for (i = 0; i < n->max_queue_pairs; i++) {
3081         qemu_get_subqueue(n->nic, i)->link_down = link_down;
3082     }
3083 
3084     if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE) &&
3085         virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3086         qemu_announce_timer_reset(&n->announce_timer, migrate_announce_params(),
3087                                   QEMU_CLOCK_VIRTUAL,
3088                                   virtio_net_announce_timer, n);
3089         if (n->announce_timer.round) {
3090             timer_mod(n->announce_timer.tm,
3091                       qemu_clock_get_ms(n->announce_timer.type));
3092         } else {
3093             qemu_announce_timer_del(&n->announce_timer, false);
3094         }
3095     }
3096 
3097     virtio_net_commit_rss_config(n);
3098     return 0;
3099 }
3100 
3101 static int virtio_net_post_load_virtio(VirtIODevice *vdev)
3102 {
3103     VirtIONet *n = VIRTIO_NET(vdev);
3104     /*
3105      * The actual needed state is now in saved_guest_offloads,
3106      * see virtio_net_post_load_device for detail.
3107      * Restore it back and apply the desired offloads.
3108      */
3109     n->curr_guest_offloads = n->saved_guest_offloads;
3110     if (peer_has_vnet_hdr(n)) {
3111         virtio_net_apply_guest_offloads(n);
3112     }
3113 
3114     return 0;
3115 }
3116 
3117 /* tx_waiting field of a VirtIONetQueue */
3118 static const VMStateDescription vmstate_virtio_net_queue_tx_waiting = {
3119     .name = "virtio-net-queue-tx_waiting",
3120     .fields = (const VMStateField[]) {
3121         VMSTATE_UINT32(tx_waiting, VirtIONetQueue),
3122         VMSTATE_END_OF_LIST()
3123    },
3124 };
3125 
3126 static bool max_queue_pairs_gt_1(void *opaque, int version_id)
3127 {
3128     return VIRTIO_NET(opaque)->max_queue_pairs > 1;
3129 }
3130 
3131 static bool has_ctrl_guest_offloads(void *opaque, int version_id)
3132 {
3133     return virtio_vdev_has_feature(VIRTIO_DEVICE(opaque),
3134                                    VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
3135 }
3136 
3137 static bool mac_table_fits(void *opaque, int version_id)
3138 {
3139     return VIRTIO_NET(opaque)->mac_table.in_use <= MAC_TABLE_ENTRIES;
3140 }
3141 
3142 static bool mac_table_doesnt_fit(void *opaque, int version_id)
3143 {
3144     return !mac_table_fits(opaque, version_id);
3145 }
3146 
3147 /* This temporary type is shared by all the WITH_TMP methods
3148  * although only some fields are used by each.
3149  */
3150 struct VirtIONetMigTmp {
3151     VirtIONet      *parent;
3152     VirtIONetQueue *vqs_1;
3153     uint16_t        curr_queue_pairs_1;
3154     uint8_t         has_ufo;
3155     uint32_t        has_vnet_hdr;
3156 };
3157 
3158 /* The 2nd and subsequent tx_waiting flags are loaded later than
3159  * the 1st entry in the queue_pairs and only if there's more than one
3160  * entry.  We use the tmp mechanism to calculate a temporary
3161  * pointer and count and also validate the count.
3162  */
3163 
3164 static int virtio_net_tx_waiting_pre_save(void *opaque)
3165 {
3166     struct VirtIONetMigTmp *tmp = opaque;
3167 
3168     tmp->vqs_1 = tmp->parent->vqs + 1;
3169     tmp->curr_queue_pairs_1 = tmp->parent->curr_queue_pairs - 1;
3170     if (tmp->parent->curr_queue_pairs == 0) {
3171         tmp->curr_queue_pairs_1 = 0;
3172     }
3173 
3174     return 0;
3175 }
3176 
3177 static int virtio_net_tx_waiting_pre_load(void *opaque)
3178 {
3179     struct VirtIONetMigTmp *tmp = opaque;
3180 
3181     /* Reuse the pointer setup from save */
3182     virtio_net_tx_waiting_pre_save(opaque);
3183 
3184     if (tmp->parent->curr_queue_pairs > tmp->parent->max_queue_pairs) {
3185         error_report("virtio-net: curr_queue_pairs %x > max_queue_pairs %x",
3186             tmp->parent->curr_queue_pairs, tmp->parent->max_queue_pairs);
3187 
3188         return -EINVAL;
3189     }
3190 
3191     return 0; /* all good */
3192 }
3193 
3194 static const VMStateDescription vmstate_virtio_net_tx_waiting = {
3195     .name      = "virtio-net-tx_waiting",
3196     .pre_load  = virtio_net_tx_waiting_pre_load,
3197     .pre_save  = virtio_net_tx_waiting_pre_save,
3198     .fields    = (const VMStateField[]) {
3199         VMSTATE_STRUCT_VARRAY_POINTER_UINT16(vqs_1, struct VirtIONetMigTmp,
3200                                      curr_queue_pairs_1,
3201                                      vmstate_virtio_net_queue_tx_waiting,
3202                                      struct VirtIONetQueue),
3203         VMSTATE_END_OF_LIST()
3204     },
3205 };
3206 
3207 /* the 'has_ufo' flag is just tested; if the incoming stream has the
3208  * flag set we need to check that we have it
3209  */
3210 static int virtio_net_ufo_post_load(void *opaque, int version_id)
3211 {
3212     struct VirtIONetMigTmp *tmp = opaque;
3213 
3214     if (tmp->has_ufo && !peer_has_ufo(tmp->parent)) {
3215         error_report("virtio-net: saved image requires TUN_F_UFO support");
3216         return -EINVAL;
3217     }
3218 
3219     return 0;
3220 }
3221 
3222 static int virtio_net_ufo_pre_save(void *opaque)
3223 {
3224     struct VirtIONetMigTmp *tmp = opaque;
3225 
3226     tmp->has_ufo = tmp->parent->has_ufo;
3227 
3228     return 0;
3229 }
3230 
3231 static const VMStateDescription vmstate_virtio_net_has_ufo = {
3232     .name      = "virtio-net-ufo",
3233     .post_load = virtio_net_ufo_post_load,
3234     .pre_save  = virtio_net_ufo_pre_save,
3235     .fields    = (const VMStateField[]) {
3236         VMSTATE_UINT8(has_ufo, struct VirtIONetMigTmp),
3237         VMSTATE_END_OF_LIST()
3238     },
3239 };
3240 
3241 /* the 'has_vnet_hdr' flag is just tested; if the incoming stream has the
3242  * flag set we need to check that we have it
3243  */
3244 static int virtio_net_vnet_post_load(void *opaque, int version_id)
3245 {
3246     struct VirtIONetMigTmp *tmp = opaque;
3247 
3248     if (tmp->has_vnet_hdr && !peer_has_vnet_hdr(tmp->parent)) {
3249         error_report("virtio-net: saved image requires vnet_hdr=on");
3250         return -EINVAL;
3251     }
3252 
3253     return 0;
3254 }
3255 
3256 static int virtio_net_vnet_pre_save(void *opaque)
3257 {
3258     struct VirtIONetMigTmp *tmp = opaque;
3259 
3260     tmp->has_vnet_hdr = tmp->parent->has_vnet_hdr;
3261 
3262     return 0;
3263 }
3264 
3265 static const VMStateDescription vmstate_virtio_net_has_vnet = {
3266     .name      = "virtio-net-vnet",
3267     .post_load = virtio_net_vnet_post_load,
3268     .pre_save  = virtio_net_vnet_pre_save,
3269     .fields    = (const VMStateField[]) {
3270         VMSTATE_UINT32(has_vnet_hdr, struct VirtIONetMigTmp),
3271         VMSTATE_END_OF_LIST()
3272     },
3273 };
3274 
3275 static bool virtio_net_rss_needed(void *opaque)
3276 {
3277     return VIRTIO_NET(opaque)->rss_data.enabled;
3278 }
3279 
3280 static const VMStateDescription vmstate_virtio_net_rss = {
3281     .name      = "virtio-net-device/rss",
3282     .version_id = 1,
3283     .minimum_version_id = 1,
3284     .needed = virtio_net_rss_needed,
3285     .fields = (const VMStateField[]) {
3286         VMSTATE_BOOL(rss_data.enabled, VirtIONet),
3287         VMSTATE_BOOL(rss_data.redirect, VirtIONet),
3288         VMSTATE_BOOL(rss_data.populate_hash, VirtIONet),
3289         VMSTATE_UINT32(rss_data.hash_types, VirtIONet),
3290         VMSTATE_UINT16(rss_data.indirections_len, VirtIONet),
3291         VMSTATE_UINT16(rss_data.default_queue, VirtIONet),
3292         VMSTATE_UINT8_ARRAY(rss_data.key, VirtIONet,
3293                             VIRTIO_NET_RSS_MAX_KEY_SIZE),
3294         VMSTATE_VARRAY_UINT16_ALLOC(rss_data.indirections_table, VirtIONet,
3295                                     rss_data.indirections_len, 0,
3296                                     vmstate_info_uint16, uint16_t),
3297         VMSTATE_END_OF_LIST()
3298     },
3299 };
3300 
3301 static const VMStateDescription vmstate_virtio_net_device = {
3302     .name = "virtio-net-device",
3303     .version_id = VIRTIO_NET_VM_VERSION,
3304     .minimum_version_id = VIRTIO_NET_VM_VERSION,
3305     .post_load = virtio_net_post_load_device,
3306     .fields = (const VMStateField[]) {
3307         VMSTATE_UINT8_ARRAY(mac, VirtIONet, ETH_ALEN),
3308         VMSTATE_STRUCT_POINTER(vqs, VirtIONet,
3309                                vmstate_virtio_net_queue_tx_waiting,
3310                                VirtIONetQueue),
3311         VMSTATE_UINT32(mergeable_rx_bufs, VirtIONet),
3312         VMSTATE_UINT16(status, VirtIONet),
3313         VMSTATE_UINT8(promisc, VirtIONet),
3314         VMSTATE_UINT8(allmulti, VirtIONet),
3315         VMSTATE_UINT32(mac_table.in_use, VirtIONet),
3316 
3317         /* Guarded pair: If it fits we load it, else we throw it away
3318          * - can happen if source has a larger MAC table.; post-load
3319          *  sets flags in this case.
3320          */
3321         VMSTATE_VBUFFER_MULTIPLY(mac_table.macs, VirtIONet,
3322                                 0, mac_table_fits, mac_table.in_use,
3323                                  ETH_ALEN),
3324         VMSTATE_UNUSED_VARRAY_UINT32(VirtIONet, mac_table_doesnt_fit, 0,
3325                                      mac_table.in_use, ETH_ALEN),
3326 
3327         /* Note: This is an array of uint32's that's always been saved as a
3328          * buffer; hold onto your endiannesses; it's actually used as a bitmap
3329          * but based on the uint.
3330          */
3331         VMSTATE_BUFFER_POINTER_UNSAFE(vlans, VirtIONet, 0, MAX_VLAN >> 3),
3332         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3333                          vmstate_virtio_net_has_vnet),
3334         VMSTATE_UINT8(mac_table.multi_overflow, VirtIONet),
3335         VMSTATE_UINT8(mac_table.uni_overflow, VirtIONet),
3336         VMSTATE_UINT8(alluni, VirtIONet),
3337         VMSTATE_UINT8(nomulti, VirtIONet),
3338         VMSTATE_UINT8(nouni, VirtIONet),
3339         VMSTATE_UINT8(nobcast, VirtIONet),
3340         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3341                          vmstate_virtio_net_has_ufo),
3342         VMSTATE_SINGLE_TEST(max_queue_pairs, VirtIONet, max_queue_pairs_gt_1, 0,
3343                             vmstate_info_uint16_equal, uint16_t),
3344         VMSTATE_UINT16_TEST(curr_queue_pairs, VirtIONet, max_queue_pairs_gt_1),
3345         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3346                          vmstate_virtio_net_tx_waiting),
3347         VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
3348                             has_ctrl_guest_offloads),
3349         VMSTATE_END_OF_LIST()
3350     },
3351     .subsections = (const VMStateDescription * const []) {
3352         &vmstate_virtio_net_rss,
3353         NULL
3354     }
3355 };
3356 
3357 static NetClientInfo net_virtio_info = {
3358     .type = NET_CLIENT_DRIVER_NIC,
3359     .size = sizeof(NICState),
3360     .can_receive = virtio_net_can_receive,
3361     .receive = virtio_net_receive,
3362     .link_status_changed = virtio_net_set_link_status,
3363     .query_rx_filter = virtio_net_query_rxfilter,
3364     .announce = virtio_net_announce,
3365 };
3366 
3367 static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx)
3368 {
3369     VirtIONet *n = VIRTIO_NET(vdev);
3370     NetClientState *nc;
3371     assert(n->vhost_started);
3372     if (!n->multiqueue && idx == 2) {
3373         /* Must guard against invalid features and bogus queue index
3374          * from being set by malicious guest, or penetrated through
3375          * buggy migration stream.
3376          */
3377         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3378             qemu_log_mask(LOG_GUEST_ERROR,
3379                           "%s: bogus vq index ignored\n", __func__);
3380             return false;
3381         }
3382         nc = qemu_get_subqueue(n->nic, n->max_queue_pairs);
3383     } else {
3384         nc = qemu_get_subqueue(n->nic, vq2q(idx));
3385     }
3386     /*
3387      * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
3388      * as the macro of configure interrupt's IDX, If this driver does not
3389      * support, the function will return false
3390      */
3391 
3392     if (idx == VIRTIO_CONFIG_IRQ_IDX) {
3393         return vhost_net_config_pending(get_vhost_net(nc->peer));
3394     }
3395     return vhost_net_virtqueue_pending(get_vhost_net(nc->peer), idx);
3396 }
3397 
3398 static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx,
3399                                            bool mask)
3400 {
3401     VirtIONet *n = VIRTIO_NET(vdev);
3402     NetClientState *nc;
3403     assert(n->vhost_started);
3404     if (!n->multiqueue && idx == 2) {
3405         /* Must guard against invalid features and bogus queue index
3406          * from being set by malicious guest, or penetrated through
3407          * buggy migration stream.
3408          */
3409         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3410             qemu_log_mask(LOG_GUEST_ERROR,
3411                           "%s: bogus vq index ignored\n", __func__);
3412             return;
3413         }
3414         nc = qemu_get_subqueue(n->nic, n->max_queue_pairs);
3415     } else {
3416         nc = qemu_get_subqueue(n->nic, vq2q(idx));
3417     }
3418     /*
3419      *Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
3420      * as the macro of configure interrupt's IDX, If this driver does not
3421      * support, the function will return
3422      */
3423 
3424     if (idx == VIRTIO_CONFIG_IRQ_IDX) {
3425         vhost_net_config_mask(get_vhost_net(nc->peer), vdev, mask);
3426         return;
3427     }
3428     vhost_net_virtqueue_mask(get_vhost_net(nc->peer), vdev, idx, mask);
3429 }
3430 
3431 static void virtio_net_set_config_size(VirtIONet *n, uint64_t host_features)
3432 {
3433     virtio_add_feature(&host_features, VIRTIO_NET_F_MAC);
3434 
3435     n->config_size = virtio_get_config_size(&cfg_size_params, host_features);
3436 }
3437 
3438 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
3439                                    const char *type)
3440 {
3441     /*
3442      * The name can be NULL, the netclient name will be type.x.
3443      */
3444     assert(type != NULL);
3445 
3446     g_free(n->netclient_name);
3447     g_free(n->netclient_type);
3448     n->netclient_name = g_strdup(name);
3449     n->netclient_type = g_strdup(type);
3450 }
3451 
3452 static bool failover_unplug_primary(VirtIONet *n, DeviceState *dev)
3453 {
3454     HotplugHandler *hotplug_ctrl;
3455     PCIDevice *pci_dev;
3456     Error *err = NULL;
3457 
3458     hotplug_ctrl = qdev_get_hotplug_handler(dev);
3459     if (hotplug_ctrl) {
3460         pci_dev = PCI_DEVICE(dev);
3461         pci_dev->partially_hotplugged = true;
3462         hotplug_handler_unplug_request(hotplug_ctrl, dev, &err);
3463         if (err) {
3464             error_report_err(err);
3465             return false;
3466         }
3467     } else {
3468         return false;
3469     }
3470     return true;
3471 }
3472 
3473 static bool failover_replug_primary(VirtIONet *n, DeviceState *dev,
3474                                     Error **errp)
3475 {
3476     Error *err = NULL;
3477     HotplugHandler *hotplug_ctrl;
3478     PCIDevice *pdev = PCI_DEVICE(dev);
3479     BusState *primary_bus;
3480 
3481     if (!pdev->partially_hotplugged) {
3482         return true;
3483     }
3484     primary_bus = dev->parent_bus;
3485     if (!primary_bus) {
3486         error_setg(errp, "virtio_net: couldn't find primary bus");
3487         return false;
3488     }
3489     qdev_set_parent_bus(dev, primary_bus, &error_abort);
3490     qatomic_set(&n->failover_primary_hidden, false);
3491     hotplug_ctrl = qdev_get_hotplug_handler(dev);
3492     if (hotplug_ctrl) {
3493         hotplug_handler_pre_plug(hotplug_ctrl, dev, &err);
3494         if (err) {
3495             goto out;
3496         }
3497         hotplug_handler_plug(hotplug_ctrl, dev, &err);
3498     }
3499     pdev->partially_hotplugged = false;
3500 
3501 out:
3502     error_propagate(errp, err);
3503     return !err;
3504 }
3505 
3506 static void virtio_net_handle_migration_primary(VirtIONet *n, MigrationEvent *e)
3507 {
3508     bool should_be_hidden;
3509     Error *err = NULL;
3510     DeviceState *dev = failover_find_primary_device(n);
3511 
3512     if (!dev) {
3513         return;
3514     }
3515 
3516     should_be_hidden = qatomic_read(&n->failover_primary_hidden);
3517 
3518     if (e->type == MIG_EVENT_PRECOPY_SETUP && !should_be_hidden) {
3519         if (failover_unplug_primary(n, dev)) {
3520             vmstate_unregister(VMSTATE_IF(dev), qdev_get_vmsd(dev), dev);
3521             qapi_event_send_unplug_primary(dev->id);
3522             qatomic_set(&n->failover_primary_hidden, true);
3523         } else {
3524             warn_report("couldn't unplug primary device");
3525         }
3526     } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
3527         /* We already unplugged the device let's plug it back */
3528         if (!failover_replug_primary(n, dev, &err)) {
3529             if (err) {
3530                 error_report_err(err);
3531             }
3532         }
3533     }
3534 }
3535 
3536 static int virtio_net_migration_state_notifier(NotifierWithReturn *notifier,
3537                                                MigrationEvent *e, Error **errp)
3538 {
3539     VirtIONet *n = container_of(notifier, VirtIONet, migration_state);
3540     virtio_net_handle_migration_primary(n, e);
3541     return 0;
3542 }
3543 
3544 static bool failover_hide_primary_device(DeviceListener *listener,
3545                                          const QDict *device_opts,
3546                                          bool from_json,
3547                                          Error **errp)
3548 {
3549     VirtIONet *n = container_of(listener, VirtIONet, primary_listener);
3550     const char *standby_id;
3551 
3552     if (!device_opts) {
3553         return false;
3554     }
3555 
3556     if (!qdict_haskey(device_opts, "failover_pair_id")) {
3557         return false;
3558     }
3559 
3560     if (!qdict_haskey(device_opts, "id")) {
3561         error_setg(errp, "Device with failover_pair_id needs to have id");
3562         return false;
3563     }
3564 
3565     standby_id = qdict_get_str(device_opts, "failover_pair_id");
3566     if (g_strcmp0(standby_id, n->netclient_name) != 0) {
3567         return false;
3568     }
3569 
3570     /*
3571      * The hide helper can be called several times for a given device.
3572      * Check there is only one primary for a virtio-net device but
3573      * don't duplicate the qdict several times if it's called for the same
3574      * device.
3575      */
3576     if (n->primary_opts) {
3577         const char *old, *new;
3578         /* devices with failover_pair_id always have an id */
3579         old = qdict_get_str(n->primary_opts, "id");
3580         new = qdict_get_str(device_opts, "id");
3581         if (strcmp(old, new) != 0) {
3582             error_setg(errp, "Cannot attach more than one primary device to "
3583                        "'%s': '%s' and '%s'", n->netclient_name, old, new);
3584             return false;
3585         }
3586     } else {
3587         n->primary_opts = qdict_clone_shallow(device_opts);
3588         n->primary_opts_from_json = from_json;
3589     }
3590 
3591     /* failover_primary_hidden is set during feature negotiation */
3592     return qatomic_read(&n->failover_primary_hidden);
3593 }
3594 
3595 static void virtio_net_device_realize(DeviceState *dev, Error **errp)
3596 {
3597     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3598     VirtIONet *n = VIRTIO_NET(dev);
3599     NetClientState *nc;
3600     int i;
3601 
3602     if (n->net_conf.mtu) {
3603         n->host_features |= (1ULL << VIRTIO_NET_F_MTU);
3604     }
3605 
3606     if (n->net_conf.duplex_str) {
3607         if (strncmp(n->net_conf.duplex_str, "half", 5) == 0) {
3608             n->net_conf.duplex = DUPLEX_HALF;
3609         } else if (strncmp(n->net_conf.duplex_str, "full", 5) == 0) {
3610             n->net_conf.duplex = DUPLEX_FULL;
3611         } else {
3612             error_setg(errp, "'duplex' must be 'half' or 'full'");
3613             return;
3614         }
3615         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
3616     } else {
3617         n->net_conf.duplex = DUPLEX_UNKNOWN;
3618     }
3619 
3620     if (n->net_conf.speed < SPEED_UNKNOWN) {
3621         error_setg(errp, "'speed' must be between 0 and INT_MAX");
3622         return;
3623     }
3624     if (n->net_conf.speed >= 0) {
3625         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
3626     }
3627 
3628     if (n->failover) {
3629         n->primary_listener.hide_device = failover_hide_primary_device;
3630         qatomic_set(&n->failover_primary_hidden, true);
3631         device_listener_register(&n->primary_listener);
3632         migration_add_notifier(&n->migration_state,
3633                                virtio_net_migration_state_notifier);
3634         n->host_features |= (1ULL << VIRTIO_NET_F_STANDBY);
3635     }
3636 
3637     virtio_net_set_config_size(n, n->host_features);
3638     virtio_init(vdev, VIRTIO_ID_NET, n->config_size);
3639 
3640     /*
3641      * We set a lower limit on RX queue size to what it always was.
3642      * Guests that want a smaller ring can always resize it without
3643      * help from us (using virtio 1 and up).
3644      */
3645     if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE ||
3646         n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE ||
3647         !is_power_of_2(n->net_conf.rx_queue_size)) {
3648         error_setg(errp, "Invalid rx_queue_size (= %" PRIu16 "), "
3649                    "must be a power of 2 between %d and %d.",
3650                    n->net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_MIN_SIZE,
3651                    VIRTQUEUE_MAX_SIZE);
3652         virtio_cleanup(vdev);
3653         return;
3654     }
3655 
3656     if (n->net_conf.tx_queue_size < VIRTIO_NET_TX_QUEUE_MIN_SIZE ||
3657         n->net_conf.tx_queue_size > virtio_net_max_tx_queue_size(n) ||
3658         !is_power_of_2(n->net_conf.tx_queue_size)) {
3659         error_setg(errp, "Invalid tx_queue_size (= %" PRIu16 "), "
3660                    "must be a power of 2 between %d and %d",
3661                    n->net_conf.tx_queue_size, VIRTIO_NET_TX_QUEUE_MIN_SIZE,
3662                    virtio_net_max_tx_queue_size(n));
3663         virtio_cleanup(vdev);
3664         return;
3665     }
3666 
3667     n->max_ncs = MAX(n->nic_conf.peers.queues, 1);
3668 
3669     /*
3670      * Figure out the datapath queue pairs since the backend could
3671      * provide control queue via peers as well.
3672      */
3673     if (n->nic_conf.peers.queues) {
3674         for (i = 0; i < n->max_ncs; i++) {
3675             if (n->nic_conf.peers.ncs[i]->is_datapath) {
3676                 ++n->max_queue_pairs;
3677             }
3678         }
3679     }
3680     n->max_queue_pairs = MAX(n->max_queue_pairs, 1);
3681 
3682     if (n->max_queue_pairs * 2 + 1 > VIRTIO_QUEUE_MAX) {
3683         error_setg(errp, "Invalid number of queue pairs (= %" PRIu32 "), "
3684                    "must be a positive integer less than %d.",
3685                    n->max_queue_pairs, (VIRTIO_QUEUE_MAX - 1) / 2);
3686         virtio_cleanup(vdev);
3687         return;
3688     }
3689     n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
3690     n->curr_queue_pairs = 1;
3691     n->tx_timeout = n->net_conf.txtimer;
3692 
3693     if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
3694                        && strcmp(n->net_conf.tx, "bh")) {
3695         warn_report("virtio-net: "
3696                     "Unknown option tx=%s, valid options: \"timer\" \"bh\"",
3697                     n->net_conf.tx);
3698         error_printf("Defaulting to \"bh\"");
3699     }
3700 
3701     n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
3702                                     n->net_conf.tx_queue_size);
3703 
3704     virtio_net_add_queue(n, 0);
3705 
3706     n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
3707     qemu_macaddr_default_if_unset(&n->nic_conf.macaddr);
3708     memcpy(&n->mac[0], &n->nic_conf.macaddr, sizeof(n->mac));
3709     n->status = VIRTIO_NET_S_LINK_UP;
3710     qemu_announce_timer_reset(&n->announce_timer, migrate_announce_params(),
3711                               QEMU_CLOCK_VIRTUAL,
3712                               virtio_net_announce_timer, n);
3713     n->announce_timer.round = 0;
3714 
3715     if (n->netclient_type) {
3716         /*
3717          * Happen when virtio_net_set_netclient_name has been called.
3718          */
3719         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
3720                               n->netclient_type, n->netclient_name,
3721                               &dev->mem_reentrancy_guard, n);
3722     } else {
3723         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
3724                               object_get_typename(OBJECT(dev)), dev->id,
3725                               &dev->mem_reentrancy_guard, n);
3726     }
3727 
3728     for (i = 0; i < n->max_queue_pairs; i++) {
3729         n->nic->ncs[i].do_not_pad = true;
3730     }
3731 
3732     peer_test_vnet_hdr(n);
3733     if (peer_has_vnet_hdr(n)) {
3734         n->host_hdr_len = sizeof(struct virtio_net_hdr);
3735     } else {
3736         n->host_hdr_len = 0;
3737     }
3738 
3739     qemu_format_nic_info_str(qemu_get_queue(n->nic), n->nic_conf.macaddr.a);
3740 
3741     n->vqs[0].tx_waiting = 0;
3742     n->tx_burst = n->net_conf.txburst;
3743     virtio_net_set_mrg_rx_bufs(n, 0, 0, 0);
3744     n->promisc = 1; /* for compatibility */
3745 
3746     n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
3747 
3748     n->vlans = g_malloc0(MAX_VLAN >> 3);
3749 
3750     nc = qemu_get_queue(n->nic);
3751     nc->rxfilter_notify_enabled = 1;
3752 
3753    if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
3754         struct virtio_net_config netcfg = {};
3755         memcpy(&netcfg.mac, &n->nic_conf.macaddr, ETH_ALEN);
3756         vhost_net_set_config(get_vhost_net(nc->peer),
3757             (uint8_t *)&netcfg, 0, ETH_ALEN, VHOST_SET_CONFIG_TYPE_FRONTEND);
3758     }
3759     QTAILQ_INIT(&n->rsc_chains);
3760     n->qdev = dev;
3761 
3762     net_rx_pkt_init(&n->rx_pkt);
3763 
3764     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
3765         Error *err = NULL;
3766         if (!virtio_net_load_ebpf(n, &err)) {
3767             /*
3768              * If user explicitly gave QEMU RSS FDs to use, then
3769              * failing to use them must be considered a fatal
3770              * error. If no RSS FDs were provided, QEMU is trying
3771              * eBPF on a "best effort" basis only, so report a
3772              * warning and allow fallback to software RSS.
3773              */
3774             if (n->ebpf_rss_fds) {
3775                 error_propagate(errp, err);
3776             } else {
3777                 warn_report("unable to load eBPF RSS: %s",
3778                             error_get_pretty(err));
3779                 error_free(err);
3780             }
3781         }
3782     }
3783 }
3784 
3785 static void virtio_net_device_unrealize(DeviceState *dev)
3786 {
3787     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3788     VirtIONet *n = VIRTIO_NET(dev);
3789     int i, max_queue_pairs;
3790 
3791     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
3792         virtio_net_unload_ebpf(n);
3793     }
3794 
3795     /* This will stop vhost backend if appropriate. */
3796     virtio_net_set_status(vdev, 0);
3797 
3798     g_free(n->netclient_name);
3799     n->netclient_name = NULL;
3800     g_free(n->netclient_type);
3801     n->netclient_type = NULL;
3802 
3803     g_free(n->mac_table.macs);
3804     g_free(n->vlans);
3805 
3806     if (n->failover) {
3807         qobject_unref(n->primary_opts);
3808         device_listener_unregister(&n->primary_listener);
3809         migration_remove_notifier(&n->migration_state);
3810     } else {
3811         assert(n->primary_opts == NULL);
3812     }
3813 
3814     max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
3815     for (i = 0; i < max_queue_pairs; i++) {
3816         virtio_net_del_queue(n, i);
3817     }
3818     /* delete also control vq */
3819     virtio_del_queue(vdev, max_queue_pairs * 2);
3820     qemu_announce_timer_del(&n->announce_timer, false);
3821     g_free(n->vqs);
3822     qemu_del_nic(n->nic);
3823     virtio_net_rsc_cleanup(n);
3824     g_free(n->rss_data.indirections_table);
3825     net_rx_pkt_uninit(n->rx_pkt);
3826     virtio_cleanup(vdev);
3827 }
3828 
3829 static void virtio_net_reset(VirtIODevice *vdev)
3830 {
3831     VirtIONet *n = VIRTIO_NET(vdev);
3832     int i;
3833 
3834     /* Reset back to compatibility mode */
3835     n->promisc = 1;
3836     n->allmulti = 0;
3837     n->alluni = 0;
3838     n->nomulti = 0;
3839     n->nouni = 0;
3840     n->nobcast = 0;
3841     /* multiqueue is disabled by default */
3842     n->curr_queue_pairs = 1;
3843     timer_del(n->announce_timer.tm);
3844     n->announce_timer.round = 0;
3845     n->status &= ~VIRTIO_NET_S_ANNOUNCE;
3846 
3847     /* Flush any MAC and VLAN filter table state */
3848     n->mac_table.in_use = 0;
3849     n->mac_table.first_multi = 0;
3850     n->mac_table.multi_overflow = 0;
3851     n->mac_table.uni_overflow = 0;
3852     memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
3853     memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac));
3854     qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
3855     memset(n->vlans, 0, MAX_VLAN >> 3);
3856 
3857     /* Flush any async TX */
3858     for (i = 0;  i < n->max_queue_pairs; i++) {
3859         flush_or_purge_queued_packets(qemu_get_subqueue(n->nic, i));
3860     }
3861 
3862     virtio_net_disable_rss(n);
3863 }
3864 
3865 static void virtio_net_instance_init(Object *obj)
3866 {
3867     VirtIONet *n = VIRTIO_NET(obj);
3868 
3869     /*
3870      * The default config_size is sizeof(struct virtio_net_config).
3871      * Can be overridden with virtio_net_set_config_size.
3872      */
3873     n->config_size = sizeof(struct virtio_net_config);
3874     device_add_bootindex_property(obj, &n->nic_conf.bootindex,
3875                                   "bootindex", "/ethernet-phy@0",
3876                                   DEVICE(n));
3877 
3878     ebpf_rss_init(&n->ebpf_rss);
3879 }
3880 
3881 static int virtio_net_pre_save(void *opaque)
3882 {
3883     VirtIONet *n = opaque;
3884 
3885     /* At this point, backend must be stopped, otherwise
3886      * it might keep writing to memory. */
3887     assert(!n->vhost_started);
3888 
3889     return 0;
3890 }
3891 
3892 static bool primary_unplug_pending(void *opaque)
3893 {
3894     DeviceState *dev = opaque;
3895     DeviceState *primary;
3896     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3897     VirtIONet *n = VIRTIO_NET(vdev);
3898 
3899     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
3900         return false;
3901     }
3902     primary = failover_find_primary_device(n);
3903     return primary ? primary->pending_deleted_event : false;
3904 }
3905 
3906 static bool dev_unplug_pending(void *opaque)
3907 {
3908     DeviceState *dev = opaque;
3909     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev);
3910 
3911     return vdc->primary_unplug_pending(dev);
3912 }
3913 
3914 static struct vhost_dev *virtio_net_get_vhost(VirtIODevice *vdev)
3915 {
3916     VirtIONet *n = VIRTIO_NET(vdev);
3917     NetClientState *nc;
3918     struct vhost_net *net;
3919 
3920     if (!n->nic) {
3921         return NULL;
3922     }
3923 
3924     nc = qemu_get_queue(n->nic);
3925     if (!nc) {
3926         return NULL;
3927     }
3928 
3929     net = get_vhost_net(nc->peer);
3930     if (!net) {
3931         return NULL;
3932     }
3933 
3934     return &net->dev;
3935 }
3936 
3937 static const VMStateDescription vmstate_virtio_net = {
3938     .name = "virtio-net",
3939     .minimum_version_id = VIRTIO_NET_VM_VERSION,
3940     .version_id = VIRTIO_NET_VM_VERSION,
3941     .fields = (const VMStateField[]) {
3942         VMSTATE_VIRTIO_DEVICE,
3943         VMSTATE_END_OF_LIST()
3944     },
3945     .pre_save = virtio_net_pre_save,
3946     .dev_unplug_pending = dev_unplug_pending,
3947 };
3948 
3949 static Property virtio_net_properties[] = {
3950     DEFINE_PROP_BIT64("csum", VirtIONet, host_features,
3951                     VIRTIO_NET_F_CSUM, true),
3952     DEFINE_PROP_BIT64("guest_csum", VirtIONet, host_features,
3953                     VIRTIO_NET_F_GUEST_CSUM, true),
3954     DEFINE_PROP_BIT64("gso", VirtIONet, host_features, VIRTIO_NET_F_GSO, true),
3955     DEFINE_PROP_BIT64("guest_tso4", VirtIONet, host_features,
3956                     VIRTIO_NET_F_GUEST_TSO4, true),
3957     DEFINE_PROP_BIT64("guest_tso6", VirtIONet, host_features,
3958                     VIRTIO_NET_F_GUEST_TSO6, true),
3959     DEFINE_PROP_BIT64("guest_ecn", VirtIONet, host_features,
3960                     VIRTIO_NET_F_GUEST_ECN, true),
3961     DEFINE_PROP_BIT64("guest_ufo", VirtIONet, host_features,
3962                     VIRTIO_NET_F_GUEST_UFO, true),
3963     DEFINE_PROP_BIT64("guest_announce", VirtIONet, host_features,
3964                     VIRTIO_NET_F_GUEST_ANNOUNCE, true),
3965     DEFINE_PROP_BIT64("host_tso4", VirtIONet, host_features,
3966                     VIRTIO_NET_F_HOST_TSO4, true),
3967     DEFINE_PROP_BIT64("host_tso6", VirtIONet, host_features,
3968                     VIRTIO_NET_F_HOST_TSO6, true),
3969     DEFINE_PROP_BIT64("host_ecn", VirtIONet, host_features,
3970                     VIRTIO_NET_F_HOST_ECN, true),
3971     DEFINE_PROP_BIT64("host_ufo", VirtIONet, host_features,
3972                     VIRTIO_NET_F_HOST_UFO, true),
3973     DEFINE_PROP_BIT64("mrg_rxbuf", VirtIONet, host_features,
3974                     VIRTIO_NET_F_MRG_RXBUF, true),
3975     DEFINE_PROP_BIT64("status", VirtIONet, host_features,
3976                     VIRTIO_NET_F_STATUS, true),
3977     DEFINE_PROP_BIT64("ctrl_vq", VirtIONet, host_features,
3978                     VIRTIO_NET_F_CTRL_VQ, true),
3979     DEFINE_PROP_BIT64("ctrl_rx", VirtIONet, host_features,
3980                     VIRTIO_NET_F_CTRL_RX, true),
3981     DEFINE_PROP_BIT64("ctrl_vlan", VirtIONet, host_features,
3982                     VIRTIO_NET_F_CTRL_VLAN, true),
3983     DEFINE_PROP_BIT64("ctrl_rx_extra", VirtIONet, host_features,
3984                     VIRTIO_NET_F_CTRL_RX_EXTRA, true),
3985     DEFINE_PROP_BIT64("ctrl_mac_addr", VirtIONet, host_features,
3986                     VIRTIO_NET_F_CTRL_MAC_ADDR, true),
3987     DEFINE_PROP_BIT64("ctrl_guest_offloads", VirtIONet, host_features,
3988                     VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, true),
3989     DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
3990     DEFINE_PROP_BIT64("rss", VirtIONet, host_features,
3991                     VIRTIO_NET_F_RSS, false),
3992     DEFINE_PROP_BIT64("hash", VirtIONet, host_features,
3993                     VIRTIO_NET_F_HASH_REPORT, false),
3994     DEFINE_PROP_ARRAY("ebpf-rss-fds", VirtIONet, nr_ebpf_rss_fds,
3995                       ebpf_rss_fds, qdev_prop_string, char*),
3996     DEFINE_PROP_BIT64("guest_rsc_ext", VirtIONet, host_features,
3997                     VIRTIO_NET_F_RSC_EXT, false),
3998     DEFINE_PROP_UINT32("rsc_interval", VirtIONet, rsc_timeout,
3999                        VIRTIO_NET_RSC_DEFAULT_INTERVAL),
4000     DEFINE_NIC_PROPERTIES(VirtIONet, nic_conf),
4001     DEFINE_PROP_UINT32("x-txtimer", VirtIONet, net_conf.txtimer,
4002                        TX_TIMER_INTERVAL),
4003     DEFINE_PROP_INT32("x-txburst", VirtIONet, net_conf.txburst, TX_BURST),
4004     DEFINE_PROP_STRING("tx", VirtIONet, net_conf.tx),
4005     DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size,
4006                        VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE),
4007     DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size,
4008                        VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE),
4009     DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0),
4010     DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend,
4011                      true),
4012     DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
4013     DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
4014     DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
4015     DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
4016                       VIRTIO_NET_F_GUEST_USO4, true),
4017     DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
4018                       VIRTIO_NET_F_GUEST_USO6, true),
4019     DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
4020                       VIRTIO_NET_F_HOST_USO, true),
4021     DEFINE_PROP_END_OF_LIST(),
4022 };
4023 
4024 static void virtio_net_class_init(ObjectClass *klass, void *data)
4025 {
4026     DeviceClass *dc = DEVICE_CLASS(klass);
4027     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
4028 
4029     device_class_set_props(dc, virtio_net_properties);
4030     dc->vmsd = &vmstate_virtio_net;
4031     set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
4032     vdc->realize = virtio_net_device_realize;
4033     vdc->unrealize = virtio_net_device_unrealize;
4034     vdc->get_config = virtio_net_get_config;
4035     vdc->set_config = virtio_net_set_config;
4036     vdc->get_features = virtio_net_get_features;
4037     vdc->set_features = virtio_net_set_features;
4038     vdc->bad_features = virtio_net_bad_features;
4039     vdc->reset = virtio_net_reset;
4040     vdc->queue_reset = virtio_net_queue_reset;
4041     vdc->queue_enable = virtio_net_queue_enable;
4042     vdc->set_status = virtio_net_set_status;
4043     vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
4044     vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;
4045     vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO);
4046     vdc->post_load = virtio_net_post_load_virtio;
4047     vdc->vmsd = &vmstate_virtio_net_device;
4048     vdc->primary_unplug_pending = primary_unplug_pending;
4049     vdc->get_vhost = virtio_net_get_vhost;
4050     vdc->toggle_device_iotlb = vhost_toggle_device_iotlb;
4051 }
4052 
4053 static const TypeInfo virtio_net_info = {
4054     .name = TYPE_VIRTIO_NET,
4055     .parent = TYPE_VIRTIO_DEVICE,
4056     .instance_size = sizeof(VirtIONet),
4057     .instance_init = virtio_net_instance_init,
4058     .class_init = virtio_net_class_init,
4059 };
4060 
4061 static void virtio_register_types(void)
4062 {
4063     type_register_static(&virtio_net_info);
4064 }
4065 
4066 type_init(virtio_register_types)
4067