xref: /openbmc/qemu/net/vhost-vdpa.c (revision 07eba949)
11e0a84eaSCindy Lu /*
21e0a84eaSCindy Lu  * vhost-vdpa.c
31e0a84eaSCindy Lu  *
41e0a84eaSCindy Lu  * Copyright(c) 2017-2018 Intel Corporation.
51e0a84eaSCindy Lu  * Copyright(c) 2020 Red Hat, Inc.
61e0a84eaSCindy Lu  *
71e0a84eaSCindy Lu  * This work is licensed under the terms of the GNU GPL, version 2 or later.
81e0a84eaSCindy Lu  * See the COPYING file in the top-level directory.
91e0a84eaSCindy Lu  *
101e0a84eaSCindy Lu  */
111e0a84eaSCindy Lu 
121e0a84eaSCindy Lu #include "qemu/osdep.h"
131e0a84eaSCindy Lu #include "clients.h"
14bd907ae4SEugenio Pérez #include "hw/virtio/virtio-net.h"
151e0a84eaSCindy Lu #include "net/vhost_net.h"
161e0a84eaSCindy Lu #include "net/vhost-vdpa.h"
171e0a84eaSCindy Lu #include "hw/virtio/vhost-vdpa.h"
181e0a84eaSCindy Lu #include "qemu/config-file.h"
191e0a84eaSCindy Lu #include "qemu/error-report.h"
20bd907ae4SEugenio Pérez #include "qemu/log.h"
21bd907ae4SEugenio Pérez #include "qemu/memalign.h"
221e0a84eaSCindy Lu #include "qemu/option.h"
231e0a84eaSCindy Lu #include "qapi/error.h"
2440237840SJason Wang #include <linux/vhost.h>
251e0a84eaSCindy Lu #include <sys/ioctl.h>
261e0a84eaSCindy Lu #include <err.h>
271e0a84eaSCindy Lu #include "standard-headers/linux/virtio_net.h"
281e0a84eaSCindy Lu #include "monitor/monitor.h"
2969498430SEugenio Pérez #include "migration/migration.h"
3069498430SEugenio Pérez #include "migration/misc.h"
311e0a84eaSCindy Lu #include "hw/virtio/vhost.h"
321e0a84eaSCindy Lu 
331e0a84eaSCindy Lu /* Todo:need to add the multiqueue support here */
341e0a84eaSCindy Lu typedef struct VhostVDPAState {
351e0a84eaSCindy Lu     NetClientState nc;
361e0a84eaSCindy Lu     struct vhost_vdpa vhost_vdpa;
3769498430SEugenio Pérez     Notifier migration_state;
381e0a84eaSCindy Lu     VHostNetState *vhost_net;
392df4dd31SEugenio Pérez 
402df4dd31SEugenio Pérez     /* Control commands shadow buffers */
4117fb889fSEugenio Pérez     void *cvq_cmd_out_buffer;
4217fb889fSEugenio Pérez     virtio_net_ctrl_ack *status;
4317fb889fSEugenio Pérez 
447f211a28SEugenio Pérez     /* The device always have SVQ enabled */
457f211a28SEugenio Pérez     bool always_svq;
46152128d6SEugenio Pérez 
47152128d6SEugenio Pérez     /* The device can isolate CVQ in its own ASID */
48152128d6SEugenio Pérez     bool cvq_isolated;
49152128d6SEugenio Pérez 
501e0a84eaSCindy Lu     bool started;
511e0a84eaSCindy Lu } VhostVDPAState;
521e0a84eaSCindy Lu 
532875a0caSHawkins Jiawei /*
542875a0caSHawkins Jiawei  * The array is sorted alphabetically in ascending order,
552875a0caSHawkins Jiawei  * with the exception of VHOST_INVALID_FEATURE_BIT,
562875a0caSHawkins Jiawei  * which should always be the last entry.
572875a0caSHawkins Jiawei  */
581e0a84eaSCindy Lu const int vdpa_feature_bits[] = {
591e0a84eaSCindy Lu     VIRTIO_F_ANY_LAYOUT,
602875a0caSHawkins Jiawei     VIRTIO_F_IOMMU_PLATFORM,
612875a0caSHawkins Jiawei     VIRTIO_F_NOTIFY_ON_EMPTY,
622875a0caSHawkins Jiawei     VIRTIO_F_RING_PACKED,
632875a0caSHawkins Jiawei     VIRTIO_F_RING_RESET,
641e0a84eaSCindy Lu     VIRTIO_F_VERSION_1,
651e0a84eaSCindy Lu     VIRTIO_NET_F_CSUM,
6651e84244SEugenio Pérez     VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,
672875a0caSHawkins Jiawei     VIRTIO_NET_F_CTRL_MAC_ADDR,
6840237840SJason Wang     VIRTIO_NET_F_CTRL_RX,
6940237840SJason Wang     VIRTIO_NET_F_CTRL_RX_EXTRA,
7040237840SJason Wang     VIRTIO_NET_F_CTRL_VLAN,
7140237840SJason Wang     VIRTIO_NET_F_CTRL_VQ,
722875a0caSHawkins Jiawei     VIRTIO_NET_F_GSO,
732875a0caSHawkins Jiawei     VIRTIO_NET_F_GUEST_CSUM,
742875a0caSHawkins Jiawei     VIRTIO_NET_F_GUEST_ECN,
752875a0caSHawkins Jiawei     VIRTIO_NET_F_GUEST_TSO4,
762875a0caSHawkins Jiawei     VIRTIO_NET_F_GUEST_TSO6,
772875a0caSHawkins Jiawei     VIRTIO_NET_F_GUEST_UFO,
789da16849SAndrew Melnychenko     VIRTIO_NET_F_GUEST_USO4,
799da16849SAndrew Melnychenko     VIRTIO_NET_F_GUEST_USO6,
800145c393SAndrew Melnychenko     VIRTIO_NET_F_HASH_REPORT,
812875a0caSHawkins Jiawei     VIRTIO_NET_F_HOST_ECN,
822875a0caSHawkins Jiawei     VIRTIO_NET_F_HOST_TSO4,
832875a0caSHawkins Jiawei     VIRTIO_NET_F_HOST_TSO6,
842875a0caSHawkins Jiawei     VIRTIO_NET_F_HOST_UFO,
859da16849SAndrew Melnychenko     VIRTIO_NET_F_HOST_USO,
862875a0caSHawkins Jiawei     VIRTIO_NET_F_MQ,
872875a0caSHawkins Jiawei     VIRTIO_NET_F_MRG_RXBUF,
882875a0caSHawkins Jiawei     VIRTIO_NET_F_MTU,
892875a0caSHawkins Jiawei     VIRTIO_NET_F_RSS,
909aa47eddSSi-Wei Liu     VIRTIO_NET_F_STATUS,
912875a0caSHawkins Jiawei     VIRTIO_RING_F_EVENT_IDX,
922875a0caSHawkins Jiawei     VIRTIO_RING_F_INDIRECT_DESC,
932875a0caSHawkins Jiawei 
942875a0caSHawkins Jiawei     /* VHOST_INVALID_FEATURE_BIT should always be the last entry */
951e0a84eaSCindy Lu     VHOST_INVALID_FEATURE_BIT
961e0a84eaSCindy Lu };
971e0a84eaSCindy Lu 
981576dbb5SEugenio Pérez /** Supported device specific feature bits with SVQ */
991576dbb5SEugenio Pérez static const uint64_t vdpa_svq_device_features =
1001576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_CSUM) |
1011576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
1024b4a1378SHawkins Jiawei     BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) |
1031576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_MTU) |
1041576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_MAC) |
1051576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
1061576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
1071576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
1081576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
1091576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
1101576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
1111576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
1121576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
1131576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
1141576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_STATUS) |
1151576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
116ea6eec49SHawkins Jiawei     BIT_ULL(VIRTIO_NET_F_CTRL_RX) |
117e213c45aSHawkins Jiawei     BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |
118d669b7bbSHawkins Jiawei     BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) |
11972b99a87SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_MQ) |
1201576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
1211576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
122609ab4c3SEugenio Pérez     /* VHOST_F_LOG_ALL is exposed by SVQ */
123609ab4c3SEugenio Pérez     BIT_ULL(VHOST_F_LOG_ALL) |
124556b67d4SHawkins Jiawei     BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |
125*07eba949SHawkins Jiawei     BIT_ULL(VIRTIO_NET_F_RSS) |
1261576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
1270d74e2b7SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_STANDBY) |
1280d74e2b7SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX);
1291576dbb5SEugenio Pérez 
130c1a10086SEugenio Pérez #define VHOST_VDPA_NET_CVQ_ASID 1
131c1a10086SEugenio Pérez 
vhost_vdpa_get_vhost_net(NetClientState * nc)1321e0a84eaSCindy Lu VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
1331e0a84eaSCindy Lu {
1341e0a84eaSCindy Lu     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
1351e0a84eaSCindy Lu     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
1361e0a84eaSCindy Lu     return s->vhost_net;
1371e0a84eaSCindy Lu }
1381e0a84eaSCindy Lu 
vhost_vdpa_net_cvq_cmd_len(void)139915bf6ccSEugenio Pérez static size_t vhost_vdpa_net_cvq_cmd_len(void)
140915bf6ccSEugenio Pérez {
141915bf6ccSEugenio Pérez     /*
142915bf6ccSEugenio Pérez      * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
143915bf6ccSEugenio Pérez      * In buffer is always 1 byte, so it should fit here
144915bf6ccSEugenio Pérez      */
145915bf6ccSEugenio Pérez     return sizeof(struct virtio_net_ctrl_hdr) +
146915bf6ccSEugenio Pérez            2 * sizeof(struct virtio_net_ctrl_mac) +
147915bf6ccSEugenio Pérez            MAC_TABLE_ENTRIES * ETH_ALEN;
148915bf6ccSEugenio Pérez }
149915bf6ccSEugenio Pérez 
vhost_vdpa_net_cvq_cmd_page_len(void)150915bf6ccSEugenio Pérez static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
151915bf6ccSEugenio Pérez {
152915bf6ccSEugenio Pérez     return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
153915bf6ccSEugenio Pérez }
154915bf6ccSEugenio Pérez 
vhost_vdpa_net_valid_svq_features(uint64_t features,Error ** errp)15536e46472SEugenio Pérez static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp)
15636e46472SEugenio Pérez {
15736e46472SEugenio Pérez     uint64_t invalid_dev_features =
15836e46472SEugenio Pérez         features & ~vdpa_svq_device_features &
15936e46472SEugenio Pérez         /* Transport are all accepted at this point */
16036e46472SEugenio Pérez         ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
16136e46472SEugenio Pérez                          VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
16236e46472SEugenio Pérez 
16336e46472SEugenio Pérez     if (invalid_dev_features) {
16436e46472SEugenio Pérez         error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
16536e46472SEugenio Pérez                    invalid_dev_features);
166258a0394SEugenio Pérez         return false;
16736e46472SEugenio Pérez     }
16836e46472SEugenio Pérez 
169258a0394SEugenio Pérez     return vhost_svq_valid_features(features, errp);
17036e46472SEugenio Pérez }
17136e46472SEugenio Pérez 
vhost_vdpa_net_check_device_id(struct vhost_net * net)1721e0a84eaSCindy Lu static int vhost_vdpa_net_check_device_id(struct vhost_net *net)
1731e0a84eaSCindy Lu {
1741e0a84eaSCindy Lu     uint32_t device_id;
1751e0a84eaSCindy Lu     int ret;
1761e0a84eaSCindy Lu     struct vhost_dev *hdev;
1771e0a84eaSCindy Lu 
1781e0a84eaSCindy Lu     hdev = (struct vhost_dev *)&net->dev;
1791e0a84eaSCindy Lu     ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id);
1801e0a84eaSCindy Lu     if (device_id != VIRTIO_ID_NET) {
1811e0a84eaSCindy Lu         return -ENOTSUP;
1821e0a84eaSCindy Lu     }
1831e0a84eaSCindy Lu     return ret;
1841e0a84eaSCindy Lu }
1851e0a84eaSCindy Lu 
vhost_vdpa_add(NetClientState * ncs,void * be,int queue_pair_index,int nvqs)18640237840SJason Wang static int vhost_vdpa_add(NetClientState *ncs, void *be,
18740237840SJason Wang                           int queue_pair_index, int nvqs)
1881e0a84eaSCindy Lu {
1891e0a84eaSCindy Lu     VhostNetOptions options;
1901e0a84eaSCindy Lu     struct vhost_net *net = NULL;
1911e0a84eaSCindy Lu     VhostVDPAState *s;
1921e0a84eaSCindy Lu     int ret;
1931e0a84eaSCindy Lu 
1941e0a84eaSCindy Lu     options.backend_type = VHOST_BACKEND_TYPE_VDPA;
1951e0a84eaSCindy Lu     assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
1961e0a84eaSCindy Lu     s = DO_UPCAST(VhostVDPAState, nc, ncs);
1971e0a84eaSCindy Lu     options.net_backend = ncs;
1981e0a84eaSCindy Lu     options.opaque      = be;
1991e0a84eaSCindy Lu     options.busyloop_timeout = 0;
20040237840SJason Wang     options.nvqs = nvqs;
2011e0a84eaSCindy Lu 
2021e0a84eaSCindy Lu     net = vhost_net_init(&options);
2031e0a84eaSCindy Lu     if (!net) {
2041e0a84eaSCindy Lu         error_report("failed to init vhost_net for queue");
205a97ef87aSJason Wang         goto err_init;
2061e0a84eaSCindy Lu     }
2071e0a84eaSCindy Lu     s->vhost_net = net;
2081e0a84eaSCindy Lu     ret = vhost_vdpa_net_check_device_id(net);
2091e0a84eaSCindy Lu     if (ret) {
210a97ef87aSJason Wang         goto err_check;
2111e0a84eaSCindy Lu     }
2121e0a84eaSCindy Lu     return 0;
213a97ef87aSJason Wang err_check:
2141e0a84eaSCindy Lu     vhost_net_cleanup(net);
215ab36edcfSJason Wang     g_free(net);
216a97ef87aSJason Wang err_init:
2171e0a84eaSCindy Lu     return -1;
2181e0a84eaSCindy Lu }
2191e0a84eaSCindy Lu 
vhost_vdpa_cleanup(NetClientState * nc)2201e0a84eaSCindy Lu static void vhost_vdpa_cleanup(NetClientState *nc)
2211e0a84eaSCindy Lu {
2221e0a84eaSCindy Lu     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
2231e0a84eaSCindy Lu 
224a0d7215eSAni Sinha     /*
225a0d7215eSAni Sinha      * If a peer NIC is attached, do not cleanup anything.
226a0d7215eSAni Sinha      * Cleanup will happen as a part of qemu_cleanup() -> net_cleanup()
227a0d7215eSAni Sinha      * when the guest is shutting down.
228a0d7215eSAni Sinha      */
229a0d7215eSAni Sinha     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_NIC) {
230a0d7215eSAni Sinha         return;
231a0d7215eSAni Sinha     }
232babf8b87SEugenio Pérez     munmap(s->cvq_cmd_out_buffer, vhost_vdpa_net_cvq_cmd_page_len());
233babf8b87SEugenio Pérez     munmap(s->status, vhost_vdpa_net_cvq_cmd_page_len());
2341e0a84eaSCindy Lu     if (s->vhost_net) {
2351e0a84eaSCindy Lu         vhost_net_cleanup(s->vhost_net);
2361e0a84eaSCindy Lu         g_free(s->vhost_net);
2371e0a84eaSCindy Lu         s->vhost_net = NULL;
2381e0a84eaSCindy Lu     }
23957b3a7d8SCindy Lu      if (s->vhost_vdpa.device_fd >= 0) {
24057b3a7d8SCindy Lu         qemu_close(s->vhost_vdpa.device_fd);
24157b3a7d8SCindy Lu         s->vhost_vdpa.device_fd = -1;
24257b3a7d8SCindy Lu     }
2431e0a84eaSCindy Lu }
2441e0a84eaSCindy Lu 
245d1fd2d31SHawkins Jiawei /** Dummy SetSteeringEBPF to support RSS for vhost-vdpa backend  */
vhost_vdpa_set_steering_ebpf(NetClientState * nc,int prog_fd)246d1fd2d31SHawkins Jiawei static bool vhost_vdpa_set_steering_ebpf(NetClientState *nc, int prog_fd)
247d1fd2d31SHawkins Jiawei {
248d1fd2d31SHawkins Jiawei     return true;
249d1fd2d31SHawkins Jiawei }
250d1fd2d31SHawkins Jiawei 
vhost_vdpa_has_vnet_hdr(NetClientState * nc)2511e0a84eaSCindy Lu static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc)
2521e0a84eaSCindy Lu {
2531e0a84eaSCindy Lu     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
2541e0a84eaSCindy Lu 
2551e0a84eaSCindy Lu     return true;
2561e0a84eaSCindy Lu }
2571e0a84eaSCindy Lu 
vhost_vdpa_has_ufo(NetClientState * nc)2581e0a84eaSCindy Lu static bool vhost_vdpa_has_ufo(NetClientState *nc)
2591e0a84eaSCindy Lu {
2601e0a84eaSCindy Lu     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
2611e0a84eaSCindy Lu     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
2621e0a84eaSCindy Lu     uint64_t features = 0;
2631e0a84eaSCindy Lu     features |= (1ULL << VIRTIO_NET_F_HOST_UFO);
2641e0a84eaSCindy Lu     features = vhost_net_get_features(s->vhost_net, features);
2651e0a84eaSCindy Lu     return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO));
2661e0a84eaSCindy Lu 
2671e0a84eaSCindy Lu }
2681e0a84eaSCindy Lu 
vhost_vdpa_check_peer_type(NetClientState * nc,ObjectClass * oc,Error ** errp)269ee8a1c63SKevin Wolf static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc,
270ee8a1c63SKevin Wolf                                        Error **errp)
271ee8a1c63SKevin Wolf {
272ee8a1c63SKevin Wolf     const char *driver = object_class_get_name(oc);
273ee8a1c63SKevin Wolf 
274ee8a1c63SKevin Wolf     if (!g_str_has_prefix(driver, "virtio-net-")) {
275ee8a1c63SKevin Wolf         error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
276ee8a1c63SKevin Wolf         return false;
277ee8a1c63SKevin Wolf     }
278ee8a1c63SKevin Wolf 
279ee8a1c63SKevin Wolf     return true;
280ee8a1c63SKevin Wolf }
281ee8a1c63SKevin Wolf 
282846a1e85SEugenio Pérez /** Dummy receive in case qemu falls back to userland tap networking */
vhost_vdpa_receive(NetClientState * nc,const uint8_t * buf,size_t size)283846a1e85SEugenio Pérez static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
284846a1e85SEugenio Pérez                                   size_t size)
285846a1e85SEugenio Pérez {
286bc5add1dSSi-Wei Liu     return size;
287846a1e85SEugenio Pérez }
288846a1e85SEugenio Pérez 
28900ef422eSEugenio Pérez /** From any vdpa net client, get the netclient of the first queue pair */
vhost_vdpa_net_first_nc_vdpa(VhostVDPAState * s)29000ef422eSEugenio Pérez static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
29100ef422eSEugenio Pérez {
29200ef422eSEugenio Pérez     NICState *nic = qemu_get_nic(s->nc.peer);
29300ef422eSEugenio Pérez     NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
29400ef422eSEugenio Pérez 
29500ef422eSEugenio Pérez     return DO_UPCAST(VhostVDPAState, nc, nc0);
29600ef422eSEugenio Pérez }
29700ef422eSEugenio Pérez 
vhost_vdpa_net_log_global_enable(VhostVDPAState * s,bool enable)29869498430SEugenio Pérez static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
29969498430SEugenio Pérez {
30069498430SEugenio Pérez     struct vhost_vdpa *v = &s->vhost_vdpa;
30169498430SEugenio Pérez     VirtIONet *n;
30269498430SEugenio Pérez     VirtIODevice *vdev;
30369498430SEugenio Pérez     int data_queue_pairs, cvq, r;
30469498430SEugenio Pérez 
30569498430SEugenio Pérez     /* We are only called on the first data vqs and only if x-svq is not set */
30669498430SEugenio Pérez     if (s->vhost_vdpa.shadow_vqs_enabled == enable) {
30769498430SEugenio Pérez         return;
30869498430SEugenio Pérez     }
30969498430SEugenio Pérez 
31069498430SEugenio Pérez     vdev = v->dev->vdev;
31169498430SEugenio Pérez     n = VIRTIO_NET(vdev);
31269498430SEugenio Pérez     if (!n->vhost_started) {
31369498430SEugenio Pérez         return;
31469498430SEugenio Pérez     }
31569498430SEugenio Pérez 
31669498430SEugenio Pérez     data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
31769498430SEugenio Pérez     cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
31869498430SEugenio Pérez                                   n->max_ncs - n->max_queue_pairs : 0;
31969498430SEugenio Pérez     /*
32069498430SEugenio Pérez      * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter
32169498430SEugenio Pérez      * in the future and resume the device if read-only operations between
32269498430SEugenio Pérez      * suspend and reset goes wrong.
32369498430SEugenio Pérez      */
32469498430SEugenio Pérez     vhost_net_stop(vdev, n->nic->ncs, data_queue_pairs, cvq);
32569498430SEugenio Pérez 
32669498430SEugenio Pérez     /* Start will check migration setup_or_active to configure or not SVQ */
32769498430SEugenio Pérez     r = vhost_net_start(vdev, n->nic->ncs, data_queue_pairs, cvq);
32869498430SEugenio Pérez     if (unlikely(r < 0)) {
32969498430SEugenio Pérez         error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r);
33069498430SEugenio Pérez     }
33169498430SEugenio Pérez }
33269498430SEugenio Pérez 
vdpa_net_migration_state_notifier(Notifier * notifier,void * data)33369498430SEugenio Pérez static void vdpa_net_migration_state_notifier(Notifier *notifier, void *data)
33469498430SEugenio Pérez {
33569498430SEugenio Pérez     MigrationState *migration = data;
33669498430SEugenio Pérez     VhostVDPAState *s = container_of(notifier, VhostVDPAState,
33769498430SEugenio Pérez                                      migration_state);
33869498430SEugenio Pérez 
33969498430SEugenio Pérez     if (migration_in_setup(migration)) {
34069498430SEugenio Pérez         vhost_vdpa_net_log_global_enable(s, true);
34169498430SEugenio Pérez     } else if (migration_has_failed(migration)) {
34269498430SEugenio Pérez         vhost_vdpa_net_log_global_enable(s, false);
34369498430SEugenio Pérez     }
34469498430SEugenio Pérez }
34569498430SEugenio Pérez 
vhost_vdpa_net_data_start_first(VhostVDPAState * s)34600ef422eSEugenio Pérez static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
34700ef422eSEugenio Pérez {
34800ef422eSEugenio Pérez     struct vhost_vdpa *v = &s->vhost_vdpa;
34900ef422eSEugenio Pérez 
350d9cda213SSteve Sistare     migration_add_notifier(&s->migration_state,
351d9cda213SSteve Sistare                            vdpa_net_migration_state_notifier);
35200ef422eSEugenio Pérez     if (v->shadow_vqs_enabled) {
35300ef422eSEugenio Pérez         v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
35400ef422eSEugenio Pérez                                            v->iova_range.last);
35500ef422eSEugenio Pérez     }
35600ef422eSEugenio Pérez }
35700ef422eSEugenio Pérez 
vhost_vdpa_net_data_start(NetClientState * nc)35800ef422eSEugenio Pérez static int vhost_vdpa_net_data_start(NetClientState *nc)
35900ef422eSEugenio Pérez {
36000ef422eSEugenio Pérez     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
36100ef422eSEugenio Pérez     struct vhost_vdpa *v = &s->vhost_vdpa;
36200ef422eSEugenio Pérez 
36300ef422eSEugenio Pérez     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
36400ef422eSEugenio Pérez 
36569498430SEugenio Pérez     if (s->always_svq ||
36669498430SEugenio Pérez         migration_is_setup_or_active(migrate_get_current()->state)) {
36769498430SEugenio Pérez         v->shadow_vqs_enabled = true;
36869498430SEugenio Pérez         v->shadow_data = true;
36969498430SEugenio Pérez     } else {
37069498430SEugenio Pérez         v->shadow_vqs_enabled = false;
37169498430SEugenio Pérez         v->shadow_data = false;
37269498430SEugenio Pérez     }
37369498430SEugenio Pérez 
37400ef422eSEugenio Pérez     if (v->index == 0) {
37500ef422eSEugenio Pérez         vhost_vdpa_net_data_start_first(s);
37600ef422eSEugenio Pérez         return 0;
37700ef422eSEugenio Pérez     }
37800ef422eSEugenio Pérez 
37900ef422eSEugenio Pérez     if (v->shadow_vqs_enabled) {
38000ef422eSEugenio Pérez         VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
38100ef422eSEugenio Pérez         v->iova_tree = s0->vhost_vdpa.iova_tree;
38200ef422eSEugenio Pérez     }
38300ef422eSEugenio Pérez 
38400ef422eSEugenio Pérez     return 0;
38500ef422eSEugenio Pérez }
38600ef422eSEugenio Pérez 
vhost_vdpa_net_data_load(NetClientState * nc)3876c482547SEugenio Pérez static int vhost_vdpa_net_data_load(NetClientState *nc)
3886c482547SEugenio Pérez {
3896c482547SEugenio Pérez     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
3906c482547SEugenio Pérez     struct vhost_vdpa *v = &s->vhost_vdpa;
3916c482547SEugenio Pérez     bool has_cvq = v->dev->vq_index_end % 2;
3926c482547SEugenio Pérez 
3936c482547SEugenio Pérez     if (has_cvq) {
3946c482547SEugenio Pérez         return 0;
3956c482547SEugenio Pérez     }
3966c482547SEugenio Pérez 
3976c482547SEugenio Pérez     for (int i = 0; i < v->dev->nvqs; ++i) {
3986c482547SEugenio Pérez         vhost_vdpa_set_vring_ready(v, i + v->dev->vq_index);
3996c482547SEugenio Pérez     }
4006c482547SEugenio Pérez     return 0;
4016c482547SEugenio Pérez }
4026c482547SEugenio Pérez 
vhost_vdpa_net_client_stop(NetClientState * nc)40300ef422eSEugenio Pérez static void vhost_vdpa_net_client_stop(NetClientState *nc)
40400ef422eSEugenio Pérez {
40500ef422eSEugenio Pérez     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
40600ef422eSEugenio Pérez     struct vhost_dev *dev;
40700ef422eSEugenio Pérez 
40800ef422eSEugenio Pérez     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
40900ef422eSEugenio Pérez 
41069498430SEugenio Pérez     if (s->vhost_vdpa.index == 0) {
411d9cda213SSteve Sistare         migration_remove_notifier(&s->migration_state);
41269498430SEugenio Pérez     }
41369498430SEugenio Pérez 
41400ef422eSEugenio Pérez     dev = s->vhost_vdpa.dev;
41500ef422eSEugenio Pérez     if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
41600ef422eSEugenio Pérez         g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
4170a7a164bSEugenio Pérez     } else {
4180a7a164bSEugenio Pérez         s->vhost_vdpa.iova_tree = NULL;
41900ef422eSEugenio Pérez     }
42000ef422eSEugenio Pérez }
42100ef422eSEugenio Pérez 
4221e0a84eaSCindy Lu static NetClientInfo net_vhost_vdpa_info = {
4231e0a84eaSCindy Lu         .type = NET_CLIENT_DRIVER_VHOST_VDPA,
4241e0a84eaSCindy Lu         .size = sizeof(VhostVDPAState),
425846a1e85SEugenio Pérez         .receive = vhost_vdpa_receive,
42600ef422eSEugenio Pérez         .start = vhost_vdpa_net_data_start,
4276c482547SEugenio Pérez         .load = vhost_vdpa_net_data_load,
42800ef422eSEugenio Pérez         .stop = vhost_vdpa_net_client_stop,
4291e0a84eaSCindy Lu         .cleanup = vhost_vdpa_cleanup,
4301e0a84eaSCindy Lu         .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
4311e0a84eaSCindy Lu         .has_ufo = vhost_vdpa_has_ufo,
432ee8a1c63SKevin Wolf         .check_peer_type = vhost_vdpa_check_peer_type,
433d1fd2d31SHawkins Jiawei         .set_steering_ebpf = vhost_vdpa_set_steering_ebpf,
4341e0a84eaSCindy Lu };
4351e0a84eaSCindy Lu 
vhost_vdpa_get_vring_group(int device_fd,unsigned vq_index,Error ** errp)436152128d6SEugenio Pérez static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index,
437152128d6SEugenio Pérez                                           Error **errp)
438c1a10086SEugenio Pérez {
439c1a10086SEugenio Pérez     struct vhost_vring_state state = {
440c1a10086SEugenio Pérez         .index = vq_index,
441c1a10086SEugenio Pérez     };
442c1a10086SEugenio Pérez     int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state);
443c1a10086SEugenio Pérez 
444c1a10086SEugenio Pérez     if (unlikely(r < 0)) {
4450f2bb0bfSEugenio Pérez         r = -errno;
446152128d6SEugenio Pérez         error_setg_errno(errp, errno, "Cannot get VQ %u group", vq_index);
447c1a10086SEugenio Pérez         return r;
448c1a10086SEugenio Pérez     }
449c1a10086SEugenio Pérez 
450c1a10086SEugenio Pérez     return state.num;
451c1a10086SEugenio Pérez }
452c1a10086SEugenio Pérez 
vhost_vdpa_set_address_space_id(struct vhost_vdpa * v,unsigned vq_group,unsigned asid_num)453c1a10086SEugenio Pérez static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v,
454c1a10086SEugenio Pérez                                            unsigned vq_group,
455c1a10086SEugenio Pérez                                            unsigned asid_num)
456c1a10086SEugenio Pérez {
457c1a10086SEugenio Pérez     struct vhost_vring_state asid = {
458c1a10086SEugenio Pérez         .index = vq_group,
459c1a10086SEugenio Pérez         .num = asid_num,
460c1a10086SEugenio Pérez     };
461c1a10086SEugenio Pérez     int r;
462c1a10086SEugenio Pérez 
463c1a10086SEugenio Pérez     r = ioctl(v->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid);
464c1a10086SEugenio Pérez     if (unlikely(r < 0)) {
465c1a10086SEugenio Pérez         error_report("Can't set vq group %u asid %u, errno=%d (%s)",
466c1a10086SEugenio Pérez                      asid.index, asid.num, errno, g_strerror(errno));
467c1a10086SEugenio Pérez     }
468c1a10086SEugenio Pérez     return r;
469c1a10086SEugenio Pérez }
470c1a10086SEugenio Pérez 
vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa * v,void * addr)4712df4dd31SEugenio Pérez static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
4722df4dd31SEugenio Pérez {
4732df4dd31SEugenio Pérez     VhostIOVATree *tree = v->iova_tree;
4742df4dd31SEugenio Pérez     DMAMap needle = {
4752df4dd31SEugenio Pérez         /*
4762df4dd31SEugenio Pérez          * No need to specify size or to look for more translations since
4772df4dd31SEugenio Pérez          * this contiguous chunk was allocated by us.
4782df4dd31SEugenio Pérez          */
4792df4dd31SEugenio Pérez         .translated_addr = (hwaddr)(uintptr_t)addr,
4802df4dd31SEugenio Pérez     };
4812df4dd31SEugenio Pérez     const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
4822df4dd31SEugenio Pérez     int r;
4832df4dd31SEugenio Pérez 
4842df4dd31SEugenio Pérez     if (unlikely(!map)) {
4852df4dd31SEugenio Pérez         error_report("Cannot locate expected map");
4862df4dd31SEugenio Pérez         return;
4872df4dd31SEugenio Pérez     }
4882df4dd31SEugenio Pérez 
489cd831ed5SEugenio Pérez     r = vhost_vdpa_dma_unmap(v, v->address_space_id, map->iova, map->size + 1);
4902df4dd31SEugenio Pérez     if (unlikely(r != 0)) {
4912df4dd31SEugenio Pérez         error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
4922df4dd31SEugenio Pérez     }
4932df4dd31SEugenio Pérez 
49469292a8eSEugenio Pérez     vhost_iova_tree_remove(tree, *map);
4952df4dd31SEugenio Pérez }
4962df4dd31SEugenio Pérez 
4977a7f87e9SEugenio Pérez /** Map CVQ buffer. */
vhost_vdpa_cvq_map_buf(struct vhost_vdpa * v,void * buf,size_t size,bool write)4987a7f87e9SEugenio Pérez static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size,
4997a7f87e9SEugenio Pérez                                   bool write)
5002df4dd31SEugenio Pérez {
5012df4dd31SEugenio Pérez     DMAMap map = {};
5022df4dd31SEugenio Pérez     int r;
5032df4dd31SEugenio Pérez 
5042df4dd31SEugenio Pérez     map.translated_addr = (hwaddr)(uintptr_t)buf;
5057a7f87e9SEugenio Pérez     map.size = size - 1;
5062df4dd31SEugenio Pérez     map.perm = write ? IOMMU_RW : IOMMU_RO,
5072df4dd31SEugenio Pérez     r = vhost_iova_tree_map_alloc(v->iova_tree, &map);
5082df4dd31SEugenio Pérez     if (unlikely(r != IOVA_OK)) {
5092df4dd31SEugenio Pérez         error_report("Cannot map injected element");
5107a7f87e9SEugenio Pérez         return r;
5112df4dd31SEugenio Pérez     }
5122df4dd31SEugenio Pérez 
513cd831ed5SEugenio Pérez     r = vhost_vdpa_dma_map(v, v->address_space_id, map.iova,
514cd831ed5SEugenio Pérez                            vhost_vdpa_net_cvq_cmd_page_len(), buf, !write);
5152df4dd31SEugenio Pérez     if (unlikely(r < 0)) {
5162df4dd31SEugenio Pérez         goto dma_map_err;
5172df4dd31SEugenio Pérez     }
5182df4dd31SEugenio Pérez 
5197a7f87e9SEugenio Pérez     return 0;
5202df4dd31SEugenio Pérez 
5212df4dd31SEugenio Pérez dma_map_err:
52269292a8eSEugenio Pérez     vhost_iova_tree_remove(v->iova_tree, map);
5237a7f87e9SEugenio Pérez     return r;
5242df4dd31SEugenio Pérez }
5252df4dd31SEugenio Pérez 
vhost_vdpa_net_cvq_start(NetClientState * nc)5267a7f87e9SEugenio Pérez static int vhost_vdpa_net_cvq_start(NetClientState *nc)
5272df4dd31SEugenio Pérez {
52800ef422eSEugenio Pérez     VhostVDPAState *s, *s0;
529c1a10086SEugenio Pérez     struct vhost_vdpa *v;
530c1a10086SEugenio Pérez     int64_t cvq_group;
531152128d6SEugenio Pérez     int r;
532152128d6SEugenio Pérez     Error *err = NULL;
5332df4dd31SEugenio Pérez 
5347a7f87e9SEugenio Pérez     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
5357a7f87e9SEugenio Pérez 
5367a7f87e9SEugenio Pérez     s = DO_UPCAST(VhostVDPAState, nc, nc);
537c1a10086SEugenio Pérez     v = &s->vhost_vdpa;
538c1a10086SEugenio Pérez 
53969498430SEugenio Pérez     s0 = vhost_vdpa_net_first_nc_vdpa(s);
54069498430SEugenio Pérez     v->shadow_data = s0->vhost_vdpa.shadow_vqs_enabled;
541b40eba9cSEugenio Pérez     v->shadow_vqs_enabled = s0->vhost_vdpa.shadow_vqs_enabled;
542c1a10086SEugenio Pérez     s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
543c1a10086SEugenio Pérez 
54469498430SEugenio Pérez     if (s->vhost_vdpa.shadow_data) {
545c1a10086SEugenio Pérez         /* SVQ is already configured for all virtqueues */
546c1a10086SEugenio Pérez         goto out;
547c1a10086SEugenio Pérez     }
548c1a10086SEugenio Pérez 
549c1a10086SEugenio Pérez     /*
550c1a10086SEugenio Pérez      * If we early return in these cases SVQ will not be enabled. The migration
551c1a10086SEugenio Pérez      * will be blocked as long as vhost-vdpa backends will not offer _F_LOG.
552c1a10086SEugenio Pérez      */
553152128d6SEugenio Pérez     if (!vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) {
554c1a10086SEugenio Pérez         return 0;
555c1a10086SEugenio Pérez     }
556c1a10086SEugenio Pérez 
557152128d6SEugenio Pérez     if (!s->cvq_isolated) {
558152128d6SEugenio Pérez         return 0;
559152128d6SEugenio Pérez     }
560152128d6SEugenio Pérez 
561152128d6SEugenio Pérez     cvq_group = vhost_vdpa_get_vring_group(v->device_fd,
562152128d6SEugenio Pérez                                            v->dev->vq_index_end - 1,
563152128d6SEugenio Pérez                                            &err);
564c1a10086SEugenio Pérez     if (unlikely(cvq_group < 0)) {
565152128d6SEugenio Pérez         error_report_err(err);
566c1a10086SEugenio Pérez         return cvq_group;
567c1a10086SEugenio Pérez     }
568c1a10086SEugenio Pérez 
569c1a10086SEugenio Pérez     r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID);
570c1a10086SEugenio Pérez     if (unlikely(r < 0)) {
571c1a10086SEugenio Pérez         return r;
572c1a10086SEugenio Pérez     }
573c1a10086SEugenio Pérez 
574c1a10086SEugenio Pérez     v->shadow_vqs_enabled = true;
575c1a10086SEugenio Pérez     s->vhost_vdpa.address_space_id = VHOST_VDPA_NET_CVQ_ASID;
576c1a10086SEugenio Pérez 
577c1a10086SEugenio Pérez out:
5787a7f87e9SEugenio Pérez     if (!s->vhost_vdpa.shadow_vqs_enabled) {
5797a7f87e9SEugenio Pérez         return 0;
5802df4dd31SEugenio Pérez     }
5812df4dd31SEugenio Pérez 
58200ef422eSEugenio Pérez     if (s0->vhost_vdpa.iova_tree) {
58300ef422eSEugenio Pérez         /*
58400ef422eSEugenio Pérez          * SVQ is already configured for all virtqueues.  Reuse IOVA tree for
58500ef422eSEugenio Pérez          * simplicity, whether CVQ shares ASID with guest or not, because:
58600ef422eSEugenio Pérez          * - Memory listener need access to guest's memory addresses allocated
58700ef422eSEugenio Pérez          *   in the IOVA tree.
58800ef422eSEugenio Pérez          * - There should be plenty of IOVA address space for both ASID not to
58900ef422eSEugenio Pérez          *   worry about collisions between them.  Guest's translations are
59000ef422eSEugenio Pérez          *   still validated with virtio virtqueue_pop so there is no risk for
59100ef422eSEugenio Pérez          *   the guest to access memory that it shouldn't.
59200ef422eSEugenio Pérez          *
59300ef422eSEugenio Pérez          * To allocate a iova tree per ASID is doable but it complicates the
59400ef422eSEugenio Pérez          * code and it is not worth it for the moment.
59500ef422eSEugenio Pérez          */
59600ef422eSEugenio Pérez         v->iova_tree = s0->vhost_vdpa.iova_tree;
59700ef422eSEugenio Pérez     } else {
59800ef422eSEugenio Pérez         v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
59900ef422eSEugenio Pérez                                            v->iova_range.last);
60000ef422eSEugenio Pérez     }
60100ef422eSEugenio Pérez 
6027a7f87e9SEugenio Pérez     r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
6037a7f87e9SEugenio Pérez                                vhost_vdpa_net_cvq_cmd_page_len(), false);
6047a7f87e9SEugenio Pérez     if (unlikely(r < 0)) {
6057a7f87e9SEugenio Pérez         return r;
6067a7f87e9SEugenio Pérez     }
6077a7f87e9SEugenio Pérez 
60817fb889fSEugenio Pérez     r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->status,
6097a7f87e9SEugenio Pérez                                vhost_vdpa_net_cvq_cmd_page_len(), true);
6107a7f87e9SEugenio Pérez     if (unlikely(r < 0)) {
6112df4dd31SEugenio Pérez         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
6122df4dd31SEugenio Pérez     }
6132df4dd31SEugenio Pérez 
6147a7f87e9SEugenio Pérez     return r;
6157a7f87e9SEugenio Pérez }
6167a7f87e9SEugenio Pérez 
vhost_vdpa_net_cvq_stop(NetClientState * nc)6177a7f87e9SEugenio Pérez static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
6187a7f87e9SEugenio Pérez {
6197a7f87e9SEugenio Pérez     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
6207a7f87e9SEugenio Pérez 
6217a7f87e9SEugenio Pérez     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
6227a7f87e9SEugenio Pérez 
6237a7f87e9SEugenio Pérez     if (s->vhost_vdpa.shadow_vqs_enabled) {
6247a7f87e9SEugenio Pérez         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
62517fb889fSEugenio Pérez         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
626c1a10086SEugenio Pérez     }
62700ef422eSEugenio Pérez 
62800ef422eSEugenio Pérez     vhost_vdpa_net_client_stop(nc);
6292df4dd31SEugenio Pérez }
6302df4dd31SEugenio Pérez 
vhost_vdpa_net_cvq_add(VhostVDPAState * s,const struct iovec * out_sg,size_t out_num,const struct iovec * in_sg,size_t in_num)6310e6bff0dSHawkins Jiawei static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s,
6320e6bff0dSHawkins Jiawei                                     const struct iovec *out_sg, size_t out_num,
6330e6bff0dSHawkins Jiawei                                     const struct iovec *in_sg, size_t in_num)
634be4278b6SEugenio Pérez {
635be4278b6SEugenio Pérez     VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
636be4278b6SEugenio Pérez     int r;
637be4278b6SEugenio Pérez 
6380e6bff0dSHawkins Jiawei     r = vhost_svq_add(svq, out_sg, out_num, in_sg, in_num, NULL);
639be4278b6SEugenio Pérez     if (unlikely(r != 0)) {
640be4278b6SEugenio Pérez         if (unlikely(r == -ENOSPC)) {
641be4278b6SEugenio Pérez             qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
642be4278b6SEugenio Pérez                           __func__);
643be4278b6SEugenio Pérez         }
644a864a321SHawkins Jiawei     }
645a864a321SHawkins Jiawei 
646be4278b6SEugenio Pérez     return r;
647be4278b6SEugenio Pérez }
648be4278b6SEugenio Pérez 
649be4278b6SEugenio Pérez /*
650a864a321SHawkins Jiawei  * Convenience wrapper to poll SVQ for multiple control commands.
651a864a321SHawkins Jiawei  *
652a864a321SHawkins Jiawei  * Caller should hold the BQL when invoking this function, and should take
653a864a321SHawkins Jiawei  * the answer before SVQ pulls by itself when BQL is released.
654be4278b6SEugenio Pérez  */
vhost_vdpa_net_svq_poll(VhostVDPAState * s,size_t cmds_in_flight)655a864a321SHawkins Jiawei static ssize_t vhost_vdpa_net_svq_poll(VhostVDPAState *s, size_t cmds_in_flight)
656a864a321SHawkins Jiawei {
657a864a321SHawkins Jiawei     VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
658a864a321SHawkins Jiawei     return vhost_svq_poll(svq, cmds_in_flight);
659be4278b6SEugenio Pérez }
660be4278b6SEugenio Pérez 
vhost_vdpa_net_load_cursor_reset(VhostVDPAState * s,struct iovec * out_cursor,struct iovec * in_cursor)6611d7e2a8fSHawkins Jiawei static void vhost_vdpa_net_load_cursor_reset(VhostVDPAState *s,
6621d7e2a8fSHawkins Jiawei                                              struct iovec *out_cursor,
6631d7e2a8fSHawkins Jiawei                                              struct iovec *in_cursor)
6641d7e2a8fSHawkins Jiawei {
6651d7e2a8fSHawkins Jiawei     /* reset the cursor of the output buffer for the device */
6661d7e2a8fSHawkins Jiawei     out_cursor->iov_base = s->cvq_cmd_out_buffer;
6671d7e2a8fSHawkins Jiawei     out_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len();
6681d7e2a8fSHawkins Jiawei 
6691d7e2a8fSHawkins Jiawei     /* reset the cursor of the in buffer for the device */
6701d7e2a8fSHawkins Jiawei     in_cursor->iov_base = s->status;
6711d7e2a8fSHawkins Jiawei     in_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len();
6721d7e2a8fSHawkins Jiawei }
6731d7e2a8fSHawkins Jiawei 
674acec5f68SHawkins Jiawei /*
675acec5f68SHawkins Jiawei  * Poll SVQ for multiple pending control commands and check the device's ack.
676acec5f68SHawkins Jiawei  *
677acec5f68SHawkins Jiawei  * Caller should hold the BQL when invoking this function.
678acec5f68SHawkins Jiawei  *
679acec5f68SHawkins Jiawei  * @s: The VhostVDPAState
680acec5f68SHawkins Jiawei  * @len: The length of the pending status shadow buffer
681acec5f68SHawkins Jiawei  */
vhost_vdpa_net_svq_flush(VhostVDPAState * s,size_t len)682acec5f68SHawkins Jiawei static ssize_t vhost_vdpa_net_svq_flush(VhostVDPAState *s, size_t len)
683acec5f68SHawkins Jiawei {
684acec5f68SHawkins Jiawei     /* device uses a one-byte length ack for each control command */
685acec5f68SHawkins Jiawei     ssize_t dev_written = vhost_vdpa_net_svq_poll(s, len);
686acec5f68SHawkins Jiawei     if (unlikely(dev_written != len)) {
687acec5f68SHawkins Jiawei         return -EIO;
688acec5f68SHawkins Jiawei     }
689acec5f68SHawkins Jiawei 
690acec5f68SHawkins Jiawei     /* check the device's ack */
691acec5f68SHawkins Jiawei     for (int i = 0; i < len; ++i) {
692acec5f68SHawkins Jiawei         if (s->status[i] != VIRTIO_NET_OK) {
693acec5f68SHawkins Jiawei             return -EIO;
694acec5f68SHawkins Jiawei         }
695acec5f68SHawkins Jiawei     }
696acec5f68SHawkins Jiawei     return 0;
697acec5f68SHawkins Jiawei }
698acec5f68SHawkins Jiawei 
vhost_vdpa_net_load_cmd(VhostVDPAState * s,struct iovec * out_cursor,struct iovec * in_cursor,uint8_t class,uint8_t cmd,const struct iovec * data_sg,size_t data_num)6991d7e2a8fSHawkins Jiawei static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s,
7001d7e2a8fSHawkins Jiawei                                        struct iovec *out_cursor,
7011d7e2a8fSHawkins Jiawei                                        struct iovec *in_cursor, uint8_t class,
7022848c6aaSHawkins Jiawei                                        uint8_t cmd, const struct iovec *data_sg,
7032848c6aaSHawkins Jiawei                                        size_t data_num)
704f73c0c43SEugenio Pérez {
705f73c0c43SEugenio Pérez     const struct virtio_net_ctrl_hdr ctrl = {
706f73c0c43SEugenio Pérez         .class = class,
707f73c0c43SEugenio Pérez         .cmd = cmd,
708f73c0c43SEugenio Pérez     };
709acec5f68SHawkins Jiawei     size_t data_size = iov_size(data_sg, data_num), cmd_size;
7101d7e2a8fSHawkins Jiawei     struct iovec out, in;
711a864a321SHawkins Jiawei     ssize_t r;
712acec5f68SHawkins Jiawei     unsigned dummy_cursor_iov_cnt;
713acec5f68SHawkins Jiawei     VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
714f73c0c43SEugenio Pérez 
715f73c0c43SEugenio Pérez     assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
716acec5f68SHawkins Jiawei     cmd_size = sizeof(ctrl) + data_size;
717acec5f68SHawkins Jiawei     if (vhost_svq_available_slots(svq) < 2 ||
718acec5f68SHawkins Jiawei         iov_size(out_cursor, 1) < cmd_size) {
719acec5f68SHawkins Jiawei         /*
720acec5f68SHawkins Jiawei          * It is time to flush all pending control commands if SVQ is full
721acec5f68SHawkins Jiawei          * or control commands shadow buffers are full.
722acec5f68SHawkins Jiawei          *
723acec5f68SHawkins Jiawei          * We can poll here since we've had BQL from the time
724acec5f68SHawkins Jiawei          * we sent the descriptor.
725acec5f68SHawkins Jiawei          */
726acec5f68SHawkins Jiawei         r = vhost_vdpa_net_svq_flush(s, in_cursor->iov_base -
727acec5f68SHawkins Jiawei                                      (void *)s->status);
728acec5f68SHawkins Jiawei         if (unlikely(r < 0)) {
729acec5f68SHawkins Jiawei             return r;
730f73c0c43SEugenio Pérez         }
731f73c0c43SEugenio Pérez 
732acec5f68SHawkins Jiawei         vhost_vdpa_net_load_cursor_reset(s, out_cursor, in_cursor);
733acec5f68SHawkins Jiawei     }
734f73c0c43SEugenio Pérez 
735f73c0c43SEugenio Pérez     /* pack the CVQ command header */
7361d7e2a8fSHawkins Jiawei     iov_from_buf(out_cursor, 1, 0, &ctrl, sizeof(ctrl));
737f73c0c43SEugenio Pérez     /* pack the CVQ command command-specific-data */
738f73c0c43SEugenio Pérez     iov_to_buf(data_sg, data_num, 0,
7391d7e2a8fSHawkins Jiawei                out_cursor->iov_base + sizeof(ctrl), data_size);
7401d7e2a8fSHawkins Jiawei 
7411d7e2a8fSHawkins Jiawei     /* extract the required buffer from the cursor for output */
742acec5f68SHawkins Jiawei     iov_copy(&out, 1, out_cursor, 1, 0, cmd_size);
7431d7e2a8fSHawkins Jiawei     /* extract the required buffer from the cursor for input */
7441d7e2a8fSHawkins Jiawei     iov_copy(&in, 1, in_cursor, 1, 0, sizeof(*s->status));
745f73c0c43SEugenio Pérez 
746a864a321SHawkins Jiawei     r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1);
747a864a321SHawkins Jiawei     if (unlikely(r < 0)) {
748a864a321SHawkins Jiawei         return r;
749a864a321SHawkins Jiawei     }
750a864a321SHawkins Jiawei 
751acec5f68SHawkins Jiawei     /* iterate the cursors */
752acec5f68SHawkins Jiawei     dummy_cursor_iov_cnt = 1;
753acec5f68SHawkins Jiawei     iov_discard_front(&out_cursor, &dummy_cursor_iov_cnt, cmd_size);
754acec5f68SHawkins Jiawei     dummy_cursor_iov_cnt = 1;
755acec5f68SHawkins Jiawei     iov_discard_front(&in_cursor, &dummy_cursor_iov_cnt, sizeof(*s->status));
756acec5f68SHawkins Jiawei 
757acec5f68SHawkins Jiawei     return 0;
758f73c0c43SEugenio Pérez }
759f73c0c43SEugenio Pérez 
vhost_vdpa_net_load_mac(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor)7601d7e2a8fSHawkins Jiawei static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n,
7611d7e2a8fSHawkins Jiawei                                    struct iovec *out_cursor,
7621d7e2a8fSHawkins Jiawei                                    struct iovec *in_cursor)
763f73c0c43SEugenio Pérez {
76402d3bf09SHawkins Jiawei     if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
7652848c6aaSHawkins Jiawei         const struct iovec data = {
7662848c6aaSHawkins Jiawei             .iov_base = (void *)n->mac,
7672848c6aaSHawkins Jiawei             .iov_len = sizeof(n->mac),
7682848c6aaSHawkins Jiawei         };
769acec5f68SHawkins Jiawei         ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
7701d7e2a8fSHawkins Jiawei                                             VIRTIO_NET_CTRL_MAC,
771f73c0c43SEugenio Pérez                                             VIRTIO_NET_CTRL_MAC_ADDR_SET,
7722848c6aaSHawkins Jiawei                                             &data, 1);
773acec5f68SHawkins Jiawei         if (unlikely(r < 0)) {
774acec5f68SHawkins Jiawei             return r;
775b479bc3cSHawkins Jiawei         }
776f73c0c43SEugenio Pérez     }
777f73c0c43SEugenio Pérez 
7780ddcecb8SHawkins Jiawei     /*
7790ddcecb8SHawkins Jiawei      * According to VirtIO standard, "The device MUST have an
7800ddcecb8SHawkins Jiawei      * empty MAC filtering table on reset.".
7810ddcecb8SHawkins Jiawei      *
7820ddcecb8SHawkins Jiawei      * Therefore, there is no need to send this CVQ command if the
7830ddcecb8SHawkins Jiawei      * driver also sets an empty MAC filter table, which aligns with
7840ddcecb8SHawkins Jiawei      * the device's defaults.
7850ddcecb8SHawkins Jiawei      *
7860ddcecb8SHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
7870ddcecb8SHawkins Jiawei      * configuration only at live migration.
7880ddcecb8SHawkins Jiawei      */
7890ddcecb8SHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX) ||
7900ddcecb8SHawkins Jiawei         n->mac_table.in_use == 0) {
7910ddcecb8SHawkins Jiawei         return 0;
7920ddcecb8SHawkins Jiawei     }
7930ddcecb8SHawkins Jiawei 
7940ddcecb8SHawkins Jiawei     uint32_t uni_entries = n->mac_table.first_multi,
7950ddcecb8SHawkins Jiawei              uni_macs_size = uni_entries * ETH_ALEN,
7960ddcecb8SHawkins Jiawei              mul_entries = n->mac_table.in_use - uni_entries,
7970ddcecb8SHawkins Jiawei              mul_macs_size = mul_entries * ETH_ALEN;
7980ddcecb8SHawkins Jiawei     struct virtio_net_ctrl_mac uni = {
7990ddcecb8SHawkins Jiawei         .entries = cpu_to_le32(uni_entries),
8000ddcecb8SHawkins Jiawei     };
8010ddcecb8SHawkins Jiawei     struct virtio_net_ctrl_mac mul = {
8020ddcecb8SHawkins Jiawei         .entries = cpu_to_le32(mul_entries),
8030ddcecb8SHawkins Jiawei     };
8040ddcecb8SHawkins Jiawei     const struct iovec data[] = {
8050ddcecb8SHawkins Jiawei         {
8060ddcecb8SHawkins Jiawei             .iov_base = &uni,
8070ddcecb8SHawkins Jiawei             .iov_len = sizeof(uni),
8080ddcecb8SHawkins Jiawei         }, {
8090ddcecb8SHawkins Jiawei             .iov_base = n->mac_table.macs,
8100ddcecb8SHawkins Jiawei             .iov_len = uni_macs_size,
8110ddcecb8SHawkins Jiawei         }, {
8120ddcecb8SHawkins Jiawei             .iov_base = &mul,
8130ddcecb8SHawkins Jiawei             .iov_len = sizeof(mul),
8140ddcecb8SHawkins Jiawei         }, {
8150ddcecb8SHawkins Jiawei             .iov_base = &n->mac_table.macs[uni_macs_size],
8160ddcecb8SHawkins Jiawei             .iov_len = mul_macs_size,
8170ddcecb8SHawkins Jiawei         },
8180ddcecb8SHawkins Jiawei     };
819acec5f68SHawkins Jiawei     ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
8200ddcecb8SHawkins Jiawei                                         VIRTIO_NET_CTRL_MAC,
8210ddcecb8SHawkins Jiawei                                         VIRTIO_NET_CTRL_MAC_TABLE_SET,
8220ddcecb8SHawkins Jiawei                                         data, ARRAY_SIZE(data));
823acec5f68SHawkins Jiawei     if (unlikely(r < 0)) {
824acec5f68SHawkins Jiawei         return r;
8250ddcecb8SHawkins Jiawei     }
8260ddcecb8SHawkins Jiawei 
827f73c0c43SEugenio Pérez     return 0;
828f73c0c43SEugenio Pérez }
829f73c0c43SEugenio Pérez 
vhost_vdpa_net_load_rss(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor,bool do_rss)8308b98c15fSHawkins Jiawei static int vhost_vdpa_net_load_rss(VhostVDPAState *s, const VirtIONet *n,
8318b98c15fSHawkins Jiawei                                    struct iovec *out_cursor,
832b3c09106SHawkins Jiawei                                    struct iovec *in_cursor, bool do_rss)
8338b98c15fSHawkins Jiawei {
8348b98c15fSHawkins Jiawei     struct virtio_net_rss_config cfg = {};
8358b98c15fSHawkins Jiawei     ssize_t r;
8368b98c15fSHawkins Jiawei     g_autofree uint16_t *table = NULL;
8378b98c15fSHawkins Jiawei 
8388b98c15fSHawkins Jiawei     /*
8398b98c15fSHawkins Jiawei      * According to VirtIO standard, "Initially the device has all hash
8408b98c15fSHawkins Jiawei      * types disabled and reports only VIRTIO_NET_HASH_REPORT_NONE.".
8418b98c15fSHawkins Jiawei      *
8428b98c15fSHawkins Jiawei      * Therefore, there is no need to send this CVQ command if the
8438b98c15fSHawkins Jiawei      * driver disables the all hash types, which aligns with
8448b98c15fSHawkins Jiawei      * the device's defaults.
8458b98c15fSHawkins Jiawei      *
8468b98c15fSHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
8478b98c15fSHawkins Jiawei      * configuration only at live migration.
8488b98c15fSHawkins Jiawei      */
8498b98c15fSHawkins Jiawei     if (!n->rss_data.enabled ||
8508b98c15fSHawkins Jiawei         n->rss_data.hash_types == VIRTIO_NET_HASH_REPORT_NONE) {
8518b98c15fSHawkins Jiawei         return 0;
8528b98c15fSHawkins Jiawei     }
8538b98c15fSHawkins Jiawei 
8548b98c15fSHawkins Jiawei     table = g_malloc_n(n->rss_data.indirections_len,
8558b98c15fSHawkins Jiawei                        sizeof(n->rss_data.indirections_table[0]));
8568b98c15fSHawkins Jiawei     cfg.hash_types = cpu_to_le32(n->rss_data.hash_types);
8578b98c15fSHawkins Jiawei 
858b3c09106SHawkins Jiawei     if (do_rss) {
859b3c09106SHawkins Jiawei         /*
860b3c09106SHawkins Jiawei          * According to VirtIO standard, "Number of entries in indirection_table
861b3c09106SHawkins Jiawei          * is (indirection_table_mask + 1)".
862b3c09106SHawkins Jiawei          */
863b3c09106SHawkins Jiawei         cfg.indirection_table_mask = cpu_to_le16(n->rss_data.indirections_len -
864b3c09106SHawkins Jiawei                                                  1);
865b3c09106SHawkins Jiawei         cfg.unclassified_queue = cpu_to_le16(n->rss_data.default_queue);
866b3c09106SHawkins Jiawei         for (int i = 0; i < n->rss_data.indirections_len; ++i) {
867b3c09106SHawkins Jiawei             table[i] = cpu_to_le16(n->rss_data.indirections_table[i]);
868b3c09106SHawkins Jiawei         }
869b3c09106SHawkins Jiawei         cfg.max_tx_vq = cpu_to_le16(n->curr_queue_pairs);
870b3c09106SHawkins Jiawei     } else {
8718b98c15fSHawkins Jiawei         /*
8728b98c15fSHawkins Jiawei          * According to VirtIO standard, "Field reserved MUST contain zeroes.
8738b98c15fSHawkins Jiawei          * It is defined to make the structure to match the layout of
8748b98c15fSHawkins Jiawei          * virtio_net_rss_config structure, defined in 5.1.6.5.7.".
8758b98c15fSHawkins Jiawei          *
8768b98c15fSHawkins Jiawei          * Therefore, we need to zero the fields in
8778b98c15fSHawkins Jiawei          * struct virtio_net_rss_config, which corresponds to the
8788b98c15fSHawkins Jiawei          * `reserved` field in struct virtio_net_hash_config.
8798b98c15fSHawkins Jiawei          *
8808b98c15fSHawkins Jiawei          * Note that all other fields are zeroed at their definitions,
8818b98c15fSHawkins Jiawei          * except for the `indirection_table` field, where the actual data
8828b98c15fSHawkins Jiawei          * is stored in the `table` variable to ensure compatibility
8838b98c15fSHawkins Jiawei          * with RSS case. Therefore, we need to zero the `table` variable here.
8848b98c15fSHawkins Jiawei          */
8858b98c15fSHawkins Jiawei         table[0] = 0;
886b3c09106SHawkins Jiawei     }
8878b98c15fSHawkins Jiawei 
8888b98c15fSHawkins Jiawei     /*
8898b98c15fSHawkins Jiawei      * Considering that virtio_net_handle_rss() currently does not restore
8908b98c15fSHawkins Jiawei      * the hash key length parsed from the CVQ command sent from the guest
8918b98c15fSHawkins Jiawei      * into n->rss_data and uses the maximum key length in other code, so
8928b98c15fSHawkins Jiawei      * we also employ the maximum key length here.
8938b98c15fSHawkins Jiawei      */
8948b98c15fSHawkins Jiawei     cfg.hash_key_length = sizeof(n->rss_data.key);
8958b98c15fSHawkins Jiawei 
8968b98c15fSHawkins Jiawei     const struct iovec data[] = {
8978b98c15fSHawkins Jiawei         {
8988b98c15fSHawkins Jiawei             .iov_base = &cfg,
8998b98c15fSHawkins Jiawei             .iov_len = offsetof(struct virtio_net_rss_config,
9008b98c15fSHawkins Jiawei                                 indirection_table),
9018b98c15fSHawkins Jiawei         }, {
9028b98c15fSHawkins Jiawei             .iov_base = table,
9038b98c15fSHawkins Jiawei             .iov_len = n->rss_data.indirections_len *
9048b98c15fSHawkins Jiawei                        sizeof(n->rss_data.indirections_table[0]),
9058b98c15fSHawkins Jiawei         }, {
9068b98c15fSHawkins Jiawei             .iov_base = &cfg.max_tx_vq,
9078b98c15fSHawkins Jiawei             .iov_len = offsetof(struct virtio_net_rss_config, hash_key_data) -
9088b98c15fSHawkins Jiawei                        offsetof(struct virtio_net_rss_config, max_tx_vq),
9098b98c15fSHawkins Jiawei         }, {
9108b98c15fSHawkins Jiawei             .iov_base = (void *)n->rss_data.key,
9118b98c15fSHawkins Jiawei             .iov_len = sizeof(n->rss_data.key),
9128b98c15fSHawkins Jiawei         }
9138b98c15fSHawkins Jiawei     };
9148b98c15fSHawkins Jiawei 
9158b98c15fSHawkins Jiawei     r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
9168b98c15fSHawkins Jiawei                                 VIRTIO_NET_CTRL_MQ,
917b3c09106SHawkins Jiawei                                 do_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG :
9188b98c15fSHawkins Jiawei                                 VIRTIO_NET_CTRL_MQ_HASH_CONFIG,
9198b98c15fSHawkins Jiawei                                 data, ARRAY_SIZE(data));
9208b98c15fSHawkins Jiawei     if (unlikely(r < 0)) {
9218b98c15fSHawkins Jiawei         return r;
9228b98c15fSHawkins Jiawei     }
9238b98c15fSHawkins Jiawei 
9248b98c15fSHawkins Jiawei     return 0;
9258b98c15fSHawkins Jiawei }
9268b98c15fSHawkins Jiawei 
vhost_vdpa_net_load_mq(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor)927f64c7cdaSEugenio Pérez static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
9281d7e2a8fSHawkins Jiawei                                   const VirtIONet *n,
9291d7e2a8fSHawkins Jiawei                                   struct iovec *out_cursor,
9301d7e2a8fSHawkins Jiawei                                   struct iovec *in_cursor)
931f64c7cdaSEugenio Pérez {
932f64c7cdaSEugenio Pérez     struct virtio_net_ctrl_mq mq;
933acec5f68SHawkins Jiawei     ssize_t r;
934f64c7cdaSEugenio Pérez 
93502d3bf09SHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_MQ)) {
936f64c7cdaSEugenio Pérez         return 0;
937f64c7cdaSEugenio Pérez     }
938f64c7cdaSEugenio Pérez 
939f64c7cdaSEugenio Pérez     mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
9402848c6aaSHawkins Jiawei     const struct iovec data = {
9412848c6aaSHawkins Jiawei         .iov_base = &mq,
9422848c6aaSHawkins Jiawei         .iov_len = sizeof(mq),
9432848c6aaSHawkins Jiawei     };
944acec5f68SHawkins Jiawei     r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
9451d7e2a8fSHawkins Jiawei                                 VIRTIO_NET_CTRL_MQ,
9462848c6aaSHawkins Jiawei                                 VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET,
9472848c6aaSHawkins Jiawei                                 &data, 1);
948acec5f68SHawkins Jiawei     if (unlikely(r < 0)) {
949acec5f68SHawkins Jiawei         return r;
950f45fd95eSHawkins Jiawei     }
951f64c7cdaSEugenio Pérez 
952b3c09106SHawkins Jiawei     if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_RSS)) {
953b3c09106SHawkins Jiawei         /* load the receive-side scaling state */
954b3c09106SHawkins Jiawei         r = vhost_vdpa_net_load_rss(s, n, out_cursor, in_cursor, true);
9558b98c15fSHawkins Jiawei         if (unlikely(r < 0)) {
9568b98c15fSHawkins Jiawei             return r;
9578b98c15fSHawkins Jiawei         }
958b3c09106SHawkins Jiawei     } else if (virtio_vdev_has_feature(&n->parent_obj,
959b3c09106SHawkins Jiawei                                        VIRTIO_NET_F_HASH_REPORT)) {
960b3c09106SHawkins Jiawei         /* load the hash calculation state */
961b3c09106SHawkins Jiawei         r = vhost_vdpa_net_load_rss(s, n, out_cursor, in_cursor, false);
962b3c09106SHawkins Jiawei         if (unlikely(r < 0)) {
963b3c09106SHawkins Jiawei             return r;
964b3c09106SHawkins Jiawei         }
965b3c09106SHawkins Jiawei     }
9668b98c15fSHawkins Jiawei 
967f45fd95eSHawkins Jiawei     return 0;
968f64c7cdaSEugenio Pérez }
969f64c7cdaSEugenio Pérez 
vhost_vdpa_net_load_offloads(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor)9700b58d368SHawkins Jiawei static int vhost_vdpa_net_load_offloads(VhostVDPAState *s,
9711d7e2a8fSHawkins Jiawei                                         const VirtIONet *n,
9721d7e2a8fSHawkins Jiawei                                         struct iovec *out_cursor,
9731d7e2a8fSHawkins Jiawei                                         struct iovec *in_cursor)
9740b58d368SHawkins Jiawei {
9750b58d368SHawkins Jiawei     uint64_t offloads;
976acec5f68SHawkins Jiawei     ssize_t r;
9770b58d368SHawkins Jiawei 
9780b58d368SHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj,
9790b58d368SHawkins Jiawei                                  VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
9800b58d368SHawkins Jiawei         return 0;
9810b58d368SHawkins Jiawei     }
9820b58d368SHawkins Jiawei 
9830b58d368SHawkins Jiawei     if (n->curr_guest_offloads == virtio_net_supported_guest_offloads(n)) {
9840b58d368SHawkins Jiawei         /*
9850b58d368SHawkins Jiawei          * According to VirtIO standard, "Upon feature negotiation
9860b58d368SHawkins Jiawei          * corresponding offload gets enabled to preserve
9870b58d368SHawkins Jiawei          * backward compatibility.".
9880b58d368SHawkins Jiawei          *
9890b58d368SHawkins Jiawei          * Therefore, there is no need to send this CVQ command if the
9900b58d368SHawkins Jiawei          * driver also enables all supported offloads, which aligns with
9910b58d368SHawkins Jiawei          * the device's defaults.
9920b58d368SHawkins Jiawei          *
9930b58d368SHawkins Jiawei          * Note that the device's defaults can mismatch the driver's
9940b58d368SHawkins Jiawei          * configuration only at live migration.
9950b58d368SHawkins Jiawei          */
9960b58d368SHawkins Jiawei         return 0;
9970b58d368SHawkins Jiawei     }
9980b58d368SHawkins Jiawei 
9990b58d368SHawkins Jiawei     offloads = cpu_to_le64(n->curr_guest_offloads);
10002848c6aaSHawkins Jiawei     const struct iovec data = {
10012848c6aaSHawkins Jiawei         .iov_base = &offloads,
10022848c6aaSHawkins Jiawei         .iov_len = sizeof(offloads),
10032848c6aaSHawkins Jiawei     };
1004acec5f68SHawkins Jiawei     r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
10051d7e2a8fSHawkins Jiawei                                 VIRTIO_NET_CTRL_GUEST_OFFLOADS,
10060b58d368SHawkins Jiawei                                 VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
10072848c6aaSHawkins Jiawei                                 &data, 1);
1008acec5f68SHawkins Jiawei     if (unlikely(r < 0)) {
1009acec5f68SHawkins Jiawei         return r;
10106f348071SHawkins Jiawei     }
10110b58d368SHawkins Jiawei 
10126f348071SHawkins Jiawei     return 0;
10130b58d368SHawkins Jiawei }
10140b58d368SHawkins Jiawei 
vhost_vdpa_net_load_rx_mode(VhostVDPAState * s,struct iovec * out_cursor,struct iovec * in_cursor,uint8_t cmd,uint8_t on)1015b12f907eSHawkins Jiawei static int vhost_vdpa_net_load_rx_mode(VhostVDPAState *s,
10161d7e2a8fSHawkins Jiawei                                        struct iovec *out_cursor,
10171d7e2a8fSHawkins Jiawei                                        struct iovec *in_cursor,
1018b12f907eSHawkins Jiawei                                        uint8_t cmd,
1019b12f907eSHawkins Jiawei                                        uint8_t on)
1020b12f907eSHawkins Jiawei {
1021b12f907eSHawkins Jiawei     const struct iovec data = {
1022b12f907eSHawkins Jiawei         .iov_base = &on,
1023b12f907eSHawkins Jiawei         .iov_len = sizeof(on),
1024b12f907eSHawkins Jiawei     };
1025acec5f68SHawkins Jiawei     ssize_t r;
102624e59cfeSHawkins Jiawei 
1027acec5f68SHawkins Jiawei     r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
1028acec5f68SHawkins Jiawei                                 VIRTIO_NET_CTRL_RX, cmd, &data, 1);
1029acec5f68SHawkins Jiawei     if (unlikely(r < 0)) {
1030acec5f68SHawkins Jiawei         return r;
103124e59cfeSHawkins Jiawei     }
103224e59cfeSHawkins Jiawei 
103324e59cfeSHawkins Jiawei     return 0;
1034b12f907eSHawkins Jiawei }
1035b12f907eSHawkins Jiawei 
vhost_vdpa_net_load_rx(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor)1036b12f907eSHawkins Jiawei static int vhost_vdpa_net_load_rx(VhostVDPAState *s,
10371d7e2a8fSHawkins Jiawei                                   const VirtIONet *n,
10381d7e2a8fSHawkins Jiawei                                   struct iovec *out_cursor,
10391d7e2a8fSHawkins Jiawei                                   struct iovec *in_cursor)
1040b12f907eSHawkins Jiawei {
104124e59cfeSHawkins Jiawei     ssize_t r;
1042b12f907eSHawkins Jiawei 
1043b12f907eSHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX)) {
1044b12f907eSHawkins Jiawei         return 0;
1045b12f907eSHawkins Jiawei     }
1046b12f907eSHawkins Jiawei 
1047b12f907eSHawkins Jiawei     /*
1048b12f907eSHawkins Jiawei      * According to virtio_net_reset(), device turns promiscuous mode
1049b12f907eSHawkins Jiawei      * on by default.
1050b12f907eSHawkins Jiawei      *
10510a19d879SMichael Tokarev      * Additionally, according to VirtIO standard, "Since there are
1052b12f907eSHawkins Jiawei      * no guarantees, it can use a hash filter or silently switch to
1053b12f907eSHawkins Jiawei      * allmulti or promiscuous mode if it is given too many addresses.".
1054b12f907eSHawkins Jiawei      * QEMU marks `n->mac_table.uni_overflow` if guest sets too many
1055b12f907eSHawkins Jiawei      * non-multicast MAC addresses, indicating that promiscuous mode
1056b12f907eSHawkins Jiawei      * should be enabled.
1057b12f907eSHawkins Jiawei      *
1058b12f907eSHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the
1059b12f907eSHawkins Jiawei      * `n->mac_table.uni_overflow` is not marked and `n->promisc` is off,
1060b12f907eSHawkins Jiawei      * which sets promiscuous mode on, different from the device's defaults.
1061b12f907eSHawkins Jiawei      *
1062b12f907eSHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
1063b12f907eSHawkins Jiawei      * configuration only at live migration.
1064b12f907eSHawkins Jiawei      */
1065b12f907eSHawkins Jiawei     if (!n->mac_table.uni_overflow && !n->promisc) {
10661d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
1067b12f907eSHawkins Jiawei                                         VIRTIO_NET_CTRL_RX_PROMISC, 0);
106824e59cfeSHawkins Jiawei         if (unlikely(r < 0)) {
106924e59cfeSHawkins Jiawei             return r;
1070b12f907eSHawkins Jiawei         }
1071b12f907eSHawkins Jiawei     }
1072b12f907eSHawkins Jiawei 
1073b12f907eSHawkins Jiawei     /*
1074b12f907eSHawkins Jiawei      * According to virtio_net_reset(), device turns all-multicast mode
1075b12f907eSHawkins Jiawei      * off by default.
1076b12f907eSHawkins Jiawei      *
1077b12f907eSHawkins Jiawei      * According to VirtIO standard, "Since there are no guarantees,
1078b12f907eSHawkins Jiawei      * it can use a hash filter or silently switch to allmulti or
1079b12f907eSHawkins Jiawei      * promiscuous mode if it is given too many addresses.". QEMU marks
1080b12f907eSHawkins Jiawei      * `n->mac_table.multi_overflow` if guest sets too many
1081b12f907eSHawkins Jiawei      * non-multicast MAC addresses.
1082b12f907eSHawkins Jiawei      *
1083b12f907eSHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the
1084b12f907eSHawkins Jiawei      * `n->mac_table.multi_overflow` is marked or `n->allmulti` is on,
1085b12f907eSHawkins Jiawei      * which sets all-multicast mode on, different from the device's defaults.
1086b12f907eSHawkins Jiawei      *
1087b12f907eSHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
1088b12f907eSHawkins Jiawei      * configuration only at live migration.
1089b12f907eSHawkins Jiawei      */
1090b12f907eSHawkins Jiawei     if (n->mac_table.multi_overflow || n->allmulti) {
10911d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
1092b12f907eSHawkins Jiawei                                         VIRTIO_NET_CTRL_RX_ALLMULTI, 1);
109324e59cfeSHawkins Jiawei         if (unlikely(r < 0)) {
109424e59cfeSHawkins Jiawei             return r;
1095b12f907eSHawkins Jiawei         }
1096b12f907eSHawkins Jiawei     }
1097b12f907eSHawkins Jiawei 
10984fd180c7SHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX_EXTRA)) {
10994fd180c7SHawkins Jiawei         return 0;
11004fd180c7SHawkins Jiawei     }
11014fd180c7SHawkins Jiawei 
11024fd180c7SHawkins Jiawei     /*
11034fd180c7SHawkins Jiawei      * According to virtio_net_reset(), device turns all-unicast mode
11044fd180c7SHawkins Jiawei      * off by default.
11054fd180c7SHawkins Jiawei      *
11064fd180c7SHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the driver
11074fd180c7SHawkins Jiawei      * sets all-unicast mode on, different from the device's defaults.
11084fd180c7SHawkins Jiawei      *
11094fd180c7SHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
11104fd180c7SHawkins Jiawei      * configuration only at live migration.
11114fd180c7SHawkins Jiawei      */
11124fd180c7SHawkins Jiawei     if (n->alluni) {
11131d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
11144fd180c7SHawkins Jiawei                                         VIRTIO_NET_CTRL_RX_ALLUNI, 1);
111524e59cfeSHawkins Jiawei         if (r < 0) {
111624e59cfeSHawkins Jiawei             return r;
11174fd180c7SHawkins Jiawei         }
11184fd180c7SHawkins Jiawei     }
11194fd180c7SHawkins Jiawei 
11204fd180c7SHawkins Jiawei     /*
11214fd180c7SHawkins Jiawei      * According to virtio_net_reset(), device turns non-multicast mode
11224fd180c7SHawkins Jiawei      * off by default.
11234fd180c7SHawkins Jiawei      *
11244fd180c7SHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the driver
11254fd180c7SHawkins Jiawei      * sets non-multicast mode on, different from the device's defaults.
11264fd180c7SHawkins Jiawei      *
11274fd180c7SHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
11284fd180c7SHawkins Jiawei      * configuration only at live migration.
11294fd180c7SHawkins Jiawei      */
11304fd180c7SHawkins Jiawei     if (n->nomulti) {
11311d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
11324fd180c7SHawkins Jiawei                                         VIRTIO_NET_CTRL_RX_NOMULTI, 1);
113324e59cfeSHawkins Jiawei         if (r < 0) {
113424e59cfeSHawkins Jiawei             return r;
11354fd180c7SHawkins Jiawei         }
11364fd180c7SHawkins Jiawei     }
11374fd180c7SHawkins Jiawei 
11384fd180c7SHawkins Jiawei     /*
11394fd180c7SHawkins Jiawei      * According to virtio_net_reset(), device turns non-unicast mode
11404fd180c7SHawkins Jiawei      * off by default.
11414fd180c7SHawkins Jiawei      *
11424fd180c7SHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the driver
11434fd180c7SHawkins Jiawei      * sets non-unicast mode on, different from the device's defaults.
11444fd180c7SHawkins Jiawei      *
11454fd180c7SHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
11464fd180c7SHawkins Jiawei      * configuration only at live migration.
11474fd180c7SHawkins Jiawei      */
11484fd180c7SHawkins Jiawei     if (n->nouni) {
11491d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
11504fd180c7SHawkins Jiawei                                         VIRTIO_NET_CTRL_RX_NOUNI, 1);
115124e59cfeSHawkins Jiawei         if (r < 0) {
115224e59cfeSHawkins Jiawei             return r;
11534fd180c7SHawkins Jiawei         }
11544fd180c7SHawkins Jiawei     }
11554fd180c7SHawkins Jiawei 
11564fd180c7SHawkins Jiawei     /*
11574fd180c7SHawkins Jiawei      * According to virtio_net_reset(), device turns non-broadcast mode
11584fd180c7SHawkins Jiawei      * off by default.
11594fd180c7SHawkins Jiawei      *
11604fd180c7SHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the driver
11614fd180c7SHawkins Jiawei      * sets non-broadcast mode on, different from the device's defaults.
11624fd180c7SHawkins Jiawei      *
11634fd180c7SHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
11644fd180c7SHawkins Jiawei      * configuration only at live migration.
11654fd180c7SHawkins Jiawei      */
11664fd180c7SHawkins Jiawei     if (n->nobcast) {
11671d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
11684fd180c7SHawkins Jiawei                                         VIRTIO_NET_CTRL_RX_NOBCAST, 1);
116924e59cfeSHawkins Jiawei         if (r < 0) {
117024e59cfeSHawkins Jiawei             return r;
11714fd180c7SHawkins Jiawei         }
11724fd180c7SHawkins Jiawei     }
11734fd180c7SHawkins Jiawei 
1174b12f907eSHawkins Jiawei     return 0;
1175b12f907eSHawkins Jiawei }
1176b12f907eSHawkins Jiawei 
vhost_vdpa_net_load_single_vlan(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor,uint16_t vid)11778f7e9967SHawkins Jiawei static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s,
11788f7e9967SHawkins Jiawei                                            const VirtIONet *n,
11791d7e2a8fSHawkins Jiawei                                            struct iovec *out_cursor,
11801d7e2a8fSHawkins Jiawei                                            struct iovec *in_cursor,
11818f7e9967SHawkins Jiawei                                            uint16_t vid)
11828f7e9967SHawkins Jiawei {
11838f7e9967SHawkins Jiawei     const struct iovec data = {
11848f7e9967SHawkins Jiawei         .iov_base = &vid,
11858f7e9967SHawkins Jiawei         .iov_len = sizeof(vid),
11868f7e9967SHawkins Jiawei     };
1187acec5f68SHawkins Jiawei     ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
11881d7e2a8fSHawkins Jiawei                                         VIRTIO_NET_CTRL_VLAN,
11898f7e9967SHawkins Jiawei                                         VIRTIO_NET_CTRL_VLAN_ADD,
11908f7e9967SHawkins Jiawei                                         &data, 1);
1191acec5f68SHawkins Jiawei     if (unlikely(r < 0)) {
1192acec5f68SHawkins Jiawei         return r;
11938f7e9967SHawkins Jiawei     }
11948f7e9967SHawkins Jiawei 
11958f7e9967SHawkins Jiawei     return 0;
11968f7e9967SHawkins Jiawei }
11978f7e9967SHawkins Jiawei 
vhost_vdpa_net_load_vlan(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor)11988f7e9967SHawkins Jiawei static int vhost_vdpa_net_load_vlan(VhostVDPAState *s,
11991d7e2a8fSHawkins Jiawei                                     const VirtIONet *n,
12001d7e2a8fSHawkins Jiawei                                     struct iovec *out_cursor,
12011d7e2a8fSHawkins Jiawei                                     struct iovec *in_cursor)
12028f7e9967SHawkins Jiawei {
12038f7e9967SHawkins Jiawei     int r;
12048f7e9967SHawkins Jiawei 
12058f7e9967SHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_VLAN)) {
12068f7e9967SHawkins Jiawei         return 0;
12078f7e9967SHawkins Jiawei     }
12088f7e9967SHawkins Jiawei 
12098f7e9967SHawkins Jiawei     for (int i = 0; i < MAX_VLAN >> 5; i++) {
12108f7e9967SHawkins Jiawei         for (int j = 0; n->vlans[i] && j <= 0x1f; j++) {
12118f7e9967SHawkins Jiawei             if (n->vlans[i] & (1U << j)) {
12121d7e2a8fSHawkins Jiawei                 r = vhost_vdpa_net_load_single_vlan(s, n, out_cursor,
12131d7e2a8fSHawkins Jiawei                                                     in_cursor, (i << 5) + j);
12148f7e9967SHawkins Jiawei                 if (unlikely(r != 0)) {
12158f7e9967SHawkins Jiawei                     return r;
12168f7e9967SHawkins Jiawei                 }
12178f7e9967SHawkins Jiawei             }
12188f7e9967SHawkins Jiawei         }
12198f7e9967SHawkins Jiawei     }
12208f7e9967SHawkins Jiawei 
12218f7e9967SHawkins Jiawei     return 0;
12228f7e9967SHawkins Jiawei }
12238f7e9967SHawkins Jiawei 
vhost_vdpa_net_cvq_load(NetClientState * nc)1224f3fada59SEugenio Pérez static int vhost_vdpa_net_cvq_load(NetClientState *nc)
1225dd036d8dSEugenio Pérez {
1226dd036d8dSEugenio Pérez     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
1227f73c0c43SEugenio Pérez     struct vhost_vdpa *v = &s->vhost_vdpa;
1228dd036d8dSEugenio Pérez     const VirtIONet *n;
1229f73c0c43SEugenio Pérez     int r;
12301d7e2a8fSHawkins Jiawei     struct iovec out_cursor, in_cursor;
1231dd036d8dSEugenio Pérez 
1232dd036d8dSEugenio Pérez     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
1233dd036d8dSEugenio Pérez 
12346c482547SEugenio Pérez     vhost_vdpa_set_vring_ready(v, v->dev->vq_index);
1235dd036d8dSEugenio Pérez 
12366c482547SEugenio Pérez     if (v->shadow_vqs_enabled) {
1237dd036d8dSEugenio Pérez         n = VIRTIO_NET(v->dev->vdev);
12381d7e2a8fSHawkins Jiawei         vhost_vdpa_net_load_cursor_reset(s, &out_cursor, &in_cursor);
12391d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_mac(s, n, &out_cursor, &in_cursor);
1240f73c0c43SEugenio Pérez         if (unlikely(r < 0)) {
1241f73c0c43SEugenio Pérez             return r;
1242dd036d8dSEugenio Pérez         }
12431d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_mq(s, n, &out_cursor, &in_cursor);
1244f64c7cdaSEugenio Pérez         if (unlikely(r)) {
1245f64c7cdaSEugenio Pérez             return r;
1246f64c7cdaSEugenio Pérez         }
12471d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_offloads(s, n, &out_cursor, &in_cursor);
12480b58d368SHawkins Jiawei         if (unlikely(r)) {
12490b58d368SHawkins Jiawei             return r;
12500b58d368SHawkins Jiawei         }
12511d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_rx(s, n, &out_cursor, &in_cursor);
1252b12f907eSHawkins Jiawei         if (unlikely(r)) {
1253b12f907eSHawkins Jiawei             return r;
1254b12f907eSHawkins Jiawei         }
12551d7e2a8fSHawkins Jiawei         r = vhost_vdpa_net_load_vlan(s, n, &out_cursor, &in_cursor);
12568f7e9967SHawkins Jiawei         if (unlikely(r)) {
12578f7e9967SHawkins Jiawei             return r;
12588f7e9967SHawkins Jiawei         }
1259acec5f68SHawkins Jiawei 
1260acec5f68SHawkins Jiawei         /*
1261acec5f68SHawkins Jiawei          * We need to poll and check all pending device's used buffers.
1262acec5f68SHawkins Jiawei          *
1263acec5f68SHawkins Jiawei          * We can poll here since we've had BQL from the time
1264acec5f68SHawkins Jiawei          * we sent the descriptor.
1265acec5f68SHawkins Jiawei          */
1266acec5f68SHawkins Jiawei         r = vhost_vdpa_net_svq_flush(s, in_cursor.iov_base - (void *)s->status);
1267fee364e4SHawkins Jiawei         if (unlikely(r)) {
1268fee364e4SHawkins Jiawei             return r;
1269fee364e4SHawkins Jiawei         }
12706c482547SEugenio Pérez     }
12716c482547SEugenio Pérez 
12726c482547SEugenio Pérez     for (int i = 0; i < v->dev->vq_index; ++i) {
12736c482547SEugenio Pérez         vhost_vdpa_set_vring_ready(v, i);
12746c482547SEugenio Pérez     }
1275dd036d8dSEugenio Pérez 
1276dd036d8dSEugenio Pérez     return 0;
1277dd036d8dSEugenio Pérez }
1278dd036d8dSEugenio Pérez 
1279f8972b56SEugenio Pérez static NetClientInfo net_vhost_vdpa_cvq_info = {
1280f8972b56SEugenio Pérez     .type = NET_CLIENT_DRIVER_VHOST_VDPA,
1281f8972b56SEugenio Pérez     .size = sizeof(VhostVDPAState),
1282f8972b56SEugenio Pérez     .receive = vhost_vdpa_receive,
12837a7f87e9SEugenio Pérez     .start = vhost_vdpa_net_cvq_start,
1284f3fada59SEugenio Pérez     .load = vhost_vdpa_net_cvq_load,
12857a7f87e9SEugenio Pérez     .stop = vhost_vdpa_net_cvq_stop,
1286f8972b56SEugenio Pérez     .cleanup = vhost_vdpa_cleanup,
1287f8972b56SEugenio Pérez     .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
1288f8972b56SEugenio Pérez     .has_ufo = vhost_vdpa_has_ufo,
1289f8972b56SEugenio Pérez     .check_peer_type = vhost_vdpa_check_peer_type,
1290d1fd2d31SHawkins Jiawei     .set_steering_ebpf = vhost_vdpa_set_steering_ebpf,
1291f8972b56SEugenio Pérez };
1292f8972b56SEugenio Pérez 
1293fee364e4SHawkins Jiawei /*
1294fee364e4SHawkins Jiawei  * Forward the excessive VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command to
1295fee364e4SHawkins Jiawei  * vdpa device.
1296fee364e4SHawkins Jiawei  *
1297fee364e4SHawkins Jiawei  * Considering that QEMU cannot send the entire filter table to the
1298fee364e4SHawkins Jiawei  * vdpa device, it should send the VIRTIO_NET_CTRL_RX_PROMISC CVQ
1299fee364e4SHawkins Jiawei  * command to enable promiscuous mode to receive all packets,
1300fee364e4SHawkins Jiawei  * according to VirtIO standard, "Since there are no guarantees,
1301fee364e4SHawkins Jiawei  * it can use a hash filter or silently switch to allmulti or
1302fee364e4SHawkins Jiawei  * promiscuous mode if it is given too many addresses.".
1303fee364e4SHawkins Jiawei  *
1304fee364e4SHawkins Jiawei  * Since QEMU ignores MAC addresses beyond `MAC_TABLE_ENTRIES` and
1305fee364e4SHawkins Jiawei  * marks `n->mac_table.x_overflow` accordingly, it should have
1306fee364e4SHawkins Jiawei  * the same effect on the device model to receive
1307fee364e4SHawkins Jiawei  * (`MAC_TABLE_ENTRIES` + 1) or more non-multicast MAC addresses.
1308fee364e4SHawkins Jiawei  * The same applies to multicast MAC addresses.
1309fee364e4SHawkins Jiawei  *
1310fee364e4SHawkins Jiawei  * Therefore, QEMU can provide the device model with a fake
1311fee364e4SHawkins Jiawei  * VIRTIO_NET_CTRL_MAC_TABLE_SET command with (`MAC_TABLE_ENTRIES` + 1)
1312fee364e4SHawkins Jiawei  * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1) multicast
1313fee364e4SHawkins Jiawei  * MAC addresses. This ensures that the device model marks
1314fee364e4SHawkins Jiawei  * `n->mac_table.uni_overflow` and `n->mac_table.multi_overflow`,
1315fee364e4SHawkins Jiawei  * allowing all packets to be received, which aligns with the
1316fee364e4SHawkins Jiawei  * state of the vdpa device.
1317fee364e4SHawkins Jiawei  */
vhost_vdpa_net_excessive_mac_filter_cvq_add(VhostVDPAState * s,VirtQueueElement * elem,struct iovec * out,const struct iovec * in)1318fee364e4SHawkins Jiawei static int vhost_vdpa_net_excessive_mac_filter_cvq_add(VhostVDPAState *s,
1319fee364e4SHawkins Jiawei                                                        VirtQueueElement *elem,
1320327dedb8SHawkins Jiawei                                                        struct iovec *out,
1321327dedb8SHawkins Jiawei                                                        const struct iovec *in)
1322fee364e4SHawkins Jiawei {
1323fee364e4SHawkins Jiawei     struct virtio_net_ctrl_mac mac_data, *mac_ptr;
1324fee364e4SHawkins Jiawei     struct virtio_net_ctrl_hdr *hdr_ptr;
1325fee364e4SHawkins Jiawei     uint32_t cursor;
1326fee364e4SHawkins Jiawei     ssize_t r;
1327327dedb8SHawkins Jiawei     uint8_t on = 1;
1328fee364e4SHawkins Jiawei 
1329fee364e4SHawkins Jiawei     /* parse the non-multicast MAC address entries from CVQ command */
1330fee364e4SHawkins Jiawei     cursor = sizeof(*hdr_ptr);
1331fee364e4SHawkins Jiawei     r = iov_to_buf(elem->out_sg, elem->out_num, cursor,
1332fee364e4SHawkins Jiawei                    &mac_data, sizeof(mac_data));
1333fee364e4SHawkins Jiawei     if (unlikely(r != sizeof(mac_data))) {
1334fee364e4SHawkins Jiawei         /*
1335fee364e4SHawkins Jiawei          * If the CVQ command is invalid, we should simulate the vdpa device
1336fee364e4SHawkins Jiawei          * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1337fee364e4SHawkins Jiawei          */
1338fee364e4SHawkins Jiawei         *s->status = VIRTIO_NET_ERR;
1339fee364e4SHawkins Jiawei         return sizeof(*s->status);
1340fee364e4SHawkins Jiawei     }
1341fee364e4SHawkins Jiawei     cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN;
1342fee364e4SHawkins Jiawei 
1343fee364e4SHawkins Jiawei     /* parse the multicast MAC address entries from CVQ command */
1344fee364e4SHawkins Jiawei     r = iov_to_buf(elem->out_sg, elem->out_num, cursor,
1345fee364e4SHawkins Jiawei                    &mac_data, sizeof(mac_data));
1346fee364e4SHawkins Jiawei     if (r != sizeof(mac_data)) {
1347fee364e4SHawkins Jiawei         /*
1348fee364e4SHawkins Jiawei          * If the CVQ command is invalid, we should simulate the vdpa device
1349fee364e4SHawkins Jiawei          * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1350fee364e4SHawkins Jiawei          */
1351fee364e4SHawkins Jiawei         *s->status = VIRTIO_NET_ERR;
1352fee364e4SHawkins Jiawei         return sizeof(*s->status);
1353fee364e4SHawkins Jiawei     }
1354fee364e4SHawkins Jiawei     cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN;
1355fee364e4SHawkins Jiawei 
1356fee364e4SHawkins Jiawei     /* validate the CVQ command */
1357fee364e4SHawkins Jiawei     if (iov_size(elem->out_sg, elem->out_num) != cursor) {
1358fee364e4SHawkins Jiawei         /*
1359fee364e4SHawkins Jiawei          * If the CVQ command is invalid, we should simulate the vdpa device
1360fee364e4SHawkins Jiawei          * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1361fee364e4SHawkins Jiawei          */
1362fee364e4SHawkins Jiawei         *s->status = VIRTIO_NET_ERR;
1363fee364e4SHawkins Jiawei         return sizeof(*s->status);
1364fee364e4SHawkins Jiawei     }
1365fee364e4SHawkins Jiawei 
1366fee364e4SHawkins Jiawei     /*
1367fee364e4SHawkins Jiawei      * According to VirtIO standard, "Since there are no guarantees,
1368fee364e4SHawkins Jiawei      * it can use a hash filter or silently switch to allmulti or
1369fee364e4SHawkins Jiawei      * promiscuous mode if it is given too many addresses.".
1370fee364e4SHawkins Jiawei      *
1371fee364e4SHawkins Jiawei      * Therefore, considering that QEMU is unable to send the entire
1372fee364e4SHawkins Jiawei      * filter table to the vdpa device, it should send the
1373fee364e4SHawkins Jiawei      * VIRTIO_NET_CTRL_RX_PROMISC CVQ command to enable promiscuous mode
1374fee364e4SHawkins Jiawei      */
1375327dedb8SHawkins Jiawei     hdr_ptr = out->iov_base;
1376327dedb8SHawkins Jiawei     out->iov_len = sizeof(*hdr_ptr) + sizeof(on);
1377327dedb8SHawkins Jiawei 
1378327dedb8SHawkins Jiawei     hdr_ptr->class = VIRTIO_NET_CTRL_RX;
1379327dedb8SHawkins Jiawei     hdr_ptr->cmd = VIRTIO_NET_CTRL_RX_PROMISC;
1380327dedb8SHawkins Jiawei     iov_from_buf(out, 1, sizeof(*hdr_ptr), &on, sizeof(on));
1381327dedb8SHawkins Jiawei     r = vhost_vdpa_net_cvq_add(s, out, 1, in, 1);
1382fee364e4SHawkins Jiawei     if (unlikely(r < 0)) {
1383fee364e4SHawkins Jiawei         return r;
1384fee364e4SHawkins Jiawei     }
1385a864a321SHawkins Jiawei 
1386a864a321SHawkins Jiawei     /*
1387a864a321SHawkins Jiawei      * We can poll here since we've had BQL from the time
1388a864a321SHawkins Jiawei      * we sent the descriptor.
1389a864a321SHawkins Jiawei      */
1390a864a321SHawkins Jiawei     r = vhost_vdpa_net_svq_poll(s, 1);
1391a864a321SHawkins Jiawei     if (unlikely(r < sizeof(*s->status))) {
1392a864a321SHawkins Jiawei         return r;
1393a864a321SHawkins Jiawei     }
1394fee364e4SHawkins Jiawei     if (*s->status != VIRTIO_NET_OK) {
1395fee364e4SHawkins Jiawei         return sizeof(*s->status);
1396fee364e4SHawkins Jiawei     }
1397fee364e4SHawkins Jiawei 
1398fee364e4SHawkins Jiawei     /*
1399fee364e4SHawkins Jiawei      * QEMU should also send a fake VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ
1400fee364e4SHawkins Jiawei      * command to the device model, including (`MAC_TABLE_ENTRIES` + 1)
1401fee364e4SHawkins Jiawei      * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1)
1402fee364e4SHawkins Jiawei      * multicast MAC addresses.
1403fee364e4SHawkins Jiawei      *
1404fee364e4SHawkins Jiawei      * By doing so, the device model can mark `n->mac_table.uni_overflow`
1405fee364e4SHawkins Jiawei      * and `n->mac_table.multi_overflow`, enabling all packets to be
1406fee364e4SHawkins Jiawei      * received, which aligns with the state of the vdpa device.
1407fee364e4SHawkins Jiawei      */
1408fee364e4SHawkins Jiawei     cursor = 0;
1409fee364e4SHawkins Jiawei     uint32_t fake_uni_entries = MAC_TABLE_ENTRIES + 1,
1410fee364e4SHawkins Jiawei              fake_mul_entries = MAC_TABLE_ENTRIES + 1,
1411fee364e4SHawkins Jiawei              fake_cvq_size = sizeof(struct virtio_net_ctrl_hdr) +
1412fee364e4SHawkins Jiawei                              sizeof(mac_data) + fake_uni_entries * ETH_ALEN +
1413fee364e4SHawkins Jiawei                              sizeof(mac_data) + fake_mul_entries * ETH_ALEN;
1414fee364e4SHawkins Jiawei 
1415fee364e4SHawkins Jiawei     assert(fake_cvq_size < vhost_vdpa_net_cvq_cmd_page_len());
1416fee364e4SHawkins Jiawei     out->iov_len = fake_cvq_size;
1417fee364e4SHawkins Jiawei 
1418fee364e4SHawkins Jiawei     /* pack the header for fake CVQ command */
1419fee364e4SHawkins Jiawei     hdr_ptr = out->iov_base + cursor;
1420fee364e4SHawkins Jiawei     hdr_ptr->class = VIRTIO_NET_CTRL_MAC;
1421fee364e4SHawkins Jiawei     hdr_ptr->cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
1422fee364e4SHawkins Jiawei     cursor += sizeof(*hdr_ptr);
1423fee364e4SHawkins Jiawei 
1424fee364e4SHawkins Jiawei     /*
1425fee364e4SHawkins Jiawei      * Pack the non-multicast MAC addresses part for fake CVQ command.
1426fee364e4SHawkins Jiawei      *
1427fee364e4SHawkins Jiawei      * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC
14280a19d879SMichael Tokarev      * addresses provided in CVQ command. Therefore, only the entries
1429fee364e4SHawkins Jiawei      * field need to be prepared in the CVQ command.
1430fee364e4SHawkins Jiawei      */
1431fee364e4SHawkins Jiawei     mac_ptr = out->iov_base + cursor;
1432fee364e4SHawkins Jiawei     mac_ptr->entries = cpu_to_le32(fake_uni_entries);
1433fee364e4SHawkins Jiawei     cursor += sizeof(*mac_ptr) + fake_uni_entries * ETH_ALEN;
1434fee364e4SHawkins Jiawei 
1435fee364e4SHawkins Jiawei     /*
1436fee364e4SHawkins Jiawei      * Pack the multicast MAC addresses part for fake CVQ command.
1437fee364e4SHawkins Jiawei      *
1438fee364e4SHawkins Jiawei      * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC
14390a19d879SMichael Tokarev      * addresses provided in CVQ command. Therefore, only the entries
1440fee364e4SHawkins Jiawei      * field need to be prepared in the CVQ command.
1441fee364e4SHawkins Jiawei      */
1442fee364e4SHawkins Jiawei     mac_ptr = out->iov_base + cursor;
1443fee364e4SHawkins Jiawei     mac_ptr->entries = cpu_to_le32(fake_mul_entries);
1444fee364e4SHawkins Jiawei 
1445fee364e4SHawkins Jiawei     /*
1446fee364e4SHawkins Jiawei      * Simulating QEMU poll a vdpa device used buffer
1447fee364e4SHawkins Jiawei      * for VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1448fee364e4SHawkins Jiawei      */
1449fee364e4SHawkins Jiawei     return sizeof(*s->status);
1450fee364e4SHawkins Jiawei }
1451fee364e4SHawkins Jiawei 
14522df4dd31SEugenio Pérez /**
14532df4dd31SEugenio Pérez  * Validate and copy control virtqueue commands.
14542df4dd31SEugenio Pérez  *
14552df4dd31SEugenio Pérez  * Following QEMU guidelines, we offer a copy of the buffers to the device to
14562df4dd31SEugenio Pérez  * prevent TOCTOU bugs.
1457bd907ae4SEugenio Pérez  */
vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue * svq,VirtQueueElement * elem,void * opaque)1458bd907ae4SEugenio Pérez static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
1459bd907ae4SEugenio Pérez                                             VirtQueueElement *elem,
1460bd907ae4SEugenio Pérez                                             void *opaque)
1461bd907ae4SEugenio Pérez {
14622df4dd31SEugenio Pérez     VhostVDPAState *s = opaque;
1463be4278b6SEugenio Pérez     size_t in_len;
146445c41018SHawkins Jiawei     const struct virtio_net_ctrl_hdr *ctrl;
1465bd907ae4SEugenio Pérez     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
14667a7f87e9SEugenio Pérez     /* Out buffer sent to both the vdpa device and the device model */
14677a7f87e9SEugenio Pérez     struct iovec out = {
14687a7f87e9SEugenio Pérez         .iov_base = s->cvq_cmd_out_buffer,
14697a7f87e9SEugenio Pérez     };
14702df4dd31SEugenio Pérez     /* in buffer used for device model */
14710e6bff0dSHawkins Jiawei     const struct iovec model_in = {
14722df4dd31SEugenio Pérez         .iov_base = &status,
14732df4dd31SEugenio Pérez         .iov_len = sizeof(status),
14742df4dd31SEugenio Pérez     };
14750e6bff0dSHawkins Jiawei     /* in buffer used for vdpa device */
14760e6bff0dSHawkins Jiawei     const struct iovec vdpa_in = {
14770e6bff0dSHawkins Jiawei         .iov_base = s->status,
14780e6bff0dSHawkins Jiawei         .iov_len = sizeof(*s->status),
14790e6bff0dSHawkins Jiawei     };
1480be4278b6SEugenio Pérez     ssize_t dev_written = -EINVAL;
1481bd907ae4SEugenio Pérez 
14827a7f87e9SEugenio Pérez     out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0,
14837a7f87e9SEugenio Pérez                              s->cvq_cmd_out_buffer,
1484fee364e4SHawkins Jiawei                              vhost_vdpa_net_cvq_cmd_page_len());
148545c41018SHawkins Jiawei 
148645c41018SHawkins Jiawei     ctrl = s->cvq_cmd_out_buffer;
148745c41018SHawkins Jiawei     if (ctrl->class == VIRTIO_NET_CTRL_ANNOUNCE) {
14883f9a3eebSEugenio Pérez         /*
14893f9a3eebSEugenio Pérez          * Guest announce capability is emulated by qemu, so don't forward to
14903f9a3eebSEugenio Pérez          * the device.
14913f9a3eebSEugenio Pérez          */
14923f9a3eebSEugenio Pérez         dev_written = sizeof(status);
14933f9a3eebSEugenio Pérez         *s->status = VIRTIO_NET_OK;
1494fee364e4SHawkins Jiawei     } else if (unlikely(ctrl->class == VIRTIO_NET_CTRL_MAC &&
1495fee364e4SHawkins Jiawei                         ctrl->cmd == VIRTIO_NET_CTRL_MAC_TABLE_SET &&
1496fee364e4SHawkins Jiawei                         iov_size(elem->out_sg, elem->out_num) > out.iov_len)) {
1497fee364e4SHawkins Jiawei         /*
1498fee364e4SHawkins Jiawei          * Due to the size limitation of the out buffer sent to the vdpa device,
1499fee364e4SHawkins Jiawei          * which is determined by vhost_vdpa_net_cvq_cmd_page_len(), excessive
1500fee364e4SHawkins Jiawei          * MAC addresses set by the driver for the filter table can cause
1501fee364e4SHawkins Jiawei          * truncation of the CVQ command in QEMU. As a result, the vdpa device
1502fee364e4SHawkins Jiawei          * rejects the flawed CVQ command.
1503fee364e4SHawkins Jiawei          *
1504fee364e4SHawkins Jiawei          * Therefore, QEMU must handle this situation instead of sending
15050a19d879SMichael Tokarev          * the CVQ command directly.
1506fee364e4SHawkins Jiawei          */
1507fee364e4SHawkins Jiawei         dev_written = vhost_vdpa_net_excessive_mac_filter_cvq_add(s, elem,
1508327dedb8SHawkins Jiawei                                                             &out, &vdpa_in);
1509fee364e4SHawkins Jiawei         if (unlikely(dev_written < 0)) {
1510fee364e4SHawkins Jiawei             goto out;
1511fee364e4SHawkins Jiawei         }
15123f9a3eebSEugenio Pérez     } else {
1513a864a321SHawkins Jiawei         ssize_t r;
1514a864a321SHawkins Jiawei         r = vhost_vdpa_net_cvq_add(s, &out, 1, &vdpa_in, 1);
1515a864a321SHawkins Jiawei         if (unlikely(r < 0)) {
1516a864a321SHawkins Jiawei             dev_written = r;
1517bd907ae4SEugenio Pérez             goto out;
1518bd907ae4SEugenio Pérez         }
1519a864a321SHawkins Jiawei 
1520a864a321SHawkins Jiawei         /*
1521a864a321SHawkins Jiawei          * We can poll here since we've had BQL from the time
1522a864a321SHawkins Jiawei          * we sent the descriptor.
1523a864a321SHawkins Jiawei          */
1524a864a321SHawkins Jiawei         dev_written = vhost_vdpa_net_svq_poll(s, 1);
15253f9a3eebSEugenio Pérez     }
1526bd907ae4SEugenio Pérez 
1527bd907ae4SEugenio Pérez     if (unlikely(dev_written < sizeof(status))) {
1528bd907ae4SEugenio Pérez         error_report("Insufficient written data (%zu)", dev_written);
15292df4dd31SEugenio Pérez         goto out;
15302df4dd31SEugenio Pérez     }
15312df4dd31SEugenio Pérez 
153217fb889fSEugenio Pérez     if (*s->status != VIRTIO_NET_OK) {
1533d45243bcSEugenio Pérez         goto out;
15342df4dd31SEugenio Pérez     }
15352df4dd31SEugenio Pérez 
15362df4dd31SEugenio Pérez     status = VIRTIO_NET_ERR;
15370e6bff0dSHawkins Jiawei     virtio_net_handle_ctrl_iov(svq->vdev, &model_in, 1, &out, 1);
15382df4dd31SEugenio Pérez     if (status != VIRTIO_NET_OK) {
15392df4dd31SEugenio Pérez         error_report("Bad CVQ processing in model");
1540bd907ae4SEugenio Pérez     }
1541bd907ae4SEugenio Pérez 
1542bd907ae4SEugenio Pérez out:
1543bd907ae4SEugenio Pérez     in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
1544bd907ae4SEugenio Pérez                           sizeof(status));
1545bd907ae4SEugenio Pérez     if (unlikely(in_len < sizeof(status))) {
1546bd907ae4SEugenio Pérez         error_report("Bad device CVQ written length");
1547bd907ae4SEugenio Pérez     }
1548bd907ae4SEugenio Pérez     vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
1549031b1abaSHawkins Jiawei     /*
1550031b1abaSHawkins Jiawei      * `elem` belongs to vhost_vdpa_net_handle_ctrl_avail() only when
1551031b1abaSHawkins Jiawei      * the function successfully forwards the CVQ command, indicated
1552031b1abaSHawkins Jiawei      * by a non-negative value of `dev_written`. Otherwise, it still
1553031b1abaSHawkins Jiawei      * belongs to SVQ.
1554031b1abaSHawkins Jiawei      * This function should only free the `elem` when it owns.
1555031b1abaSHawkins Jiawei      */
1556031b1abaSHawkins Jiawei     if (dev_written >= 0) {
1557bd907ae4SEugenio Pérez         g_free(elem);
1558031b1abaSHawkins Jiawei     }
1559be4278b6SEugenio Pérez     return dev_written < 0 ? dev_written : 0;
1560bd907ae4SEugenio Pérez }
1561bd907ae4SEugenio Pérez 
1562bd907ae4SEugenio Pérez static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
1563bd907ae4SEugenio Pérez     .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
1564bd907ae4SEugenio Pérez };
1565bd907ae4SEugenio Pérez 
1566152128d6SEugenio Pérez /**
1567152128d6SEugenio Pérez  * Probe if CVQ is isolated
1568152128d6SEugenio Pérez  *
1569152128d6SEugenio Pérez  * @device_fd         The vdpa device fd
1570152128d6SEugenio Pérez  * @features          Features offered by the device.
1571152128d6SEugenio Pérez  * @cvq_index         The control vq pair index
1572152128d6SEugenio Pérez  *
1573152128d6SEugenio Pérez  * Returns <0 in case of failure, 0 if false and 1 if true.
1574152128d6SEugenio Pérez  */
vhost_vdpa_probe_cvq_isolation(int device_fd,uint64_t features,int cvq_index,Error ** errp)1575152128d6SEugenio Pérez static int vhost_vdpa_probe_cvq_isolation(int device_fd, uint64_t features,
1576152128d6SEugenio Pérez                                           int cvq_index, Error **errp)
1577152128d6SEugenio Pérez {
1578152128d6SEugenio Pérez     uint64_t backend_features;
1579152128d6SEugenio Pérez     int64_t cvq_group;
1580152128d6SEugenio Pérez     uint8_t status = VIRTIO_CONFIG_S_ACKNOWLEDGE |
1581845ec38aSEugenio Pérez                      VIRTIO_CONFIG_S_DRIVER;
1582152128d6SEugenio Pérez     int r;
1583152128d6SEugenio Pérez 
1584152128d6SEugenio Pérez     ERRP_GUARD();
1585152128d6SEugenio Pérez 
1586152128d6SEugenio Pérez     r = ioctl(device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features);
1587152128d6SEugenio Pérez     if (unlikely(r < 0)) {
1588152128d6SEugenio Pérez         error_setg_errno(errp, errno, "Cannot get vdpa backend_features");
1589152128d6SEugenio Pérez         return r;
1590152128d6SEugenio Pérez     }
1591152128d6SEugenio Pérez 
1592152128d6SEugenio Pérez     if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID))) {
1593152128d6SEugenio Pérez         return 0;
1594152128d6SEugenio Pérez     }
1595152128d6SEugenio Pérez 
1596845ec38aSEugenio Pérez     r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
1597152128d6SEugenio Pérez     if (unlikely(r)) {
1598845ec38aSEugenio Pérez         error_setg_errno(errp, -r, "Cannot set device status");
1599f1085882SEugenio Pérez         goto out;
1600152128d6SEugenio Pérez     }
1601152128d6SEugenio Pérez 
1602845ec38aSEugenio Pérez     r = ioctl(device_fd, VHOST_SET_FEATURES, &features);
1603845ec38aSEugenio Pérez     if (unlikely(r)) {
1604845ec38aSEugenio Pérez         error_setg_errno(errp, -r, "Cannot set features");
1605845ec38aSEugenio Pérez         goto out;
1606845ec38aSEugenio Pérez     }
1607845ec38aSEugenio Pérez 
1608845ec38aSEugenio Pérez     status |= VIRTIO_CONFIG_S_FEATURES_OK;
1609152128d6SEugenio Pérez     r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
1610152128d6SEugenio Pérez     if (unlikely(r)) {
1611845ec38aSEugenio Pérez         error_setg_errno(errp, -r, "Cannot set device status");
1612152128d6SEugenio Pérez         goto out;
1613152128d6SEugenio Pérez     }
1614152128d6SEugenio Pérez 
1615152128d6SEugenio Pérez     cvq_group = vhost_vdpa_get_vring_group(device_fd, cvq_index, errp);
1616152128d6SEugenio Pérez     if (unlikely(cvq_group < 0)) {
1617152128d6SEugenio Pérez         if (cvq_group != -ENOTSUP) {
1618152128d6SEugenio Pérez             r = cvq_group;
1619152128d6SEugenio Pérez             goto out;
1620152128d6SEugenio Pérez         }
1621152128d6SEugenio Pérez 
1622152128d6SEugenio Pérez         /*
1623152128d6SEugenio Pérez          * The kernel report VHOST_BACKEND_F_IOTLB_ASID if the vdpa frontend
1624152128d6SEugenio Pérez          * support ASID even if the parent driver does not.  The CVQ cannot be
1625152128d6SEugenio Pérez          * isolated in this case.
1626152128d6SEugenio Pérez          */
1627152128d6SEugenio Pérez         error_free(*errp);
1628152128d6SEugenio Pérez         *errp = NULL;
1629152128d6SEugenio Pérez         r = 0;
1630152128d6SEugenio Pérez         goto out;
1631152128d6SEugenio Pérez     }
1632152128d6SEugenio Pérez 
1633152128d6SEugenio Pérez     for (int i = 0; i < cvq_index; ++i) {
1634152128d6SEugenio Pérez         int64_t group = vhost_vdpa_get_vring_group(device_fd, i, errp);
1635152128d6SEugenio Pérez         if (unlikely(group < 0)) {
1636152128d6SEugenio Pérez             r = group;
1637152128d6SEugenio Pérez             goto out;
1638152128d6SEugenio Pérez         }
1639152128d6SEugenio Pérez 
1640152128d6SEugenio Pérez         if (group == (int64_t)cvq_group) {
1641152128d6SEugenio Pérez             r = 0;
1642152128d6SEugenio Pérez             goto out;
1643152128d6SEugenio Pérez         }
1644152128d6SEugenio Pérez     }
1645152128d6SEugenio Pérez 
1646152128d6SEugenio Pérez     r = 1;
1647152128d6SEugenio Pérez 
1648152128d6SEugenio Pérez out:
1649152128d6SEugenio Pérez     status = 0;
1650152128d6SEugenio Pérez     ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
1651152128d6SEugenio Pérez     return r;
1652152128d6SEugenio Pérez }
1653152128d6SEugenio Pérez 
net_vhost_vdpa_init(NetClientState * peer,const char * device,const char * name,int vdpa_device_fd,int queue_pair_index,int nvqs,bool is_datapath,bool svq,struct vhost_vdpa_iova_range iova_range,uint64_t features,Error ** errp)1654654790b6SJason Wang static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
1655654790b6SJason Wang                                        const char *device,
1656654790b6SJason Wang                                        const char *name,
165740237840SJason Wang                                        int vdpa_device_fd,
165840237840SJason Wang                                        int queue_pair_index,
165940237840SJason Wang                                        int nvqs,
16601576dbb5SEugenio Pérez                                        bool is_datapath,
16611576dbb5SEugenio Pérez                                        bool svq,
16625c1ebd4cSEugenio Pérez                                        struct vhost_vdpa_iova_range iova_range,
1663152128d6SEugenio Pérez                                        uint64_t features,
1664152128d6SEugenio Pérez                                        Error **errp)
16651e0a84eaSCindy Lu {
16661e0a84eaSCindy Lu     NetClientState *nc = NULL;
16671e0a84eaSCindy Lu     VhostVDPAState *s;
16681e0a84eaSCindy Lu     int ret = 0;
16691e0a84eaSCindy Lu     assert(name);
1670e77db790SStefan Hajnoczi     int cvq_isolated = 0;
1671152128d6SEugenio Pérez 
167240237840SJason Wang     if (is_datapath) {
167340237840SJason Wang         nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device,
167440237840SJason Wang                                  name);
167540237840SJason Wang     } else {
1676152128d6SEugenio Pérez         cvq_isolated = vhost_vdpa_probe_cvq_isolation(vdpa_device_fd, features,
1677152128d6SEugenio Pérez                                                       queue_pair_index * 2,
1678152128d6SEugenio Pérez                                                       errp);
1679152128d6SEugenio Pérez         if (unlikely(cvq_isolated < 0)) {
1680152128d6SEugenio Pérez             return NULL;
1681152128d6SEugenio Pérez         }
1682152128d6SEugenio Pérez 
1683f8972b56SEugenio Pérez         nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer,
168440237840SJason Wang                                          device, name);
168540237840SJason Wang     }
168653b85d95SLaurent Vivier     qemu_set_info_str(nc, TYPE_VHOST_VDPA);
16871e0a84eaSCindy Lu     s = DO_UPCAST(VhostVDPAState, nc, nc);
16887327813dSJason Wang 
16891e0a84eaSCindy Lu     s->vhost_vdpa.device_fd = vdpa_device_fd;
169040237840SJason Wang     s->vhost_vdpa.index = queue_pair_index;
16917f211a28SEugenio Pérez     s->always_svq = svq;
1692d9cda213SSteve Sistare     s->migration_state.notify = NULL;
16931576dbb5SEugenio Pérez     s->vhost_vdpa.shadow_vqs_enabled = svq;
1694a585fad2SEugenio Pérez     s->vhost_vdpa.iova_range = iova_range;
16956188d78aSEugenio Pérez     s->vhost_vdpa.shadow_data = svq;
16965c1ebd4cSEugenio Pérez     if (queue_pair_index == 0) {
16975c1ebd4cSEugenio Pérez         vhost_vdpa_net_valid_svq_features(features,
16985c1ebd4cSEugenio Pérez                                           &s->vhost_vdpa.migration_blocker);
16995c1ebd4cSEugenio Pérez     } else if (!is_datapath) {
1700babf8b87SEugenio Pérez         s->cvq_cmd_out_buffer = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(),
1701babf8b87SEugenio Pérez                                      PROT_READ | PROT_WRITE,
1702babf8b87SEugenio Pérez                                      MAP_SHARED | MAP_ANONYMOUS, -1, 0);
1703babf8b87SEugenio Pérez         s->status = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(),
1704babf8b87SEugenio Pérez                          PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
1705babf8b87SEugenio Pérez                          -1, 0);
17062df4dd31SEugenio Pérez 
1707bd907ae4SEugenio Pérez         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
1708bd907ae4SEugenio Pérez         s->vhost_vdpa.shadow_vq_ops_opaque = s;
1709152128d6SEugenio Pérez         s->cvq_isolated = cvq_isolated;
17108bc0049eSEugenio Pérez     }
171140237840SJason Wang     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
171274af5eecSJason Wang     if (ret) {
171374af5eecSJason Wang         qemu_del_net_client(nc);
1714654790b6SJason Wang         return NULL;
171574af5eecSJason Wang     }
1716654790b6SJason Wang     return nc;
17171e0a84eaSCindy Lu }
17181e0a84eaSCindy Lu 
vhost_vdpa_get_features(int fd,uint64_t * features,Error ** errp)17198170ab3fSEugenio Pérez static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
17208170ab3fSEugenio Pérez {
17218170ab3fSEugenio Pérez     int ret = ioctl(fd, VHOST_GET_FEATURES, features);
17228170ab3fSEugenio Pérez     if (unlikely(ret < 0)) {
17238170ab3fSEugenio Pérez         error_setg_errno(errp, errno,
17248170ab3fSEugenio Pérez                          "Fail to query features from vhost-vDPA device");
17258170ab3fSEugenio Pérez     }
17268170ab3fSEugenio Pérez     return ret;
17278170ab3fSEugenio Pérez }
17288170ab3fSEugenio Pérez 
vhost_vdpa_get_max_queue_pairs(int fd,uint64_t features,int * has_cvq,Error ** errp)17298170ab3fSEugenio Pérez static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
17308170ab3fSEugenio Pérez                                           int *has_cvq, Error **errp)
173140237840SJason Wang {
173240237840SJason Wang     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
1733cd523a41SStefano Garzarella     g_autofree struct vhost_vdpa_config *config = NULL;
173440237840SJason Wang     __virtio16 *max_queue_pairs;
173540237840SJason Wang     int ret;
173640237840SJason Wang 
173740237840SJason Wang     if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
173840237840SJason Wang         *has_cvq = 1;
173940237840SJason Wang     } else {
174040237840SJason Wang         *has_cvq = 0;
174140237840SJason Wang     }
174240237840SJason Wang 
174340237840SJason Wang     if (features & (1 << VIRTIO_NET_F_MQ)) {
174440237840SJason Wang         config = g_malloc0(config_size + sizeof(*max_queue_pairs));
174540237840SJason Wang         config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs);
174640237840SJason Wang         config->len = sizeof(*max_queue_pairs);
174740237840SJason Wang 
174840237840SJason Wang         ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config);
174940237840SJason Wang         if (ret) {
175040237840SJason Wang             error_setg(errp, "Fail to get config from vhost-vDPA device");
175140237840SJason Wang             return -ret;
175240237840SJason Wang         }
175340237840SJason Wang 
175440237840SJason Wang         max_queue_pairs = (__virtio16 *)&config->buf;
175540237840SJason Wang 
175640237840SJason Wang         return lduw_le_p(max_queue_pairs);
175740237840SJason Wang     }
175840237840SJason Wang 
175940237840SJason Wang     return 1;
176040237840SJason Wang }
176140237840SJason Wang 
net_init_vhost_vdpa(const Netdev * netdev,const char * name,NetClientState * peer,Error ** errp)17621e0a84eaSCindy Lu int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
17631e0a84eaSCindy Lu                         NetClientState *peer, Error **errp)
17641e0a84eaSCindy Lu {
17651e0a84eaSCindy Lu     const NetdevVhostVDPAOptions *opts;
17668170ab3fSEugenio Pérez     uint64_t features;
1767654790b6SJason Wang     int vdpa_device_fd;
1768eb3cb751SEugenio Pérez     g_autofree NetClientState **ncs = NULL;
1769a585fad2SEugenio Pérez     struct vhost_vdpa_iova_range iova_range;
1770eb3cb751SEugenio Pérez     NetClientState *nc;
1771aed5da45SEugenio Pérez     int queue_pairs, r, i = 0, has_cvq = 0;
17721e0a84eaSCindy Lu 
17731e0a84eaSCindy Lu     assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
17741e0a84eaSCindy Lu     opts = &netdev->u.vhost_vdpa;
17757480874aSMarkus Armbruster     if (!opts->vhostdev && !opts->vhostfd) {
17768801ccd0SSi-Wei Liu         error_setg(errp,
17778801ccd0SSi-Wei Liu                    "vhost-vdpa: neither vhostdev= nor vhostfd= was specified");
1778c8295404SEugenio Pérez         return -1;
1779c8295404SEugenio Pérez     }
17807327813dSJason Wang 
17817480874aSMarkus Armbruster     if (opts->vhostdev && opts->vhostfd) {
17828801ccd0SSi-Wei Liu         error_setg(errp,
17838801ccd0SSi-Wei Liu                    "vhost-vdpa: vhostdev= and vhostfd= are mutually exclusive");
17848801ccd0SSi-Wei Liu         return -1;
17858801ccd0SSi-Wei Liu     }
17868801ccd0SSi-Wei Liu 
17877480874aSMarkus Armbruster     if (opts->vhostdev) {
17880351152bSEugenio Pérez         vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp);
17897327813dSJason Wang         if (vdpa_device_fd == -1) {
17907327813dSJason Wang             return -errno;
17917327813dSJason Wang         }
17925107fd3eSPeter Maydell     } else {
17935107fd3eSPeter Maydell         /* has_vhostfd */
17948801ccd0SSi-Wei Liu         vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, errp);
17958801ccd0SSi-Wei Liu         if (vdpa_device_fd == -1) {
17968801ccd0SSi-Wei Liu             error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: ");
17978801ccd0SSi-Wei Liu             return -1;
17988801ccd0SSi-Wei Liu         }
17998801ccd0SSi-Wei Liu     }
18007327813dSJason Wang 
18018170ab3fSEugenio Pérez     r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
18028170ab3fSEugenio Pérez     if (unlikely(r < 0)) {
1803aed5da45SEugenio Pérez         goto err;
18048170ab3fSEugenio Pérez     }
18058170ab3fSEugenio Pérez 
18068170ab3fSEugenio Pérez     queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
180740237840SJason Wang                                                  &has_cvq, errp);
180840237840SJason Wang     if (queue_pairs < 0) {
18097327813dSJason Wang         qemu_close(vdpa_device_fd);
181040237840SJason Wang         return queue_pairs;
18117327813dSJason Wang     }
18127327813dSJason Wang 
1813bf7a2ad8SLongpeng     r = vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
1814bf7a2ad8SLongpeng     if (unlikely(r < 0)) {
1815bf7a2ad8SLongpeng         error_setg(errp, "vhost-vdpa: get iova range failed: %s",
1816bf7a2ad8SLongpeng                    strerror(-r));
1817bf7a2ad8SLongpeng         goto err;
1818bf7a2ad8SLongpeng     }
1819bf7a2ad8SLongpeng 
182000ef422eSEugenio Pérez     if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
182100ef422eSEugenio Pérez         goto err;
18221576dbb5SEugenio Pérez     }
18231576dbb5SEugenio Pérez 
182440237840SJason Wang     ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
182540237840SJason Wang 
182640237840SJason Wang     for (i = 0; i < queue_pairs; i++) {
182740237840SJason Wang         ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
18281576dbb5SEugenio Pérez                                      vdpa_device_fd, i, 2, true, opts->x_svq,
1829152128d6SEugenio Pérez                                      iova_range, features, errp);
183040237840SJason Wang         if (!ncs[i])
183140237840SJason Wang             goto err;
183240237840SJason Wang     }
183340237840SJason Wang 
183440237840SJason Wang     if (has_cvq) {
183540237840SJason Wang         nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
18361576dbb5SEugenio Pérez                                  vdpa_device_fd, i, 1, false,
1837152128d6SEugenio Pérez                                  opts->x_svq, iova_range, features, errp);
183840237840SJason Wang         if (!nc)
183940237840SJason Wang             goto err;
184040237840SJason Wang     }
184140237840SJason Wang 
1842654790b6SJason Wang     return 0;
184340237840SJason Wang 
184440237840SJason Wang err:
184540237840SJason Wang     if (i) {
18469bd05507SSi-Wei Liu         for (i--; i >= 0; i--) {
18479bd05507SSi-Wei Liu             qemu_del_net_client(ncs[i]);
18489bd05507SSi-Wei Liu         }
184940237840SJason Wang     }
18501576dbb5SEugenio Pérez 
185140237840SJason Wang     qemu_close(vdpa_device_fd);
185240237840SJason Wang 
185340237840SJason Wang     return -1;
18541e0a84eaSCindy Lu }
1855