xref: /openbmc/qemu/net/vhost-vdpa.c (revision 9da16849)
11e0a84eaSCindy Lu /*
21e0a84eaSCindy Lu  * vhost-vdpa.c
31e0a84eaSCindy Lu  *
41e0a84eaSCindy Lu  * Copyright(c) 2017-2018 Intel Corporation.
51e0a84eaSCindy Lu  * Copyright(c) 2020 Red Hat, Inc.
61e0a84eaSCindy Lu  *
71e0a84eaSCindy Lu  * This work is licensed under the terms of the GNU GPL, version 2 or later.
81e0a84eaSCindy Lu  * See the COPYING file in the top-level directory.
91e0a84eaSCindy Lu  *
101e0a84eaSCindy Lu  */
111e0a84eaSCindy Lu 
121e0a84eaSCindy Lu #include "qemu/osdep.h"
131e0a84eaSCindy Lu #include "clients.h"
14bd907ae4SEugenio Pérez #include "hw/virtio/virtio-net.h"
151e0a84eaSCindy Lu #include "net/vhost_net.h"
161e0a84eaSCindy Lu #include "net/vhost-vdpa.h"
171e0a84eaSCindy Lu #include "hw/virtio/vhost-vdpa.h"
181e0a84eaSCindy Lu #include "qemu/config-file.h"
191e0a84eaSCindy Lu #include "qemu/error-report.h"
20bd907ae4SEugenio Pérez #include "qemu/log.h"
21bd907ae4SEugenio Pérez #include "qemu/memalign.h"
221e0a84eaSCindy Lu #include "qemu/option.h"
231e0a84eaSCindy Lu #include "qapi/error.h"
2440237840SJason Wang #include <linux/vhost.h>
251e0a84eaSCindy Lu #include <sys/ioctl.h>
261e0a84eaSCindy Lu #include <err.h>
271e0a84eaSCindy Lu #include "standard-headers/linux/virtio_net.h"
281e0a84eaSCindy Lu #include "monitor/monitor.h"
2969498430SEugenio Pérez #include "migration/migration.h"
3069498430SEugenio Pérez #include "migration/misc.h"
311e0a84eaSCindy Lu #include "hw/virtio/vhost.h"
321e0a84eaSCindy Lu 
331e0a84eaSCindy Lu /* Todo:need to add the multiqueue support here */
341e0a84eaSCindy Lu typedef struct VhostVDPAState {
351e0a84eaSCindy Lu     NetClientState nc;
361e0a84eaSCindy Lu     struct vhost_vdpa vhost_vdpa;
3769498430SEugenio Pérez     Notifier migration_state;
381e0a84eaSCindy Lu     VHostNetState *vhost_net;
392df4dd31SEugenio Pérez 
402df4dd31SEugenio Pérez     /* Control commands shadow buffers */
4117fb889fSEugenio Pérez     void *cvq_cmd_out_buffer;
4217fb889fSEugenio Pérez     virtio_net_ctrl_ack *status;
4317fb889fSEugenio Pérez 
447f211a28SEugenio Pérez     /* The device always have SVQ enabled */
457f211a28SEugenio Pérez     bool always_svq;
46152128d6SEugenio Pérez 
47152128d6SEugenio Pérez     /* The device can isolate CVQ in its own ASID */
48152128d6SEugenio Pérez     bool cvq_isolated;
49152128d6SEugenio Pérez 
501e0a84eaSCindy Lu     bool started;
511e0a84eaSCindy Lu } VhostVDPAState;
521e0a84eaSCindy Lu 
532875a0caSHawkins Jiawei /*
542875a0caSHawkins Jiawei  * The array is sorted alphabetically in ascending order,
552875a0caSHawkins Jiawei  * with the exception of VHOST_INVALID_FEATURE_BIT,
562875a0caSHawkins Jiawei  * which should always be the last entry.
572875a0caSHawkins Jiawei  */
581e0a84eaSCindy Lu const int vdpa_feature_bits[] = {
591e0a84eaSCindy Lu     VIRTIO_F_ANY_LAYOUT,
602875a0caSHawkins Jiawei     VIRTIO_F_IOMMU_PLATFORM,
612875a0caSHawkins Jiawei     VIRTIO_F_NOTIFY_ON_EMPTY,
622875a0caSHawkins Jiawei     VIRTIO_F_RING_PACKED,
632875a0caSHawkins Jiawei     VIRTIO_F_RING_RESET,
641e0a84eaSCindy Lu     VIRTIO_F_VERSION_1,
651e0a84eaSCindy Lu     VIRTIO_NET_F_CSUM,
6651e84244SEugenio Pérez     VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,
672875a0caSHawkins Jiawei     VIRTIO_NET_F_CTRL_MAC_ADDR,
6840237840SJason Wang     VIRTIO_NET_F_CTRL_RX,
6940237840SJason Wang     VIRTIO_NET_F_CTRL_RX_EXTRA,
7040237840SJason Wang     VIRTIO_NET_F_CTRL_VLAN,
7140237840SJason Wang     VIRTIO_NET_F_CTRL_VQ,
722875a0caSHawkins Jiawei     VIRTIO_NET_F_GSO,
732875a0caSHawkins Jiawei     VIRTIO_NET_F_GUEST_CSUM,
742875a0caSHawkins Jiawei     VIRTIO_NET_F_GUEST_ECN,
752875a0caSHawkins Jiawei     VIRTIO_NET_F_GUEST_TSO4,
762875a0caSHawkins Jiawei     VIRTIO_NET_F_GUEST_TSO6,
772875a0caSHawkins Jiawei     VIRTIO_NET_F_GUEST_UFO,
78*9da16849SAndrew Melnychenko     VIRTIO_NET_F_GUEST_USO4,
79*9da16849SAndrew Melnychenko     VIRTIO_NET_F_GUEST_USO6,
800145c393SAndrew Melnychenko     VIRTIO_NET_F_HASH_REPORT,
812875a0caSHawkins Jiawei     VIRTIO_NET_F_HOST_ECN,
822875a0caSHawkins Jiawei     VIRTIO_NET_F_HOST_TSO4,
832875a0caSHawkins Jiawei     VIRTIO_NET_F_HOST_TSO6,
842875a0caSHawkins Jiawei     VIRTIO_NET_F_HOST_UFO,
85*9da16849SAndrew Melnychenko     VIRTIO_NET_F_HOST_USO,
862875a0caSHawkins Jiawei     VIRTIO_NET_F_MQ,
872875a0caSHawkins Jiawei     VIRTIO_NET_F_MRG_RXBUF,
882875a0caSHawkins Jiawei     VIRTIO_NET_F_MTU,
892875a0caSHawkins Jiawei     VIRTIO_NET_F_RSS,
909aa47eddSSi-Wei Liu     VIRTIO_NET_F_STATUS,
912875a0caSHawkins Jiawei     VIRTIO_RING_F_EVENT_IDX,
922875a0caSHawkins Jiawei     VIRTIO_RING_F_INDIRECT_DESC,
932875a0caSHawkins Jiawei 
942875a0caSHawkins Jiawei     /* VHOST_INVALID_FEATURE_BIT should always be the last entry */
951e0a84eaSCindy Lu     VHOST_INVALID_FEATURE_BIT
961e0a84eaSCindy Lu };
971e0a84eaSCindy Lu 
981576dbb5SEugenio Pérez /** Supported device specific feature bits with SVQ */
991576dbb5SEugenio Pérez static const uint64_t vdpa_svq_device_features =
1001576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_CSUM) |
1011576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
1024b4a1378SHawkins Jiawei     BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) |
1031576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_MTU) |
1041576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_MAC) |
1051576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
1061576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
1071576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
1081576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
1091576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
1101576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
1111576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
1121576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
1131576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
1141576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_STATUS) |
1151576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
116ea6eec49SHawkins Jiawei     BIT_ULL(VIRTIO_NET_F_CTRL_RX) |
117d669b7bbSHawkins Jiawei     BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) |
11872b99a87SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_MQ) |
1191576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
1201576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
121609ab4c3SEugenio Pérez     /* VHOST_F_LOG_ALL is exposed by SVQ */
122609ab4c3SEugenio Pérez     BIT_ULL(VHOST_F_LOG_ALL) |
1231576dbb5SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
1240d74e2b7SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_STANDBY) |
1250d74e2b7SEugenio Pérez     BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX);
1261576dbb5SEugenio Pérez 
127c1a10086SEugenio Pérez #define VHOST_VDPA_NET_CVQ_ASID 1
128c1a10086SEugenio Pérez 
1291e0a84eaSCindy Lu VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
1301e0a84eaSCindy Lu {
1311e0a84eaSCindy Lu     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
1321e0a84eaSCindy Lu     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
1331e0a84eaSCindy Lu     return s->vhost_net;
1341e0a84eaSCindy Lu }
1351e0a84eaSCindy Lu 
136915bf6ccSEugenio Pérez static size_t vhost_vdpa_net_cvq_cmd_len(void)
137915bf6ccSEugenio Pérez {
138915bf6ccSEugenio Pérez     /*
139915bf6ccSEugenio Pérez      * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
140915bf6ccSEugenio Pérez      * In buffer is always 1 byte, so it should fit here
141915bf6ccSEugenio Pérez      */
142915bf6ccSEugenio Pérez     return sizeof(struct virtio_net_ctrl_hdr) +
143915bf6ccSEugenio Pérez            2 * sizeof(struct virtio_net_ctrl_mac) +
144915bf6ccSEugenio Pérez            MAC_TABLE_ENTRIES * ETH_ALEN;
145915bf6ccSEugenio Pérez }
146915bf6ccSEugenio Pérez 
147915bf6ccSEugenio Pérez static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
148915bf6ccSEugenio Pérez {
149915bf6ccSEugenio Pérez     return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
150915bf6ccSEugenio Pérez }
151915bf6ccSEugenio Pérez 
15236e46472SEugenio Pérez static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp)
15336e46472SEugenio Pérez {
15436e46472SEugenio Pérez     uint64_t invalid_dev_features =
15536e46472SEugenio Pérez         features & ~vdpa_svq_device_features &
15636e46472SEugenio Pérez         /* Transport are all accepted at this point */
15736e46472SEugenio Pérez         ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
15836e46472SEugenio Pérez                          VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
15936e46472SEugenio Pérez 
16036e46472SEugenio Pérez     if (invalid_dev_features) {
16136e46472SEugenio Pérez         error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
16236e46472SEugenio Pérez                    invalid_dev_features);
163258a0394SEugenio Pérez         return false;
16436e46472SEugenio Pérez     }
16536e46472SEugenio Pérez 
166258a0394SEugenio Pérez     return vhost_svq_valid_features(features, errp);
16736e46472SEugenio Pérez }
16836e46472SEugenio Pérez 
1691e0a84eaSCindy Lu static int vhost_vdpa_net_check_device_id(struct vhost_net *net)
1701e0a84eaSCindy Lu {
1711e0a84eaSCindy Lu     uint32_t device_id;
1721e0a84eaSCindy Lu     int ret;
1731e0a84eaSCindy Lu     struct vhost_dev *hdev;
1741e0a84eaSCindy Lu 
1751e0a84eaSCindy Lu     hdev = (struct vhost_dev *)&net->dev;
1761e0a84eaSCindy Lu     ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id);
1771e0a84eaSCindy Lu     if (device_id != VIRTIO_ID_NET) {
1781e0a84eaSCindy Lu         return -ENOTSUP;
1791e0a84eaSCindy Lu     }
1801e0a84eaSCindy Lu     return ret;
1811e0a84eaSCindy Lu }
1821e0a84eaSCindy Lu 
18340237840SJason Wang static int vhost_vdpa_add(NetClientState *ncs, void *be,
18440237840SJason Wang                           int queue_pair_index, int nvqs)
1851e0a84eaSCindy Lu {
1861e0a84eaSCindy Lu     VhostNetOptions options;
1871e0a84eaSCindy Lu     struct vhost_net *net = NULL;
1881e0a84eaSCindy Lu     VhostVDPAState *s;
1891e0a84eaSCindy Lu     int ret;
1901e0a84eaSCindy Lu 
1911e0a84eaSCindy Lu     options.backend_type = VHOST_BACKEND_TYPE_VDPA;
1921e0a84eaSCindy Lu     assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
1931e0a84eaSCindy Lu     s = DO_UPCAST(VhostVDPAState, nc, ncs);
1941e0a84eaSCindy Lu     options.net_backend = ncs;
1951e0a84eaSCindy Lu     options.opaque      = be;
1961e0a84eaSCindy Lu     options.busyloop_timeout = 0;
19740237840SJason Wang     options.nvqs = nvqs;
1981e0a84eaSCindy Lu 
1991e0a84eaSCindy Lu     net = vhost_net_init(&options);
2001e0a84eaSCindy Lu     if (!net) {
2011e0a84eaSCindy Lu         error_report("failed to init vhost_net for queue");
202a97ef87aSJason Wang         goto err_init;
2031e0a84eaSCindy Lu     }
2041e0a84eaSCindy Lu     s->vhost_net = net;
2051e0a84eaSCindy Lu     ret = vhost_vdpa_net_check_device_id(net);
2061e0a84eaSCindy Lu     if (ret) {
207a97ef87aSJason Wang         goto err_check;
2081e0a84eaSCindy Lu     }
2091e0a84eaSCindy Lu     return 0;
210a97ef87aSJason Wang err_check:
2111e0a84eaSCindy Lu     vhost_net_cleanup(net);
212ab36edcfSJason Wang     g_free(net);
213a97ef87aSJason Wang err_init:
2141e0a84eaSCindy Lu     return -1;
2151e0a84eaSCindy Lu }
2161e0a84eaSCindy Lu 
2171e0a84eaSCindy Lu static void vhost_vdpa_cleanup(NetClientState *nc)
2181e0a84eaSCindy Lu {
2191e0a84eaSCindy Lu     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
2201e0a84eaSCindy Lu 
221a0d7215eSAni Sinha     /*
222a0d7215eSAni Sinha      * If a peer NIC is attached, do not cleanup anything.
223a0d7215eSAni Sinha      * Cleanup will happen as a part of qemu_cleanup() -> net_cleanup()
224a0d7215eSAni Sinha      * when the guest is shutting down.
225a0d7215eSAni Sinha      */
226a0d7215eSAni Sinha     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_NIC) {
227a0d7215eSAni Sinha         return;
228a0d7215eSAni Sinha     }
229babf8b87SEugenio Pérez     munmap(s->cvq_cmd_out_buffer, vhost_vdpa_net_cvq_cmd_page_len());
230babf8b87SEugenio Pérez     munmap(s->status, vhost_vdpa_net_cvq_cmd_page_len());
2311e0a84eaSCindy Lu     if (s->vhost_net) {
2321e0a84eaSCindy Lu         vhost_net_cleanup(s->vhost_net);
2331e0a84eaSCindy Lu         g_free(s->vhost_net);
2341e0a84eaSCindy Lu         s->vhost_net = NULL;
2351e0a84eaSCindy Lu     }
23657b3a7d8SCindy Lu      if (s->vhost_vdpa.device_fd >= 0) {
23757b3a7d8SCindy Lu         qemu_close(s->vhost_vdpa.device_fd);
23857b3a7d8SCindy Lu         s->vhost_vdpa.device_fd = -1;
23957b3a7d8SCindy Lu     }
2401e0a84eaSCindy Lu }
2411e0a84eaSCindy Lu 
2421e0a84eaSCindy Lu static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc)
2431e0a84eaSCindy Lu {
2441e0a84eaSCindy Lu     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
2451e0a84eaSCindy Lu 
2461e0a84eaSCindy Lu     return true;
2471e0a84eaSCindy Lu }
2481e0a84eaSCindy Lu 
2491e0a84eaSCindy Lu static bool vhost_vdpa_has_ufo(NetClientState *nc)
2501e0a84eaSCindy Lu {
2511e0a84eaSCindy Lu     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
2521e0a84eaSCindy Lu     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
2531e0a84eaSCindy Lu     uint64_t features = 0;
2541e0a84eaSCindy Lu     features |= (1ULL << VIRTIO_NET_F_HOST_UFO);
2551e0a84eaSCindy Lu     features = vhost_net_get_features(s->vhost_net, features);
2561e0a84eaSCindy Lu     return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO));
2571e0a84eaSCindy Lu 
2581e0a84eaSCindy Lu }
2591e0a84eaSCindy Lu 
260ee8a1c63SKevin Wolf static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc,
261ee8a1c63SKevin Wolf                                        Error **errp)
262ee8a1c63SKevin Wolf {
263ee8a1c63SKevin Wolf     const char *driver = object_class_get_name(oc);
264ee8a1c63SKevin Wolf 
265ee8a1c63SKevin Wolf     if (!g_str_has_prefix(driver, "virtio-net-")) {
266ee8a1c63SKevin Wolf         error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
267ee8a1c63SKevin Wolf         return false;
268ee8a1c63SKevin Wolf     }
269ee8a1c63SKevin Wolf 
270ee8a1c63SKevin Wolf     return true;
271ee8a1c63SKevin Wolf }
272ee8a1c63SKevin Wolf 
273846a1e85SEugenio Pérez /** Dummy receive in case qemu falls back to userland tap networking */
274846a1e85SEugenio Pérez static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
275846a1e85SEugenio Pérez                                   size_t size)
276846a1e85SEugenio Pérez {
277bc5add1dSSi-Wei Liu     return size;
278846a1e85SEugenio Pérez }
279846a1e85SEugenio Pérez 
28000ef422eSEugenio Pérez /** From any vdpa net client, get the netclient of the first queue pair */
28100ef422eSEugenio Pérez static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
28200ef422eSEugenio Pérez {
28300ef422eSEugenio Pérez     NICState *nic = qemu_get_nic(s->nc.peer);
28400ef422eSEugenio Pérez     NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
28500ef422eSEugenio Pérez 
28600ef422eSEugenio Pérez     return DO_UPCAST(VhostVDPAState, nc, nc0);
28700ef422eSEugenio Pérez }
28800ef422eSEugenio Pérez 
28969498430SEugenio Pérez static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
29069498430SEugenio Pérez {
29169498430SEugenio Pérez     struct vhost_vdpa *v = &s->vhost_vdpa;
29269498430SEugenio Pérez     VirtIONet *n;
29369498430SEugenio Pérez     VirtIODevice *vdev;
29469498430SEugenio Pérez     int data_queue_pairs, cvq, r;
29569498430SEugenio Pérez 
29669498430SEugenio Pérez     /* We are only called on the first data vqs and only if x-svq is not set */
29769498430SEugenio Pérez     if (s->vhost_vdpa.shadow_vqs_enabled == enable) {
29869498430SEugenio Pérez         return;
29969498430SEugenio Pérez     }
30069498430SEugenio Pérez 
30169498430SEugenio Pérez     vdev = v->dev->vdev;
30269498430SEugenio Pérez     n = VIRTIO_NET(vdev);
30369498430SEugenio Pérez     if (!n->vhost_started) {
30469498430SEugenio Pérez         return;
30569498430SEugenio Pérez     }
30669498430SEugenio Pérez 
30769498430SEugenio Pérez     data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
30869498430SEugenio Pérez     cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
30969498430SEugenio Pérez                                   n->max_ncs - n->max_queue_pairs : 0;
31069498430SEugenio Pérez     /*
31169498430SEugenio Pérez      * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter
31269498430SEugenio Pérez      * in the future and resume the device if read-only operations between
31369498430SEugenio Pérez      * suspend and reset goes wrong.
31469498430SEugenio Pérez      */
31569498430SEugenio Pérez     vhost_net_stop(vdev, n->nic->ncs, data_queue_pairs, cvq);
31669498430SEugenio Pérez 
31769498430SEugenio Pérez     /* Start will check migration setup_or_active to configure or not SVQ */
31869498430SEugenio Pérez     r = vhost_net_start(vdev, n->nic->ncs, data_queue_pairs, cvq);
31969498430SEugenio Pérez     if (unlikely(r < 0)) {
32069498430SEugenio Pérez         error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r);
32169498430SEugenio Pérez     }
32269498430SEugenio Pérez }
32369498430SEugenio Pérez 
32469498430SEugenio Pérez static void vdpa_net_migration_state_notifier(Notifier *notifier, void *data)
32569498430SEugenio Pérez {
32669498430SEugenio Pérez     MigrationState *migration = data;
32769498430SEugenio Pérez     VhostVDPAState *s = container_of(notifier, VhostVDPAState,
32869498430SEugenio Pérez                                      migration_state);
32969498430SEugenio Pérez 
33069498430SEugenio Pérez     if (migration_in_setup(migration)) {
33169498430SEugenio Pérez         vhost_vdpa_net_log_global_enable(s, true);
33269498430SEugenio Pérez     } else if (migration_has_failed(migration)) {
33369498430SEugenio Pérez         vhost_vdpa_net_log_global_enable(s, false);
33469498430SEugenio Pérez     }
33569498430SEugenio Pérez }
33669498430SEugenio Pérez 
33700ef422eSEugenio Pérez static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
33800ef422eSEugenio Pérez {
33900ef422eSEugenio Pérez     struct vhost_vdpa *v = &s->vhost_vdpa;
34000ef422eSEugenio Pérez 
34169498430SEugenio Pérez     add_migration_state_change_notifier(&s->migration_state);
34200ef422eSEugenio Pérez     if (v->shadow_vqs_enabled) {
34300ef422eSEugenio Pérez         v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
34400ef422eSEugenio Pérez                                            v->iova_range.last);
34500ef422eSEugenio Pérez     }
34600ef422eSEugenio Pérez }
34700ef422eSEugenio Pérez 
34800ef422eSEugenio Pérez static int vhost_vdpa_net_data_start(NetClientState *nc)
34900ef422eSEugenio Pérez {
35000ef422eSEugenio Pérez     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
35100ef422eSEugenio Pérez     struct vhost_vdpa *v = &s->vhost_vdpa;
35200ef422eSEugenio Pérez 
35300ef422eSEugenio Pérez     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
35400ef422eSEugenio Pérez 
35569498430SEugenio Pérez     if (s->always_svq ||
35669498430SEugenio Pérez         migration_is_setup_or_active(migrate_get_current()->state)) {
35769498430SEugenio Pérez         v->shadow_vqs_enabled = true;
35869498430SEugenio Pérez         v->shadow_data = true;
35969498430SEugenio Pérez     } else {
36069498430SEugenio Pérez         v->shadow_vqs_enabled = false;
36169498430SEugenio Pérez         v->shadow_data = false;
36269498430SEugenio Pérez     }
36369498430SEugenio Pérez 
36400ef422eSEugenio Pérez     if (v->index == 0) {
36500ef422eSEugenio Pérez         vhost_vdpa_net_data_start_first(s);
36600ef422eSEugenio Pérez         return 0;
36700ef422eSEugenio Pérez     }
36800ef422eSEugenio Pérez 
36900ef422eSEugenio Pérez     if (v->shadow_vqs_enabled) {
37000ef422eSEugenio Pérez         VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
37100ef422eSEugenio Pérez         v->iova_tree = s0->vhost_vdpa.iova_tree;
37200ef422eSEugenio Pérez     }
37300ef422eSEugenio Pérez 
37400ef422eSEugenio Pérez     return 0;
37500ef422eSEugenio Pérez }
37600ef422eSEugenio Pérez 
37700ef422eSEugenio Pérez static void vhost_vdpa_net_client_stop(NetClientState *nc)
37800ef422eSEugenio Pérez {
37900ef422eSEugenio Pérez     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
38000ef422eSEugenio Pérez     struct vhost_dev *dev;
38100ef422eSEugenio Pérez 
38200ef422eSEugenio Pérez     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
38300ef422eSEugenio Pérez 
38469498430SEugenio Pérez     if (s->vhost_vdpa.index == 0) {
38569498430SEugenio Pérez         remove_migration_state_change_notifier(&s->migration_state);
38669498430SEugenio Pérez     }
38769498430SEugenio Pérez 
38800ef422eSEugenio Pérez     dev = s->vhost_vdpa.dev;
38900ef422eSEugenio Pérez     if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
39000ef422eSEugenio Pérez         g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
39100ef422eSEugenio Pérez     }
39200ef422eSEugenio Pérez }
39300ef422eSEugenio Pérez 
3941e0a84eaSCindy Lu static NetClientInfo net_vhost_vdpa_info = {
3951e0a84eaSCindy Lu         .type = NET_CLIENT_DRIVER_VHOST_VDPA,
3961e0a84eaSCindy Lu         .size = sizeof(VhostVDPAState),
397846a1e85SEugenio Pérez         .receive = vhost_vdpa_receive,
39800ef422eSEugenio Pérez         .start = vhost_vdpa_net_data_start,
39900ef422eSEugenio Pérez         .stop = vhost_vdpa_net_client_stop,
4001e0a84eaSCindy Lu         .cleanup = vhost_vdpa_cleanup,
4011e0a84eaSCindy Lu         .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
4021e0a84eaSCindy Lu         .has_ufo = vhost_vdpa_has_ufo,
403ee8a1c63SKevin Wolf         .check_peer_type = vhost_vdpa_check_peer_type,
4041e0a84eaSCindy Lu };
4051e0a84eaSCindy Lu 
406152128d6SEugenio Pérez static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index,
407152128d6SEugenio Pérez                                           Error **errp)
408c1a10086SEugenio Pérez {
409c1a10086SEugenio Pérez     struct vhost_vring_state state = {
410c1a10086SEugenio Pérez         .index = vq_index,
411c1a10086SEugenio Pérez     };
412c1a10086SEugenio Pérez     int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state);
413c1a10086SEugenio Pérez 
414c1a10086SEugenio Pérez     if (unlikely(r < 0)) {
4150f2bb0bfSEugenio Pérez         r = -errno;
416152128d6SEugenio Pérez         error_setg_errno(errp, errno, "Cannot get VQ %u group", vq_index);
417c1a10086SEugenio Pérez         return r;
418c1a10086SEugenio Pérez     }
419c1a10086SEugenio Pérez 
420c1a10086SEugenio Pérez     return state.num;
421c1a10086SEugenio Pérez }
422c1a10086SEugenio Pérez 
423c1a10086SEugenio Pérez static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v,
424c1a10086SEugenio Pérez                                            unsigned vq_group,
425c1a10086SEugenio Pérez                                            unsigned asid_num)
426c1a10086SEugenio Pérez {
427c1a10086SEugenio Pérez     struct vhost_vring_state asid = {
428c1a10086SEugenio Pérez         .index = vq_group,
429c1a10086SEugenio Pérez         .num = asid_num,
430c1a10086SEugenio Pérez     };
431c1a10086SEugenio Pérez     int r;
432c1a10086SEugenio Pérez 
433c1a10086SEugenio Pérez     r = ioctl(v->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid);
434c1a10086SEugenio Pérez     if (unlikely(r < 0)) {
435c1a10086SEugenio Pérez         error_report("Can't set vq group %u asid %u, errno=%d (%s)",
436c1a10086SEugenio Pérez                      asid.index, asid.num, errno, g_strerror(errno));
437c1a10086SEugenio Pérez     }
438c1a10086SEugenio Pérez     return r;
439c1a10086SEugenio Pérez }
440c1a10086SEugenio Pérez 
4412df4dd31SEugenio Pérez static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
4422df4dd31SEugenio Pérez {
4432df4dd31SEugenio Pérez     VhostIOVATree *tree = v->iova_tree;
4442df4dd31SEugenio Pérez     DMAMap needle = {
4452df4dd31SEugenio Pérez         /*
4462df4dd31SEugenio Pérez          * No need to specify size or to look for more translations since
4472df4dd31SEugenio Pérez          * this contiguous chunk was allocated by us.
4482df4dd31SEugenio Pérez          */
4492df4dd31SEugenio Pérez         .translated_addr = (hwaddr)(uintptr_t)addr,
4502df4dd31SEugenio Pérez     };
4512df4dd31SEugenio Pérez     const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
4522df4dd31SEugenio Pérez     int r;
4532df4dd31SEugenio Pérez 
4542df4dd31SEugenio Pérez     if (unlikely(!map)) {
4552df4dd31SEugenio Pérez         error_report("Cannot locate expected map");
4562df4dd31SEugenio Pérez         return;
4572df4dd31SEugenio Pérez     }
4582df4dd31SEugenio Pérez 
459cd831ed5SEugenio Pérez     r = vhost_vdpa_dma_unmap(v, v->address_space_id, map->iova, map->size + 1);
4602df4dd31SEugenio Pérez     if (unlikely(r != 0)) {
4612df4dd31SEugenio Pérez         error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
4622df4dd31SEugenio Pérez     }
4632df4dd31SEugenio Pérez 
46469292a8eSEugenio Pérez     vhost_iova_tree_remove(tree, *map);
4652df4dd31SEugenio Pérez }
4662df4dd31SEugenio Pérez 
4677a7f87e9SEugenio Pérez /** Map CVQ buffer. */
4687a7f87e9SEugenio Pérez static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size,
4697a7f87e9SEugenio Pérez                                   bool write)
4702df4dd31SEugenio Pérez {
4712df4dd31SEugenio Pérez     DMAMap map = {};
4722df4dd31SEugenio Pérez     int r;
4732df4dd31SEugenio Pérez 
4742df4dd31SEugenio Pérez     map.translated_addr = (hwaddr)(uintptr_t)buf;
4757a7f87e9SEugenio Pérez     map.size = size - 1;
4762df4dd31SEugenio Pérez     map.perm = write ? IOMMU_RW : IOMMU_RO,
4772df4dd31SEugenio Pérez     r = vhost_iova_tree_map_alloc(v->iova_tree, &map);
4782df4dd31SEugenio Pérez     if (unlikely(r != IOVA_OK)) {
4792df4dd31SEugenio Pérez         error_report("Cannot map injected element");
4807a7f87e9SEugenio Pérez         return r;
4812df4dd31SEugenio Pérez     }
4822df4dd31SEugenio Pérez 
483cd831ed5SEugenio Pérez     r = vhost_vdpa_dma_map(v, v->address_space_id, map.iova,
484cd831ed5SEugenio Pérez                            vhost_vdpa_net_cvq_cmd_page_len(), buf, !write);
4852df4dd31SEugenio Pérez     if (unlikely(r < 0)) {
4862df4dd31SEugenio Pérez         goto dma_map_err;
4872df4dd31SEugenio Pérez     }
4882df4dd31SEugenio Pérez 
4897a7f87e9SEugenio Pérez     return 0;
4902df4dd31SEugenio Pérez 
4912df4dd31SEugenio Pérez dma_map_err:
49269292a8eSEugenio Pérez     vhost_iova_tree_remove(v->iova_tree, map);
4937a7f87e9SEugenio Pérez     return r;
4942df4dd31SEugenio Pérez }
4952df4dd31SEugenio Pérez 
4967a7f87e9SEugenio Pérez static int vhost_vdpa_net_cvq_start(NetClientState *nc)
4972df4dd31SEugenio Pérez {
49800ef422eSEugenio Pérez     VhostVDPAState *s, *s0;
499c1a10086SEugenio Pérez     struct vhost_vdpa *v;
500c1a10086SEugenio Pérez     int64_t cvq_group;
501152128d6SEugenio Pérez     int r;
502152128d6SEugenio Pérez     Error *err = NULL;
5032df4dd31SEugenio Pérez 
5047a7f87e9SEugenio Pérez     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
5057a7f87e9SEugenio Pérez 
5067a7f87e9SEugenio Pérez     s = DO_UPCAST(VhostVDPAState, nc, nc);
507c1a10086SEugenio Pérez     v = &s->vhost_vdpa;
508c1a10086SEugenio Pérez 
50969498430SEugenio Pérez     s0 = vhost_vdpa_net_first_nc_vdpa(s);
51069498430SEugenio Pérez     v->shadow_data = s0->vhost_vdpa.shadow_vqs_enabled;
511c1a10086SEugenio Pérez     v->shadow_vqs_enabled = s->always_svq;
512c1a10086SEugenio Pérez     s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
513c1a10086SEugenio Pérez 
51469498430SEugenio Pérez     if (s->vhost_vdpa.shadow_data) {
515c1a10086SEugenio Pérez         /* SVQ is already configured for all virtqueues */
516c1a10086SEugenio Pérez         goto out;
517c1a10086SEugenio Pérez     }
518c1a10086SEugenio Pérez 
519c1a10086SEugenio Pérez     /*
520c1a10086SEugenio Pérez      * If we early return in these cases SVQ will not be enabled. The migration
521c1a10086SEugenio Pérez      * will be blocked as long as vhost-vdpa backends will not offer _F_LOG.
522c1a10086SEugenio Pérez      */
523152128d6SEugenio Pérez     if (!vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) {
524c1a10086SEugenio Pérez         return 0;
525c1a10086SEugenio Pérez     }
526c1a10086SEugenio Pérez 
527152128d6SEugenio Pérez     if (!s->cvq_isolated) {
528152128d6SEugenio Pérez         return 0;
529152128d6SEugenio Pérez     }
530152128d6SEugenio Pérez 
531152128d6SEugenio Pérez     cvq_group = vhost_vdpa_get_vring_group(v->device_fd,
532152128d6SEugenio Pérez                                            v->dev->vq_index_end - 1,
533152128d6SEugenio Pérez                                            &err);
534c1a10086SEugenio Pérez     if (unlikely(cvq_group < 0)) {
535152128d6SEugenio Pérez         error_report_err(err);
536c1a10086SEugenio Pérez         return cvq_group;
537c1a10086SEugenio Pérez     }
538c1a10086SEugenio Pérez 
539c1a10086SEugenio Pérez     r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID);
540c1a10086SEugenio Pérez     if (unlikely(r < 0)) {
541c1a10086SEugenio Pérez         return r;
542c1a10086SEugenio Pérez     }
543c1a10086SEugenio Pérez 
544c1a10086SEugenio Pérez     v->shadow_vqs_enabled = true;
545c1a10086SEugenio Pérez     s->vhost_vdpa.address_space_id = VHOST_VDPA_NET_CVQ_ASID;
546c1a10086SEugenio Pérez 
547c1a10086SEugenio Pérez out:
5487a7f87e9SEugenio Pérez     if (!s->vhost_vdpa.shadow_vqs_enabled) {
5497a7f87e9SEugenio Pérez         return 0;
5502df4dd31SEugenio Pérez     }
5512df4dd31SEugenio Pérez 
55200ef422eSEugenio Pérez     if (s0->vhost_vdpa.iova_tree) {
55300ef422eSEugenio Pérez         /*
55400ef422eSEugenio Pérez          * SVQ is already configured for all virtqueues.  Reuse IOVA tree for
55500ef422eSEugenio Pérez          * simplicity, whether CVQ shares ASID with guest or not, because:
55600ef422eSEugenio Pérez          * - Memory listener need access to guest's memory addresses allocated
55700ef422eSEugenio Pérez          *   in the IOVA tree.
55800ef422eSEugenio Pérez          * - There should be plenty of IOVA address space for both ASID not to
55900ef422eSEugenio Pérez          *   worry about collisions between them.  Guest's translations are
56000ef422eSEugenio Pérez          *   still validated with virtio virtqueue_pop so there is no risk for
56100ef422eSEugenio Pérez          *   the guest to access memory that it shouldn't.
56200ef422eSEugenio Pérez          *
56300ef422eSEugenio Pérez          * To allocate a iova tree per ASID is doable but it complicates the
56400ef422eSEugenio Pérez          * code and it is not worth it for the moment.
56500ef422eSEugenio Pérez          */
56600ef422eSEugenio Pérez         v->iova_tree = s0->vhost_vdpa.iova_tree;
56700ef422eSEugenio Pérez     } else {
56800ef422eSEugenio Pérez         v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
56900ef422eSEugenio Pérez                                            v->iova_range.last);
57000ef422eSEugenio Pérez     }
57100ef422eSEugenio Pérez 
5727a7f87e9SEugenio Pérez     r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
5737a7f87e9SEugenio Pérez                                vhost_vdpa_net_cvq_cmd_page_len(), false);
5747a7f87e9SEugenio Pérez     if (unlikely(r < 0)) {
5757a7f87e9SEugenio Pérez         return r;
5767a7f87e9SEugenio Pérez     }
5777a7f87e9SEugenio Pérez 
57817fb889fSEugenio Pérez     r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->status,
5797a7f87e9SEugenio Pérez                                vhost_vdpa_net_cvq_cmd_page_len(), true);
5807a7f87e9SEugenio Pérez     if (unlikely(r < 0)) {
5812df4dd31SEugenio Pérez         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
5822df4dd31SEugenio Pérez     }
5832df4dd31SEugenio Pérez 
5847a7f87e9SEugenio Pérez     return r;
5857a7f87e9SEugenio Pérez }
5867a7f87e9SEugenio Pérez 
5877a7f87e9SEugenio Pérez static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
5887a7f87e9SEugenio Pérez {
5897a7f87e9SEugenio Pérez     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
5907a7f87e9SEugenio Pérez 
5917a7f87e9SEugenio Pérez     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
5927a7f87e9SEugenio Pérez 
5937a7f87e9SEugenio Pérez     if (s->vhost_vdpa.shadow_vqs_enabled) {
5947a7f87e9SEugenio Pérez         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
59517fb889fSEugenio Pérez         vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
596c1a10086SEugenio Pérez     }
59700ef422eSEugenio Pérez 
59800ef422eSEugenio Pérez     vhost_vdpa_net_client_stop(nc);
5992df4dd31SEugenio Pérez }
6002df4dd31SEugenio Pérez 
601be4278b6SEugenio Pérez static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
602be4278b6SEugenio Pérez                                       size_t in_len)
603be4278b6SEugenio Pérez {
604be4278b6SEugenio Pérez     /* Buffers for the device */
605be4278b6SEugenio Pérez     const struct iovec out = {
606be4278b6SEugenio Pérez         .iov_base = s->cvq_cmd_out_buffer,
607be4278b6SEugenio Pérez         .iov_len = out_len,
608be4278b6SEugenio Pérez     };
609be4278b6SEugenio Pérez     const struct iovec in = {
61017fb889fSEugenio Pérez         .iov_base = s->status,
611be4278b6SEugenio Pérez         .iov_len = sizeof(virtio_net_ctrl_ack),
612be4278b6SEugenio Pérez     };
613be4278b6SEugenio Pérez     VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
614be4278b6SEugenio Pérez     int r;
615be4278b6SEugenio Pérez 
616be4278b6SEugenio Pérez     r = vhost_svq_add(svq, &out, 1, &in, 1, NULL);
617be4278b6SEugenio Pérez     if (unlikely(r != 0)) {
618be4278b6SEugenio Pérez         if (unlikely(r == -ENOSPC)) {
619be4278b6SEugenio Pérez             qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
620be4278b6SEugenio Pérez                           __func__);
621be4278b6SEugenio Pérez         }
622be4278b6SEugenio Pérez         return r;
623be4278b6SEugenio Pérez     }
624be4278b6SEugenio Pérez 
625be4278b6SEugenio Pérez     /*
626be4278b6SEugenio Pérez      * We can poll here since we've had BQL from the time we sent the
627be4278b6SEugenio Pérez      * descriptor. Also, we need to take the answer before SVQ pulls by itself,
628be4278b6SEugenio Pérez      * when BQL is released
629be4278b6SEugenio Pérez      */
630be4278b6SEugenio Pérez     return vhost_svq_poll(svq);
631be4278b6SEugenio Pérez }
632be4278b6SEugenio Pérez 
633f73c0c43SEugenio Pérez static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, uint8_t class,
6342848c6aaSHawkins Jiawei                                        uint8_t cmd, const struct iovec *data_sg,
6352848c6aaSHawkins Jiawei                                        size_t data_num)
636f73c0c43SEugenio Pérez {
637f73c0c43SEugenio Pérez     const struct virtio_net_ctrl_hdr ctrl = {
638f73c0c43SEugenio Pérez         .class = class,
639f73c0c43SEugenio Pérez         .cmd = cmd,
640f73c0c43SEugenio Pérez     };
6412848c6aaSHawkins Jiawei     size_t data_size = iov_size(data_sg, data_num);
642f73c0c43SEugenio Pérez 
643f73c0c43SEugenio Pérez     assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
644f73c0c43SEugenio Pérez 
6452848c6aaSHawkins Jiawei     /* pack the CVQ command header */
646f73c0c43SEugenio Pérez     memcpy(s->cvq_cmd_out_buffer, &ctrl, sizeof(ctrl));
647f73c0c43SEugenio Pérez 
6482848c6aaSHawkins Jiawei     /* pack the CVQ command command-specific-data */
6492848c6aaSHawkins Jiawei     iov_to_buf(data_sg, data_num, 0,
6502848c6aaSHawkins Jiawei                s->cvq_cmd_out_buffer + sizeof(ctrl), data_size);
6512848c6aaSHawkins Jiawei 
6522848c6aaSHawkins Jiawei     return vhost_vdpa_net_cvq_add(s, data_size + sizeof(ctrl),
653f73c0c43SEugenio Pérez                                   sizeof(virtio_net_ctrl_ack));
654f73c0c43SEugenio Pérez }
655f73c0c43SEugenio Pérez 
656f73c0c43SEugenio Pérez static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n)
657f73c0c43SEugenio Pérez {
65802d3bf09SHawkins Jiawei     if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
6592848c6aaSHawkins Jiawei         const struct iovec data = {
6602848c6aaSHawkins Jiawei             .iov_base = (void *)n->mac,
6612848c6aaSHawkins Jiawei             .iov_len = sizeof(n->mac),
6622848c6aaSHawkins Jiawei         };
663f73c0c43SEugenio Pérez         ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MAC,
664f73c0c43SEugenio Pérez                                                   VIRTIO_NET_CTRL_MAC_ADDR_SET,
6652848c6aaSHawkins Jiawei                                                   &data, 1);
666f73c0c43SEugenio Pérez         if (unlikely(dev_written < 0)) {
667f73c0c43SEugenio Pérez             return dev_written;
668f73c0c43SEugenio Pérez         }
669b479bc3cSHawkins Jiawei         if (*s->status != VIRTIO_NET_OK) {
670b479bc3cSHawkins Jiawei             return -EIO;
671b479bc3cSHawkins Jiawei         }
672f73c0c43SEugenio Pérez     }
673f73c0c43SEugenio Pérez 
6740ddcecb8SHawkins Jiawei     /*
6750ddcecb8SHawkins Jiawei      * According to VirtIO standard, "The device MUST have an
6760ddcecb8SHawkins Jiawei      * empty MAC filtering table on reset.".
6770ddcecb8SHawkins Jiawei      *
6780ddcecb8SHawkins Jiawei      * Therefore, there is no need to send this CVQ command if the
6790ddcecb8SHawkins Jiawei      * driver also sets an empty MAC filter table, which aligns with
6800ddcecb8SHawkins Jiawei      * the device's defaults.
6810ddcecb8SHawkins Jiawei      *
6820ddcecb8SHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
6830ddcecb8SHawkins Jiawei      * configuration only at live migration.
6840ddcecb8SHawkins Jiawei      */
6850ddcecb8SHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX) ||
6860ddcecb8SHawkins Jiawei         n->mac_table.in_use == 0) {
6870ddcecb8SHawkins Jiawei         return 0;
6880ddcecb8SHawkins Jiawei     }
6890ddcecb8SHawkins Jiawei 
6900ddcecb8SHawkins Jiawei     uint32_t uni_entries = n->mac_table.first_multi,
6910ddcecb8SHawkins Jiawei              uni_macs_size = uni_entries * ETH_ALEN,
6920ddcecb8SHawkins Jiawei              mul_entries = n->mac_table.in_use - uni_entries,
6930ddcecb8SHawkins Jiawei              mul_macs_size = mul_entries * ETH_ALEN;
6940ddcecb8SHawkins Jiawei     struct virtio_net_ctrl_mac uni = {
6950ddcecb8SHawkins Jiawei         .entries = cpu_to_le32(uni_entries),
6960ddcecb8SHawkins Jiawei     };
6970ddcecb8SHawkins Jiawei     struct virtio_net_ctrl_mac mul = {
6980ddcecb8SHawkins Jiawei         .entries = cpu_to_le32(mul_entries),
6990ddcecb8SHawkins Jiawei     };
7000ddcecb8SHawkins Jiawei     const struct iovec data[] = {
7010ddcecb8SHawkins Jiawei         {
7020ddcecb8SHawkins Jiawei             .iov_base = &uni,
7030ddcecb8SHawkins Jiawei             .iov_len = sizeof(uni),
7040ddcecb8SHawkins Jiawei         }, {
7050ddcecb8SHawkins Jiawei             .iov_base = n->mac_table.macs,
7060ddcecb8SHawkins Jiawei             .iov_len = uni_macs_size,
7070ddcecb8SHawkins Jiawei         }, {
7080ddcecb8SHawkins Jiawei             .iov_base = &mul,
7090ddcecb8SHawkins Jiawei             .iov_len = sizeof(mul),
7100ddcecb8SHawkins Jiawei         }, {
7110ddcecb8SHawkins Jiawei             .iov_base = &n->mac_table.macs[uni_macs_size],
7120ddcecb8SHawkins Jiawei             .iov_len = mul_macs_size,
7130ddcecb8SHawkins Jiawei         },
7140ddcecb8SHawkins Jiawei     };
7150ddcecb8SHawkins Jiawei     ssize_t dev_written = vhost_vdpa_net_load_cmd(s,
7160ddcecb8SHawkins Jiawei                                 VIRTIO_NET_CTRL_MAC,
7170ddcecb8SHawkins Jiawei                                 VIRTIO_NET_CTRL_MAC_TABLE_SET,
7180ddcecb8SHawkins Jiawei                                 data, ARRAY_SIZE(data));
7190ddcecb8SHawkins Jiawei     if (unlikely(dev_written < 0)) {
7200ddcecb8SHawkins Jiawei         return dev_written;
7210ddcecb8SHawkins Jiawei     }
7220ddcecb8SHawkins Jiawei     if (*s->status != VIRTIO_NET_OK) {
7230ddcecb8SHawkins Jiawei         return -EIO;
7240ddcecb8SHawkins Jiawei     }
7250ddcecb8SHawkins Jiawei 
726f73c0c43SEugenio Pérez     return 0;
727f73c0c43SEugenio Pérez }
728f73c0c43SEugenio Pérez 
729f64c7cdaSEugenio Pérez static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
730f64c7cdaSEugenio Pérez                                   const VirtIONet *n)
731f64c7cdaSEugenio Pérez {
732f64c7cdaSEugenio Pérez     struct virtio_net_ctrl_mq mq;
733f64c7cdaSEugenio Pérez     ssize_t dev_written;
734f64c7cdaSEugenio Pérez 
73502d3bf09SHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_MQ)) {
736f64c7cdaSEugenio Pérez         return 0;
737f64c7cdaSEugenio Pérez     }
738f64c7cdaSEugenio Pérez 
739f64c7cdaSEugenio Pérez     mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
7402848c6aaSHawkins Jiawei     const struct iovec data = {
7412848c6aaSHawkins Jiawei         .iov_base = &mq,
7422848c6aaSHawkins Jiawei         .iov_len = sizeof(mq),
7432848c6aaSHawkins Jiawei     };
744f64c7cdaSEugenio Pérez     dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MQ,
7452848c6aaSHawkins Jiawei                                           VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET,
7462848c6aaSHawkins Jiawei                                           &data, 1);
747f64c7cdaSEugenio Pérez     if (unlikely(dev_written < 0)) {
748f64c7cdaSEugenio Pérez         return dev_written;
749f64c7cdaSEugenio Pérez     }
750f45fd95eSHawkins Jiawei     if (*s->status != VIRTIO_NET_OK) {
751f45fd95eSHawkins Jiawei         return -EIO;
752f45fd95eSHawkins Jiawei     }
753f64c7cdaSEugenio Pérez 
754f45fd95eSHawkins Jiawei     return 0;
755f64c7cdaSEugenio Pérez }
756f64c7cdaSEugenio Pérez 
7570b58d368SHawkins Jiawei static int vhost_vdpa_net_load_offloads(VhostVDPAState *s,
7580b58d368SHawkins Jiawei                                         const VirtIONet *n)
7590b58d368SHawkins Jiawei {
7600b58d368SHawkins Jiawei     uint64_t offloads;
7610b58d368SHawkins Jiawei     ssize_t dev_written;
7620b58d368SHawkins Jiawei 
7630b58d368SHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj,
7640b58d368SHawkins Jiawei                                  VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
7650b58d368SHawkins Jiawei         return 0;
7660b58d368SHawkins Jiawei     }
7670b58d368SHawkins Jiawei 
7680b58d368SHawkins Jiawei     if (n->curr_guest_offloads == virtio_net_supported_guest_offloads(n)) {
7690b58d368SHawkins Jiawei         /*
7700b58d368SHawkins Jiawei          * According to VirtIO standard, "Upon feature negotiation
7710b58d368SHawkins Jiawei          * corresponding offload gets enabled to preserve
7720b58d368SHawkins Jiawei          * backward compatibility.".
7730b58d368SHawkins Jiawei          *
7740b58d368SHawkins Jiawei          * Therefore, there is no need to send this CVQ command if the
7750b58d368SHawkins Jiawei          * driver also enables all supported offloads, which aligns with
7760b58d368SHawkins Jiawei          * the device's defaults.
7770b58d368SHawkins Jiawei          *
7780b58d368SHawkins Jiawei          * Note that the device's defaults can mismatch the driver's
7790b58d368SHawkins Jiawei          * configuration only at live migration.
7800b58d368SHawkins Jiawei          */
7810b58d368SHawkins Jiawei         return 0;
7820b58d368SHawkins Jiawei     }
7830b58d368SHawkins Jiawei 
7840b58d368SHawkins Jiawei     offloads = cpu_to_le64(n->curr_guest_offloads);
7852848c6aaSHawkins Jiawei     const struct iovec data = {
7862848c6aaSHawkins Jiawei         .iov_base = &offloads,
7872848c6aaSHawkins Jiawei         .iov_len = sizeof(offloads),
7882848c6aaSHawkins Jiawei     };
7890b58d368SHawkins Jiawei     dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
7900b58d368SHawkins Jiawei                                           VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
7912848c6aaSHawkins Jiawei                                           &data, 1);
7920b58d368SHawkins Jiawei     if (unlikely(dev_written < 0)) {
7930b58d368SHawkins Jiawei         return dev_written;
7940b58d368SHawkins Jiawei     }
7956f348071SHawkins Jiawei     if (*s->status != VIRTIO_NET_OK) {
7966f348071SHawkins Jiawei         return -EIO;
7976f348071SHawkins Jiawei     }
7980b58d368SHawkins Jiawei 
7996f348071SHawkins Jiawei     return 0;
8000b58d368SHawkins Jiawei }
8010b58d368SHawkins Jiawei 
802b12f907eSHawkins Jiawei static int vhost_vdpa_net_load_rx_mode(VhostVDPAState *s,
803b12f907eSHawkins Jiawei                                        uint8_t cmd,
804b12f907eSHawkins Jiawei                                        uint8_t on)
805b12f907eSHawkins Jiawei {
806b12f907eSHawkins Jiawei     const struct iovec data = {
807b12f907eSHawkins Jiawei         .iov_base = &on,
808b12f907eSHawkins Jiawei         .iov_len = sizeof(on),
809b12f907eSHawkins Jiawei     };
810b12f907eSHawkins Jiawei     return vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_RX,
811b12f907eSHawkins Jiawei                                    cmd, &data, 1);
812b12f907eSHawkins Jiawei }
813b12f907eSHawkins Jiawei 
814b12f907eSHawkins Jiawei static int vhost_vdpa_net_load_rx(VhostVDPAState *s,
815b12f907eSHawkins Jiawei                                   const VirtIONet *n)
816b12f907eSHawkins Jiawei {
817b12f907eSHawkins Jiawei     ssize_t dev_written;
818b12f907eSHawkins Jiawei 
819b12f907eSHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX)) {
820b12f907eSHawkins Jiawei         return 0;
821b12f907eSHawkins Jiawei     }
822b12f907eSHawkins Jiawei 
823b12f907eSHawkins Jiawei     /*
824b12f907eSHawkins Jiawei      * According to virtio_net_reset(), device turns promiscuous mode
825b12f907eSHawkins Jiawei      * on by default.
826b12f907eSHawkins Jiawei      *
8270a19d879SMichael Tokarev      * Additionally, according to VirtIO standard, "Since there are
828b12f907eSHawkins Jiawei      * no guarantees, it can use a hash filter or silently switch to
829b12f907eSHawkins Jiawei      * allmulti or promiscuous mode if it is given too many addresses.".
830b12f907eSHawkins Jiawei      * QEMU marks `n->mac_table.uni_overflow` if guest sets too many
831b12f907eSHawkins Jiawei      * non-multicast MAC addresses, indicating that promiscuous mode
832b12f907eSHawkins Jiawei      * should be enabled.
833b12f907eSHawkins Jiawei      *
834b12f907eSHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the
835b12f907eSHawkins Jiawei      * `n->mac_table.uni_overflow` is not marked and `n->promisc` is off,
836b12f907eSHawkins Jiawei      * which sets promiscuous mode on, different from the device's defaults.
837b12f907eSHawkins Jiawei      *
838b12f907eSHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
839b12f907eSHawkins Jiawei      * configuration only at live migration.
840b12f907eSHawkins Jiawei      */
841b12f907eSHawkins Jiawei     if (!n->mac_table.uni_overflow && !n->promisc) {
842b12f907eSHawkins Jiawei         dev_written = vhost_vdpa_net_load_rx_mode(s,
843b12f907eSHawkins Jiawei                                             VIRTIO_NET_CTRL_RX_PROMISC, 0);
844b12f907eSHawkins Jiawei         if (unlikely(dev_written < 0)) {
845b12f907eSHawkins Jiawei             return dev_written;
846b12f907eSHawkins Jiawei         }
847b12f907eSHawkins Jiawei         if (*s->status != VIRTIO_NET_OK) {
848b12f907eSHawkins Jiawei             return -EIO;
849b12f907eSHawkins Jiawei         }
850b12f907eSHawkins Jiawei     }
851b12f907eSHawkins Jiawei 
852b12f907eSHawkins Jiawei     /*
853b12f907eSHawkins Jiawei      * According to virtio_net_reset(), device turns all-multicast mode
854b12f907eSHawkins Jiawei      * off by default.
855b12f907eSHawkins Jiawei      *
856b12f907eSHawkins Jiawei      * According to VirtIO standard, "Since there are no guarantees,
857b12f907eSHawkins Jiawei      * it can use a hash filter or silently switch to allmulti or
858b12f907eSHawkins Jiawei      * promiscuous mode if it is given too many addresses.". QEMU marks
859b12f907eSHawkins Jiawei      * `n->mac_table.multi_overflow` if guest sets too many
860b12f907eSHawkins Jiawei      * non-multicast MAC addresses.
861b12f907eSHawkins Jiawei      *
862b12f907eSHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the
863b12f907eSHawkins Jiawei      * `n->mac_table.multi_overflow` is marked or `n->allmulti` is on,
864b12f907eSHawkins Jiawei      * which sets all-multicast mode on, different from the device's defaults.
865b12f907eSHawkins Jiawei      *
866b12f907eSHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
867b12f907eSHawkins Jiawei      * configuration only at live migration.
868b12f907eSHawkins Jiawei      */
869b12f907eSHawkins Jiawei     if (n->mac_table.multi_overflow || n->allmulti) {
870b12f907eSHawkins Jiawei         dev_written = vhost_vdpa_net_load_rx_mode(s,
871b12f907eSHawkins Jiawei                                             VIRTIO_NET_CTRL_RX_ALLMULTI, 1);
872b12f907eSHawkins Jiawei         if (unlikely(dev_written < 0)) {
873b12f907eSHawkins Jiawei             return dev_written;
874b12f907eSHawkins Jiawei         }
875b12f907eSHawkins Jiawei         if (*s->status != VIRTIO_NET_OK) {
876b12f907eSHawkins Jiawei             return -EIO;
877b12f907eSHawkins Jiawei         }
878b12f907eSHawkins Jiawei     }
879b12f907eSHawkins Jiawei 
8804fd180c7SHawkins Jiawei     if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX_EXTRA)) {
8814fd180c7SHawkins Jiawei         return 0;
8824fd180c7SHawkins Jiawei     }
8834fd180c7SHawkins Jiawei 
8844fd180c7SHawkins Jiawei     /*
8854fd180c7SHawkins Jiawei      * According to virtio_net_reset(), device turns all-unicast mode
8864fd180c7SHawkins Jiawei      * off by default.
8874fd180c7SHawkins Jiawei      *
8884fd180c7SHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the driver
8894fd180c7SHawkins Jiawei      * sets all-unicast mode on, different from the device's defaults.
8904fd180c7SHawkins Jiawei      *
8914fd180c7SHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
8924fd180c7SHawkins Jiawei      * configuration only at live migration.
8934fd180c7SHawkins Jiawei      */
8944fd180c7SHawkins Jiawei     if (n->alluni) {
8954fd180c7SHawkins Jiawei         dev_written = vhost_vdpa_net_load_rx_mode(s,
8964fd180c7SHawkins Jiawei                                             VIRTIO_NET_CTRL_RX_ALLUNI, 1);
8974fd180c7SHawkins Jiawei         if (dev_written < 0) {
8984fd180c7SHawkins Jiawei             return dev_written;
8994fd180c7SHawkins Jiawei         }
9004fd180c7SHawkins Jiawei         if (*s->status != VIRTIO_NET_OK) {
9014fd180c7SHawkins Jiawei             return -EIO;
9024fd180c7SHawkins Jiawei         }
9034fd180c7SHawkins Jiawei     }
9044fd180c7SHawkins Jiawei 
9054fd180c7SHawkins Jiawei     /*
9064fd180c7SHawkins Jiawei      * According to virtio_net_reset(), device turns non-multicast mode
9074fd180c7SHawkins Jiawei      * off by default.
9084fd180c7SHawkins Jiawei      *
9094fd180c7SHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the driver
9104fd180c7SHawkins Jiawei      * sets non-multicast mode on, different from the device's defaults.
9114fd180c7SHawkins Jiawei      *
9124fd180c7SHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
9134fd180c7SHawkins Jiawei      * configuration only at live migration.
9144fd180c7SHawkins Jiawei      */
9154fd180c7SHawkins Jiawei     if (n->nomulti) {
9164fd180c7SHawkins Jiawei         dev_written = vhost_vdpa_net_load_rx_mode(s,
9174fd180c7SHawkins Jiawei                                             VIRTIO_NET_CTRL_RX_NOMULTI, 1);
9184fd180c7SHawkins Jiawei         if (dev_written < 0) {
9194fd180c7SHawkins Jiawei             return dev_written;
9204fd180c7SHawkins Jiawei         }
9214fd180c7SHawkins Jiawei         if (*s->status != VIRTIO_NET_OK) {
9224fd180c7SHawkins Jiawei             return -EIO;
9234fd180c7SHawkins Jiawei         }
9244fd180c7SHawkins Jiawei     }
9254fd180c7SHawkins Jiawei 
9264fd180c7SHawkins Jiawei     /*
9274fd180c7SHawkins Jiawei      * According to virtio_net_reset(), device turns non-unicast mode
9284fd180c7SHawkins Jiawei      * off by default.
9294fd180c7SHawkins Jiawei      *
9304fd180c7SHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the driver
9314fd180c7SHawkins Jiawei      * sets non-unicast mode on, different from the device's defaults.
9324fd180c7SHawkins Jiawei      *
9334fd180c7SHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
9344fd180c7SHawkins Jiawei      * configuration only at live migration.
9354fd180c7SHawkins Jiawei      */
9364fd180c7SHawkins Jiawei     if (n->nouni) {
9374fd180c7SHawkins Jiawei         dev_written = vhost_vdpa_net_load_rx_mode(s,
9384fd180c7SHawkins Jiawei                                             VIRTIO_NET_CTRL_RX_NOUNI, 1);
9394fd180c7SHawkins Jiawei         if (dev_written < 0) {
9404fd180c7SHawkins Jiawei             return dev_written;
9414fd180c7SHawkins Jiawei         }
9424fd180c7SHawkins Jiawei         if (*s->status != VIRTIO_NET_OK) {
9434fd180c7SHawkins Jiawei             return -EIO;
9444fd180c7SHawkins Jiawei         }
9454fd180c7SHawkins Jiawei     }
9464fd180c7SHawkins Jiawei 
9474fd180c7SHawkins Jiawei     /*
9484fd180c7SHawkins Jiawei      * According to virtio_net_reset(), device turns non-broadcast mode
9494fd180c7SHawkins Jiawei      * off by default.
9504fd180c7SHawkins Jiawei      *
9514fd180c7SHawkins Jiawei      * Therefore, QEMU should only send this CVQ command if the driver
9524fd180c7SHawkins Jiawei      * sets non-broadcast mode on, different from the device's defaults.
9534fd180c7SHawkins Jiawei      *
9544fd180c7SHawkins Jiawei      * Note that the device's defaults can mismatch the driver's
9554fd180c7SHawkins Jiawei      * configuration only at live migration.
9564fd180c7SHawkins Jiawei      */
9574fd180c7SHawkins Jiawei     if (n->nobcast) {
9584fd180c7SHawkins Jiawei         dev_written = vhost_vdpa_net_load_rx_mode(s,
9594fd180c7SHawkins Jiawei                                             VIRTIO_NET_CTRL_RX_NOBCAST, 1);
9604fd180c7SHawkins Jiawei         if (dev_written < 0) {
9614fd180c7SHawkins Jiawei             return dev_written;
9624fd180c7SHawkins Jiawei         }
9634fd180c7SHawkins Jiawei         if (*s->status != VIRTIO_NET_OK) {
9644fd180c7SHawkins Jiawei             return -EIO;
9654fd180c7SHawkins Jiawei         }
9664fd180c7SHawkins Jiawei     }
9674fd180c7SHawkins Jiawei 
968b12f907eSHawkins Jiawei     return 0;
969b12f907eSHawkins Jiawei }
970b12f907eSHawkins Jiawei 
971dd036d8dSEugenio Pérez static int vhost_vdpa_net_load(NetClientState *nc)
972dd036d8dSEugenio Pérez {
973dd036d8dSEugenio Pérez     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
974f73c0c43SEugenio Pérez     struct vhost_vdpa *v = &s->vhost_vdpa;
975dd036d8dSEugenio Pérez     const VirtIONet *n;
976f73c0c43SEugenio Pérez     int r;
977dd036d8dSEugenio Pérez 
978dd036d8dSEugenio Pérez     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
979dd036d8dSEugenio Pérez 
980dd036d8dSEugenio Pérez     if (!v->shadow_vqs_enabled) {
981dd036d8dSEugenio Pérez         return 0;
982dd036d8dSEugenio Pérez     }
983dd036d8dSEugenio Pérez 
984dd036d8dSEugenio Pérez     n = VIRTIO_NET(v->dev->vdev);
985f73c0c43SEugenio Pérez     r = vhost_vdpa_net_load_mac(s, n);
986f73c0c43SEugenio Pérez     if (unlikely(r < 0)) {
987f73c0c43SEugenio Pérez         return r;
988dd036d8dSEugenio Pérez     }
989f64c7cdaSEugenio Pérez     r = vhost_vdpa_net_load_mq(s, n);
990f64c7cdaSEugenio Pérez     if (unlikely(r)) {
991f64c7cdaSEugenio Pérez         return r;
992f64c7cdaSEugenio Pérez     }
9930b58d368SHawkins Jiawei     r = vhost_vdpa_net_load_offloads(s, n);
9940b58d368SHawkins Jiawei     if (unlikely(r)) {
9950b58d368SHawkins Jiawei         return r;
9960b58d368SHawkins Jiawei     }
997b12f907eSHawkins Jiawei     r = vhost_vdpa_net_load_rx(s, n);
998b12f907eSHawkins Jiawei     if (unlikely(r)) {
999b12f907eSHawkins Jiawei         return r;
1000b12f907eSHawkins Jiawei     }
1001dd036d8dSEugenio Pérez 
1002dd036d8dSEugenio Pérez     return 0;
1003dd036d8dSEugenio Pérez }
1004dd036d8dSEugenio Pérez 
1005f8972b56SEugenio Pérez static NetClientInfo net_vhost_vdpa_cvq_info = {
1006f8972b56SEugenio Pérez     .type = NET_CLIENT_DRIVER_VHOST_VDPA,
1007f8972b56SEugenio Pérez     .size = sizeof(VhostVDPAState),
1008f8972b56SEugenio Pérez     .receive = vhost_vdpa_receive,
10097a7f87e9SEugenio Pérez     .start = vhost_vdpa_net_cvq_start,
1010dd036d8dSEugenio Pérez     .load = vhost_vdpa_net_load,
10117a7f87e9SEugenio Pérez     .stop = vhost_vdpa_net_cvq_stop,
1012f8972b56SEugenio Pérez     .cleanup = vhost_vdpa_cleanup,
1013f8972b56SEugenio Pérez     .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
1014f8972b56SEugenio Pérez     .has_ufo = vhost_vdpa_has_ufo,
1015f8972b56SEugenio Pérez     .check_peer_type = vhost_vdpa_check_peer_type,
1016f8972b56SEugenio Pérez };
1017f8972b56SEugenio Pérez 
1018fee364e4SHawkins Jiawei /*
1019fee364e4SHawkins Jiawei  * Forward the excessive VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command to
1020fee364e4SHawkins Jiawei  * vdpa device.
1021fee364e4SHawkins Jiawei  *
1022fee364e4SHawkins Jiawei  * Considering that QEMU cannot send the entire filter table to the
1023fee364e4SHawkins Jiawei  * vdpa device, it should send the VIRTIO_NET_CTRL_RX_PROMISC CVQ
1024fee364e4SHawkins Jiawei  * command to enable promiscuous mode to receive all packets,
1025fee364e4SHawkins Jiawei  * according to VirtIO standard, "Since there are no guarantees,
1026fee364e4SHawkins Jiawei  * it can use a hash filter or silently switch to allmulti or
1027fee364e4SHawkins Jiawei  * promiscuous mode if it is given too many addresses.".
1028fee364e4SHawkins Jiawei  *
1029fee364e4SHawkins Jiawei  * Since QEMU ignores MAC addresses beyond `MAC_TABLE_ENTRIES` and
1030fee364e4SHawkins Jiawei  * marks `n->mac_table.x_overflow` accordingly, it should have
1031fee364e4SHawkins Jiawei  * the same effect on the device model to receive
1032fee364e4SHawkins Jiawei  * (`MAC_TABLE_ENTRIES` + 1) or more non-multicast MAC addresses.
1033fee364e4SHawkins Jiawei  * The same applies to multicast MAC addresses.
1034fee364e4SHawkins Jiawei  *
1035fee364e4SHawkins Jiawei  * Therefore, QEMU can provide the device model with a fake
1036fee364e4SHawkins Jiawei  * VIRTIO_NET_CTRL_MAC_TABLE_SET command with (`MAC_TABLE_ENTRIES` + 1)
1037fee364e4SHawkins Jiawei  * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1) multicast
1038fee364e4SHawkins Jiawei  * MAC addresses. This ensures that the device model marks
1039fee364e4SHawkins Jiawei  * `n->mac_table.uni_overflow` and `n->mac_table.multi_overflow`,
1040fee364e4SHawkins Jiawei  * allowing all packets to be received, which aligns with the
1041fee364e4SHawkins Jiawei  * state of the vdpa device.
1042fee364e4SHawkins Jiawei  */
1043fee364e4SHawkins Jiawei static int vhost_vdpa_net_excessive_mac_filter_cvq_add(VhostVDPAState *s,
1044fee364e4SHawkins Jiawei                                                        VirtQueueElement *elem,
1045fee364e4SHawkins Jiawei                                                        struct iovec *out)
1046fee364e4SHawkins Jiawei {
1047fee364e4SHawkins Jiawei     struct virtio_net_ctrl_mac mac_data, *mac_ptr;
1048fee364e4SHawkins Jiawei     struct virtio_net_ctrl_hdr *hdr_ptr;
1049fee364e4SHawkins Jiawei     uint32_t cursor;
1050fee364e4SHawkins Jiawei     ssize_t r;
1051fee364e4SHawkins Jiawei 
1052fee364e4SHawkins Jiawei     /* parse the non-multicast MAC address entries from CVQ command */
1053fee364e4SHawkins Jiawei     cursor = sizeof(*hdr_ptr);
1054fee364e4SHawkins Jiawei     r = iov_to_buf(elem->out_sg, elem->out_num, cursor,
1055fee364e4SHawkins Jiawei                    &mac_data, sizeof(mac_data));
1056fee364e4SHawkins Jiawei     if (unlikely(r != sizeof(mac_data))) {
1057fee364e4SHawkins Jiawei         /*
1058fee364e4SHawkins Jiawei          * If the CVQ command is invalid, we should simulate the vdpa device
1059fee364e4SHawkins Jiawei          * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1060fee364e4SHawkins Jiawei          */
1061fee364e4SHawkins Jiawei         *s->status = VIRTIO_NET_ERR;
1062fee364e4SHawkins Jiawei         return sizeof(*s->status);
1063fee364e4SHawkins Jiawei     }
1064fee364e4SHawkins Jiawei     cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN;
1065fee364e4SHawkins Jiawei 
1066fee364e4SHawkins Jiawei     /* parse the multicast MAC address entries from CVQ command */
1067fee364e4SHawkins Jiawei     r = iov_to_buf(elem->out_sg, elem->out_num, cursor,
1068fee364e4SHawkins Jiawei                    &mac_data, sizeof(mac_data));
1069fee364e4SHawkins Jiawei     if (r != sizeof(mac_data)) {
1070fee364e4SHawkins Jiawei         /*
1071fee364e4SHawkins Jiawei          * If the CVQ command is invalid, we should simulate the vdpa device
1072fee364e4SHawkins Jiawei          * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1073fee364e4SHawkins Jiawei          */
1074fee364e4SHawkins Jiawei         *s->status = VIRTIO_NET_ERR;
1075fee364e4SHawkins Jiawei         return sizeof(*s->status);
1076fee364e4SHawkins Jiawei     }
1077fee364e4SHawkins Jiawei     cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN;
1078fee364e4SHawkins Jiawei 
1079fee364e4SHawkins Jiawei     /* validate the CVQ command */
1080fee364e4SHawkins Jiawei     if (iov_size(elem->out_sg, elem->out_num) != cursor) {
1081fee364e4SHawkins Jiawei         /*
1082fee364e4SHawkins Jiawei          * If the CVQ command is invalid, we should simulate the vdpa device
1083fee364e4SHawkins Jiawei          * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1084fee364e4SHawkins Jiawei          */
1085fee364e4SHawkins Jiawei         *s->status = VIRTIO_NET_ERR;
1086fee364e4SHawkins Jiawei         return sizeof(*s->status);
1087fee364e4SHawkins Jiawei     }
1088fee364e4SHawkins Jiawei 
1089fee364e4SHawkins Jiawei     /*
1090fee364e4SHawkins Jiawei      * According to VirtIO standard, "Since there are no guarantees,
1091fee364e4SHawkins Jiawei      * it can use a hash filter or silently switch to allmulti or
1092fee364e4SHawkins Jiawei      * promiscuous mode if it is given too many addresses.".
1093fee364e4SHawkins Jiawei      *
1094fee364e4SHawkins Jiawei      * Therefore, considering that QEMU is unable to send the entire
1095fee364e4SHawkins Jiawei      * filter table to the vdpa device, it should send the
1096fee364e4SHawkins Jiawei      * VIRTIO_NET_CTRL_RX_PROMISC CVQ command to enable promiscuous mode
1097fee364e4SHawkins Jiawei      */
1098fee364e4SHawkins Jiawei     r = vhost_vdpa_net_load_rx_mode(s, VIRTIO_NET_CTRL_RX_PROMISC, 1);
1099fee364e4SHawkins Jiawei     if (unlikely(r < 0)) {
1100fee364e4SHawkins Jiawei         return r;
1101fee364e4SHawkins Jiawei     }
1102fee364e4SHawkins Jiawei     if (*s->status != VIRTIO_NET_OK) {
1103fee364e4SHawkins Jiawei         return sizeof(*s->status);
1104fee364e4SHawkins Jiawei     }
1105fee364e4SHawkins Jiawei 
1106fee364e4SHawkins Jiawei     /*
1107fee364e4SHawkins Jiawei      * QEMU should also send a fake VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ
1108fee364e4SHawkins Jiawei      * command to the device model, including (`MAC_TABLE_ENTRIES` + 1)
1109fee364e4SHawkins Jiawei      * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1)
1110fee364e4SHawkins Jiawei      * multicast MAC addresses.
1111fee364e4SHawkins Jiawei      *
1112fee364e4SHawkins Jiawei      * By doing so, the device model can mark `n->mac_table.uni_overflow`
1113fee364e4SHawkins Jiawei      * and `n->mac_table.multi_overflow`, enabling all packets to be
1114fee364e4SHawkins Jiawei      * received, which aligns with the state of the vdpa device.
1115fee364e4SHawkins Jiawei      */
1116fee364e4SHawkins Jiawei     cursor = 0;
1117fee364e4SHawkins Jiawei     uint32_t fake_uni_entries = MAC_TABLE_ENTRIES + 1,
1118fee364e4SHawkins Jiawei              fake_mul_entries = MAC_TABLE_ENTRIES + 1,
1119fee364e4SHawkins Jiawei              fake_cvq_size = sizeof(struct virtio_net_ctrl_hdr) +
1120fee364e4SHawkins Jiawei                              sizeof(mac_data) + fake_uni_entries * ETH_ALEN +
1121fee364e4SHawkins Jiawei                              sizeof(mac_data) + fake_mul_entries * ETH_ALEN;
1122fee364e4SHawkins Jiawei 
1123fee364e4SHawkins Jiawei     assert(fake_cvq_size < vhost_vdpa_net_cvq_cmd_page_len());
1124fee364e4SHawkins Jiawei     out->iov_len = fake_cvq_size;
1125fee364e4SHawkins Jiawei 
1126fee364e4SHawkins Jiawei     /* pack the header for fake CVQ command */
1127fee364e4SHawkins Jiawei     hdr_ptr = out->iov_base + cursor;
1128fee364e4SHawkins Jiawei     hdr_ptr->class = VIRTIO_NET_CTRL_MAC;
1129fee364e4SHawkins Jiawei     hdr_ptr->cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
1130fee364e4SHawkins Jiawei     cursor += sizeof(*hdr_ptr);
1131fee364e4SHawkins Jiawei 
1132fee364e4SHawkins Jiawei     /*
1133fee364e4SHawkins Jiawei      * Pack the non-multicast MAC addresses part for fake CVQ command.
1134fee364e4SHawkins Jiawei      *
1135fee364e4SHawkins Jiawei      * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC
11360a19d879SMichael Tokarev      * addresses provided in CVQ command. Therefore, only the entries
1137fee364e4SHawkins Jiawei      * field need to be prepared in the CVQ command.
1138fee364e4SHawkins Jiawei      */
1139fee364e4SHawkins Jiawei     mac_ptr = out->iov_base + cursor;
1140fee364e4SHawkins Jiawei     mac_ptr->entries = cpu_to_le32(fake_uni_entries);
1141fee364e4SHawkins Jiawei     cursor += sizeof(*mac_ptr) + fake_uni_entries * ETH_ALEN;
1142fee364e4SHawkins Jiawei 
1143fee364e4SHawkins Jiawei     /*
1144fee364e4SHawkins Jiawei      * Pack the multicast MAC addresses part for fake CVQ command.
1145fee364e4SHawkins Jiawei      *
1146fee364e4SHawkins Jiawei      * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC
11470a19d879SMichael Tokarev      * addresses provided in CVQ command. Therefore, only the entries
1148fee364e4SHawkins Jiawei      * field need to be prepared in the CVQ command.
1149fee364e4SHawkins Jiawei      */
1150fee364e4SHawkins Jiawei     mac_ptr = out->iov_base + cursor;
1151fee364e4SHawkins Jiawei     mac_ptr->entries = cpu_to_le32(fake_mul_entries);
1152fee364e4SHawkins Jiawei 
1153fee364e4SHawkins Jiawei     /*
1154fee364e4SHawkins Jiawei      * Simulating QEMU poll a vdpa device used buffer
1155fee364e4SHawkins Jiawei      * for VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1156fee364e4SHawkins Jiawei      */
1157fee364e4SHawkins Jiawei     return sizeof(*s->status);
1158fee364e4SHawkins Jiawei }
1159fee364e4SHawkins Jiawei 
11602df4dd31SEugenio Pérez /**
11612df4dd31SEugenio Pérez  * Validate and copy control virtqueue commands.
11622df4dd31SEugenio Pérez  *
11632df4dd31SEugenio Pérez  * Following QEMU guidelines, we offer a copy of the buffers to the device to
11642df4dd31SEugenio Pérez  * prevent TOCTOU bugs.
1165bd907ae4SEugenio Pérez  */
1166bd907ae4SEugenio Pérez static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
1167bd907ae4SEugenio Pérez                                             VirtQueueElement *elem,
1168bd907ae4SEugenio Pérez                                             void *opaque)
1169bd907ae4SEugenio Pérez {
11702df4dd31SEugenio Pérez     VhostVDPAState *s = opaque;
1171be4278b6SEugenio Pérez     size_t in_len;
117245c41018SHawkins Jiawei     const struct virtio_net_ctrl_hdr *ctrl;
1173bd907ae4SEugenio Pérez     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
11747a7f87e9SEugenio Pérez     /* Out buffer sent to both the vdpa device and the device model */
11757a7f87e9SEugenio Pérez     struct iovec out = {
11767a7f87e9SEugenio Pérez         .iov_base = s->cvq_cmd_out_buffer,
11777a7f87e9SEugenio Pérez     };
11782df4dd31SEugenio Pérez     /* in buffer used for device model */
11792df4dd31SEugenio Pérez     const struct iovec in = {
11802df4dd31SEugenio Pérez         .iov_base = &status,
11812df4dd31SEugenio Pérez         .iov_len = sizeof(status),
11822df4dd31SEugenio Pérez     };
1183be4278b6SEugenio Pérez     ssize_t dev_written = -EINVAL;
1184bd907ae4SEugenio Pérez 
11857a7f87e9SEugenio Pérez     out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0,
11867a7f87e9SEugenio Pérez                              s->cvq_cmd_out_buffer,
1187fee364e4SHawkins Jiawei                              vhost_vdpa_net_cvq_cmd_page_len());
118845c41018SHawkins Jiawei 
118945c41018SHawkins Jiawei     ctrl = s->cvq_cmd_out_buffer;
119045c41018SHawkins Jiawei     if (ctrl->class == VIRTIO_NET_CTRL_ANNOUNCE) {
11913f9a3eebSEugenio Pérez         /*
11923f9a3eebSEugenio Pérez          * Guest announce capability is emulated by qemu, so don't forward to
11933f9a3eebSEugenio Pérez          * the device.
11943f9a3eebSEugenio Pérez          */
11953f9a3eebSEugenio Pérez         dev_written = sizeof(status);
11963f9a3eebSEugenio Pérez         *s->status = VIRTIO_NET_OK;
1197fee364e4SHawkins Jiawei     } else if (unlikely(ctrl->class == VIRTIO_NET_CTRL_MAC &&
1198fee364e4SHawkins Jiawei                         ctrl->cmd == VIRTIO_NET_CTRL_MAC_TABLE_SET &&
1199fee364e4SHawkins Jiawei                         iov_size(elem->out_sg, elem->out_num) > out.iov_len)) {
1200fee364e4SHawkins Jiawei         /*
1201fee364e4SHawkins Jiawei          * Due to the size limitation of the out buffer sent to the vdpa device,
1202fee364e4SHawkins Jiawei          * which is determined by vhost_vdpa_net_cvq_cmd_page_len(), excessive
1203fee364e4SHawkins Jiawei          * MAC addresses set by the driver for the filter table can cause
1204fee364e4SHawkins Jiawei          * truncation of the CVQ command in QEMU. As a result, the vdpa device
1205fee364e4SHawkins Jiawei          * rejects the flawed CVQ command.
1206fee364e4SHawkins Jiawei          *
1207fee364e4SHawkins Jiawei          * Therefore, QEMU must handle this situation instead of sending
12080a19d879SMichael Tokarev          * the CVQ command directly.
1209fee364e4SHawkins Jiawei          */
1210fee364e4SHawkins Jiawei         dev_written = vhost_vdpa_net_excessive_mac_filter_cvq_add(s, elem,
1211fee364e4SHawkins Jiawei                                                                   &out);
1212fee364e4SHawkins Jiawei         if (unlikely(dev_written < 0)) {
1213fee364e4SHawkins Jiawei             goto out;
1214fee364e4SHawkins Jiawei         }
12153f9a3eebSEugenio Pérez     } else {
1216be4278b6SEugenio Pérez         dev_written = vhost_vdpa_net_cvq_add(s, out.iov_len, sizeof(status));
1217be4278b6SEugenio Pérez         if (unlikely(dev_written < 0)) {
1218bd907ae4SEugenio Pérez             goto out;
1219bd907ae4SEugenio Pérez         }
12203f9a3eebSEugenio Pérez     }
1221bd907ae4SEugenio Pérez 
1222bd907ae4SEugenio Pérez     if (unlikely(dev_written < sizeof(status))) {
1223bd907ae4SEugenio Pérez         error_report("Insufficient written data (%zu)", dev_written);
12242df4dd31SEugenio Pérez         goto out;
12252df4dd31SEugenio Pérez     }
12262df4dd31SEugenio Pérez 
122717fb889fSEugenio Pérez     if (*s->status != VIRTIO_NET_OK) {
1228d45243bcSEugenio Pérez         goto out;
12292df4dd31SEugenio Pérez     }
12302df4dd31SEugenio Pérez 
12312df4dd31SEugenio Pérez     status = VIRTIO_NET_ERR;
12327a7f87e9SEugenio Pérez     virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, &out, 1);
12332df4dd31SEugenio Pérez     if (status != VIRTIO_NET_OK) {
12342df4dd31SEugenio Pérez         error_report("Bad CVQ processing in model");
1235bd907ae4SEugenio Pérez     }
1236bd907ae4SEugenio Pérez 
1237bd907ae4SEugenio Pérez out:
1238bd907ae4SEugenio Pérez     in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
1239bd907ae4SEugenio Pérez                           sizeof(status));
1240bd907ae4SEugenio Pérez     if (unlikely(in_len < sizeof(status))) {
1241bd907ae4SEugenio Pérez         error_report("Bad device CVQ written length");
1242bd907ae4SEugenio Pérez     }
1243bd907ae4SEugenio Pérez     vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
1244031b1abaSHawkins Jiawei     /*
1245031b1abaSHawkins Jiawei      * `elem` belongs to vhost_vdpa_net_handle_ctrl_avail() only when
1246031b1abaSHawkins Jiawei      * the function successfully forwards the CVQ command, indicated
1247031b1abaSHawkins Jiawei      * by a non-negative value of `dev_written`. Otherwise, it still
1248031b1abaSHawkins Jiawei      * belongs to SVQ.
1249031b1abaSHawkins Jiawei      * This function should only free the `elem` when it owns.
1250031b1abaSHawkins Jiawei      */
1251031b1abaSHawkins Jiawei     if (dev_written >= 0) {
1252bd907ae4SEugenio Pérez         g_free(elem);
1253031b1abaSHawkins Jiawei     }
1254be4278b6SEugenio Pérez     return dev_written < 0 ? dev_written : 0;
1255bd907ae4SEugenio Pérez }
1256bd907ae4SEugenio Pérez 
1257bd907ae4SEugenio Pérez static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
1258bd907ae4SEugenio Pérez     .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
1259bd907ae4SEugenio Pérez };
1260bd907ae4SEugenio Pérez 
1261152128d6SEugenio Pérez /**
1262152128d6SEugenio Pérez  * Probe if CVQ is isolated
1263152128d6SEugenio Pérez  *
1264152128d6SEugenio Pérez  * @device_fd         The vdpa device fd
1265152128d6SEugenio Pérez  * @features          Features offered by the device.
1266152128d6SEugenio Pérez  * @cvq_index         The control vq pair index
1267152128d6SEugenio Pérez  *
1268152128d6SEugenio Pérez  * Returns <0 in case of failure, 0 if false and 1 if true.
1269152128d6SEugenio Pérez  */
1270152128d6SEugenio Pérez static int vhost_vdpa_probe_cvq_isolation(int device_fd, uint64_t features,
1271152128d6SEugenio Pérez                                           int cvq_index, Error **errp)
1272152128d6SEugenio Pérez {
1273152128d6SEugenio Pérez     uint64_t backend_features;
1274152128d6SEugenio Pérez     int64_t cvq_group;
1275152128d6SEugenio Pérez     uint8_t status = VIRTIO_CONFIG_S_ACKNOWLEDGE |
1276152128d6SEugenio Pérez                      VIRTIO_CONFIG_S_DRIVER |
1277152128d6SEugenio Pérez                      VIRTIO_CONFIG_S_FEATURES_OK;
1278152128d6SEugenio Pérez     int r;
1279152128d6SEugenio Pérez 
1280152128d6SEugenio Pérez     ERRP_GUARD();
1281152128d6SEugenio Pérez 
1282152128d6SEugenio Pérez     r = ioctl(device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features);
1283152128d6SEugenio Pérez     if (unlikely(r < 0)) {
1284152128d6SEugenio Pérez         error_setg_errno(errp, errno, "Cannot get vdpa backend_features");
1285152128d6SEugenio Pérez         return r;
1286152128d6SEugenio Pérez     }
1287152128d6SEugenio Pérez 
1288152128d6SEugenio Pérez     if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID))) {
1289152128d6SEugenio Pérez         return 0;
1290152128d6SEugenio Pérez     }
1291152128d6SEugenio Pérez 
1292152128d6SEugenio Pérez     r = ioctl(device_fd, VHOST_SET_FEATURES, &features);
1293152128d6SEugenio Pérez     if (unlikely(r)) {
1294152128d6SEugenio Pérez         error_setg_errno(errp, errno, "Cannot set features");
1295152128d6SEugenio Pérez     }
1296152128d6SEugenio Pérez 
1297152128d6SEugenio Pérez     r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
1298152128d6SEugenio Pérez     if (unlikely(r)) {
1299152128d6SEugenio Pérez         error_setg_errno(errp, -r, "Cannot set device features");
1300152128d6SEugenio Pérez         goto out;
1301152128d6SEugenio Pérez     }
1302152128d6SEugenio Pérez 
1303152128d6SEugenio Pérez     cvq_group = vhost_vdpa_get_vring_group(device_fd, cvq_index, errp);
1304152128d6SEugenio Pérez     if (unlikely(cvq_group < 0)) {
1305152128d6SEugenio Pérez         if (cvq_group != -ENOTSUP) {
1306152128d6SEugenio Pérez             r = cvq_group;
1307152128d6SEugenio Pérez             goto out;
1308152128d6SEugenio Pérez         }
1309152128d6SEugenio Pérez 
1310152128d6SEugenio Pérez         /*
1311152128d6SEugenio Pérez          * The kernel report VHOST_BACKEND_F_IOTLB_ASID if the vdpa frontend
1312152128d6SEugenio Pérez          * support ASID even if the parent driver does not.  The CVQ cannot be
1313152128d6SEugenio Pérez          * isolated in this case.
1314152128d6SEugenio Pérez          */
1315152128d6SEugenio Pérez         error_free(*errp);
1316152128d6SEugenio Pérez         *errp = NULL;
1317152128d6SEugenio Pérez         r = 0;
1318152128d6SEugenio Pérez         goto out;
1319152128d6SEugenio Pérez     }
1320152128d6SEugenio Pérez 
1321152128d6SEugenio Pérez     for (int i = 0; i < cvq_index; ++i) {
1322152128d6SEugenio Pérez         int64_t group = vhost_vdpa_get_vring_group(device_fd, i, errp);
1323152128d6SEugenio Pérez         if (unlikely(group < 0)) {
1324152128d6SEugenio Pérez             r = group;
1325152128d6SEugenio Pérez             goto out;
1326152128d6SEugenio Pérez         }
1327152128d6SEugenio Pérez 
1328152128d6SEugenio Pérez         if (group == (int64_t)cvq_group) {
1329152128d6SEugenio Pérez             r = 0;
1330152128d6SEugenio Pérez             goto out;
1331152128d6SEugenio Pérez         }
1332152128d6SEugenio Pérez     }
1333152128d6SEugenio Pérez 
1334152128d6SEugenio Pérez     r = 1;
1335152128d6SEugenio Pérez 
1336152128d6SEugenio Pérez out:
1337152128d6SEugenio Pérez     status = 0;
1338152128d6SEugenio Pérez     ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
1339152128d6SEugenio Pérez     return r;
1340152128d6SEugenio Pérez }
1341152128d6SEugenio Pérez 
1342654790b6SJason Wang static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
1343654790b6SJason Wang                                        const char *device,
1344654790b6SJason Wang                                        const char *name,
134540237840SJason Wang                                        int vdpa_device_fd,
134640237840SJason Wang                                        int queue_pair_index,
134740237840SJason Wang                                        int nvqs,
13481576dbb5SEugenio Pérez                                        bool is_datapath,
13491576dbb5SEugenio Pérez                                        bool svq,
13505c1ebd4cSEugenio Pérez                                        struct vhost_vdpa_iova_range iova_range,
1351152128d6SEugenio Pérez                                        uint64_t features,
1352152128d6SEugenio Pérez                                        Error **errp)
13531e0a84eaSCindy Lu {
13541e0a84eaSCindy Lu     NetClientState *nc = NULL;
13551e0a84eaSCindy Lu     VhostVDPAState *s;
13561e0a84eaSCindy Lu     int ret = 0;
13571e0a84eaSCindy Lu     assert(name);
1358152128d6SEugenio Pérez     int cvq_isolated;
1359152128d6SEugenio Pérez 
136040237840SJason Wang     if (is_datapath) {
136140237840SJason Wang         nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device,
136240237840SJason Wang                                  name);
136340237840SJason Wang     } else {
1364152128d6SEugenio Pérez         cvq_isolated = vhost_vdpa_probe_cvq_isolation(vdpa_device_fd, features,
1365152128d6SEugenio Pérez                                                       queue_pair_index * 2,
1366152128d6SEugenio Pérez                                                       errp);
1367152128d6SEugenio Pérez         if (unlikely(cvq_isolated < 0)) {
1368152128d6SEugenio Pérez             return NULL;
1369152128d6SEugenio Pérez         }
1370152128d6SEugenio Pérez 
1371f8972b56SEugenio Pérez         nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer,
137240237840SJason Wang                                          device, name);
137340237840SJason Wang     }
137453b85d95SLaurent Vivier     qemu_set_info_str(nc, TYPE_VHOST_VDPA);
13751e0a84eaSCindy Lu     s = DO_UPCAST(VhostVDPAState, nc, nc);
13767327813dSJason Wang 
13771e0a84eaSCindy Lu     s->vhost_vdpa.device_fd = vdpa_device_fd;
137840237840SJason Wang     s->vhost_vdpa.index = queue_pair_index;
13797f211a28SEugenio Pérez     s->always_svq = svq;
138069498430SEugenio Pérez     s->migration_state.notify = vdpa_net_migration_state_notifier;
13811576dbb5SEugenio Pérez     s->vhost_vdpa.shadow_vqs_enabled = svq;
1382a585fad2SEugenio Pérez     s->vhost_vdpa.iova_range = iova_range;
13836188d78aSEugenio Pérez     s->vhost_vdpa.shadow_data = svq;
13845c1ebd4cSEugenio Pérez     if (queue_pair_index == 0) {
13855c1ebd4cSEugenio Pérez         vhost_vdpa_net_valid_svq_features(features,
13865c1ebd4cSEugenio Pérez                                           &s->vhost_vdpa.migration_blocker);
13875c1ebd4cSEugenio Pérez     } else if (!is_datapath) {
1388babf8b87SEugenio Pérez         s->cvq_cmd_out_buffer = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(),
1389babf8b87SEugenio Pérez                                      PROT_READ | PROT_WRITE,
1390babf8b87SEugenio Pérez                                      MAP_SHARED | MAP_ANONYMOUS, -1, 0);
1391babf8b87SEugenio Pérez         s->status = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(),
1392babf8b87SEugenio Pérez                          PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
1393babf8b87SEugenio Pérez                          -1, 0);
13942df4dd31SEugenio Pérez 
1395bd907ae4SEugenio Pérez         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
1396bd907ae4SEugenio Pérez         s->vhost_vdpa.shadow_vq_ops_opaque = s;
1397152128d6SEugenio Pérez         s->cvq_isolated = cvq_isolated;
13989c363cf6SEugenio Pérez 
13999c363cf6SEugenio Pérez         /*
14008bc0049eSEugenio Pérez          * TODO: We cannot migrate devices with CVQ and no x-svq enabled as
14018bc0049eSEugenio Pérez          * there is no way to set the device state (MAC, MQ, etc) before
14028bc0049eSEugenio Pérez          * starting the datapath.
14039c363cf6SEugenio Pérez          *
14049c363cf6SEugenio Pérez          * Migration blocker ownership now belongs to s->vhost_vdpa.
14059c363cf6SEugenio Pérez          */
14068bc0049eSEugenio Pérez         if (!svq) {
14079c363cf6SEugenio Pérez             error_setg(&s->vhost_vdpa.migration_blocker,
14089c363cf6SEugenio Pérez                        "net vdpa cannot migrate with CVQ feature");
1409bd907ae4SEugenio Pérez         }
14108bc0049eSEugenio Pérez     }
141140237840SJason Wang     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
141274af5eecSJason Wang     if (ret) {
141374af5eecSJason Wang         qemu_del_net_client(nc);
1414654790b6SJason Wang         return NULL;
141574af5eecSJason Wang     }
1416654790b6SJason Wang     return nc;
14171e0a84eaSCindy Lu }
14181e0a84eaSCindy Lu 
14198170ab3fSEugenio Pérez static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
14208170ab3fSEugenio Pérez {
14218170ab3fSEugenio Pérez     int ret = ioctl(fd, VHOST_GET_FEATURES, features);
14228170ab3fSEugenio Pérez     if (unlikely(ret < 0)) {
14238170ab3fSEugenio Pérez         error_setg_errno(errp, errno,
14248170ab3fSEugenio Pérez                          "Fail to query features from vhost-vDPA device");
14258170ab3fSEugenio Pérez     }
14268170ab3fSEugenio Pérez     return ret;
14278170ab3fSEugenio Pérez }
14288170ab3fSEugenio Pérez 
14298170ab3fSEugenio Pérez static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
14308170ab3fSEugenio Pérez                                           int *has_cvq, Error **errp)
143140237840SJason Wang {
143240237840SJason Wang     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
1433cd523a41SStefano Garzarella     g_autofree struct vhost_vdpa_config *config = NULL;
143440237840SJason Wang     __virtio16 *max_queue_pairs;
143540237840SJason Wang     int ret;
143640237840SJason Wang 
143740237840SJason Wang     if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
143840237840SJason Wang         *has_cvq = 1;
143940237840SJason Wang     } else {
144040237840SJason Wang         *has_cvq = 0;
144140237840SJason Wang     }
144240237840SJason Wang 
144340237840SJason Wang     if (features & (1 << VIRTIO_NET_F_MQ)) {
144440237840SJason Wang         config = g_malloc0(config_size + sizeof(*max_queue_pairs));
144540237840SJason Wang         config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs);
144640237840SJason Wang         config->len = sizeof(*max_queue_pairs);
144740237840SJason Wang 
144840237840SJason Wang         ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config);
144940237840SJason Wang         if (ret) {
145040237840SJason Wang             error_setg(errp, "Fail to get config from vhost-vDPA device");
145140237840SJason Wang             return -ret;
145240237840SJason Wang         }
145340237840SJason Wang 
145440237840SJason Wang         max_queue_pairs = (__virtio16 *)&config->buf;
145540237840SJason Wang 
145640237840SJason Wang         return lduw_le_p(max_queue_pairs);
145740237840SJason Wang     }
145840237840SJason Wang 
145940237840SJason Wang     return 1;
146040237840SJason Wang }
146140237840SJason Wang 
14621e0a84eaSCindy Lu int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
14631e0a84eaSCindy Lu                         NetClientState *peer, Error **errp)
14641e0a84eaSCindy Lu {
14651e0a84eaSCindy Lu     const NetdevVhostVDPAOptions *opts;
14668170ab3fSEugenio Pérez     uint64_t features;
1467654790b6SJason Wang     int vdpa_device_fd;
1468eb3cb751SEugenio Pérez     g_autofree NetClientState **ncs = NULL;
1469a585fad2SEugenio Pérez     struct vhost_vdpa_iova_range iova_range;
1470eb3cb751SEugenio Pérez     NetClientState *nc;
1471aed5da45SEugenio Pérez     int queue_pairs, r, i = 0, has_cvq = 0;
14721e0a84eaSCindy Lu 
14731e0a84eaSCindy Lu     assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
14741e0a84eaSCindy Lu     opts = &netdev->u.vhost_vdpa;
14757480874aSMarkus Armbruster     if (!opts->vhostdev && !opts->vhostfd) {
14768801ccd0SSi-Wei Liu         error_setg(errp,
14778801ccd0SSi-Wei Liu                    "vhost-vdpa: neither vhostdev= nor vhostfd= was specified");
1478c8295404SEugenio Pérez         return -1;
1479c8295404SEugenio Pérez     }
14807327813dSJason Wang 
14817480874aSMarkus Armbruster     if (opts->vhostdev && opts->vhostfd) {
14828801ccd0SSi-Wei Liu         error_setg(errp,
14838801ccd0SSi-Wei Liu                    "vhost-vdpa: vhostdev= and vhostfd= are mutually exclusive");
14848801ccd0SSi-Wei Liu         return -1;
14858801ccd0SSi-Wei Liu     }
14868801ccd0SSi-Wei Liu 
14877480874aSMarkus Armbruster     if (opts->vhostdev) {
14880351152bSEugenio Pérez         vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp);
14897327813dSJason Wang         if (vdpa_device_fd == -1) {
14907327813dSJason Wang             return -errno;
14917327813dSJason Wang         }
14925107fd3eSPeter Maydell     } else {
14935107fd3eSPeter Maydell         /* has_vhostfd */
14948801ccd0SSi-Wei Liu         vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, errp);
14958801ccd0SSi-Wei Liu         if (vdpa_device_fd == -1) {
14968801ccd0SSi-Wei Liu             error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: ");
14978801ccd0SSi-Wei Liu             return -1;
14988801ccd0SSi-Wei Liu         }
14998801ccd0SSi-Wei Liu     }
15007327813dSJason Wang 
15018170ab3fSEugenio Pérez     r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
15028170ab3fSEugenio Pérez     if (unlikely(r < 0)) {
1503aed5da45SEugenio Pérez         goto err;
15048170ab3fSEugenio Pérez     }
15058170ab3fSEugenio Pérez 
15068170ab3fSEugenio Pérez     queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
150740237840SJason Wang                                                  &has_cvq, errp);
150840237840SJason Wang     if (queue_pairs < 0) {
15097327813dSJason Wang         qemu_close(vdpa_device_fd);
151040237840SJason Wang         return queue_pairs;
15117327813dSJason Wang     }
15127327813dSJason Wang 
1513bf7a2ad8SLongpeng     r = vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
1514bf7a2ad8SLongpeng     if (unlikely(r < 0)) {
1515bf7a2ad8SLongpeng         error_setg(errp, "vhost-vdpa: get iova range failed: %s",
1516bf7a2ad8SLongpeng                    strerror(-r));
1517bf7a2ad8SLongpeng         goto err;
1518bf7a2ad8SLongpeng     }
1519bf7a2ad8SLongpeng 
152000ef422eSEugenio Pérez     if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
152100ef422eSEugenio Pérez         goto err;
15221576dbb5SEugenio Pérez     }
15231576dbb5SEugenio Pérez 
152440237840SJason Wang     ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
152540237840SJason Wang 
152640237840SJason Wang     for (i = 0; i < queue_pairs; i++) {
152740237840SJason Wang         ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
15281576dbb5SEugenio Pérez                                      vdpa_device_fd, i, 2, true, opts->x_svq,
1529152128d6SEugenio Pérez                                      iova_range, features, errp);
153040237840SJason Wang         if (!ncs[i])
153140237840SJason Wang             goto err;
153240237840SJason Wang     }
153340237840SJason Wang 
153440237840SJason Wang     if (has_cvq) {
153540237840SJason Wang         nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
15361576dbb5SEugenio Pérez                                  vdpa_device_fd, i, 1, false,
1537152128d6SEugenio Pérez                                  opts->x_svq, iova_range, features, errp);
153840237840SJason Wang         if (!nc)
153940237840SJason Wang             goto err;
154040237840SJason Wang     }
154140237840SJason Wang 
1542654790b6SJason Wang     return 0;
154340237840SJason Wang 
154440237840SJason Wang err:
154540237840SJason Wang     if (i) {
15469bd05507SSi-Wei Liu         for (i--; i >= 0; i--) {
15479bd05507SSi-Wei Liu             qemu_del_net_client(ncs[i]);
15489bd05507SSi-Wei Liu         }
154940237840SJason Wang     }
15501576dbb5SEugenio Pérez 
155140237840SJason Wang     qemu_close(vdpa_device_fd);
155240237840SJason Wang 
155340237840SJason Wang     return -1;
15541e0a84eaSCindy Lu }
1555