1 /*
2 * vhost-vdpa.c
3 *
4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
6 *
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
9 *
10 */
11
12 #include "qemu/osdep.h"
13 #include "clients.h"
14 #include "hw/virtio/virtio-net.h"
15 #include "net/vhost_net.h"
16 #include "net/vhost-vdpa.h"
17 #include "hw/virtio/vhost-vdpa.h"
18 #include "qemu/config-file.h"
19 #include "qemu/error-report.h"
20 #include "qemu/log.h"
21 #include "qemu/memalign.h"
22 #include "qemu/option.h"
23 #include "qapi/error.h"
24 #include <linux/vhost.h>
25 #include <sys/ioctl.h>
26 #include <err.h>
27 #include "standard-headers/linux/virtio_net.h"
28 #include "monitor/monitor.h"
29 #include "migration/misc.h"
30 #include "hw/virtio/vhost.h"
31 #include "trace.h"
32
33 /* Todo:need to add the multiqueue support here */
34 typedef struct VhostVDPAState {
35 NetClientState nc;
36 struct vhost_vdpa vhost_vdpa;
37 NotifierWithReturn migration_state;
38 VHostNetState *vhost_net;
39
40 /* Control commands shadow buffers */
41 void *cvq_cmd_out_buffer;
42 virtio_net_ctrl_ack *status;
43
44 /* The device always have SVQ enabled */
45 bool always_svq;
46
47 /* The device can isolate CVQ in its own ASID */
48 bool cvq_isolated;
49
50 bool started;
51 } VhostVDPAState;
52
53 /*
54 * The array is sorted alphabetically in ascending order,
55 * with the exception of VHOST_INVALID_FEATURE_BIT,
56 * which should always be the last entry.
57 */
58 static const int vdpa_feature_bits[] = {
59 VIRTIO_F_ANY_LAYOUT,
60 VIRTIO_F_IOMMU_PLATFORM,
61 VIRTIO_F_NOTIFY_ON_EMPTY,
62 VIRTIO_F_RING_PACKED,
63 VIRTIO_F_RING_RESET,
64 VIRTIO_F_VERSION_1,
65 VIRTIO_F_IN_ORDER,
66 VIRTIO_F_NOTIFICATION_DATA,
67 VIRTIO_NET_F_CSUM,
68 VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,
69 VIRTIO_NET_F_CTRL_MAC_ADDR,
70 VIRTIO_NET_F_CTRL_RX,
71 VIRTIO_NET_F_CTRL_RX_EXTRA,
72 VIRTIO_NET_F_CTRL_VLAN,
73 VIRTIO_NET_F_CTRL_VQ,
74 VIRTIO_NET_F_GSO,
75 VIRTIO_NET_F_GUEST_CSUM,
76 VIRTIO_NET_F_GUEST_ECN,
77 VIRTIO_NET_F_GUEST_TSO4,
78 VIRTIO_NET_F_GUEST_TSO6,
79 VIRTIO_NET_F_GUEST_UFO,
80 VIRTIO_NET_F_GUEST_USO4,
81 VIRTIO_NET_F_GUEST_USO6,
82 VIRTIO_NET_F_HASH_REPORT,
83 VIRTIO_NET_F_HOST_ECN,
84 VIRTIO_NET_F_HOST_TSO4,
85 VIRTIO_NET_F_HOST_TSO6,
86 VIRTIO_NET_F_HOST_UFO,
87 VIRTIO_NET_F_HOST_USO,
88 VIRTIO_NET_F_MQ,
89 VIRTIO_NET_F_MRG_RXBUF,
90 VIRTIO_NET_F_MTU,
91 VIRTIO_NET_F_RSC_EXT,
92 VIRTIO_NET_F_RSS,
93 VIRTIO_NET_F_STATUS,
94 VIRTIO_RING_F_EVENT_IDX,
95 VIRTIO_RING_F_INDIRECT_DESC,
96
97 /* VHOST_INVALID_FEATURE_BIT should always be the last entry */
98 VHOST_INVALID_FEATURE_BIT
99 };
100
101 /** Supported device specific feature bits with SVQ */
102 static const uint64_t vdpa_svq_device_features =
103 BIT_ULL(VIRTIO_NET_F_CSUM) |
104 BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
105 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) |
106 BIT_ULL(VIRTIO_NET_F_MTU) |
107 BIT_ULL(VIRTIO_NET_F_MAC) |
108 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
109 BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
110 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
111 BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
112 BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
113 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
114 BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
115 BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
116 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
117 BIT_ULL(VIRTIO_NET_F_STATUS) |
118 BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
119 BIT_ULL(VIRTIO_NET_F_CTRL_RX) |
120 BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |
121 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) |
122 BIT_ULL(VIRTIO_NET_F_MQ) |
123 BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
124 BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
125 /* VHOST_F_LOG_ALL is exposed by SVQ */
126 BIT_ULL(VHOST_F_LOG_ALL) |
127 BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |
128 BIT_ULL(VIRTIO_NET_F_RSS) |
129 BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
130 BIT_ULL(VIRTIO_NET_F_STANDBY) |
131 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX);
132
133 #define VHOST_VDPA_NET_CVQ_ASID 1
134
vhost_vdpa_get_vhost_net(NetClientState * nc)135 static struct vhost_net *vhost_vdpa_get_vhost_net(NetClientState *nc)
136 {
137 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
138 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
139 return s->vhost_net;
140 }
141
vhost_vdpa_net_cvq_cmd_len(void)142 static size_t vhost_vdpa_net_cvq_cmd_len(void)
143 {
144 /*
145 * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
146 * In buffer is always 1 byte, so it should fit here
147 */
148 return sizeof(struct virtio_net_ctrl_hdr) +
149 2 * sizeof(struct virtio_net_ctrl_mac) +
150 MAC_TABLE_ENTRIES * ETH_ALEN;
151 }
152
vhost_vdpa_net_cvq_cmd_page_len(void)153 static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
154 {
155 return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
156 }
157
vhost_vdpa_net_valid_svq_features(uint64_t features,Error ** errp)158 static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp)
159 {
160 uint64_t invalid_dev_features =
161 features & ~vdpa_svq_device_features &
162 /* Transport are all accepted at this point */
163 ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
164 VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
165
166 if (invalid_dev_features) {
167 error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
168 invalid_dev_features);
169 return false;
170 }
171
172 return vhost_svq_valid_features(features, errp);
173 }
174
vhost_vdpa_net_check_device_id(struct vhost_net * net)175 static int vhost_vdpa_net_check_device_id(struct vhost_net *net)
176 {
177 uint32_t device_id;
178 int ret;
179 struct vhost_dev *hdev;
180
181 hdev = (struct vhost_dev *)&net->dev;
182 ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id);
183 if (device_id != VIRTIO_ID_NET) {
184 return -ENOTSUP;
185 }
186 return ret;
187 }
188
vhost_vdpa_add(NetClientState * ncs,void * be,int queue_pair_index,int nvqs)189 static int vhost_vdpa_add(NetClientState *ncs, void *be,
190 int queue_pair_index, int nvqs)
191 {
192 VhostNetOptions options;
193 struct vhost_net *net = NULL;
194 VhostVDPAState *s;
195 int ret;
196
197 options.backend_type = VHOST_BACKEND_TYPE_VDPA;
198 assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
199 s = DO_UPCAST(VhostVDPAState, nc, ncs);
200 options.net_backend = ncs;
201 options.opaque = be;
202 options.busyloop_timeout = 0;
203 options.nvqs = nvqs;
204 options.feature_bits = vdpa_feature_bits;
205 options.get_acked_features = NULL;
206 options.save_acked_features = NULL;
207 options.max_tx_queue_size = VIRTQUEUE_MAX_SIZE;
208 options.is_vhost_user = false;
209
210 net = vhost_net_init(&options);
211 if (!net) {
212 error_report("failed to init vhost_net for queue");
213 goto err_init;
214 }
215 s->vhost_net = net;
216 ret = vhost_vdpa_net_check_device_id(net);
217 if (ret) {
218 goto err_check;
219 }
220 return 0;
221 err_check:
222 vhost_net_cleanup(net);
223 g_free(net);
224 err_init:
225 return -1;
226 }
227
vhost_vdpa_cleanup(NetClientState * nc)228 static void vhost_vdpa_cleanup(NetClientState *nc)
229 {
230 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
231
232 munmap(s->cvq_cmd_out_buffer, vhost_vdpa_net_cvq_cmd_page_len());
233 munmap(s->status, vhost_vdpa_net_cvq_cmd_page_len());
234 if (s->vhost_net) {
235 vhost_net_cleanup(s->vhost_net);
236 g_free(s->vhost_net);
237 s->vhost_net = NULL;
238 }
239 if (s->vhost_vdpa.index != 0) {
240 return;
241 }
242 qemu_close(s->vhost_vdpa.shared->device_fd);
243 g_clear_pointer(&s->vhost_vdpa.shared->iova_tree, vhost_iova_tree_delete);
244 g_free(s->vhost_vdpa.shared);
245 }
246
vhost_vdpa_has_vnet_hdr(NetClientState * nc)247 static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc)
248 {
249 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
250
251 return true;
252 }
253
vhost_vdpa_get_vnet_hash_supported_types(NetClientState * nc,uint32_t * types)254 static bool vhost_vdpa_get_vnet_hash_supported_types(NetClientState *nc,
255 uint32_t *types)
256 {
257 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
258 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
259 uint64_t features = s->vhost_vdpa.dev->features;
260 int fd = s->vhost_vdpa.shared->device_fd;
261 struct {
262 struct vhost_vdpa_config hdr;
263 uint32_t supported_hash_types;
264 } config;
265
266 if (!virtio_has_feature(features, VIRTIO_NET_F_HASH_REPORT) &&
267 !virtio_has_feature(features, VIRTIO_NET_F_RSS)) {
268 return false;
269 }
270
271 config.hdr.off = offsetof(struct virtio_net_config, supported_hash_types);
272 config.hdr.len = sizeof(config.supported_hash_types);
273
274 assert(!ioctl(fd, VHOST_VDPA_GET_CONFIG, &config));
275 *types = le32_to_cpu(config.supported_hash_types);
276
277 return true;
278 }
279
vhost_vdpa_has_ufo(NetClientState * nc)280 static bool vhost_vdpa_has_ufo(NetClientState *nc)
281 {
282 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
283 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
284 uint64_t features = 0;
285 features |= (1ULL << VIRTIO_NET_F_HOST_UFO);
286 features = vhost_net_get_features(s->vhost_net, features);
287 return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO));
288
289 }
290
291 /*
292 * FIXME: vhost_vdpa doesn't have an API to "set h/w endianness". But it's
293 * reasonable to assume that h/w is LE by default, because LE is what
294 * virtio 1.0 and later ask for. So, this function just says "yes, the h/w is
295 * LE". Otherwise, on a BE machine, higher-level code would mistakely think
296 * the h/w is BE and can't support VDPA for a virtio 1.0 client.
297 */
vhost_vdpa_set_vnet_le(NetClientState * nc,bool enable)298 static int vhost_vdpa_set_vnet_le(NetClientState *nc, bool enable)
299 {
300 return 0;
301 }
302
vhost_vdpa_check_peer_type(NetClientState * nc,ObjectClass * oc,Error ** errp)303 static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc,
304 Error **errp)
305 {
306 const char *driver = object_class_get_name(oc);
307
308 if (!g_str_has_prefix(driver, "virtio-net-")) {
309 error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
310 return false;
311 }
312
313 return true;
314 }
315
316 /** Dummy receive in case qemu falls back to userland tap networking */
vhost_vdpa_receive(NetClientState * nc,const uint8_t * buf,size_t size)317 static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
318 size_t size)
319 {
320 return size;
321 }
322
323
324 /** From any vdpa net client, get the netclient of the i-th queue pair */
vhost_vdpa_net_get_nc_vdpa(VhostVDPAState * s,int i)325 static VhostVDPAState *vhost_vdpa_net_get_nc_vdpa(VhostVDPAState *s, int i)
326 {
327 NICState *nic = qemu_get_nic(s->nc.peer);
328 NetClientState *nc_i = qemu_get_peer(nic->ncs, i);
329
330 return DO_UPCAST(VhostVDPAState, nc, nc_i);
331 }
332
vhost_vdpa_net_first_nc_vdpa(VhostVDPAState * s)333 static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
334 {
335 return vhost_vdpa_net_get_nc_vdpa(s, 0);
336 }
337
vhost_vdpa_net_log_global_enable(VhostVDPAState * s,bool enable)338 static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
339 {
340 struct vhost_vdpa *v = &s->vhost_vdpa;
341 VirtIONet *n;
342 VirtIODevice *vdev;
343 int data_queue_pairs, cvq, r;
344
345 /* We are only called on the first data vqs and only if x-svq is not set */
346 if (s->vhost_vdpa.shadow_vqs_enabled == enable) {
347 return;
348 }
349
350 vdev = v->dev->vdev;
351 n = VIRTIO_NET(vdev);
352 if (!n->vhost_started) {
353 return;
354 }
355
356 data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
357 cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
358 n->max_ncs - n->max_queue_pairs : 0;
359 v->shared->svq_switching = enable ?
360 SVQ_TSTATE_ENABLING : SVQ_TSTATE_DISABLING;
361 /*
362 * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter
363 * in the future and resume the device if read-only operations between
364 * suspend and reset goes wrong.
365 */
366 vhost_net_stop(vdev, n->nic->ncs, data_queue_pairs, cvq);
367
368 /* Start will check migration setup_or_active to configure or not SVQ */
369 r = vhost_net_start(vdev, n->nic->ncs, data_queue_pairs, cvq);
370 if (unlikely(r < 0)) {
371 error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r);
372 }
373 v->shared->svq_switching = SVQ_TSTATE_DONE;
374 }
375
vdpa_net_migration_state_notifier(NotifierWithReturn * notifier,MigrationEvent * e,Error ** errp)376 static int vdpa_net_migration_state_notifier(NotifierWithReturn *notifier,
377 MigrationEvent *e, Error **errp)
378 {
379 VhostVDPAState *s = container_of(notifier, VhostVDPAState, migration_state);
380
381 if (e->type == MIG_EVENT_PRECOPY_SETUP) {
382 vhost_vdpa_net_log_global_enable(s, true);
383 } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
384 vhost_vdpa_net_log_global_enable(s, false);
385 }
386 return 0;
387 }
388
vhost_vdpa_net_data_start_first(VhostVDPAState * s)389 static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
390 {
391 migration_add_notifier(&s->migration_state,
392 vdpa_net_migration_state_notifier);
393 }
394
vhost_vdpa_net_data_start(NetClientState * nc)395 static int vhost_vdpa_net_data_start(NetClientState *nc)
396 {
397 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
398 struct vhost_vdpa *v = &s->vhost_vdpa;
399
400 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
401
402 if (s->always_svq || migration_is_running()) {
403 v->shadow_vqs_enabled = true;
404 } else {
405 v->shadow_vqs_enabled = false;
406 }
407
408 if (v->index == 0) {
409 v->shared->shadow_data = v->shadow_vqs_enabled;
410 vhost_vdpa_net_data_start_first(s);
411 return 0;
412 }
413
414 return 0;
415 }
416
vhost_vdpa_net_data_load(NetClientState * nc)417 static int vhost_vdpa_net_data_load(NetClientState *nc)
418 {
419 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
420 struct vhost_vdpa *v = &s->vhost_vdpa;
421 bool has_cvq = v->dev->vq_index_end % 2;
422
423 if (has_cvq) {
424 return 0;
425 }
426
427 for (int i = 0; i < v->dev->nvqs; ++i) {
428 int ret = vhost_vdpa_set_vring_ready(v, i + v->dev->vq_index);
429 if (ret < 0) {
430 return ret;
431 }
432 }
433 return 0;
434 }
435
vhost_vdpa_net_client_stop(NetClientState * nc)436 static void vhost_vdpa_net_client_stop(NetClientState *nc)
437 {
438 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
439
440 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
441
442 if (s->vhost_vdpa.index == 0) {
443 migration_remove_notifier(&s->migration_state);
444 }
445 }
446
447 static NetClientInfo net_vhost_vdpa_info = {
448 .type = NET_CLIENT_DRIVER_VHOST_VDPA,
449 .size = sizeof(VhostVDPAState),
450 .receive = vhost_vdpa_receive,
451 .start = vhost_vdpa_net_data_start,
452 .load = vhost_vdpa_net_data_load,
453 .stop = vhost_vdpa_net_client_stop,
454 .cleanup = vhost_vdpa_cleanup,
455 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
456 .get_vnet_hash_supported_types = vhost_vdpa_get_vnet_hash_supported_types,
457 .has_ufo = vhost_vdpa_has_ufo,
458 .set_vnet_le = vhost_vdpa_set_vnet_le,
459 .check_peer_type = vhost_vdpa_check_peer_type,
460 .get_vhost_net = vhost_vdpa_get_vhost_net,
461 };
462
vhost_vdpa_get_vring_group(int device_fd,unsigned vq_index,Error ** errp)463 static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index,
464 Error **errp)
465 {
466 struct vhost_vring_state state = {
467 .index = vq_index,
468 };
469 int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state);
470
471 if (unlikely(r < 0)) {
472 r = -errno;
473 error_setg_errno(errp, errno, "Cannot get VQ %u group", vq_index);
474 return r;
475 }
476
477 return state.num;
478 }
479
vhost_vdpa_set_address_space_id(struct vhost_vdpa * v,unsigned vq_group,unsigned asid_num)480 static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v,
481 unsigned vq_group,
482 unsigned asid_num)
483 {
484 struct vhost_vring_state asid = {
485 .index = vq_group,
486 .num = asid_num,
487 };
488 int r;
489
490 trace_vhost_vdpa_set_address_space_id(v, vq_group, asid_num);
491
492 r = ioctl(v->shared->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid);
493 if (unlikely(r < 0)) {
494 error_report("Can't set vq group %u asid %u, errno=%d (%s)",
495 asid.index, asid.num, errno, g_strerror(errno));
496 }
497 return r;
498 }
499
vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa * v,void * addr)500 static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
501 {
502 VhostIOVATree *tree = v->shared->iova_tree;
503 DMAMap needle = {
504 /*
505 * No need to specify size or to look for more translations since
506 * this contiguous chunk was allocated by us.
507 */
508 .translated_addr = (hwaddr)(uintptr_t)addr,
509 };
510 const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
511 int r;
512
513 if (unlikely(!map)) {
514 error_report("Cannot locate expected map");
515 return;
516 }
517
518 r = vhost_vdpa_dma_unmap(v->shared, v->address_space_id, map->iova,
519 map->size + 1);
520 if (unlikely(r != 0)) {
521 error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
522 }
523
524 vhost_iova_tree_remove(tree, *map);
525 }
526
527 /** Map CVQ buffer. */
vhost_vdpa_cvq_map_buf(struct vhost_vdpa * v,void * buf,size_t size,bool write)528 static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size,
529 bool write)
530 {
531 DMAMap map = {};
532 hwaddr taddr = (hwaddr)(uintptr_t)buf;
533 int r;
534
535 map.size = size - 1;
536 map.perm = write ? IOMMU_RW : IOMMU_RO,
537 r = vhost_iova_tree_map_alloc(v->shared->iova_tree, &map, taddr);
538 if (unlikely(r != IOVA_OK)) {
539 error_report("Cannot map injected element");
540
541 if (map.translated_addr == taddr) {
542 error_report("Insertion to IOVA->HVA tree failed");
543 /* Remove the mapping from the IOVA-only tree */
544 goto dma_map_err;
545 }
546 return r;
547 }
548
549 r = vhost_vdpa_dma_map(v->shared, v->address_space_id, map.iova,
550 vhost_vdpa_net_cvq_cmd_page_len(), buf, !write);
551 if (unlikely(r < 0)) {
552 goto dma_map_err;
553 }
554
555 return 0;
556
557 dma_map_err:
558 vhost_iova_tree_remove(v->shared->iova_tree, map);
559 return r;
560 }
561
vhost_vdpa_net_cvq_start(NetClientState * nc)562 static int vhost_vdpa_net_cvq_start(NetClientState *nc)
563 {
564 VhostVDPAState *s, *s0;
565 struct vhost_vdpa *v;
566 int64_t cvq_group;
567 int r;
568 Error *err = NULL;
569
570 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
571
572 s = DO_UPCAST(VhostVDPAState, nc, nc);
573 v = &s->vhost_vdpa;
574
575 s0 = vhost_vdpa_net_first_nc_vdpa(s);
576 v->shadow_vqs_enabled = s0->vhost_vdpa.shadow_vqs_enabled;
577 s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
578
579 if (v->shared->shadow_data) {
580 /* SVQ is already configured for all virtqueues */
581 goto out;
582 }
583
584 /*
585 * If we early return in these cases SVQ will not be enabled. The migration
586 * will be blocked as long as vhost-vdpa backends will not offer _F_LOG.
587 */
588 if (!vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) {
589 return 0;
590 }
591
592 if (!s->cvq_isolated) {
593 return 0;
594 }
595
596 cvq_group = vhost_vdpa_get_vring_group(v->shared->device_fd,
597 v->dev->vq_index_end - 1,
598 &err);
599 if (unlikely(cvq_group < 0)) {
600 error_report_err(err);
601 return cvq_group;
602 }
603
604 r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID);
605 if (unlikely(r < 0)) {
606 return r;
607 }
608
609 v->shadow_vqs_enabled = true;
610 s->vhost_vdpa.address_space_id = VHOST_VDPA_NET_CVQ_ASID;
611
612 out:
613 if (!s->vhost_vdpa.shadow_vqs_enabled) {
614 return 0;
615 }
616
617 r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
618 vhost_vdpa_net_cvq_cmd_page_len(), false);
619 if (unlikely(r < 0)) {
620 return r;
621 }
622
623 r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->status,
624 vhost_vdpa_net_cvq_cmd_page_len(), true);
625 if (unlikely(r < 0)) {
626 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
627 }
628
629 return r;
630 }
631
vhost_vdpa_net_cvq_stop(NetClientState * nc)632 static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
633 {
634 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
635
636 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
637
638 if (s->vhost_vdpa.shadow_vqs_enabled) {
639 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
640 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
641 }
642
643 vhost_vdpa_net_client_stop(nc);
644 }
645
vhost_vdpa_net_cvq_add(VhostVDPAState * s,const struct iovec * out_sg,size_t out_num,const struct iovec * in_sg,size_t in_num)646 static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s,
647 const struct iovec *out_sg, size_t out_num,
648 const struct iovec *in_sg, size_t in_num)
649 {
650 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
651 int r;
652
653 r = vhost_svq_add(svq, out_sg, out_num, NULL, in_sg, in_num, NULL, NULL);
654 if (unlikely(r != 0)) {
655 if (unlikely(r == -ENOSPC)) {
656 qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
657 __func__);
658 }
659 }
660
661 return r;
662 }
663
664 /*
665 * Convenience wrapper to poll SVQ for multiple control commands.
666 *
667 * Caller should hold the BQL when invoking this function, and should take
668 * the answer before SVQ pulls by itself when BQL is released.
669 */
vhost_vdpa_net_svq_poll(VhostVDPAState * s,size_t cmds_in_flight)670 static ssize_t vhost_vdpa_net_svq_poll(VhostVDPAState *s, size_t cmds_in_flight)
671 {
672 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
673 return vhost_svq_poll(svq, cmds_in_flight);
674 }
675
vhost_vdpa_net_load_cursor_reset(VhostVDPAState * s,struct iovec * out_cursor,struct iovec * in_cursor)676 static void vhost_vdpa_net_load_cursor_reset(VhostVDPAState *s,
677 struct iovec *out_cursor,
678 struct iovec *in_cursor)
679 {
680 /* reset the cursor of the output buffer for the device */
681 out_cursor->iov_base = s->cvq_cmd_out_buffer;
682 out_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len();
683
684 /* reset the cursor of the in buffer for the device */
685 in_cursor->iov_base = s->status;
686 in_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len();
687 }
688
689 /*
690 * Poll SVQ for multiple pending control commands and check the device's ack.
691 *
692 * Caller should hold the BQL when invoking this function.
693 *
694 * @s: The VhostVDPAState
695 * @len: The length of the pending status shadow buffer
696 */
vhost_vdpa_net_svq_flush(VhostVDPAState * s,size_t len)697 static ssize_t vhost_vdpa_net_svq_flush(VhostVDPAState *s, size_t len)
698 {
699 /* device uses a one-byte length ack for each control command */
700 ssize_t dev_written = vhost_vdpa_net_svq_poll(s, len);
701 if (unlikely(dev_written != len)) {
702 return -EIO;
703 }
704
705 /* check the device's ack */
706 for (int i = 0; i < len; ++i) {
707 if (s->status[i] != VIRTIO_NET_OK) {
708 return -EIO;
709 }
710 }
711 return 0;
712 }
713
vhost_vdpa_net_load_cmd(VhostVDPAState * s,struct iovec * out_cursor,struct iovec * in_cursor,uint8_t class,uint8_t cmd,const struct iovec * data_sg,size_t data_num)714 static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s,
715 struct iovec *out_cursor,
716 struct iovec *in_cursor, uint8_t class,
717 uint8_t cmd, const struct iovec *data_sg,
718 size_t data_num)
719 {
720 const struct virtio_net_ctrl_hdr ctrl = {
721 .class = class,
722 .cmd = cmd,
723 };
724 size_t data_size = iov_size(data_sg, data_num), cmd_size;
725 struct iovec out, in;
726 ssize_t r;
727 unsigned dummy_cursor_iov_cnt;
728 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
729
730 assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
731 cmd_size = sizeof(ctrl) + data_size;
732 trace_vhost_vdpa_net_load_cmd(s, class, cmd, data_num, data_size);
733 if (vhost_svq_available_slots(svq) < 2 ||
734 iov_size(out_cursor, 1) < cmd_size) {
735 /*
736 * It is time to flush all pending control commands if SVQ is full
737 * or control commands shadow buffers are full.
738 *
739 * We can poll here since we've had BQL from the time
740 * we sent the descriptor.
741 */
742 r = vhost_vdpa_net_svq_flush(s, in_cursor->iov_base -
743 (void *)s->status);
744 if (unlikely(r < 0)) {
745 return r;
746 }
747
748 vhost_vdpa_net_load_cursor_reset(s, out_cursor, in_cursor);
749 }
750
751 /* pack the CVQ command header */
752 iov_from_buf(out_cursor, 1, 0, &ctrl, sizeof(ctrl));
753 /* pack the CVQ command command-specific-data */
754 iov_to_buf(data_sg, data_num, 0,
755 out_cursor->iov_base + sizeof(ctrl), data_size);
756
757 /* extract the required buffer from the cursor for output */
758 iov_copy(&out, 1, out_cursor, 1, 0, cmd_size);
759 /* extract the required buffer from the cursor for input */
760 iov_copy(&in, 1, in_cursor, 1, 0, sizeof(*s->status));
761
762 r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1);
763 if (unlikely(r < 0)) {
764 trace_vhost_vdpa_net_load_cmd_retval(s, class, cmd, r);
765 return r;
766 }
767
768 /* iterate the cursors */
769 dummy_cursor_iov_cnt = 1;
770 iov_discard_front(&out_cursor, &dummy_cursor_iov_cnt, cmd_size);
771 dummy_cursor_iov_cnt = 1;
772 iov_discard_front(&in_cursor, &dummy_cursor_iov_cnt, sizeof(*s->status));
773
774 return 0;
775 }
776
vhost_vdpa_net_load_mac(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor)777 static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n,
778 struct iovec *out_cursor,
779 struct iovec *in_cursor)
780 {
781 if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
782 const struct iovec data = {
783 .iov_base = (void *)n->mac,
784 .iov_len = sizeof(n->mac),
785 };
786 ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
787 VIRTIO_NET_CTRL_MAC,
788 VIRTIO_NET_CTRL_MAC_ADDR_SET,
789 &data, 1);
790 if (unlikely(r < 0)) {
791 return r;
792 }
793 }
794
795 /*
796 * According to VirtIO standard, "The device MUST have an
797 * empty MAC filtering table on reset.".
798 *
799 * Therefore, there is no need to send this CVQ command if the
800 * driver also sets an empty MAC filter table, which aligns with
801 * the device's defaults.
802 *
803 * Note that the device's defaults can mismatch the driver's
804 * configuration only at live migration.
805 */
806 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX) ||
807 n->mac_table.in_use == 0) {
808 return 0;
809 }
810
811 uint32_t uni_entries = n->mac_table.first_multi,
812 uni_macs_size = uni_entries * ETH_ALEN,
813 mul_entries = n->mac_table.in_use - uni_entries,
814 mul_macs_size = mul_entries * ETH_ALEN;
815 struct virtio_net_ctrl_mac uni = {
816 .entries = cpu_to_le32(uni_entries),
817 };
818 struct virtio_net_ctrl_mac mul = {
819 .entries = cpu_to_le32(mul_entries),
820 };
821 const struct iovec data[] = {
822 {
823 .iov_base = &uni,
824 .iov_len = sizeof(uni),
825 }, {
826 .iov_base = n->mac_table.macs,
827 .iov_len = uni_macs_size,
828 }, {
829 .iov_base = &mul,
830 .iov_len = sizeof(mul),
831 }, {
832 .iov_base = &n->mac_table.macs[uni_macs_size],
833 .iov_len = mul_macs_size,
834 },
835 };
836 ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
837 VIRTIO_NET_CTRL_MAC,
838 VIRTIO_NET_CTRL_MAC_TABLE_SET,
839 data, ARRAY_SIZE(data));
840 if (unlikely(r < 0)) {
841 return r;
842 }
843
844 return 0;
845 }
846
vhost_vdpa_net_load_rss(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor,bool do_rss)847 static int vhost_vdpa_net_load_rss(VhostVDPAState *s, const VirtIONet *n,
848 struct iovec *out_cursor,
849 struct iovec *in_cursor, bool do_rss)
850 {
851 struct virtio_net_rss_config cfg = {};
852 ssize_t r;
853 g_autofree uint16_t *table = NULL;
854
855 /*
856 * According to VirtIO standard, "Initially the device has all hash
857 * types disabled and reports only VIRTIO_NET_HASH_REPORT_NONE.".
858 *
859 * Therefore, there is no need to send this CVQ command if the
860 * driver disables the all hash types, which aligns with
861 * the device's defaults.
862 *
863 * Note that the device's defaults can mismatch the driver's
864 * configuration only at live migration.
865 */
866 if (!n->rss_data.enabled ||
867 n->rss_data.runtime_hash_types == VIRTIO_NET_HASH_REPORT_NONE) {
868 return 0;
869 }
870
871 table = g_malloc_n(n->rss_data.indirections_len,
872 sizeof(n->rss_data.indirections_table[0]));
873 cfg.hash_types = cpu_to_le32(n->rss_data.runtime_hash_types);
874
875 if (do_rss) {
876 /*
877 * According to VirtIO standard, "Number of entries in indirection_table
878 * is (indirection_table_mask + 1)".
879 */
880 cfg.indirection_table_mask = cpu_to_le16(n->rss_data.indirections_len -
881 1);
882 cfg.unclassified_queue = cpu_to_le16(n->rss_data.default_queue);
883 for (int i = 0; i < n->rss_data.indirections_len; ++i) {
884 table[i] = cpu_to_le16(n->rss_data.indirections_table[i]);
885 }
886 cfg.max_tx_vq = cpu_to_le16(n->curr_queue_pairs);
887 } else {
888 /*
889 * According to VirtIO standard, "Field reserved MUST contain zeroes.
890 * It is defined to make the structure to match the layout of
891 * virtio_net_rss_config structure, defined in 5.1.6.5.7.".
892 *
893 * Therefore, we need to zero the fields in
894 * struct virtio_net_rss_config, which corresponds to the
895 * `reserved` field in struct virtio_net_hash_config.
896 *
897 * Note that all other fields are zeroed at their definitions,
898 * except for the `indirection_table` field, where the actual data
899 * is stored in the `table` variable to ensure compatibility
900 * with RSS case. Therefore, we need to zero the `table` variable here.
901 */
902 table[0] = 0;
903 }
904
905 /*
906 * Considering that virtio_net_handle_rss() currently does not restore
907 * the hash key length parsed from the CVQ command sent from the guest
908 * into n->rss_data and uses the maximum key length in other code, so
909 * we also employ the maximum key length here.
910 */
911 cfg.hash_key_length = sizeof(n->rss_data.key);
912
913 const struct iovec data[] = {
914 {
915 .iov_base = &cfg,
916 .iov_len = offsetof(struct virtio_net_rss_config,
917 indirection_table),
918 }, {
919 .iov_base = table,
920 .iov_len = n->rss_data.indirections_len *
921 sizeof(n->rss_data.indirections_table[0]),
922 }, {
923 .iov_base = &cfg.max_tx_vq,
924 .iov_len = offsetof(struct virtio_net_rss_config, hash_key_data) -
925 offsetof(struct virtio_net_rss_config, max_tx_vq),
926 }, {
927 .iov_base = (void *)n->rss_data.key,
928 .iov_len = sizeof(n->rss_data.key),
929 }
930 };
931
932 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
933 VIRTIO_NET_CTRL_MQ,
934 do_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG :
935 VIRTIO_NET_CTRL_MQ_HASH_CONFIG,
936 data, ARRAY_SIZE(data));
937 if (unlikely(r < 0)) {
938 return r;
939 }
940
941 return 0;
942 }
943
vhost_vdpa_net_load_mq(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor)944 static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
945 const VirtIONet *n,
946 struct iovec *out_cursor,
947 struct iovec *in_cursor)
948 {
949 struct virtio_net_ctrl_mq mq;
950 ssize_t r;
951
952 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_MQ)) {
953 return 0;
954 }
955
956 trace_vhost_vdpa_net_load_mq(s, n->curr_queue_pairs);
957
958 mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
959 const struct iovec data = {
960 .iov_base = &mq,
961 .iov_len = sizeof(mq),
962 };
963 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
964 VIRTIO_NET_CTRL_MQ,
965 VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET,
966 &data, 1);
967 if (unlikely(r < 0)) {
968 return r;
969 }
970
971 if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_RSS)) {
972 /* load the receive-side scaling state */
973 r = vhost_vdpa_net_load_rss(s, n, out_cursor, in_cursor, true);
974 if (unlikely(r < 0)) {
975 return r;
976 }
977 } else if (virtio_vdev_has_feature(&n->parent_obj,
978 VIRTIO_NET_F_HASH_REPORT)) {
979 /* load the hash calculation state */
980 r = vhost_vdpa_net_load_rss(s, n, out_cursor, in_cursor, false);
981 if (unlikely(r < 0)) {
982 return r;
983 }
984 }
985
986 return 0;
987 }
988
vhost_vdpa_net_load_offloads(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor)989 static int vhost_vdpa_net_load_offloads(VhostVDPAState *s,
990 const VirtIONet *n,
991 struct iovec *out_cursor,
992 struct iovec *in_cursor)
993 {
994 uint64_t offloads;
995 ssize_t r;
996
997 if (!virtio_vdev_has_feature(&n->parent_obj,
998 VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
999 return 0;
1000 }
1001
1002 if (n->curr_guest_offloads == virtio_net_supported_guest_offloads(n)) {
1003 /*
1004 * According to VirtIO standard, "Upon feature negotiation
1005 * corresponding offload gets enabled to preserve
1006 * backward compatibility.".
1007 *
1008 * Therefore, there is no need to send this CVQ command if the
1009 * driver also enables all supported offloads, which aligns with
1010 * the device's defaults.
1011 *
1012 * Note that the device's defaults can mismatch the driver's
1013 * configuration only at live migration.
1014 */
1015 return 0;
1016 }
1017
1018 offloads = cpu_to_le64(n->curr_guest_offloads);
1019 const struct iovec data = {
1020 .iov_base = &offloads,
1021 .iov_len = sizeof(offloads),
1022 };
1023 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
1024 VIRTIO_NET_CTRL_GUEST_OFFLOADS,
1025 VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
1026 &data, 1);
1027 if (unlikely(r < 0)) {
1028 return r;
1029 }
1030
1031 return 0;
1032 }
1033
vhost_vdpa_net_load_rx_mode(VhostVDPAState * s,struct iovec * out_cursor,struct iovec * in_cursor,uint8_t cmd,uint8_t on)1034 static int vhost_vdpa_net_load_rx_mode(VhostVDPAState *s,
1035 struct iovec *out_cursor,
1036 struct iovec *in_cursor,
1037 uint8_t cmd,
1038 uint8_t on)
1039 {
1040 const struct iovec data = {
1041 .iov_base = &on,
1042 .iov_len = sizeof(on),
1043 };
1044 ssize_t r;
1045
1046 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
1047 VIRTIO_NET_CTRL_RX, cmd, &data, 1);
1048 if (unlikely(r < 0)) {
1049 return r;
1050 }
1051
1052 return 0;
1053 }
1054
vhost_vdpa_net_load_rx(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor)1055 static int vhost_vdpa_net_load_rx(VhostVDPAState *s,
1056 const VirtIONet *n,
1057 struct iovec *out_cursor,
1058 struct iovec *in_cursor)
1059 {
1060 ssize_t r;
1061
1062 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX)) {
1063 return 0;
1064 }
1065
1066 /*
1067 * According to virtio_net_reset(), device turns promiscuous mode
1068 * on by default.
1069 *
1070 * Additionally, according to VirtIO standard, "Since there are
1071 * no guarantees, it can use a hash filter or silently switch to
1072 * allmulti or promiscuous mode if it is given too many addresses.".
1073 * QEMU marks `n->mac_table.uni_overflow` if guest sets too many
1074 * non-multicast MAC addresses, indicating that promiscuous mode
1075 * should be enabled.
1076 *
1077 * Therefore, QEMU should only send this CVQ command if the
1078 * `n->mac_table.uni_overflow` is not marked and `n->promisc` is off,
1079 * which sets promiscuous mode on, different from the device's defaults.
1080 *
1081 * Note that the device's defaults can mismatch the driver's
1082 * configuration only at live migration.
1083 */
1084 if (!n->mac_table.uni_overflow && !n->promisc) {
1085 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
1086 VIRTIO_NET_CTRL_RX_PROMISC, 0);
1087 if (unlikely(r < 0)) {
1088 return r;
1089 }
1090 }
1091
1092 /*
1093 * According to virtio_net_reset(), device turns all-multicast mode
1094 * off by default.
1095 *
1096 * According to VirtIO standard, "Since there are no guarantees,
1097 * it can use a hash filter or silently switch to allmulti or
1098 * promiscuous mode if it is given too many addresses.". QEMU marks
1099 * `n->mac_table.multi_overflow` if guest sets too many
1100 * non-multicast MAC addresses.
1101 *
1102 * Therefore, QEMU should only send this CVQ command if the
1103 * `n->mac_table.multi_overflow` is marked or `n->allmulti` is on,
1104 * which sets all-multicast mode on, different from the device's defaults.
1105 *
1106 * Note that the device's defaults can mismatch the driver's
1107 * configuration only at live migration.
1108 */
1109 if (n->mac_table.multi_overflow || n->allmulti) {
1110 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
1111 VIRTIO_NET_CTRL_RX_ALLMULTI, 1);
1112 if (unlikely(r < 0)) {
1113 return r;
1114 }
1115 }
1116
1117 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX_EXTRA)) {
1118 return 0;
1119 }
1120
1121 /*
1122 * According to virtio_net_reset(), device turns all-unicast mode
1123 * off by default.
1124 *
1125 * Therefore, QEMU should only send this CVQ command if the driver
1126 * sets all-unicast mode on, different from the device's defaults.
1127 *
1128 * Note that the device's defaults can mismatch the driver's
1129 * configuration only at live migration.
1130 */
1131 if (n->alluni) {
1132 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
1133 VIRTIO_NET_CTRL_RX_ALLUNI, 1);
1134 if (r < 0) {
1135 return r;
1136 }
1137 }
1138
1139 /*
1140 * According to virtio_net_reset(), device turns non-multicast mode
1141 * off by default.
1142 *
1143 * Therefore, QEMU should only send this CVQ command if the driver
1144 * sets non-multicast mode on, different from the device's defaults.
1145 *
1146 * Note that the device's defaults can mismatch the driver's
1147 * configuration only at live migration.
1148 */
1149 if (n->nomulti) {
1150 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
1151 VIRTIO_NET_CTRL_RX_NOMULTI, 1);
1152 if (r < 0) {
1153 return r;
1154 }
1155 }
1156
1157 /*
1158 * According to virtio_net_reset(), device turns non-unicast mode
1159 * off by default.
1160 *
1161 * Therefore, QEMU should only send this CVQ command if the driver
1162 * sets non-unicast mode on, different from the device's defaults.
1163 *
1164 * Note that the device's defaults can mismatch the driver's
1165 * configuration only at live migration.
1166 */
1167 if (n->nouni) {
1168 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
1169 VIRTIO_NET_CTRL_RX_NOUNI, 1);
1170 if (r < 0) {
1171 return r;
1172 }
1173 }
1174
1175 /*
1176 * According to virtio_net_reset(), device turns non-broadcast mode
1177 * off by default.
1178 *
1179 * Therefore, QEMU should only send this CVQ command if the driver
1180 * sets non-broadcast mode on, different from the device's defaults.
1181 *
1182 * Note that the device's defaults can mismatch the driver's
1183 * configuration only at live migration.
1184 */
1185 if (n->nobcast) {
1186 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
1187 VIRTIO_NET_CTRL_RX_NOBCAST, 1);
1188 if (r < 0) {
1189 return r;
1190 }
1191 }
1192
1193 return 0;
1194 }
1195
vhost_vdpa_net_load_single_vlan(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor,uint16_t vid)1196 static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s,
1197 const VirtIONet *n,
1198 struct iovec *out_cursor,
1199 struct iovec *in_cursor,
1200 uint16_t vid)
1201 {
1202 const struct iovec data = {
1203 .iov_base = &vid,
1204 .iov_len = sizeof(vid),
1205 };
1206 ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
1207 VIRTIO_NET_CTRL_VLAN,
1208 VIRTIO_NET_CTRL_VLAN_ADD,
1209 &data, 1);
1210 if (unlikely(r < 0)) {
1211 return r;
1212 }
1213
1214 return 0;
1215 }
1216
vhost_vdpa_net_load_vlan(VhostVDPAState * s,const VirtIONet * n,struct iovec * out_cursor,struct iovec * in_cursor)1217 static int vhost_vdpa_net_load_vlan(VhostVDPAState *s,
1218 const VirtIONet *n,
1219 struct iovec *out_cursor,
1220 struct iovec *in_cursor)
1221 {
1222 int r;
1223
1224 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_VLAN)) {
1225 return 0;
1226 }
1227
1228 for (int i = 0; i < MAX_VLAN >> 5; i++) {
1229 for (int j = 0; n->vlans[i] && j <= 0x1f; j++) {
1230 if (n->vlans[i] & (1U << j)) {
1231 r = vhost_vdpa_net_load_single_vlan(s, n, out_cursor,
1232 in_cursor, (i << 5) + j);
1233 if (unlikely(r != 0)) {
1234 return r;
1235 }
1236 }
1237 }
1238 }
1239
1240 return 0;
1241 }
1242
vhost_vdpa_net_cvq_load(NetClientState * nc)1243 static int vhost_vdpa_net_cvq_load(NetClientState *nc)
1244 {
1245 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
1246 struct vhost_vdpa *v = &s->vhost_vdpa;
1247 const VirtIONet *n;
1248 int r;
1249 struct iovec out_cursor, in_cursor;
1250
1251 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
1252
1253 r = vhost_vdpa_set_vring_ready(v, v->dev->vq_index);
1254 if (unlikely(r < 0)) {
1255 return r;
1256 }
1257
1258 if (v->shadow_vqs_enabled) {
1259 n = VIRTIO_NET(v->dev->vdev);
1260 vhost_vdpa_net_load_cursor_reset(s, &out_cursor, &in_cursor);
1261 r = vhost_vdpa_net_load_mac(s, n, &out_cursor, &in_cursor);
1262 if (unlikely(r < 0)) {
1263 return r;
1264 }
1265 r = vhost_vdpa_net_load_mq(s, n, &out_cursor, &in_cursor);
1266 if (unlikely(r)) {
1267 return r;
1268 }
1269 r = vhost_vdpa_net_load_offloads(s, n, &out_cursor, &in_cursor);
1270 if (unlikely(r)) {
1271 return r;
1272 }
1273 r = vhost_vdpa_net_load_rx(s, n, &out_cursor, &in_cursor);
1274 if (unlikely(r)) {
1275 return r;
1276 }
1277 r = vhost_vdpa_net_load_vlan(s, n, &out_cursor, &in_cursor);
1278 if (unlikely(r)) {
1279 return r;
1280 }
1281
1282 /*
1283 * We need to poll and check all pending device's used buffers.
1284 *
1285 * We can poll here since we've had BQL from the time
1286 * we sent the descriptor.
1287 */
1288 r = vhost_vdpa_net_svq_flush(s, in_cursor.iov_base - (void *)s->status);
1289 if (unlikely(r)) {
1290 return r;
1291 }
1292 }
1293
1294 for (int i = 0; i < v->dev->vq_index; ++i) {
1295 r = vhost_vdpa_set_vring_ready(v, i);
1296 if (unlikely(r < 0)) {
1297 return r;
1298 }
1299 }
1300
1301 return 0;
1302 }
1303
1304 static NetClientInfo net_vhost_vdpa_cvq_info = {
1305 .type = NET_CLIENT_DRIVER_VHOST_VDPA,
1306 .size = sizeof(VhostVDPAState),
1307 .receive = vhost_vdpa_receive,
1308 .start = vhost_vdpa_net_cvq_start,
1309 .load = vhost_vdpa_net_cvq_load,
1310 .stop = vhost_vdpa_net_cvq_stop,
1311 .cleanup = vhost_vdpa_cleanup,
1312 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
1313 .get_vnet_hash_supported_types = vhost_vdpa_get_vnet_hash_supported_types,
1314 .has_ufo = vhost_vdpa_has_ufo,
1315 .check_peer_type = vhost_vdpa_check_peer_type,
1316 .get_vhost_net = vhost_vdpa_get_vhost_net,
1317 };
1318
1319 /*
1320 * Forward the excessive VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command to
1321 * vdpa device.
1322 *
1323 * Considering that QEMU cannot send the entire filter table to the
1324 * vdpa device, it should send the VIRTIO_NET_CTRL_RX_PROMISC CVQ
1325 * command to enable promiscuous mode to receive all packets,
1326 * according to VirtIO standard, "Since there are no guarantees,
1327 * it can use a hash filter or silently switch to allmulti or
1328 * promiscuous mode if it is given too many addresses.".
1329 *
1330 * Since QEMU ignores MAC addresses beyond `MAC_TABLE_ENTRIES` and
1331 * marks `n->mac_table.x_overflow` accordingly, it should have
1332 * the same effect on the device model to receive
1333 * (`MAC_TABLE_ENTRIES` + 1) or more non-multicast MAC addresses.
1334 * The same applies to multicast MAC addresses.
1335 *
1336 * Therefore, QEMU can provide the device model with a fake
1337 * VIRTIO_NET_CTRL_MAC_TABLE_SET command with (`MAC_TABLE_ENTRIES` + 1)
1338 * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1) multicast
1339 * MAC addresses. This ensures that the device model marks
1340 * `n->mac_table.uni_overflow` and `n->mac_table.multi_overflow`,
1341 * allowing all packets to be received, which aligns with the
1342 * state of the vdpa device.
1343 */
vhost_vdpa_net_excessive_mac_filter_cvq_add(VhostVDPAState * s,VirtQueueElement * elem,struct iovec * out,const struct iovec * in)1344 static int vhost_vdpa_net_excessive_mac_filter_cvq_add(VhostVDPAState *s,
1345 VirtQueueElement *elem,
1346 struct iovec *out,
1347 const struct iovec *in)
1348 {
1349 struct virtio_net_ctrl_mac mac_data, *mac_ptr;
1350 struct virtio_net_ctrl_hdr *hdr_ptr;
1351 uint32_t cursor;
1352 ssize_t r;
1353 uint8_t on = 1;
1354
1355 /* parse the non-multicast MAC address entries from CVQ command */
1356 cursor = sizeof(*hdr_ptr);
1357 r = iov_to_buf(elem->out_sg, elem->out_num, cursor,
1358 &mac_data, sizeof(mac_data));
1359 if (unlikely(r != sizeof(mac_data))) {
1360 /*
1361 * If the CVQ command is invalid, we should simulate the vdpa device
1362 * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1363 */
1364 *s->status = VIRTIO_NET_ERR;
1365 return sizeof(*s->status);
1366 }
1367 cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN;
1368
1369 /* parse the multicast MAC address entries from CVQ command */
1370 r = iov_to_buf(elem->out_sg, elem->out_num, cursor,
1371 &mac_data, sizeof(mac_data));
1372 if (r != sizeof(mac_data)) {
1373 /*
1374 * If the CVQ command is invalid, we should simulate the vdpa device
1375 * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1376 */
1377 *s->status = VIRTIO_NET_ERR;
1378 return sizeof(*s->status);
1379 }
1380 cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN;
1381
1382 /* validate the CVQ command */
1383 if (iov_size(elem->out_sg, elem->out_num) != cursor) {
1384 /*
1385 * If the CVQ command is invalid, we should simulate the vdpa device
1386 * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1387 */
1388 *s->status = VIRTIO_NET_ERR;
1389 return sizeof(*s->status);
1390 }
1391
1392 /*
1393 * According to VirtIO standard, "Since there are no guarantees,
1394 * it can use a hash filter or silently switch to allmulti or
1395 * promiscuous mode if it is given too many addresses.".
1396 *
1397 * Therefore, considering that QEMU is unable to send the entire
1398 * filter table to the vdpa device, it should send the
1399 * VIRTIO_NET_CTRL_RX_PROMISC CVQ command to enable promiscuous mode
1400 */
1401 hdr_ptr = out->iov_base;
1402 out->iov_len = sizeof(*hdr_ptr) + sizeof(on);
1403
1404 hdr_ptr->class = VIRTIO_NET_CTRL_RX;
1405 hdr_ptr->cmd = VIRTIO_NET_CTRL_RX_PROMISC;
1406 iov_from_buf(out, 1, sizeof(*hdr_ptr), &on, sizeof(on));
1407 r = vhost_vdpa_net_cvq_add(s, out, 1, in, 1);
1408 if (unlikely(r < 0)) {
1409 return r;
1410 }
1411
1412 /*
1413 * We can poll here since we've had BQL from the time
1414 * we sent the descriptor.
1415 */
1416 r = vhost_vdpa_net_svq_poll(s, 1);
1417 if (unlikely(r < sizeof(*s->status))) {
1418 return r;
1419 }
1420 if (*s->status != VIRTIO_NET_OK) {
1421 return sizeof(*s->status);
1422 }
1423
1424 /*
1425 * QEMU should also send a fake VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ
1426 * command to the device model, including (`MAC_TABLE_ENTRIES` + 1)
1427 * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1)
1428 * multicast MAC addresses.
1429 *
1430 * By doing so, the device model can mark `n->mac_table.uni_overflow`
1431 * and `n->mac_table.multi_overflow`, enabling all packets to be
1432 * received, which aligns with the state of the vdpa device.
1433 */
1434 cursor = 0;
1435 uint32_t fake_uni_entries = MAC_TABLE_ENTRIES + 1,
1436 fake_mul_entries = MAC_TABLE_ENTRIES + 1,
1437 fake_cvq_size = sizeof(struct virtio_net_ctrl_hdr) +
1438 sizeof(mac_data) + fake_uni_entries * ETH_ALEN +
1439 sizeof(mac_data) + fake_mul_entries * ETH_ALEN;
1440
1441 assert(fake_cvq_size < vhost_vdpa_net_cvq_cmd_page_len());
1442 out->iov_len = fake_cvq_size;
1443
1444 /* pack the header for fake CVQ command */
1445 hdr_ptr = out->iov_base + cursor;
1446 hdr_ptr->class = VIRTIO_NET_CTRL_MAC;
1447 hdr_ptr->cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
1448 cursor += sizeof(*hdr_ptr);
1449
1450 /*
1451 * Pack the non-multicast MAC addresses part for fake CVQ command.
1452 *
1453 * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC
1454 * addresses provided in CVQ command. Therefore, only the entries
1455 * field need to be prepared in the CVQ command.
1456 */
1457 mac_ptr = out->iov_base + cursor;
1458 mac_ptr->entries = cpu_to_le32(fake_uni_entries);
1459 cursor += sizeof(*mac_ptr) + fake_uni_entries * ETH_ALEN;
1460
1461 /*
1462 * Pack the multicast MAC addresses part for fake CVQ command.
1463 *
1464 * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC
1465 * addresses provided in CVQ command. Therefore, only the entries
1466 * field need to be prepared in the CVQ command.
1467 */
1468 mac_ptr = out->iov_base + cursor;
1469 mac_ptr->entries = cpu_to_le32(fake_mul_entries);
1470
1471 /*
1472 * Simulating QEMU poll a vdpa device used buffer
1473 * for VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
1474 */
1475 return sizeof(*s->status);
1476 }
1477
1478 /**
1479 * Validate and copy control virtqueue commands.
1480 *
1481 * Following QEMU guidelines, we offer a copy of the buffers to the device to
1482 * prevent TOCTOU bugs.
1483 */
vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue * svq,VirtQueueElement * elem,void * opaque)1484 static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
1485 VirtQueueElement *elem,
1486 void *opaque)
1487 {
1488 VhostVDPAState *s = opaque;
1489 size_t in_len;
1490 const struct virtio_net_ctrl_hdr *ctrl;
1491 virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1492 /* Out buffer sent to both the vdpa device and the device model */
1493 struct iovec out = {
1494 .iov_base = s->cvq_cmd_out_buffer,
1495 };
1496 /* in buffer used for device model */
1497 const struct iovec model_in = {
1498 .iov_base = &status,
1499 .iov_len = sizeof(status),
1500 };
1501 /* in buffer used for vdpa device */
1502 const struct iovec vdpa_in = {
1503 .iov_base = s->status,
1504 .iov_len = sizeof(*s->status),
1505 };
1506 ssize_t dev_written = -EINVAL;
1507
1508 out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0,
1509 s->cvq_cmd_out_buffer,
1510 vhost_vdpa_net_cvq_cmd_page_len());
1511
1512 ctrl = s->cvq_cmd_out_buffer;
1513 if (ctrl->class == VIRTIO_NET_CTRL_ANNOUNCE) {
1514 /*
1515 * Guest announce capability is emulated by qemu, so don't forward to
1516 * the device.
1517 */
1518 dev_written = sizeof(status);
1519 *s->status = VIRTIO_NET_OK;
1520 } else if (unlikely(ctrl->class == VIRTIO_NET_CTRL_MAC &&
1521 ctrl->cmd == VIRTIO_NET_CTRL_MAC_TABLE_SET &&
1522 iov_size(elem->out_sg, elem->out_num) > out.iov_len)) {
1523 /*
1524 * Due to the size limitation of the out buffer sent to the vdpa device,
1525 * which is determined by vhost_vdpa_net_cvq_cmd_page_len(), excessive
1526 * MAC addresses set by the driver for the filter table can cause
1527 * truncation of the CVQ command in QEMU. As a result, the vdpa device
1528 * rejects the flawed CVQ command.
1529 *
1530 * Therefore, QEMU must handle this situation instead of sending
1531 * the CVQ command directly.
1532 */
1533 dev_written = vhost_vdpa_net_excessive_mac_filter_cvq_add(s, elem,
1534 &out, &vdpa_in);
1535 if (unlikely(dev_written < 0)) {
1536 goto out;
1537 }
1538 } else {
1539 ssize_t r;
1540 r = vhost_vdpa_net_cvq_add(s, &out, 1, &vdpa_in, 1);
1541 if (unlikely(r < 0)) {
1542 dev_written = r;
1543 goto out;
1544 }
1545
1546 /*
1547 * We can poll here since we've had BQL from the time
1548 * we sent the descriptor.
1549 */
1550 dev_written = vhost_vdpa_net_svq_poll(s, 1);
1551 }
1552
1553 if (unlikely(dev_written < sizeof(status))) {
1554 error_report("Insufficient written data (%zu)", dev_written);
1555 goto out;
1556 }
1557
1558 if (*s->status != VIRTIO_NET_OK) {
1559 goto out;
1560 }
1561
1562 status = VIRTIO_NET_ERR;
1563 virtio_net_handle_ctrl_iov(svq->vdev, &model_in, 1, &out, 1);
1564 if (status != VIRTIO_NET_OK) {
1565 error_report("Bad CVQ processing in model");
1566 }
1567
1568 out:
1569 in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
1570 sizeof(status));
1571 if (unlikely(in_len < sizeof(status))) {
1572 error_report("Bad device CVQ written length");
1573 }
1574 vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
1575 /*
1576 * `elem` belongs to vhost_vdpa_net_handle_ctrl_avail() only when
1577 * the function successfully forwards the CVQ command, indicated
1578 * by a non-negative value of `dev_written`. Otherwise, it still
1579 * belongs to SVQ.
1580 * This function should only free the `elem` when it owns.
1581 */
1582 if (dev_written >= 0) {
1583 g_free(elem);
1584 }
1585 return dev_written < 0 ? dev_written : 0;
1586 }
1587
1588 static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
1589 .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
1590 };
1591
1592 /**
1593 * Probe if CVQ is isolated
1594 *
1595 * @device_fd The vdpa device fd
1596 * @features Features offered by the device.
1597 * @cvq_index The control vq pair index
1598 *
1599 * Returns <0 in case of failure, 0 if false and 1 if true.
1600 */
vhost_vdpa_probe_cvq_isolation(int device_fd,uint64_t features,int cvq_index,Error ** errp)1601 static int vhost_vdpa_probe_cvq_isolation(int device_fd, uint64_t features,
1602 int cvq_index, Error **errp)
1603 {
1604 ERRP_GUARD();
1605 uint64_t backend_features;
1606 int64_t cvq_group;
1607 uint8_t status = VIRTIO_CONFIG_S_ACKNOWLEDGE |
1608 VIRTIO_CONFIG_S_DRIVER;
1609 int r;
1610
1611 r = ioctl(device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features);
1612 if (unlikely(r < 0)) {
1613 error_setg_errno(errp, errno, "Cannot get vdpa backend_features");
1614 return r;
1615 }
1616
1617 if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID))) {
1618 return 0;
1619 }
1620
1621 r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
1622 if (unlikely(r)) {
1623 error_setg_errno(errp, -r, "Cannot set device status");
1624 goto out;
1625 }
1626
1627 r = ioctl(device_fd, VHOST_SET_FEATURES, &features);
1628 if (unlikely(r)) {
1629 error_setg_errno(errp, -r, "Cannot set features");
1630 goto out;
1631 }
1632
1633 status |= VIRTIO_CONFIG_S_FEATURES_OK;
1634 r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
1635 if (unlikely(r)) {
1636 error_setg_errno(errp, -r, "Cannot set device status");
1637 goto out;
1638 }
1639
1640 cvq_group = vhost_vdpa_get_vring_group(device_fd, cvq_index, errp);
1641 if (unlikely(cvq_group < 0)) {
1642 if (cvq_group != -ENOTSUP) {
1643 r = cvq_group;
1644 goto out;
1645 }
1646
1647 /*
1648 * The kernel report VHOST_BACKEND_F_IOTLB_ASID if the vdpa frontend
1649 * support ASID even if the parent driver does not. The CVQ cannot be
1650 * isolated in this case.
1651 */
1652 error_free(*errp);
1653 *errp = NULL;
1654 r = 0;
1655 goto out;
1656 }
1657
1658 for (int i = 0; i < cvq_index; ++i) {
1659 int64_t group = vhost_vdpa_get_vring_group(device_fd, i, errp);
1660 if (unlikely(group < 0)) {
1661 r = group;
1662 goto out;
1663 }
1664
1665 if (group == (int64_t)cvq_group) {
1666 r = 0;
1667 goto out;
1668 }
1669 }
1670
1671 r = 1;
1672
1673 out:
1674 status = 0;
1675 ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
1676 return r;
1677 }
1678
net_vhost_vdpa_init(NetClientState * peer,const char * device,const char * name,int vdpa_device_fd,int queue_pair_index,int nvqs,bool is_datapath,bool svq,struct vhost_vdpa_iova_range iova_range,uint64_t features,VhostVDPAShared * shared,Error ** errp)1679 static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
1680 const char *device,
1681 const char *name,
1682 int vdpa_device_fd,
1683 int queue_pair_index,
1684 int nvqs,
1685 bool is_datapath,
1686 bool svq,
1687 struct vhost_vdpa_iova_range iova_range,
1688 uint64_t features,
1689 VhostVDPAShared *shared,
1690 Error **errp)
1691 {
1692 NetClientState *nc = NULL;
1693 VhostVDPAState *s;
1694 int ret = 0;
1695 assert(name);
1696 int cvq_isolated = 0;
1697
1698 if (is_datapath) {
1699 nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device,
1700 name);
1701 } else {
1702 cvq_isolated = vhost_vdpa_probe_cvq_isolation(vdpa_device_fd, features,
1703 queue_pair_index * 2,
1704 errp);
1705 if (unlikely(cvq_isolated < 0)) {
1706 return NULL;
1707 }
1708
1709 nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer,
1710 device, name);
1711 }
1712 qemu_set_info_str(nc, TYPE_VHOST_VDPA);
1713 s = DO_UPCAST(VhostVDPAState, nc, nc);
1714
1715 s->vhost_vdpa.index = queue_pair_index;
1716 s->always_svq = svq;
1717 s->migration_state.notify = NULL;
1718 s->vhost_vdpa.shadow_vqs_enabled = svq;
1719 if (queue_pair_index == 0) {
1720 vhost_vdpa_net_valid_svq_features(features,
1721 &s->vhost_vdpa.migration_blocker);
1722 s->vhost_vdpa.shared = g_new0(VhostVDPAShared, 1);
1723 s->vhost_vdpa.shared->device_fd = vdpa_device_fd;
1724 s->vhost_vdpa.shared->iova_range = iova_range;
1725 s->vhost_vdpa.shared->shadow_data = svq;
1726 s->vhost_vdpa.shared->iova_tree = vhost_iova_tree_new(iova_range.first,
1727 iova_range.last);
1728 } else if (!is_datapath) {
1729 s->cvq_cmd_out_buffer = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(),
1730 PROT_READ | PROT_WRITE,
1731 MAP_SHARED | MAP_ANONYMOUS, -1, 0);
1732 s->status = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(),
1733 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
1734 -1, 0);
1735
1736 s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
1737 s->vhost_vdpa.shadow_vq_ops_opaque = s;
1738 s->cvq_isolated = cvq_isolated;
1739 }
1740 if (queue_pair_index != 0) {
1741 s->vhost_vdpa.shared = shared;
1742 }
1743
1744 ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
1745 if (ret) {
1746 qemu_del_net_client(nc);
1747 return NULL;
1748 }
1749
1750 return nc;
1751 }
1752
vhost_vdpa_get_features(int fd,uint64_t * features,Error ** errp)1753 static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
1754 {
1755 int ret = ioctl(fd, VHOST_GET_FEATURES, features);
1756 if (unlikely(ret < 0)) {
1757 error_setg_errno(errp, errno,
1758 "Fail to query features from vhost-vDPA device");
1759 }
1760 return ret;
1761 }
1762
vhost_vdpa_get_max_queue_pairs(int fd,uint64_t features,int * has_cvq,Error ** errp)1763 static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
1764 int *has_cvq, Error **errp)
1765 {
1766 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
1767 g_autofree struct vhost_vdpa_config *config = NULL;
1768 __virtio16 *max_queue_pairs;
1769 int ret;
1770
1771 if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
1772 *has_cvq = 1;
1773 } else {
1774 *has_cvq = 0;
1775 }
1776
1777 if (features & (1 << VIRTIO_NET_F_MQ)) {
1778 config = g_malloc0(config_size + sizeof(*max_queue_pairs));
1779 config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs);
1780 config->len = sizeof(*max_queue_pairs);
1781
1782 ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config);
1783 if (ret) {
1784 error_setg(errp, "Fail to get config from vhost-vDPA device");
1785 return -ret;
1786 }
1787
1788 max_queue_pairs = (__virtio16 *)&config->buf;
1789
1790 return lduw_le_p(max_queue_pairs);
1791 }
1792
1793 return 1;
1794 }
1795
net_init_vhost_vdpa(const Netdev * netdev,const char * name,NetClientState * peer,Error ** errp)1796 int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
1797 NetClientState *peer, Error **errp)
1798 {
1799 ERRP_GUARD();
1800 const NetdevVhostVDPAOptions *opts;
1801 uint64_t features;
1802 int vdpa_device_fd;
1803 g_autofree NetClientState **ncs = NULL;
1804 struct vhost_vdpa_iova_range iova_range;
1805 NetClientState *nc;
1806 int queue_pairs, r, i = 0, has_cvq = 0;
1807
1808 assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
1809 opts = &netdev->u.vhost_vdpa;
1810 if (!opts->vhostdev && !opts->vhostfd) {
1811 error_setg(errp,
1812 "vhost-vdpa: neither vhostdev= nor vhostfd= was specified");
1813 return -1;
1814 }
1815
1816 if (opts->vhostdev && opts->vhostfd) {
1817 error_setg(errp,
1818 "vhost-vdpa: vhostdev= and vhostfd= are mutually exclusive");
1819 return -1;
1820 }
1821
1822 if (opts->vhostdev) {
1823 vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp);
1824 if (vdpa_device_fd == -1) {
1825 return -errno;
1826 }
1827 } else {
1828 /* has_vhostfd */
1829 vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, errp);
1830 if (vdpa_device_fd == -1) {
1831 error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: ");
1832 return -1;
1833 }
1834 }
1835
1836 r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
1837 if (unlikely(r < 0)) {
1838 goto err;
1839 }
1840
1841 queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
1842 &has_cvq, errp);
1843 if (queue_pairs <= 0) {
1844 goto err;
1845 }
1846
1847 r = vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
1848 if (unlikely(r < 0)) {
1849 error_setg(errp, "vhost-vdpa: get iova range failed: %s",
1850 strerror(-r));
1851 goto err;
1852 }
1853
1854 if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
1855 goto err;
1856 }
1857
1858 ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
1859
1860 for (i = 0; i < queue_pairs; i++) {
1861 VhostVDPAShared *shared = NULL;
1862
1863 if (i) {
1864 shared = DO_UPCAST(VhostVDPAState, nc, ncs[0])->vhost_vdpa.shared;
1865 }
1866 ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
1867 vdpa_device_fd, i, 2, true, opts->x_svq,
1868 iova_range, features, shared, errp);
1869 if (!ncs[i])
1870 goto err;
1871 }
1872
1873 if (has_cvq) {
1874 VhostVDPAState *s0 = DO_UPCAST(VhostVDPAState, nc, ncs[0]);
1875 VhostVDPAShared *shared = s0->vhost_vdpa.shared;
1876
1877 nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
1878 vdpa_device_fd, i, 1, false,
1879 opts->x_svq, iova_range, features, shared,
1880 errp);
1881 if (!nc)
1882 goto err;
1883 }
1884
1885 return 0;
1886
1887 err:
1888 if (i) {
1889 for (i--; i >= 0; i--) {
1890 qemu_del_net_client(ncs[i]);
1891 }
1892 }
1893
1894 qemu_close(vdpa_device_fd);
1895
1896 return -1;
1897 }
1898