1 /* 2 * vhost-vdpa.c 3 * 4 * Copyright(c) 2017-2018 Intel Corporation. 5 * Copyright(c) 2020 Red Hat, Inc. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include "clients.h" 14 #include "hw/virtio/virtio-net.h" 15 #include "net/vhost_net.h" 16 #include "net/vhost-vdpa.h" 17 #include "hw/virtio/vhost-vdpa.h" 18 #include "qemu/config-file.h" 19 #include "qemu/error-report.h" 20 #include "qemu/log.h" 21 #include "qemu/memalign.h" 22 #include "qemu/option.h" 23 #include "qapi/error.h" 24 #include <linux/vhost.h> 25 #include <sys/ioctl.h> 26 #include <err.h> 27 #include "standard-headers/linux/virtio_net.h" 28 #include "monitor/monitor.h" 29 #include "migration/migration.h" 30 #include "migration/misc.h" 31 #include "hw/virtio/vhost.h" 32 33 /* Todo:need to add the multiqueue support here */ 34 typedef struct VhostVDPAState { 35 NetClientState nc; 36 struct vhost_vdpa vhost_vdpa; 37 Notifier migration_state; 38 VHostNetState *vhost_net; 39 40 /* Control commands shadow buffers */ 41 void *cvq_cmd_out_buffer; 42 virtio_net_ctrl_ack *status; 43 44 /* The device always have SVQ enabled */ 45 bool always_svq; 46 47 /* The device can isolate CVQ in its own ASID */ 48 bool cvq_isolated; 49 50 bool started; 51 } VhostVDPAState; 52 53 /* 54 * The array is sorted alphabetically in ascending order, 55 * with the exception of VHOST_INVALID_FEATURE_BIT, 56 * which should always be the last entry. 57 */ 58 const int vdpa_feature_bits[] = { 59 VIRTIO_F_ANY_LAYOUT, 60 VIRTIO_F_IOMMU_PLATFORM, 61 VIRTIO_F_NOTIFY_ON_EMPTY, 62 VIRTIO_F_RING_PACKED, 63 VIRTIO_F_RING_RESET, 64 VIRTIO_F_VERSION_1, 65 VIRTIO_NET_F_CSUM, 66 VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, 67 VIRTIO_NET_F_CTRL_MAC_ADDR, 68 VIRTIO_NET_F_CTRL_RX, 69 VIRTIO_NET_F_CTRL_RX_EXTRA, 70 VIRTIO_NET_F_CTRL_VLAN, 71 VIRTIO_NET_F_CTRL_VQ, 72 VIRTIO_NET_F_GSO, 73 VIRTIO_NET_F_GUEST_CSUM, 74 VIRTIO_NET_F_GUEST_ECN, 75 VIRTIO_NET_F_GUEST_TSO4, 76 VIRTIO_NET_F_GUEST_TSO6, 77 VIRTIO_NET_F_GUEST_UFO, 78 VIRTIO_NET_F_GUEST_USO4, 79 VIRTIO_NET_F_GUEST_USO6, 80 VIRTIO_NET_F_HASH_REPORT, 81 VIRTIO_NET_F_HOST_ECN, 82 VIRTIO_NET_F_HOST_TSO4, 83 VIRTIO_NET_F_HOST_TSO6, 84 VIRTIO_NET_F_HOST_UFO, 85 VIRTIO_NET_F_HOST_USO, 86 VIRTIO_NET_F_MQ, 87 VIRTIO_NET_F_MRG_RXBUF, 88 VIRTIO_NET_F_MTU, 89 VIRTIO_NET_F_RSS, 90 VIRTIO_NET_F_STATUS, 91 VIRTIO_RING_F_EVENT_IDX, 92 VIRTIO_RING_F_INDIRECT_DESC, 93 94 /* VHOST_INVALID_FEATURE_BIT should always be the last entry */ 95 VHOST_INVALID_FEATURE_BIT 96 }; 97 98 /** Supported device specific feature bits with SVQ */ 99 static const uint64_t vdpa_svq_device_features = 100 BIT_ULL(VIRTIO_NET_F_CSUM) | 101 BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) | 102 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | 103 BIT_ULL(VIRTIO_NET_F_MTU) | 104 BIT_ULL(VIRTIO_NET_F_MAC) | 105 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | 106 BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) | 107 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | 108 BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | 109 BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | 110 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | 111 BIT_ULL(VIRTIO_NET_F_HOST_ECN) | 112 BIT_ULL(VIRTIO_NET_F_HOST_UFO) | 113 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | 114 BIT_ULL(VIRTIO_NET_F_STATUS) | 115 BIT_ULL(VIRTIO_NET_F_CTRL_VQ) | 116 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | 117 BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) | 118 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | 119 BIT_ULL(VIRTIO_NET_F_MQ) | 120 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | 121 BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | 122 /* VHOST_F_LOG_ALL is exposed by SVQ */ 123 BIT_ULL(VHOST_F_LOG_ALL) | 124 BIT_ULL(VIRTIO_NET_F_HASH_REPORT) | 125 BIT_ULL(VIRTIO_NET_F_RSC_EXT) | 126 BIT_ULL(VIRTIO_NET_F_STANDBY) | 127 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX); 128 129 #define VHOST_VDPA_NET_CVQ_ASID 1 130 131 VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc) 132 { 133 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 134 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 135 return s->vhost_net; 136 } 137 138 static size_t vhost_vdpa_net_cvq_cmd_len(void) 139 { 140 /* 141 * MAC_TABLE_SET is the ctrl command that produces the longer out buffer. 142 * In buffer is always 1 byte, so it should fit here 143 */ 144 return sizeof(struct virtio_net_ctrl_hdr) + 145 2 * sizeof(struct virtio_net_ctrl_mac) + 146 MAC_TABLE_ENTRIES * ETH_ALEN; 147 } 148 149 static size_t vhost_vdpa_net_cvq_cmd_page_len(void) 150 { 151 return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size()); 152 } 153 154 static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp) 155 { 156 uint64_t invalid_dev_features = 157 features & ~vdpa_svq_device_features & 158 /* Transport are all accepted at this point */ 159 ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START, 160 VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START); 161 162 if (invalid_dev_features) { 163 error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64, 164 invalid_dev_features); 165 return false; 166 } 167 168 return vhost_svq_valid_features(features, errp); 169 } 170 171 static int vhost_vdpa_net_check_device_id(struct vhost_net *net) 172 { 173 uint32_t device_id; 174 int ret; 175 struct vhost_dev *hdev; 176 177 hdev = (struct vhost_dev *)&net->dev; 178 ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id); 179 if (device_id != VIRTIO_ID_NET) { 180 return -ENOTSUP; 181 } 182 return ret; 183 } 184 185 static int vhost_vdpa_add(NetClientState *ncs, void *be, 186 int queue_pair_index, int nvqs) 187 { 188 VhostNetOptions options; 189 struct vhost_net *net = NULL; 190 VhostVDPAState *s; 191 int ret; 192 193 options.backend_type = VHOST_BACKEND_TYPE_VDPA; 194 assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 195 s = DO_UPCAST(VhostVDPAState, nc, ncs); 196 options.net_backend = ncs; 197 options.opaque = be; 198 options.busyloop_timeout = 0; 199 options.nvqs = nvqs; 200 201 net = vhost_net_init(&options); 202 if (!net) { 203 error_report("failed to init vhost_net for queue"); 204 goto err_init; 205 } 206 s->vhost_net = net; 207 ret = vhost_vdpa_net_check_device_id(net); 208 if (ret) { 209 goto err_check; 210 } 211 return 0; 212 err_check: 213 vhost_net_cleanup(net); 214 g_free(net); 215 err_init: 216 return -1; 217 } 218 219 static void vhost_vdpa_cleanup(NetClientState *nc) 220 { 221 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 222 223 /* 224 * If a peer NIC is attached, do not cleanup anything. 225 * Cleanup will happen as a part of qemu_cleanup() -> net_cleanup() 226 * when the guest is shutting down. 227 */ 228 if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_NIC) { 229 return; 230 } 231 munmap(s->cvq_cmd_out_buffer, vhost_vdpa_net_cvq_cmd_page_len()); 232 munmap(s->status, vhost_vdpa_net_cvq_cmd_page_len()); 233 if (s->vhost_net) { 234 vhost_net_cleanup(s->vhost_net); 235 g_free(s->vhost_net); 236 s->vhost_net = NULL; 237 } 238 if (s->vhost_vdpa.device_fd >= 0) { 239 qemu_close(s->vhost_vdpa.device_fd); 240 s->vhost_vdpa.device_fd = -1; 241 } 242 } 243 244 static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc) 245 { 246 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 247 248 return true; 249 } 250 251 static bool vhost_vdpa_has_ufo(NetClientState *nc) 252 { 253 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 254 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 255 uint64_t features = 0; 256 features |= (1ULL << VIRTIO_NET_F_HOST_UFO); 257 features = vhost_net_get_features(s->vhost_net, features); 258 return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO)); 259 260 } 261 262 static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc, 263 Error **errp) 264 { 265 const char *driver = object_class_get_name(oc); 266 267 if (!g_str_has_prefix(driver, "virtio-net-")) { 268 error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*"); 269 return false; 270 } 271 272 return true; 273 } 274 275 /** Dummy receive in case qemu falls back to userland tap networking */ 276 static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf, 277 size_t size) 278 { 279 return size; 280 } 281 282 /** From any vdpa net client, get the netclient of the first queue pair */ 283 static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s) 284 { 285 NICState *nic = qemu_get_nic(s->nc.peer); 286 NetClientState *nc0 = qemu_get_peer(nic->ncs, 0); 287 288 return DO_UPCAST(VhostVDPAState, nc, nc0); 289 } 290 291 static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) 292 { 293 struct vhost_vdpa *v = &s->vhost_vdpa; 294 VirtIONet *n; 295 VirtIODevice *vdev; 296 int data_queue_pairs, cvq, r; 297 298 /* We are only called on the first data vqs and only if x-svq is not set */ 299 if (s->vhost_vdpa.shadow_vqs_enabled == enable) { 300 return; 301 } 302 303 vdev = v->dev->vdev; 304 n = VIRTIO_NET(vdev); 305 if (!n->vhost_started) { 306 return; 307 } 308 309 data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1; 310 cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ? 311 n->max_ncs - n->max_queue_pairs : 0; 312 /* 313 * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter 314 * in the future and resume the device if read-only operations between 315 * suspend and reset goes wrong. 316 */ 317 vhost_net_stop(vdev, n->nic->ncs, data_queue_pairs, cvq); 318 319 /* Start will check migration setup_or_active to configure or not SVQ */ 320 r = vhost_net_start(vdev, n->nic->ncs, data_queue_pairs, cvq); 321 if (unlikely(r < 0)) { 322 error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r); 323 } 324 } 325 326 static void vdpa_net_migration_state_notifier(Notifier *notifier, void *data) 327 { 328 MigrationState *migration = data; 329 VhostVDPAState *s = container_of(notifier, VhostVDPAState, 330 migration_state); 331 332 if (migration_in_setup(migration)) { 333 vhost_vdpa_net_log_global_enable(s, true); 334 } else if (migration_has_failed(migration)) { 335 vhost_vdpa_net_log_global_enable(s, false); 336 } 337 } 338 339 static void vhost_vdpa_net_data_start_first(VhostVDPAState *s) 340 { 341 struct vhost_vdpa *v = &s->vhost_vdpa; 342 343 migration_add_notifier(&s->migration_state, 344 vdpa_net_migration_state_notifier); 345 if (v->shadow_vqs_enabled) { 346 v->iova_tree = vhost_iova_tree_new(v->iova_range.first, 347 v->iova_range.last); 348 } 349 } 350 351 static int vhost_vdpa_net_data_start(NetClientState *nc) 352 { 353 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 354 struct vhost_vdpa *v = &s->vhost_vdpa; 355 356 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 357 358 if (s->always_svq || 359 migration_is_setup_or_active(migrate_get_current()->state)) { 360 v->shadow_vqs_enabled = true; 361 v->shadow_data = true; 362 } else { 363 v->shadow_vqs_enabled = false; 364 v->shadow_data = false; 365 } 366 367 if (v->index == 0) { 368 vhost_vdpa_net_data_start_first(s); 369 return 0; 370 } 371 372 if (v->shadow_vqs_enabled) { 373 VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s); 374 v->iova_tree = s0->vhost_vdpa.iova_tree; 375 } 376 377 return 0; 378 } 379 380 static int vhost_vdpa_net_data_load(NetClientState *nc) 381 { 382 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 383 struct vhost_vdpa *v = &s->vhost_vdpa; 384 bool has_cvq = v->dev->vq_index_end % 2; 385 386 if (has_cvq) { 387 return 0; 388 } 389 390 for (int i = 0; i < v->dev->nvqs; ++i) { 391 vhost_vdpa_set_vring_ready(v, i + v->dev->vq_index); 392 } 393 return 0; 394 } 395 396 static void vhost_vdpa_net_client_stop(NetClientState *nc) 397 { 398 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 399 struct vhost_dev *dev; 400 401 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 402 403 if (s->vhost_vdpa.index == 0) { 404 migration_remove_notifier(&s->migration_state); 405 } 406 407 dev = s->vhost_vdpa.dev; 408 if (dev->vq_index + dev->nvqs == dev->vq_index_end) { 409 g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete); 410 } else { 411 s->vhost_vdpa.iova_tree = NULL; 412 } 413 } 414 415 static NetClientInfo net_vhost_vdpa_info = { 416 .type = NET_CLIENT_DRIVER_VHOST_VDPA, 417 .size = sizeof(VhostVDPAState), 418 .receive = vhost_vdpa_receive, 419 .start = vhost_vdpa_net_data_start, 420 .load = vhost_vdpa_net_data_load, 421 .stop = vhost_vdpa_net_client_stop, 422 .cleanup = vhost_vdpa_cleanup, 423 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, 424 .has_ufo = vhost_vdpa_has_ufo, 425 .check_peer_type = vhost_vdpa_check_peer_type, 426 }; 427 428 static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index, 429 Error **errp) 430 { 431 struct vhost_vring_state state = { 432 .index = vq_index, 433 }; 434 int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state); 435 436 if (unlikely(r < 0)) { 437 r = -errno; 438 error_setg_errno(errp, errno, "Cannot get VQ %u group", vq_index); 439 return r; 440 } 441 442 return state.num; 443 } 444 445 static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v, 446 unsigned vq_group, 447 unsigned asid_num) 448 { 449 struct vhost_vring_state asid = { 450 .index = vq_group, 451 .num = asid_num, 452 }; 453 int r; 454 455 r = ioctl(v->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid); 456 if (unlikely(r < 0)) { 457 error_report("Can't set vq group %u asid %u, errno=%d (%s)", 458 asid.index, asid.num, errno, g_strerror(errno)); 459 } 460 return r; 461 } 462 463 static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr) 464 { 465 VhostIOVATree *tree = v->iova_tree; 466 DMAMap needle = { 467 /* 468 * No need to specify size or to look for more translations since 469 * this contiguous chunk was allocated by us. 470 */ 471 .translated_addr = (hwaddr)(uintptr_t)addr, 472 }; 473 const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle); 474 int r; 475 476 if (unlikely(!map)) { 477 error_report("Cannot locate expected map"); 478 return; 479 } 480 481 r = vhost_vdpa_dma_unmap(v, v->address_space_id, map->iova, map->size + 1); 482 if (unlikely(r != 0)) { 483 error_report("Device cannot unmap: %s(%d)", g_strerror(r), r); 484 } 485 486 vhost_iova_tree_remove(tree, *map); 487 } 488 489 /** Map CVQ buffer. */ 490 static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size, 491 bool write) 492 { 493 DMAMap map = {}; 494 int r; 495 496 map.translated_addr = (hwaddr)(uintptr_t)buf; 497 map.size = size - 1; 498 map.perm = write ? IOMMU_RW : IOMMU_RO, 499 r = vhost_iova_tree_map_alloc(v->iova_tree, &map); 500 if (unlikely(r != IOVA_OK)) { 501 error_report("Cannot map injected element"); 502 return r; 503 } 504 505 r = vhost_vdpa_dma_map(v, v->address_space_id, map.iova, 506 vhost_vdpa_net_cvq_cmd_page_len(), buf, !write); 507 if (unlikely(r < 0)) { 508 goto dma_map_err; 509 } 510 511 return 0; 512 513 dma_map_err: 514 vhost_iova_tree_remove(v->iova_tree, map); 515 return r; 516 } 517 518 static int vhost_vdpa_net_cvq_start(NetClientState *nc) 519 { 520 VhostVDPAState *s, *s0; 521 struct vhost_vdpa *v; 522 int64_t cvq_group; 523 int r; 524 Error *err = NULL; 525 526 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 527 528 s = DO_UPCAST(VhostVDPAState, nc, nc); 529 v = &s->vhost_vdpa; 530 531 s0 = vhost_vdpa_net_first_nc_vdpa(s); 532 v->shadow_data = s0->vhost_vdpa.shadow_vqs_enabled; 533 v->shadow_vqs_enabled = s0->vhost_vdpa.shadow_vqs_enabled; 534 s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID; 535 536 if (s->vhost_vdpa.shadow_data) { 537 /* SVQ is already configured for all virtqueues */ 538 goto out; 539 } 540 541 /* 542 * If we early return in these cases SVQ will not be enabled. The migration 543 * will be blocked as long as vhost-vdpa backends will not offer _F_LOG. 544 */ 545 if (!vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) { 546 return 0; 547 } 548 549 if (!s->cvq_isolated) { 550 return 0; 551 } 552 553 cvq_group = vhost_vdpa_get_vring_group(v->device_fd, 554 v->dev->vq_index_end - 1, 555 &err); 556 if (unlikely(cvq_group < 0)) { 557 error_report_err(err); 558 return cvq_group; 559 } 560 561 r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID); 562 if (unlikely(r < 0)) { 563 return r; 564 } 565 566 v->shadow_vqs_enabled = true; 567 s->vhost_vdpa.address_space_id = VHOST_VDPA_NET_CVQ_ASID; 568 569 out: 570 if (!s->vhost_vdpa.shadow_vqs_enabled) { 571 return 0; 572 } 573 574 if (s0->vhost_vdpa.iova_tree) { 575 /* 576 * SVQ is already configured for all virtqueues. Reuse IOVA tree for 577 * simplicity, whether CVQ shares ASID with guest or not, because: 578 * - Memory listener need access to guest's memory addresses allocated 579 * in the IOVA tree. 580 * - There should be plenty of IOVA address space for both ASID not to 581 * worry about collisions between them. Guest's translations are 582 * still validated with virtio virtqueue_pop so there is no risk for 583 * the guest to access memory that it shouldn't. 584 * 585 * To allocate a iova tree per ASID is doable but it complicates the 586 * code and it is not worth it for the moment. 587 */ 588 v->iova_tree = s0->vhost_vdpa.iova_tree; 589 } else { 590 v->iova_tree = vhost_iova_tree_new(v->iova_range.first, 591 v->iova_range.last); 592 } 593 594 r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer, 595 vhost_vdpa_net_cvq_cmd_page_len(), false); 596 if (unlikely(r < 0)) { 597 return r; 598 } 599 600 r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->status, 601 vhost_vdpa_net_cvq_cmd_page_len(), true); 602 if (unlikely(r < 0)) { 603 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); 604 } 605 606 return r; 607 } 608 609 static void vhost_vdpa_net_cvq_stop(NetClientState *nc) 610 { 611 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 612 613 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 614 615 if (s->vhost_vdpa.shadow_vqs_enabled) { 616 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); 617 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status); 618 } 619 620 vhost_vdpa_net_client_stop(nc); 621 } 622 623 static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, 624 const struct iovec *out_sg, size_t out_num, 625 const struct iovec *in_sg, size_t in_num) 626 { 627 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0); 628 int r; 629 630 r = vhost_svq_add(svq, out_sg, out_num, in_sg, in_num, NULL); 631 if (unlikely(r != 0)) { 632 if (unlikely(r == -ENOSPC)) { 633 qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n", 634 __func__); 635 } 636 } 637 638 return r; 639 } 640 641 /* 642 * Convenience wrapper to poll SVQ for multiple control commands. 643 * 644 * Caller should hold the BQL when invoking this function, and should take 645 * the answer before SVQ pulls by itself when BQL is released. 646 */ 647 static ssize_t vhost_vdpa_net_svq_poll(VhostVDPAState *s, size_t cmds_in_flight) 648 { 649 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0); 650 return vhost_svq_poll(svq, cmds_in_flight); 651 } 652 653 static void vhost_vdpa_net_load_cursor_reset(VhostVDPAState *s, 654 struct iovec *out_cursor, 655 struct iovec *in_cursor) 656 { 657 /* reset the cursor of the output buffer for the device */ 658 out_cursor->iov_base = s->cvq_cmd_out_buffer; 659 out_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len(); 660 661 /* reset the cursor of the in buffer for the device */ 662 in_cursor->iov_base = s->status; 663 in_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len(); 664 } 665 666 /* 667 * Poll SVQ for multiple pending control commands and check the device's ack. 668 * 669 * Caller should hold the BQL when invoking this function. 670 * 671 * @s: The VhostVDPAState 672 * @len: The length of the pending status shadow buffer 673 */ 674 static ssize_t vhost_vdpa_net_svq_flush(VhostVDPAState *s, size_t len) 675 { 676 /* device uses a one-byte length ack for each control command */ 677 ssize_t dev_written = vhost_vdpa_net_svq_poll(s, len); 678 if (unlikely(dev_written != len)) { 679 return -EIO; 680 } 681 682 /* check the device's ack */ 683 for (int i = 0; i < len; ++i) { 684 if (s->status[i] != VIRTIO_NET_OK) { 685 return -EIO; 686 } 687 } 688 return 0; 689 } 690 691 static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, 692 struct iovec *out_cursor, 693 struct iovec *in_cursor, uint8_t class, 694 uint8_t cmd, const struct iovec *data_sg, 695 size_t data_num) 696 { 697 const struct virtio_net_ctrl_hdr ctrl = { 698 .class = class, 699 .cmd = cmd, 700 }; 701 size_t data_size = iov_size(data_sg, data_num), cmd_size; 702 struct iovec out, in; 703 ssize_t r; 704 unsigned dummy_cursor_iov_cnt; 705 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0); 706 707 assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl)); 708 cmd_size = sizeof(ctrl) + data_size; 709 if (vhost_svq_available_slots(svq) < 2 || 710 iov_size(out_cursor, 1) < cmd_size) { 711 /* 712 * It is time to flush all pending control commands if SVQ is full 713 * or control commands shadow buffers are full. 714 * 715 * We can poll here since we've had BQL from the time 716 * we sent the descriptor. 717 */ 718 r = vhost_vdpa_net_svq_flush(s, in_cursor->iov_base - 719 (void *)s->status); 720 if (unlikely(r < 0)) { 721 return r; 722 } 723 724 vhost_vdpa_net_load_cursor_reset(s, out_cursor, in_cursor); 725 } 726 727 /* pack the CVQ command header */ 728 iov_from_buf(out_cursor, 1, 0, &ctrl, sizeof(ctrl)); 729 /* pack the CVQ command command-specific-data */ 730 iov_to_buf(data_sg, data_num, 0, 731 out_cursor->iov_base + sizeof(ctrl), data_size); 732 733 /* extract the required buffer from the cursor for output */ 734 iov_copy(&out, 1, out_cursor, 1, 0, cmd_size); 735 /* extract the required buffer from the cursor for input */ 736 iov_copy(&in, 1, in_cursor, 1, 0, sizeof(*s->status)); 737 738 r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1); 739 if (unlikely(r < 0)) { 740 return r; 741 } 742 743 /* iterate the cursors */ 744 dummy_cursor_iov_cnt = 1; 745 iov_discard_front(&out_cursor, &dummy_cursor_iov_cnt, cmd_size); 746 dummy_cursor_iov_cnt = 1; 747 iov_discard_front(&in_cursor, &dummy_cursor_iov_cnt, sizeof(*s->status)); 748 749 return 0; 750 } 751 752 static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n, 753 struct iovec *out_cursor, 754 struct iovec *in_cursor) 755 { 756 if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_MAC_ADDR)) { 757 const struct iovec data = { 758 .iov_base = (void *)n->mac, 759 .iov_len = sizeof(n->mac), 760 }; 761 ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 762 VIRTIO_NET_CTRL_MAC, 763 VIRTIO_NET_CTRL_MAC_ADDR_SET, 764 &data, 1); 765 if (unlikely(r < 0)) { 766 return r; 767 } 768 } 769 770 /* 771 * According to VirtIO standard, "The device MUST have an 772 * empty MAC filtering table on reset.". 773 * 774 * Therefore, there is no need to send this CVQ command if the 775 * driver also sets an empty MAC filter table, which aligns with 776 * the device's defaults. 777 * 778 * Note that the device's defaults can mismatch the driver's 779 * configuration only at live migration. 780 */ 781 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX) || 782 n->mac_table.in_use == 0) { 783 return 0; 784 } 785 786 uint32_t uni_entries = n->mac_table.first_multi, 787 uni_macs_size = uni_entries * ETH_ALEN, 788 mul_entries = n->mac_table.in_use - uni_entries, 789 mul_macs_size = mul_entries * ETH_ALEN; 790 struct virtio_net_ctrl_mac uni = { 791 .entries = cpu_to_le32(uni_entries), 792 }; 793 struct virtio_net_ctrl_mac mul = { 794 .entries = cpu_to_le32(mul_entries), 795 }; 796 const struct iovec data[] = { 797 { 798 .iov_base = &uni, 799 .iov_len = sizeof(uni), 800 }, { 801 .iov_base = n->mac_table.macs, 802 .iov_len = uni_macs_size, 803 }, { 804 .iov_base = &mul, 805 .iov_len = sizeof(mul), 806 }, { 807 .iov_base = &n->mac_table.macs[uni_macs_size], 808 .iov_len = mul_macs_size, 809 }, 810 }; 811 ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 812 VIRTIO_NET_CTRL_MAC, 813 VIRTIO_NET_CTRL_MAC_TABLE_SET, 814 data, ARRAY_SIZE(data)); 815 if (unlikely(r < 0)) { 816 return r; 817 } 818 819 return 0; 820 } 821 822 static int vhost_vdpa_net_load_rss(VhostVDPAState *s, const VirtIONet *n, 823 struct iovec *out_cursor, 824 struct iovec *in_cursor) 825 { 826 struct virtio_net_rss_config cfg = {}; 827 ssize_t r; 828 g_autofree uint16_t *table = NULL; 829 830 /* 831 * According to VirtIO standard, "Initially the device has all hash 832 * types disabled and reports only VIRTIO_NET_HASH_REPORT_NONE.". 833 * 834 * Therefore, there is no need to send this CVQ command if the 835 * driver disables the all hash types, which aligns with 836 * the device's defaults. 837 * 838 * Note that the device's defaults can mismatch the driver's 839 * configuration only at live migration. 840 */ 841 if (!n->rss_data.enabled || 842 n->rss_data.hash_types == VIRTIO_NET_HASH_REPORT_NONE) { 843 return 0; 844 } 845 846 table = g_malloc_n(n->rss_data.indirections_len, 847 sizeof(n->rss_data.indirections_table[0])); 848 cfg.hash_types = cpu_to_le32(n->rss_data.hash_types); 849 850 /* 851 * According to VirtIO standard, "Field reserved MUST contain zeroes. 852 * It is defined to make the structure to match the layout of 853 * virtio_net_rss_config structure, defined in 5.1.6.5.7.". 854 * 855 * Therefore, we need to zero the fields in 856 * struct virtio_net_rss_config, which corresponds to the 857 * `reserved` field in struct virtio_net_hash_config. 858 * 859 * Note that all other fields are zeroed at their definitions, 860 * except for the `indirection_table` field, where the actual data 861 * is stored in the `table` variable to ensure compatibility 862 * with RSS case. Therefore, we need to zero the `table` variable here. 863 */ 864 table[0] = 0; 865 866 /* 867 * Considering that virtio_net_handle_rss() currently does not restore 868 * the hash key length parsed from the CVQ command sent from the guest 869 * into n->rss_data and uses the maximum key length in other code, so 870 * we also employ the maximum key length here. 871 */ 872 cfg.hash_key_length = sizeof(n->rss_data.key); 873 874 const struct iovec data[] = { 875 { 876 .iov_base = &cfg, 877 .iov_len = offsetof(struct virtio_net_rss_config, 878 indirection_table), 879 }, { 880 .iov_base = table, 881 .iov_len = n->rss_data.indirections_len * 882 sizeof(n->rss_data.indirections_table[0]), 883 }, { 884 .iov_base = &cfg.max_tx_vq, 885 .iov_len = offsetof(struct virtio_net_rss_config, hash_key_data) - 886 offsetof(struct virtio_net_rss_config, max_tx_vq), 887 }, { 888 .iov_base = (void *)n->rss_data.key, 889 .iov_len = sizeof(n->rss_data.key), 890 } 891 }; 892 893 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 894 VIRTIO_NET_CTRL_MQ, 895 VIRTIO_NET_CTRL_MQ_HASH_CONFIG, 896 data, ARRAY_SIZE(data)); 897 if (unlikely(r < 0)) { 898 return r; 899 } 900 901 return 0; 902 } 903 904 static int vhost_vdpa_net_load_mq(VhostVDPAState *s, 905 const VirtIONet *n, 906 struct iovec *out_cursor, 907 struct iovec *in_cursor) 908 { 909 struct virtio_net_ctrl_mq mq; 910 ssize_t r; 911 912 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_MQ)) { 913 return 0; 914 } 915 916 mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs); 917 const struct iovec data = { 918 .iov_base = &mq, 919 .iov_len = sizeof(mq), 920 }; 921 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 922 VIRTIO_NET_CTRL_MQ, 923 VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, 924 &data, 1); 925 if (unlikely(r < 0)) { 926 return r; 927 } 928 929 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_HASH_REPORT)) { 930 return 0; 931 } 932 933 r = vhost_vdpa_net_load_rss(s, n, out_cursor, in_cursor); 934 if (unlikely(r < 0)) { 935 return r; 936 } 937 938 return 0; 939 } 940 941 static int vhost_vdpa_net_load_offloads(VhostVDPAState *s, 942 const VirtIONet *n, 943 struct iovec *out_cursor, 944 struct iovec *in_cursor) 945 { 946 uint64_t offloads; 947 ssize_t r; 948 949 if (!virtio_vdev_has_feature(&n->parent_obj, 950 VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) { 951 return 0; 952 } 953 954 if (n->curr_guest_offloads == virtio_net_supported_guest_offloads(n)) { 955 /* 956 * According to VirtIO standard, "Upon feature negotiation 957 * corresponding offload gets enabled to preserve 958 * backward compatibility.". 959 * 960 * Therefore, there is no need to send this CVQ command if the 961 * driver also enables all supported offloads, which aligns with 962 * the device's defaults. 963 * 964 * Note that the device's defaults can mismatch the driver's 965 * configuration only at live migration. 966 */ 967 return 0; 968 } 969 970 offloads = cpu_to_le64(n->curr_guest_offloads); 971 const struct iovec data = { 972 .iov_base = &offloads, 973 .iov_len = sizeof(offloads), 974 }; 975 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 976 VIRTIO_NET_CTRL_GUEST_OFFLOADS, 977 VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, 978 &data, 1); 979 if (unlikely(r < 0)) { 980 return r; 981 } 982 983 return 0; 984 } 985 986 static int vhost_vdpa_net_load_rx_mode(VhostVDPAState *s, 987 struct iovec *out_cursor, 988 struct iovec *in_cursor, 989 uint8_t cmd, 990 uint8_t on) 991 { 992 const struct iovec data = { 993 .iov_base = &on, 994 .iov_len = sizeof(on), 995 }; 996 ssize_t r; 997 998 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 999 VIRTIO_NET_CTRL_RX, cmd, &data, 1); 1000 if (unlikely(r < 0)) { 1001 return r; 1002 } 1003 1004 return 0; 1005 } 1006 1007 static int vhost_vdpa_net_load_rx(VhostVDPAState *s, 1008 const VirtIONet *n, 1009 struct iovec *out_cursor, 1010 struct iovec *in_cursor) 1011 { 1012 ssize_t r; 1013 1014 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX)) { 1015 return 0; 1016 } 1017 1018 /* 1019 * According to virtio_net_reset(), device turns promiscuous mode 1020 * on by default. 1021 * 1022 * Additionally, according to VirtIO standard, "Since there are 1023 * no guarantees, it can use a hash filter or silently switch to 1024 * allmulti or promiscuous mode if it is given too many addresses.". 1025 * QEMU marks `n->mac_table.uni_overflow` if guest sets too many 1026 * non-multicast MAC addresses, indicating that promiscuous mode 1027 * should be enabled. 1028 * 1029 * Therefore, QEMU should only send this CVQ command if the 1030 * `n->mac_table.uni_overflow` is not marked and `n->promisc` is off, 1031 * which sets promiscuous mode on, different from the device's defaults. 1032 * 1033 * Note that the device's defaults can mismatch the driver's 1034 * configuration only at live migration. 1035 */ 1036 if (!n->mac_table.uni_overflow && !n->promisc) { 1037 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1038 VIRTIO_NET_CTRL_RX_PROMISC, 0); 1039 if (unlikely(r < 0)) { 1040 return r; 1041 } 1042 } 1043 1044 /* 1045 * According to virtio_net_reset(), device turns all-multicast mode 1046 * off by default. 1047 * 1048 * According to VirtIO standard, "Since there are no guarantees, 1049 * it can use a hash filter or silently switch to allmulti or 1050 * promiscuous mode if it is given too many addresses.". QEMU marks 1051 * `n->mac_table.multi_overflow` if guest sets too many 1052 * non-multicast MAC addresses. 1053 * 1054 * Therefore, QEMU should only send this CVQ command if the 1055 * `n->mac_table.multi_overflow` is marked or `n->allmulti` is on, 1056 * which sets all-multicast mode on, different from the device's defaults. 1057 * 1058 * Note that the device's defaults can mismatch the driver's 1059 * configuration only at live migration. 1060 */ 1061 if (n->mac_table.multi_overflow || n->allmulti) { 1062 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1063 VIRTIO_NET_CTRL_RX_ALLMULTI, 1); 1064 if (unlikely(r < 0)) { 1065 return r; 1066 } 1067 } 1068 1069 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX_EXTRA)) { 1070 return 0; 1071 } 1072 1073 /* 1074 * According to virtio_net_reset(), device turns all-unicast mode 1075 * off by default. 1076 * 1077 * Therefore, QEMU should only send this CVQ command if the driver 1078 * sets all-unicast mode on, different from the device's defaults. 1079 * 1080 * Note that the device's defaults can mismatch the driver's 1081 * configuration only at live migration. 1082 */ 1083 if (n->alluni) { 1084 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1085 VIRTIO_NET_CTRL_RX_ALLUNI, 1); 1086 if (r < 0) { 1087 return r; 1088 } 1089 } 1090 1091 /* 1092 * According to virtio_net_reset(), device turns non-multicast mode 1093 * off by default. 1094 * 1095 * Therefore, QEMU should only send this CVQ command if the driver 1096 * sets non-multicast mode on, different from the device's defaults. 1097 * 1098 * Note that the device's defaults can mismatch the driver's 1099 * configuration only at live migration. 1100 */ 1101 if (n->nomulti) { 1102 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1103 VIRTIO_NET_CTRL_RX_NOMULTI, 1); 1104 if (r < 0) { 1105 return r; 1106 } 1107 } 1108 1109 /* 1110 * According to virtio_net_reset(), device turns non-unicast mode 1111 * off by default. 1112 * 1113 * Therefore, QEMU should only send this CVQ command if the driver 1114 * sets non-unicast mode on, different from the device's defaults. 1115 * 1116 * Note that the device's defaults can mismatch the driver's 1117 * configuration only at live migration. 1118 */ 1119 if (n->nouni) { 1120 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1121 VIRTIO_NET_CTRL_RX_NOUNI, 1); 1122 if (r < 0) { 1123 return r; 1124 } 1125 } 1126 1127 /* 1128 * According to virtio_net_reset(), device turns non-broadcast mode 1129 * off by default. 1130 * 1131 * Therefore, QEMU should only send this CVQ command if the driver 1132 * sets non-broadcast mode on, different from the device's defaults. 1133 * 1134 * Note that the device's defaults can mismatch the driver's 1135 * configuration only at live migration. 1136 */ 1137 if (n->nobcast) { 1138 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1139 VIRTIO_NET_CTRL_RX_NOBCAST, 1); 1140 if (r < 0) { 1141 return r; 1142 } 1143 } 1144 1145 return 0; 1146 } 1147 1148 static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s, 1149 const VirtIONet *n, 1150 struct iovec *out_cursor, 1151 struct iovec *in_cursor, 1152 uint16_t vid) 1153 { 1154 const struct iovec data = { 1155 .iov_base = &vid, 1156 .iov_len = sizeof(vid), 1157 }; 1158 ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 1159 VIRTIO_NET_CTRL_VLAN, 1160 VIRTIO_NET_CTRL_VLAN_ADD, 1161 &data, 1); 1162 if (unlikely(r < 0)) { 1163 return r; 1164 } 1165 1166 return 0; 1167 } 1168 1169 static int vhost_vdpa_net_load_vlan(VhostVDPAState *s, 1170 const VirtIONet *n, 1171 struct iovec *out_cursor, 1172 struct iovec *in_cursor) 1173 { 1174 int r; 1175 1176 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_VLAN)) { 1177 return 0; 1178 } 1179 1180 for (int i = 0; i < MAX_VLAN >> 5; i++) { 1181 for (int j = 0; n->vlans[i] && j <= 0x1f; j++) { 1182 if (n->vlans[i] & (1U << j)) { 1183 r = vhost_vdpa_net_load_single_vlan(s, n, out_cursor, 1184 in_cursor, (i << 5) + j); 1185 if (unlikely(r != 0)) { 1186 return r; 1187 } 1188 } 1189 } 1190 } 1191 1192 return 0; 1193 } 1194 1195 static int vhost_vdpa_net_cvq_load(NetClientState *nc) 1196 { 1197 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 1198 struct vhost_vdpa *v = &s->vhost_vdpa; 1199 const VirtIONet *n; 1200 int r; 1201 struct iovec out_cursor, in_cursor; 1202 1203 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 1204 1205 vhost_vdpa_set_vring_ready(v, v->dev->vq_index); 1206 1207 if (v->shadow_vqs_enabled) { 1208 n = VIRTIO_NET(v->dev->vdev); 1209 vhost_vdpa_net_load_cursor_reset(s, &out_cursor, &in_cursor); 1210 r = vhost_vdpa_net_load_mac(s, n, &out_cursor, &in_cursor); 1211 if (unlikely(r < 0)) { 1212 return r; 1213 } 1214 r = vhost_vdpa_net_load_mq(s, n, &out_cursor, &in_cursor); 1215 if (unlikely(r)) { 1216 return r; 1217 } 1218 r = vhost_vdpa_net_load_offloads(s, n, &out_cursor, &in_cursor); 1219 if (unlikely(r)) { 1220 return r; 1221 } 1222 r = vhost_vdpa_net_load_rx(s, n, &out_cursor, &in_cursor); 1223 if (unlikely(r)) { 1224 return r; 1225 } 1226 r = vhost_vdpa_net_load_vlan(s, n, &out_cursor, &in_cursor); 1227 if (unlikely(r)) { 1228 return r; 1229 } 1230 1231 /* 1232 * We need to poll and check all pending device's used buffers. 1233 * 1234 * We can poll here since we've had BQL from the time 1235 * we sent the descriptor. 1236 */ 1237 r = vhost_vdpa_net_svq_flush(s, in_cursor.iov_base - (void *)s->status); 1238 if (unlikely(r)) { 1239 return r; 1240 } 1241 } 1242 1243 for (int i = 0; i < v->dev->vq_index; ++i) { 1244 vhost_vdpa_set_vring_ready(v, i); 1245 } 1246 1247 return 0; 1248 } 1249 1250 static NetClientInfo net_vhost_vdpa_cvq_info = { 1251 .type = NET_CLIENT_DRIVER_VHOST_VDPA, 1252 .size = sizeof(VhostVDPAState), 1253 .receive = vhost_vdpa_receive, 1254 .start = vhost_vdpa_net_cvq_start, 1255 .load = vhost_vdpa_net_cvq_load, 1256 .stop = vhost_vdpa_net_cvq_stop, 1257 .cleanup = vhost_vdpa_cleanup, 1258 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, 1259 .has_ufo = vhost_vdpa_has_ufo, 1260 .check_peer_type = vhost_vdpa_check_peer_type, 1261 }; 1262 1263 /* 1264 * Forward the excessive VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command to 1265 * vdpa device. 1266 * 1267 * Considering that QEMU cannot send the entire filter table to the 1268 * vdpa device, it should send the VIRTIO_NET_CTRL_RX_PROMISC CVQ 1269 * command to enable promiscuous mode to receive all packets, 1270 * according to VirtIO standard, "Since there are no guarantees, 1271 * it can use a hash filter or silently switch to allmulti or 1272 * promiscuous mode if it is given too many addresses.". 1273 * 1274 * Since QEMU ignores MAC addresses beyond `MAC_TABLE_ENTRIES` and 1275 * marks `n->mac_table.x_overflow` accordingly, it should have 1276 * the same effect on the device model to receive 1277 * (`MAC_TABLE_ENTRIES` + 1) or more non-multicast MAC addresses. 1278 * The same applies to multicast MAC addresses. 1279 * 1280 * Therefore, QEMU can provide the device model with a fake 1281 * VIRTIO_NET_CTRL_MAC_TABLE_SET command with (`MAC_TABLE_ENTRIES` + 1) 1282 * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1) multicast 1283 * MAC addresses. This ensures that the device model marks 1284 * `n->mac_table.uni_overflow` and `n->mac_table.multi_overflow`, 1285 * allowing all packets to be received, which aligns with the 1286 * state of the vdpa device. 1287 */ 1288 static int vhost_vdpa_net_excessive_mac_filter_cvq_add(VhostVDPAState *s, 1289 VirtQueueElement *elem, 1290 struct iovec *out, 1291 const struct iovec *in) 1292 { 1293 struct virtio_net_ctrl_mac mac_data, *mac_ptr; 1294 struct virtio_net_ctrl_hdr *hdr_ptr; 1295 uint32_t cursor; 1296 ssize_t r; 1297 uint8_t on = 1; 1298 1299 /* parse the non-multicast MAC address entries from CVQ command */ 1300 cursor = sizeof(*hdr_ptr); 1301 r = iov_to_buf(elem->out_sg, elem->out_num, cursor, 1302 &mac_data, sizeof(mac_data)); 1303 if (unlikely(r != sizeof(mac_data))) { 1304 /* 1305 * If the CVQ command is invalid, we should simulate the vdpa device 1306 * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command 1307 */ 1308 *s->status = VIRTIO_NET_ERR; 1309 return sizeof(*s->status); 1310 } 1311 cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN; 1312 1313 /* parse the multicast MAC address entries from CVQ command */ 1314 r = iov_to_buf(elem->out_sg, elem->out_num, cursor, 1315 &mac_data, sizeof(mac_data)); 1316 if (r != sizeof(mac_data)) { 1317 /* 1318 * If the CVQ command is invalid, we should simulate the vdpa device 1319 * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command 1320 */ 1321 *s->status = VIRTIO_NET_ERR; 1322 return sizeof(*s->status); 1323 } 1324 cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN; 1325 1326 /* validate the CVQ command */ 1327 if (iov_size(elem->out_sg, elem->out_num) != cursor) { 1328 /* 1329 * If the CVQ command is invalid, we should simulate the vdpa device 1330 * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command 1331 */ 1332 *s->status = VIRTIO_NET_ERR; 1333 return sizeof(*s->status); 1334 } 1335 1336 /* 1337 * According to VirtIO standard, "Since there are no guarantees, 1338 * it can use a hash filter or silently switch to allmulti or 1339 * promiscuous mode if it is given too many addresses.". 1340 * 1341 * Therefore, considering that QEMU is unable to send the entire 1342 * filter table to the vdpa device, it should send the 1343 * VIRTIO_NET_CTRL_RX_PROMISC CVQ command to enable promiscuous mode 1344 */ 1345 hdr_ptr = out->iov_base; 1346 out->iov_len = sizeof(*hdr_ptr) + sizeof(on); 1347 1348 hdr_ptr->class = VIRTIO_NET_CTRL_RX; 1349 hdr_ptr->cmd = VIRTIO_NET_CTRL_RX_PROMISC; 1350 iov_from_buf(out, 1, sizeof(*hdr_ptr), &on, sizeof(on)); 1351 r = vhost_vdpa_net_cvq_add(s, out, 1, in, 1); 1352 if (unlikely(r < 0)) { 1353 return r; 1354 } 1355 1356 /* 1357 * We can poll here since we've had BQL from the time 1358 * we sent the descriptor. 1359 */ 1360 r = vhost_vdpa_net_svq_poll(s, 1); 1361 if (unlikely(r < sizeof(*s->status))) { 1362 return r; 1363 } 1364 if (*s->status != VIRTIO_NET_OK) { 1365 return sizeof(*s->status); 1366 } 1367 1368 /* 1369 * QEMU should also send a fake VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ 1370 * command to the device model, including (`MAC_TABLE_ENTRIES` + 1) 1371 * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1) 1372 * multicast MAC addresses. 1373 * 1374 * By doing so, the device model can mark `n->mac_table.uni_overflow` 1375 * and `n->mac_table.multi_overflow`, enabling all packets to be 1376 * received, which aligns with the state of the vdpa device. 1377 */ 1378 cursor = 0; 1379 uint32_t fake_uni_entries = MAC_TABLE_ENTRIES + 1, 1380 fake_mul_entries = MAC_TABLE_ENTRIES + 1, 1381 fake_cvq_size = sizeof(struct virtio_net_ctrl_hdr) + 1382 sizeof(mac_data) + fake_uni_entries * ETH_ALEN + 1383 sizeof(mac_data) + fake_mul_entries * ETH_ALEN; 1384 1385 assert(fake_cvq_size < vhost_vdpa_net_cvq_cmd_page_len()); 1386 out->iov_len = fake_cvq_size; 1387 1388 /* pack the header for fake CVQ command */ 1389 hdr_ptr = out->iov_base + cursor; 1390 hdr_ptr->class = VIRTIO_NET_CTRL_MAC; 1391 hdr_ptr->cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET; 1392 cursor += sizeof(*hdr_ptr); 1393 1394 /* 1395 * Pack the non-multicast MAC addresses part for fake CVQ command. 1396 * 1397 * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC 1398 * addresses provided in CVQ command. Therefore, only the entries 1399 * field need to be prepared in the CVQ command. 1400 */ 1401 mac_ptr = out->iov_base + cursor; 1402 mac_ptr->entries = cpu_to_le32(fake_uni_entries); 1403 cursor += sizeof(*mac_ptr) + fake_uni_entries * ETH_ALEN; 1404 1405 /* 1406 * Pack the multicast MAC addresses part for fake CVQ command. 1407 * 1408 * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC 1409 * addresses provided in CVQ command. Therefore, only the entries 1410 * field need to be prepared in the CVQ command. 1411 */ 1412 mac_ptr = out->iov_base + cursor; 1413 mac_ptr->entries = cpu_to_le32(fake_mul_entries); 1414 1415 /* 1416 * Simulating QEMU poll a vdpa device used buffer 1417 * for VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command 1418 */ 1419 return sizeof(*s->status); 1420 } 1421 1422 /** 1423 * Validate and copy control virtqueue commands. 1424 * 1425 * Following QEMU guidelines, we offer a copy of the buffers to the device to 1426 * prevent TOCTOU bugs. 1427 */ 1428 static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, 1429 VirtQueueElement *elem, 1430 void *opaque) 1431 { 1432 VhostVDPAState *s = opaque; 1433 size_t in_len; 1434 const struct virtio_net_ctrl_hdr *ctrl; 1435 virtio_net_ctrl_ack status = VIRTIO_NET_ERR; 1436 /* Out buffer sent to both the vdpa device and the device model */ 1437 struct iovec out = { 1438 .iov_base = s->cvq_cmd_out_buffer, 1439 }; 1440 /* in buffer used for device model */ 1441 const struct iovec model_in = { 1442 .iov_base = &status, 1443 .iov_len = sizeof(status), 1444 }; 1445 /* in buffer used for vdpa device */ 1446 const struct iovec vdpa_in = { 1447 .iov_base = s->status, 1448 .iov_len = sizeof(*s->status), 1449 }; 1450 ssize_t dev_written = -EINVAL; 1451 1452 out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0, 1453 s->cvq_cmd_out_buffer, 1454 vhost_vdpa_net_cvq_cmd_page_len()); 1455 1456 ctrl = s->cvq_cmd_out_buffer; 1457 if (ctrl->class == VIRTIO_NET_CTRL_ANNOUNCE) { 1458 /* 1459 * Guest announce capability is emulated by qemu, so don't forward to 1460 * the device. 1461 */ 1462 dev_written = sizeof(status); 1463 *s->status = VIRTIO_NET_OK; 1464 } else if (unlikely(ctrl->class == VIRTIO_NET_CTRL_MAC && 1465 ctrl->cmd == VIRTIO_NET_CTRL_MAC_TABLE_SET && 1466 iov_size(elem->out_sg, elem->out_num) > out.iov_len)) { 1467 /* 1468 * Due to the size limitation of the out buffer sent to the vdpa device, 1469 * which is determined by vhost_vdpa_net_cvq_cmd_page_len(), excessive 1470 * MAC addresses set by the driver for the filter table can cause 1471 * truncation of the CVQ command in QEMU. As a result, the vdpa device 1472 * rejects the flawed CVQ command. 1473 * 1474 * Therefore, QEMU must handle this situation instead of sending 1475 * the CVQ command directly. 1476 */ 1477 dev_written = vhost_vdpa_net_excessive_mac_filter_cvq_add(s, elem, 1478 &out, &vdpa_in); 1479 if (unlikely(dev_written < 0)) { 1480 goto out; 1481 } 1482 } else { 1483 ssize_t r; 1484 r = vhost_vdpa_net_cvq_add(s, &out, 1, &vdpa_in, 1); 1485 if (unlikely(r < 0)) { 1486 dev_written = r; 1487 goto out; 1488 } 1489 1490 /* 1491 * We can poll here since we've had BQL from the time 1492 * we sent the descriptor. 1493 */ 1494 dev_written = vhost_vdpa_net_svq_poll(s, 1); 1495 } 1496 1497 if (unlikely(dev_written < sizeof(status))) { 1498 error_report("Insufficient written data (%zu)", dev_written); 1499 goto out; 1500 } 1501 1502 if (*s->status != VIRTIO_NET_OK) { 1503 goto out; 1504 } 1505 1506 status = VIRTIO_NET_ERR; 1507 virtio_net_handle_ctrl_iov(svq->vdev, &model_in, 1, &out, 1); 1508 if (status != VIRTIO_NET_OK) { 1509 error_report("Bad CVQ processing in model"); 1510 } 1511 1512 out: 1513 in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, 1514 sizeof(status)); 1515 if (unlikely(in_len < sizeof(status))) { 1516 error_report("Bad device CVQ written length"); 1517 } 1518 vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status))); 1519 /* 1520 * `elem` belongs to vhost_vdpa_net_handle_ctrl_avail() only when 1521 * the function successfully forwards the CVQ command, indicated 1522 * by a non-negative value of `dev_written`. Otherwise, it still 1523 * belongs to SVQ. 1524 * This function should only free the `elem` when it owns. 1525 */ 1526 if (dev_written >= 0) { 1527 g_free(elem); 1528 } 1529 return dev_written < 0 ? dev_written : 0; 1530 } 1531 1532 static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = { 1533 .avail_handler = vhost_vdpa_net_handle_ctrl_avail, 1534 }; 1535 1536 /** 1537 * Probe if CVQ is isolated 1538 * 1539 * @device_fd The vdpa device fd 1540 * @features Features offered by the device. 1541 * @cvq_index The control vq pair index 1542 * 1543 * Returns <0 in case of failure, 0 if false and 1 if true. 1544 */ 1545 static int vhost_vdpa_probe_cvq_isolation(int device_fd, uint64_t features, 1546 int cvq_index, Error **errp) 1547 { 1548 uint64_t backend_features; 1549 int64_t cvq_group; 1550 uint8_t status = VIRTIO_CONFIG_S_ACKNOWLEDGE | 1551 VIRTIO_CONFIG_S_DRIVER; 1552 int r; 1553 1554 ERRP_GUARD(); 1555 1556 r = ioctl(device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features); 1557 if (unlikely(r < 0)) { 1558 error_setg_errno(errp, errno, "Cannot get vdpa backend_features"); 1559 return r; 1560 } 1561 1562 if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID))) { 1563 return 0; 1564 } 1565 1566 r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status); 1567 if (unlikely(r)) { 1568 error_setg_errno(errp, -r, "Cannot set device status"); 1569 goto out; 1570 } 1571 1572 r = ioctl(device_fd, VHOST_SET_FEATURES, &features); 1573 if (unlikely(r)) { 1574 error_setg_errno(errp, -r, "Cannot set features"); 1575 goto out; 1576 } 1577 1578 status |= VIRTIO_CONFIG_S_FEATURES_OK; 1579 r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status); 1580 if (unlikely(r)) { 1581 error_setg_errno(errp, -r, "Cannot set device status"); 1582 goto out; 1583 } 1584 1585 cvq_group = vhost_vdpa_get_vring_group(device_fd, cvq_index, errp); 1586 if (unlikely(cvq_group < 0)) { 1587 if (cvq_group != -ENOTSUP) { 1588 r = cvq_group; 1589 goto out; 1590 } 1591 1592 /* 1593 * The kernel report VHOST_BACKEND_F_IOTLB_ASID if the vdpa frontend 1594 * support ASID even if the parent driver does not. The CVQ cannot be 1595 * isolated in this case. 1596 */ 1597 error_free(*errp); 1598 *errp = NULL; 1599 r = 0; 1600 goto out; 1601 } 1602 1603 for (int i = 0; i < cvq_index; ++i) { 1604 int64_t group = vhost_vdpa_get_vring_group(device_fd, i, errp); 1605 if (unlikely(group < 0)) { 1606 r = group; 1607 goto out; 1608 } 1609 1610 if (group == (int64_t)cvq_group) { 1611 r = 0; 1612 goto out; 1613 } 1614 } 1615 1616 r = 1; 1617 1618 out: 1619 status = 0; 1620 ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status); 1621 return r; 1622 } 1623 1624 static NetClientState *net_vhost_vdpa_init(NetClientState *peer, 1625 const char *device, 1626 const char *name, 1627 int vdpa_device_fd, 1628 int queue_pair_index, 1629 int nvqs, 1630 bool is_datapath, 1631 bool svq, 1632 struct vhost_vdpa_iova_range iova_range, 1633 uint64_t features, 1634 Error **errp) 1635 { 1636 NetClientState *nc = NULL; 1637 VhostVDPAState *s; 1638 int ret = 0; 1639 assert(name); 1640 int cvq_isolated = 0; 1641 1642 if (is_datapath) { 1643 nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device, 1644 name); 1645 } else { 1646 cvq_isolated = vhost_vdpa_probe_cvq_isolation(vdpa_device_fd, features, 1647 queue_pair_index * 2, 1648 errp); 1649 if (unlikely(cvq_isolated < 0)) { 1650 return NULL; 1651 } 1652 1653 nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer, 1654 device, name); 1655 } 1656 qemu_set_info_str(nc, TYPE_VHOST_VDPA); 1657 s = DO_UPCAST(VhostVDPAState, nc, nc); 1658 1659 s->vhost_vdpa.device_fd = vdpa_device_fd; 1660 s->vhost_vdpa.index = queue_pair_index; 1661 s->always_svq = svq; 1662 s->migration_state.notify = NULL; 1663 s->vhost_vdpa.shadow_vqs_enabled = svq; 1664 s->vhost_vdpa.iova_range = iova_range; 1665 s->vhost_vdpa.shadow_data = svq; 1666 if (queue_pair_index == 0) { 1667 vhost_vdpa_net_valid_svq_features(features, 1668 &s->vhost_vdpa.migration_blocker); 1669 } else if (!is_datapath) { 1670 s->cvq_cmd_out_buffer = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(), 1671 PROT_READ | PROT_WRITE, 1672 MAP_SHARED | MAP_ANONYMOUS, -1, 0); 1673 s->status = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(), 1674 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 1675 -1, 0); 1676 1677 s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops; 1678 s->vhost_vdpa.shadow_vq_ops_opaque = s; 1679 s->cvq_isolated = cvq_isolated; 1680 } 1681 ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs); 1682 if (ret) { 1683 qemu_del_net_client(nc); 1684 return NULL; 1685 } 1686 return nc; 1687 } 1688 1689 static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp) 1690 { 1691 int ret = ioctl(fd, VHOST_GET_FEATURES, features); 1692 if (unlikely(ret < 0)) { 1693 error_setg_errno(errp, errno, 1694 "Fail to query features from vhost-vDPA device"); 1695 } 1696 return ret; 1697 } 1698 1699 static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features, 1700 int *has_cvq, Error **errp) 1701 { 1702 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 1703 g_autofree struct vhost_vdpa_config *config = NULL; 1704 __virtio16 *max_queue_pairs; 1705 int ret; 1706 1707 if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) { 1708 *has_cvq = 1; 1709 } else { 1710 *has_cvq = 0; 1711 } 1712 1713 if (features & (1 << VIRTIO_NET_F_MQ)) { 1714 config = g_malloc0(config_size + sizeof(*max_queue_pairs)); 1715 config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs); 1716 config->len = sizeof(*max_queue_pairs); 1717 1718 ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config); 1719 if (ret) { 1720 error_setg(errp, "Fail to get config from vhost-vDPA device"); 1721 return -ret; 1722 } 1723 1724 max_queue_pairs = (__virtio16 *)&config->buf; 1725 1726 return lduw_le_p(max_queue_pairs); 1727 } 1728 1729 return 1; 1730 } 1731 1732 int net_init_vhost_vdpa(const Netdev *netdev, const char *name, 1733 NetClientState *peer, Error **errp) 1734 { 1735 const NetdevVhostVDPAOptions *opts; 1736 uint64_t features; 1737 int vdpa_device_fd; 1738 g_autofree NetClientState **ncs = NULL; 1739 struct vhost_vdpa_iova_range iova_range; 1740 NetClientState *nc; 1741 int queue_pairs, r, i = 0, has_cvq = 0; 1742 1743 assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA); 1744 opts = &netdev->u.vhost_vdpa; 1745 if (!opts->vhostdev && !opts->vhostfd) { 1746 error_setg(errp, 1747 "vhost-vdpa: neither vhostdev= nor vhostfd= was specified"); 1748 return -1; 1749 } 1750 1751 if (opts->vhostdev && opts->vhostfd) { 1752 error_setg(errp, 1753 "vhost-vdpa: vhostdev= and vhostfd= are mutually exclusive"); 1754 return -1; 1755 } 1756 1757 if (opts->vhostdev) { 1758 vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp); 1759 if (vdpa_device_fd == -1) { 1760 return -errno; 1761 } 1762 } else { 1763 /* has_vhostfd */ 1764 vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, errp); 1765 if (vdpa_device_fd == -1) { 1766 error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: "); 1767 return -1; 1768 } 1769 } 1770 1771 r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp); 1772 if (unlikely(r < 0)) { 1773 goto err; 1774 } 1775 1776 queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features, 1777 &has_cvq, errp); 1778 if (queue_pairs < 0) { 1779 qemu_close(vdpa_device_fd); 1780 return queue_pairs; 1781 } 1782 1783 r = vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range); 1784 if (unlikely(r < 0)) { 1785 error_setg(errp, "vhost-vdpa: get iova range failed: %s", 1786 strerror(-r)); 1787 goto err; 1788 } 1789 1790 if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) { 1791 goto err; 1792 } 1793 1794 ncs = g_malloc0(sizeof(*ncs) * queue_pairs); 1795 1796 for (i = 0; i < queue_pairs; i++) { 1797 ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, 1798 vdpa_device_fd, i, 2, true, opts->x_svq, 1799 iova_range, features, errp); 1800 if (!ncs[i]) 1801 goto err; 1802 } 1803 1804 if (has_cvq) { 1805 nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, 1806 vdpa_device_fd, i, 1, false, 1807 opts->x_svq, iova_range, features, errp); 1808 if (!nc) 1809 goto err; 1810 } 1811 1812 return 0; 1813 1814 err: 1815 if (i) { 1816 for (i--; i >= 0; i--) { 1817 qemu_del_net_client(ncs[i]); 1818 } 1819 } 1820 1821 qemu_close(vdpa_device_fd); 1822 1823 return -1; 1824 } 1825