1 /* 2 * vhost-vdpa.c 3 * 4 * Copyright(c) 2017-2018 Intel Corporation. 5 * Copyright(c) 2020 Red Hat, Inc. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include "clients.h" 14 #include "hw/virtio/virtio-net.h" 15 #include "net/vhost_net.h" 16 #include "net/vhost-vdpa.h" 17 #include "hw/virtio/vhost-vdpa.h" 18 #include "qemu/config-file.h" 19 #include "qemu/error-report.h" 20 #include "qemu/log.h" 21 #include "qemu/memalign.h" 22 #include "qemu/option.h" 23 #include "qapi/error.h" 24 #include <linux/vhost.h> 25 #include <sys/ioctl.h> 26 #include <err.h> 27 #include "standard-headers/linux/virtio_net.h" 28 #include "monitor/monitor.h" 29 #include "migration/migration.h" 30 #include "migration/misc.h" 31 #include "hw/virtio/vhost.h" 32 33 /* Todo:need to add the multiqueue support here */ 34 typedef struct VhostVDPAState { 35 NetClientState nc; 36 struct vhost_vdpa vhost_vdpa; 37 Notifier migration_state; 38 VHostNetState *vhost_net; 39 40 /* Control commands shadow buffers */ 41 void *cvq_cmd_out_buffer; 42 virtio_net_ctrl_ack *status; 43 44 /* The device always have SVQ enabled */ 45 bool always_svq; 46 47 /* The device can isolate CVQ in its own ASID */ 48 bool cvq_isolated; 49 50 bool started; 51 } VhostVDPAState; 52 53 /* 54 * The array is sorted alphabetically in ascending order, 55 * with the exception of VHOST_INVALID_FEATURE_BIT, 56 * which should always be the last entry. 57 */ 58 const int vdpa_feature_bits[] = { 59 VIRTIO_F_ANY_LAYOUT, 60 VIRTIO_F_IOMMU_PLATFORM, 61 VIRTIO_F_NOTIFY_ON_EMPTY, 62 VIRTIO_F_RING_PACKED, 63 VIRTIO_F_RING_RESET, 64 VIRTIO_F_VERSION_1, 65 VIRTIO_NET_F_CSUM, 66 VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, 67 VIRTIO_NET_F_CTRL_MAC_ADDR, 68 VIRTIO_NET_F_CTRL_RX, 69 VIRTIO_NET_F_CTRL_RX_EXTRA, 70 VIRTIO_NET_F_CTRL_VLAN, 71 VIRTIO_NET_F_CTRL_VQ, 72 VIRTIO_NET_F_GSO, 73 VIRTIO_NET_F_GUEST_CSUM, 74 VIRTIO_NET_F_GUEST_ECN, 75 VIRTIO_NET_F_GUEST_TSO4, 76 VIRTIO_NET_F_GUEST_TSO6, 77 VIRTIO_NET_F_GUEST_UFO, 78 VIRTIO_NET_F_GUEST_USO4, 79 VIRTIO_NET_F_GUEST_USO6, 80 VIRTIO_NET_F_HASH_REPORT, 81 VIRTIO_NET_F_HOST_ECN, 82 VIRTIO_NET_F_HOST_TSO4, 83 VIRTIO_NET_F_HOST_TSO6, 84 VIRTIO_NET_F_HOST_UFO, 85 VIRTIO_NET_F_HOST_USO, 86 VIRTIO_NET_F_MQ, 87 VIRTIO_NET_F_MRG_RXBUF, 88 VIRTIO_NET_F_MTU, 89 VIRTIO_NET_F_RSS, 90 VIRTIO_NET_F_STATUS, 91 VIRTIO_RING_F_EVENT_IDX, 92 VIRTIO_RING_F_INDIRECT_DESC, 93 94 /* VHOST_INVALID_FEATURE_BIT should always be the last entry */ 95 VHOST_INVALID_FEATURE_BIT 96 }; 97 98 /** Supported device specific feature bits with SVQ */ 99 static const uint64_t vdpa_svq_device_features = 100 BIT_ULL(VIRTIO_NET_F_CSUM) | 101 BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) | 102 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | 103 BIT_ULL(VIRTIO_NET_F_MTU) | 104 BIT_ULL(VIRTIO_NET_F_MAC) | 105 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | 106 BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) | 107 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | 108 BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | 109 BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | 110 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | 111 BIT_ULL(VIRTIO_NET_F_HOST_ECN) | 112 BIT_ULL(VIRTIO_NET_F_HOST_UFO) | 113 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | 114 BIT_ULL(VIRTIO_NET_F_STATUS) | 115 BIT_ULL(VIRTIO_NET_F_CTRL_VQ) | 116 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | 117 BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) | 118 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | 119 BIT_ULL(VIRTIO_NET_F_MQ) | 120 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | 121 BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | 122 /* VHOST_F_LOG_ALL is exposed by SVQ */ 123 BIT_ULL(VHOST_F_LOG_ALL) | 124 BIT_ULL(VIRTIO_NET_F_HASH_REPORT) | 125 BIT_ULL(VIRTIO_NET_F_RSS) | 126 BIT_ULL(VIRTIO_NET_F_RSC_EXT) | 127 BIT_ULL(VIRTIO_NET_F_STANDBY) | 128 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX); 129 130 #define VHOST_VDPA_NET_CVQ_ASID 1 131 132 VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc) 133 { 134 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 135 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 136 return s->vhost_net; 137 } 138 139 static size_t vhost_vdpa_net_cvq_cmd_len(void) 140 { 141 /* 142 * MAC_TABLE_SET is the ctrl command that produces the longer out buffer. 143 * In buffer is always 1 byte, so it should fit here 144 */ 145 return sizeof(struct virtio_net_ctrl_hdr) + 146 2 * sizeof(struct virtio_net_ctrl_mac) + 147 MAC_TABLE_ENTRIES * ETH_ALEN; 148 } 149 150 static size_t vhost_vdpa_net_cvq_cmd_page_len(void) 151 { 152 return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size()); 153 } 154 155 static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp) 156 { 157 uint64_t invalid_dev_features = 158 features & ~vdpa_svq_device_features & 159 /* Transport are all accepted at this point */ 160 ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START, 161 VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START); 162 163 if (invalid_dev_features) { 164 error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64, 165 invalid_dev_features); 166 return false; 167 } 168 169 return vhost_svq_valid_features(features, errp); 170 } 171 172 static int vhost_vdpa_net_check_device_id(struct vhost_net *net) 173 { 174 uint32_t device_id; 175 int ret; 176 struct vhost_dev *hdev; 177 178 hdev = (struct vhost_dev *)&net->dev; 179 ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id); 180 if (device_id != VIRTIO_ID_NET) { 181 return -ENOTSUP; 182 } 183 return ret; 184 } 185 186 static int vhost_vdpa_add(NetClientState *ncs, void *be, 187 int queue_pair_index, int nvqs) 188 { 189 VhostNetOptions options; 190 struct vhost_net *net = NULL; 191 VhostVDPAState *s; 192 int ret; 193 194 options.backend_type = VHOST_BACKEND_TYPE_VDPA; 195 assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 196 s = DO_UPCAST(VhostVDPAState, nc, ncs); 197 options.net_backend = ncs; 198 options.opaque = be; 199 options.busyloop_timeout = 0; 200 options.nvqs = nvqs; 201 202 net = vhost_net_init(&options); 203 if (!net) { 204 error_report("failed to init vhost_net for queue"); 205 goto err_init; 206 } 207 s->vhost_net = net; 208 ret = vhost_vdpa_net_check_device_id(net); 209 if (ret) { 210 goto err_check; 211 } 212 return 0; 213 err_check: 214 vhost_net_cleanup(net); 215 g_free(net); 216 err_init: 217 return -1; 218 } 219 220 static void vhost_vdpa_cleanup(NetClientState *nc) 221 { 222 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 223 224 /* 225 * If a peer NIC is attached, do not cleanup anything. 226 * Cleanup will happen as a part of qemu_cleanup() -> net_cleanup() 227 * when the guest is shutting down. 228 */ 229 if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_NIC) { 230 return; 231 } 232 munmap(s->cvq_cmd_out_buffer, vhost_vdpa_net_cvq_cmd_page_len()); 233 munmap(s->status, vhost_vdpa_net_cvq_cmd_page_len()); 234 if (s->vhost_net) { 235 vhost_net_cleanup(s->vhost_net); 236 g_free(s->vhost_net); 237 s->vhost_net = NULL; 238 } 239 if (s->vhost_vdpa.index != 0) { 240 return; 241 } 242 qemu_close(s->vhost_vdpa.shared->device_fd); 243 g_free(s->vhost_vdpa.shared); 244 } 245 246 /** Dummy SetSteeringEBPF to support RSS for vhost-vdpa backend */ 247 static bool vhost_vdpa_set_steering_ebpf(NetClientState *nc, int prog_fd) 248 { 249 return true; 250 } 251 252 static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc) 253 { 254 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 255 256 return true; 257 } 258 259 static bool vhost_vdpa_has_ufo(NetClientState *nc) 260 { 261 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 262 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 263 uint64_t features = 0; 264 features |= (1ULL << VIRTIO_NET_F_HOST_UFO); 265 features = vhost_net_get_features(s->vhost_net, features); 266 return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO)); 267 268 } 269 270 static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc, 271 Error **errp) 272 { 273 const char *driver = object_class_get_name(oc); 274 275 if (!g_str_has_prefix(driver, "virtio-net-")) { 276 error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*"); 277 return false; 278 } 279 280 return true; 281 } 282 283 /** Dummy receive in case qemu falls back to userland tap networking */ 284 static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf, 285 size_t size) 286 { 287 return size; 288 } 289 290 static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) 291 { 292 struct vhost_vdpa *v = &s->vhost_vdpa; 293 VirtIONet *n; 294 VirtIODevice *vdev; 295 int data_queue_pairs, cvq, r; 296 297 /* We are only called on the first data vqs and only if x-svq is not set */ 298 if (s->vhost_vdpa.shadow_vqs_enabled == enable) { 299 return; 300 } 301 302 vdev = v->dev->vdev; 303 n = VIRTIO_NET(vdev); 304 if (!n->vhost_started) { 305 return; 306 } 307 308 data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1; 309 cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ? 310 n->max_ncs - n->max_queue_pairs : 0; 311 /* 312 * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter 313 * in the future and resume the device if read-only operations between 314 * suspend and reset goes wrong. 315 */ 316 vhost_net_stop(vdev, n->nic->ncs, data_queue_pairs, cvq); 317 318 /* Start will check migration setup_or_active to configure or not SVQ */ 319 r = vhost_net_start(vdev, n->nic->ncs, data_queue_pairs, cvq); 320 if (unlikely(r < 0)) { 321 error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r); 322 } 323 } 324 325 static void vdpa_net_migration_state_notifier(Notifier *notifier, void *data) 326 { 327 MigrationState *migration = data; 328 VhostVDPAState *s = container_of(notifier, VhostVDPAState, 329 migration_state); 330 331 if (migration_in_setup(migration)) { 332 vhost_vdpa_net_log_global_enable(s, true); 333 } else if (migration_has_failed(migration)) { 334 vhost_vdpa_net_log_global_enable(s, false); 335 } 336 } 337 338 static void vhost_vdpa_net_data_start_first(VhostVDPAState *s) 339 { 340 struct vhost_vdpa *v = &s->vhost_vdpa; 341 342 migration_add_notifier(&s->migration_state, 343 vdpa_net_migration_state_notifier); 344 if (v->shadow_vqs_enabled) { 345 v->shared->iova_tree = vhost_iova_tree_new(v->shared->iova_range.first, 346 v->shared->iova_range.last); 347 } 348 } 349 350 static int vhost_vdpa_net_data_start(NetClientState *nc) 351 { 352 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 353 struct vhost_vdpa *v = &s->vhost_vdpa; 354 355 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 356 357 if (s->always_svq || 358 migration_is_setup_or_active(migrate_get_current()->state)) { 359 v->shadow_vqs_enabled = true; 360 } else { 361 v->shadow_vqs_enabled = false; 362 } 363 364 if (v->index == 0) { 365 v->shared->shadow_data = v->shadow_vqs_enabled; 366 vhost_vdpa_net_data_start_first(s); 367 return 0; 368 } 369 370 return 0; 371 } 372 373 static int vhost_vdpa_net_data_load(NetClientState *nc) 374 { 375 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 376 struct vhost_vdpa *v = &s->vhost_vdpa; 377 bool has_cvq = v->dev->vq_index_end % 2; 378 379 if (has_cvq) { 380 return 0; 381 } 382 383 for (int i = 0; i < v->dev->nvqs; ++i) { 384 vhost_vdpa_set_vring_ready(v, i + v->dev->vq_index); 385 } 386 return 0; 387 } 388 389 static void vhost_vdpa_net_client_stop(NetClientState *nc) 390 { 391 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 392 struct vhost_dev *dev; 393 394 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 395 396 if (s->vhost_vdpa.index == 0) { 397 migration_remove_notifier(&s->migration_state); 398 } 399 400 dev = s->vhost_vdpa.dev; 401 if (dev->vq_index + dev->nvqs == dev->vq_index_end) { 402 g_clear_pointer(&s->vhost_vdpa.shared->iova_tree, 403 vhost_iova_tree_delete); 404 } 405 } 406 407 static NetClientInfo net_vhost_vdpa_info = { 408 .type = NET_CLIENT_DRIVER_VHOST_VDPA, 409 .size = sizeof(VhostVDPAState), 410 .receive = vhost_vdpa_receive, 411 .start = vhost_vdpa_net_data_start, 412 .load = vhost_vdpa_net_data_load, 413 .stop = vhost_vdpa_net_client_stop, 414 .cleanup = vhost_vdpa_cleanup, 415 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, 416 .has_ufo = vhost_vdpa_has_ufo, 417 .check_peer_type = vhost_vdpa_check_peer_type, 418 .set_steering_ebpf = vhost_vdpa_set_steering_ebpf, 419 }; 420 421 static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index, 422 Error **errp) 423 { 424 struct vhost_vring_state state = { 425 .index = vq_index, 426 }; 427 int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state); 428 429 if (unlikely(r < 0)) { 430 r = -errno; 431 error_setg_errno(errp, errno, "Cannot get VQ %u group", vq_index); 432 return r; 433 } 434 435 return state.num; 436 } 437 438 static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v, 439 unsigned vq_group, 440 unsigned asid_num) 441 { 442 struct vhost_vring_state asid = { 443 .index = vq_group, 444 .num = asid_num, 445 }; 446 int r; 447 448 r = ioctl(v->shared->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid); 449 if (unlikely(r < 0)) { 450 error_report("Can't set vq group %u asid %u, errno=%d (%s)", 451 asid.index, asid.num, errno, g_strerror(errno)); 452 } 453 return r; 454 } 455 456 static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr) 457 { 458 VhostIOVATree *tree = v->shared->iova_tree; 459 DMAMap needle = { 460 /* 461 * No need to specify size or to look for more translations since 462 * this contiguous chunk was allocated by us. 463 */ 464 .translated_addr = (hwaddr)(uintptr_t)addr, 465 }; 466 const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle); 467 int r; 468 469 if (unlikely(!map)) { 470 error_report("Cannot locate expected map"); 471 return; 472 } 473 474 r = vhost_vdpa_dma_unmap(v, v->address_space_id, map->iova, map->size + 1); 475 if (unlikely(r != 0)) { 476 error_report("Device cannot unmap: %s(%d)", g_strerror(r), r); 477 } 478 479 vhost_iova_tree_remove(tree, *map); 480 } 481 482 /** Map CVQ buffer. */ 483 static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size, 484 bool write) 485 { 486 DMAMap map = {}; 487 int r; 488 489 map.translated_addr = (hwaddr)(uintptr_t)buf; 490 map.size = size - 1; 491 map.perm = write ? IOMMU_RW : IOMMU_RO, 492 r = vhost_iova_tree_map_alloc(v->shared->iova_tree, &map); 493 if (unlikely(r != IOVA_OK)) { 494 error_report("Cannot map injected element"); 495 return r; 496 } 497 498 r = vhost_vdpa_dma_map(v, v->address_space_id, map.iova, 499 vhost_vdpa_net_cvq_cmd_page_len(), buf, !write); 500 if (unlikely(r < 0)) { 501 goto dma_map_err; 502 } 503 504 return 0; 505 506 dma_map_err: 507 vhost_iova_tree_remove(v->shared->iova_tree, map); 508 return r; 509 } 510 511 static int vhost_vdpa_net_cvq_start(NetClientState *nc) 512 { 513 VhostVDPAState *s; 514 struct vhost_vdpa *v; 515 int64_t cvq_group; 516 int r; 517 Error *err = NULL; 518 519 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 520 521 s = DO_UPCAST(VhostVDPAState, nc, nc); 522 v = &s->vhost_vdpa; 523 524 v->shadow_vqs_enabled = v->shared->shadow_data; 525 s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID; 526 527 if (v->shared->shadow_data) { 528 /* SVQ is already configured for all virtqueues */ 529 goto out; 530 } 531 532 /* 533 * If we early return in these cases SVQ will not be enabled. The migration 534 * will be blocked as long as vhost-vdpa backends will not offer _F_LOG. 535 */ 536 if (!vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) { 537 return 0; 538 } 539 540 if (!s->cvq_isolated) { 541 return 0; 542 } 543 544 cvq_group = vhost_vdpa_get_vring_group(v->shared->device_fd, 545 v->dev->vq_index_end - 1, 546 &err); 547 if (unlikely(cvq_group < 0)) { 548 error_report_err(err); 549 return cvq_group; 550 } 551 552 r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID); 553 if (unlikely(r < 0)) { 554 return r; 555 } 556 557 v->shadow_vqs_enabled = true; 558 s->vhost_vdpa.address_space_id = VHOST_VDPA_NET_CVQ_ASID; 559 560 out: 561 if (!s->vhost_vdpa.shadow_vqs_enabled) { 562 return 0; 563 } 564 565 /* 566 * If other vhost_vdpa already have an iova_tree, reuse it for simplicity, 567 * whether CVQ shares ASID with guest or not, because: 568 * - Memory listener need access to guest's memory addresses allocated in 569 * the IOVA tree. 570 * - There should be plenty of IOVA address space for both ASID not to 571 * worry about collisions between them. Guest's translations are still 572 * validated with virtio virtqueue_pop so there is no risk for the guest 573 * to access memory that it shouldn't. 574 * 575 * To allocate a iova tree per ASID is doable but it complicates the code 576 * and it is not worth it for the moment. 577 */ 578 if (!v->shared->iova_tree) { 579 v->shared->iova_tree = vhost_iova_tree_new(v->shared->iova_range.first, 580 v->shared->iova_range.last); 581 } 582 583 r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer, 584 vhost_vdpa_net_cvq_cmd_page_len(), false); 585 if (unlikely(r < 0)) { 586 return r; 587 } 588 589 r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->status, 590 vhost_vdpa_net_cvq_cmd_page_len(), true); 591 if (unlikely(r < 0)) { 592 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); 593 } 594 595 return r; 596 } 597 598 static void vhost_vdpa_net_cvq_stop(NetClientState *nc) 599 { 600 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 601 602 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 603 604 if (s->vhost_vdpa.shadow_vqs_enabled) { 605 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); 606 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status); 607 } 608 609 vhost_vdpa_net_client_stop(nc); 610 } 611 612 static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, 613 const struct iovec *out_sg, size_t out_num, 614 const struct iovec *in_sg, size_t in_num) 615 { 616 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0); 617 int r; 618 619 r = vhost_svq_add(svq, out_sg, out_num, in_sg, in_num, NULL); 620 if (unlikely(r != 0)) { 621 if (unlikely(r == -ENOSPC)) { 622 qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n", 623 __func__); 624 } 625 } 626 627 return r; 628 } 629 630 /* 631 * Convenience wrapper to poll SVQ for multiple control commands. 632 * 633 * Caller should hold the BQL when invoking this function, and should take 634 * the answer before SVQ pulls by itself when BQL is released. 635 */ 636 static ssize_t vhost_vdpa_net_svq_poll(VhostVDPAState *s, size_t cmds_in_flight) 637 { 638 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0); 639 return vhost_svq_poll(svq, cmds_in_flight); 640 } 641 642 static void vhost_vdpa_net_load_cursor_reset(VhostVDPAState *s, 643 struct iovec *out_cursor, 644 struct iovec *in_cursor) 645 { 646 /* reset the cursor of the output buffer for the device */ 647 out_cursor->iov_base = s->cvq_cmd_out_buffer; 648 out_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len(); 649 650 /* reset the cursor of the in buffer for the device */ 651 in_cursor->iov_base = s->status; 652 in_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len(); 653 } 654 655 /* 656 * Poll SVQ for multiple pending control commands and check the device's ack. 657 * 658 * Caller should hold the BQL when invoking this function. 659 * 660 * @s: The VhostVDPAState 661 * @len: The length of the pending status shadow buffer 662 */ 663 static ssize_t vhost_vdpa_net_svq_flush(VhostVDPAState *s, size_t len) 664 { 665 /* device uses a one-byte length ack for each control command */ 666 ssize_t dev_written = vhost_vdpa_net_svq_poll(s, len); 667 if (unlikely(dev_written != len)) { 668 return -EIO; 669 } 670 671 /* check the device's ack */ 672 for (int i = 0; i < len; ++i) { 673 if (s->status[i] != VIRTIO_NET_OK) { 674 return -EIO; 675 } 676 } 677 return 0; 678 } 679 680 static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, 681 struct iovec *out_cursor, 682 struct iovec *in_cursor, uint8_t class, 683 uint8_t cmd, const struct iovec *data_sg, 684 size_t data_num) 685 { 686 const struct virtio_net_ctrl_hdr ctrl = { 687 .class = class, 688 .cmd = cmd, 689 }; 690 size_t data_size = iov_size(data_sg, data_num), cmd_size; 691 struct iovec out, in; 692 ssize_t r; 693 unsigned dummy_cursor_iov_cnt; 694 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0); 695 696 assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl)); 697 cmd_size = sizeof(ctrl) + data_size; 698 if (vhost_svq_available_slots(svq) < 2 || 699 iov_size(out_cursor, 1) < cmd_size) { 700 /* 701 * It is time to flush all pending control commands if SVQ is full 702 * or control commands shadow buffers are full. 703 * 704 * We can poll here since we've had BQL from the time 705 * we sent the descriptor. 706 */ 707 r = vhost_vdpa_net_svq_flush(s, in_cursor->iov_base - 708 (void *)s->status); 709 if (unlikely(r < 0)) { 710 return r; 711 } 712 713 vhost_vdpa_net_load_cursor_reset(s, out_cursor, in_cursor); 714 } 715 716 /* pack the CVQ command header */ 717 iov_from_buf(out_cursor, 1, 0, &ctrl, sizeof(ctrl)); 718 /* pack the CVQ command command-specific-data */ 719 iov_to_buf(data_sg, data_num, 0, 720 out_cursor->iov_base + sizeof(ctrl), data_size); 721 722 /* extract the required buffer from the cursor for output */ 723 iov_copy(&out, 1, out_cursor, 1, 0, cmd_size); 724 /* extract the required buffer from the cursor for input */ 725 iov_copy(&in, 1, in_cursor, 1, 0, sizeof(*s->status)); 726 727 r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1); 728 if (unlikely(r < 0)) { 729 return r; 730 } 731 732 /* iterate the cursors */ 733 dummy_cursor_iov_cnt = 1; 734 iov_discard_front(&out_cursor, &dummy_cursor_iov_cnt, cmd_size); 735 dummy_cursor_iov_cnt = 1; 736 iov_discard_front(&in_cursor, &dummy_cursor_iov_cnt, sizeof(*s->status)); 737 738 return 0; 739 } 740 741 static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n, 742 struct iovec *out_cursor, 743 struct iovec *in_cursor) 744 { 745 if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_MAC_ADDR)) { 746 const struct iovec data = { 747 .iov_base = (void *)n->mac, 748 .iov_len = sizeof(n->mac), 749 }; 750 ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 751 VIRTIO_NET_CTRL_MAC, 752 VIRTIO_NET_CTRL_MAC_ADDR_SET, 753 &data, 1); 754 if (unlikely(r < 0)) { 755 return r; 756 } 757 } 758 759 /* 760 * According to VirtIO standard, "The device MUST have an 761 * empty MAC filtering table on reset.". 762 * 763 * Therefore, there is no need to send this CVQ command if the 764 * driver also sets an empty MAC filter table, which aligns with 765 * the device's defaults. 766 * 767 * Note that the device's defaults can mismatch the driver's 768 * configuration only at live migration. 769 */ 770 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX) || 771 n->mac_table.in_use == 0) { 772 return 0; 773 } 774 775 uint32_t uni_entries = n->mac_table.first_multi, 776 uni_macs_size = uni_entries * ETH_ALEN, 777 mul_entries = n->mac_table.in_use - uni_entries, 778 mul_macs_size = mul_entries * ETH_ALEN; 779 struct virtio_net_ctrl_mac uni = { 780 .entries = cpu_to_le32(uni_entries), 781 }; 782 struct virtio_net_ctrl_mac mul = { 783 .entries = cpu_to_le32(mul_entries), 784 }; 785 const struct iovec data[] = { 786 { 787 .iov_base = &uni, 788 .iov_len = sizeof(uni), 789 }, { 790 .iov_base = n->mac_table.macs, 791 .iov_len = uni_macs_size, 792 }, { 793 .iov_base = &mul, 794 .iov_len = sizeof(mul), 795 }, { 796 .iov_base = &n->mac_table.macs[uni_macs_size], 797 .iov_len = mul_macs_size, 798 }, 799 }; 800 ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 801 VIRTIO_NET_CTRL_MAC, 802 VIRTIO_NET_CTRL_MAC_TABLE_SET, 803 data, ARRAY_SIZE(data)); 804 if (unlikely(r < 0)) { 805 return r; 806 } 807 808 return 0; 809 } 810 811 static int vhost_vdpa_net_load_rss(VhostVDPAState *s, const VirtIONet *n, 812 struct iovec *out_cursor, 813 struct iovec *in_cursor, bool do_rss) 814 { 815 struct virtio_net_rss_config cfg = {}; 816 ssize_t r; 817 g_autofree uint16_t *table = NULL; 818 819 /* 820 * According to VirtIO standard, "Initially the device has all hash 821 * types disabled and reports only VIRTIO_NET_HASH_REPORT_NONE.". 822 * 823 * Therefore, there is no need to send this CVQ command if the 824 * driver disables the all hash types, which aligns with 825 * the device's defaults. 826 * 827 * Note that the device's defaults can mismatch the driver's 828 * configuration only at live migration. 829 */ 830 if (!n->rss_data.enabled || 831 n->rss_data.hash_types == VIRTIO_NET_HASH_REPORT_NONE) { 832 return 0; 833 } 834 835 table = g_malloc_n(n->rss_data.indirections_len, 836 sizeof(n->rss_data.indirections_table[0])); 837 cfg.hash_types = cpu_to_le32(n->rss_data.hash_types); 838 839 if (do_rss) { 840 /* 841 * According to VirtIO standard, "Number of entries in indirection_table 842 * is (indirection_table_mask + 1)". 843 */ 844 cfg.indirection_table_mask = cpu_to_le16(n->rss_data.indirections_len - 845 1); 846 cfg.unclassified_queue = cpu_to_le16(n->rss_data.default_queue); 847 for (int i = 0; i < n->rss_data.indirections_len; ++i) { 848 table[i] = cpu_to_le16(n->rss_data.indirections_table[i]); 849 } 850 cfg.max_tx_vq = cpu_to_le16(n->curr_queue_pairs); 851 } else { 852 /* 853 * According to VirtIO standard, "Field reserved MUST contain zeroes. 854 * It is defined to make the structure to match the layout of 855 * virtio_net_rss_config structure, defined in 5.1.6.5.7.". 856 * 857 * Therefore, we need to zero the fields in 858 * struct virtio_net_rss_config, which corresponds to the 859 * `reserved` field in struct virtio_net_hash_config. 860 * 861 * Note that all other fields are zeroed at their definitions, 862 * except for the `indirection_table` field, where the actual data 863 * is stored in the `table` variable to ensure compatibility 864 * with RSS case. Therefore, we need to zero the `table` variable here. 865 */ 866 table[0] = 0; 867 } 868 869 /* 870 * Considering that virtio_net_handle_rss() currently does not restore 871 * the hash key length parsed from the CVQ command sent from the guest 872 * into n->rss_data and uses the maximum key length in other code, so 873 * we also employ the maximum key length here. 874 */ 875 cfg.hash_key_length = sizeof(n->rss_data.key); 876 877 const struct iovec data[] = { 878 { 879 .iov_base = &cfg, 880 .iov_len = offsetof(struct virtio_net_rss_config, 881 indirection_table), 882 }, { 883 .iov_base = table, 884 .iov_len = n->rss_data.indirections_len * 885 sizeof(n->rss_data.indirections_table[0]), 886 }, { 887 .iov_base = &cfg.max_tx_vq, 888 .iov_len = offsetof(struct virtio_net_rss_config, hash_key_data) - 889 offsetof(struct virtio_net_rss_config, max_tx_vq), 890 }, { 891 .iov_base = (void *)n->rss_data.key, 892 .iov_len = sizeof(n->rss_data.key), 893 } 894 }; 895 896 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 897 VIRTIO_NET_CTRL_MQ, 898 do_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG : 899 VIRTIO_NET_CTRL_MQ_HASH_CONFIG, 900 data, ARRAY_SIZE(data)); 901 if (unlikely(r < 0)) { 902 return r; 903 } 904 905 return 0; 906 } 907 908 static int vhost_vdpa_net_load_mq(VhostVDPAState *s, 909 const VirtIONet *n, 910 struct iovec *out_cursor, 911 struct iovec *in_cursor) 912 { 913 struct virtio_net_ctrl_mq mq; 914 ssize_t r; 915 916 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_MQ)) { 917 return 0; 918 } 919 920 mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs); 921 const struct iovec data = { 922 .iov_base = &mq, 923 .iov_len = sizeof(mq), 924 }; 925 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 926 VIRTIO_NET_CTRL_MQ, 927 VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, 928 &data, 1); 929 if (unlikely(r < 0)) { 930 return r; 931 } 932 933 if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_RSS)) { 934 /* load the receive-side scaling state */ 935 r = vhost_vdpa_net_load_rss(s, n, out_cursor, in_cursor, true); 936 if (unlikely(r < 0)) { 937 return r; 938 } 939 } else if (virtio_vdev_has_feature(&n->parent_obj, 940 VIRTIO_NET_F_HASH_REPORT)) { 941 /* load the hash calculation state */ 942 r = vhost_vdpa_net_load_rss(s, n, out_cursor, in_cursor, false); 943 if (unlikely(r < 0)) { 944 return r; 945 } 946 } 947 948 return 0; 949 } 950 951 static int vhost_vdpa_net_load_offloads(VhostVDPAState *s, 952 const VirtIONet *n, 953 struct iovec *out_cursor, 954 struct iovec *in_cursor) 955 { 956 uint64_t offloads; 957 ssize_t r; 958 959 if (!virtio_vdev_has_feature(&n->parent_obj, 960 VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) { 961 return 0; 962 } 963 964 if (n->curr_guest_offloads == virtio_net_supported_guest_offloads(n)) { 965 /* 966 * According to VirtIO standard, "Upon feature negotiation 967 * corresponding offload gets enabled to preserve 968 * backward compatibility.". 969 * 970 * Therefore, there is no need to send this CVQ command if the 971 * driver also enables all supported offloads, which aligns with 972 * the device's defaults. 973 * 974 * Note that the device's defaults can mismatch the driver's 975 * configuration only at live migration. 976 */ 977 return 0; 978 } 979 980 offloads = cpu_to_le64(n->curr_guest_offloads); 981 const struct iovec data = { 982 .iov_base = &offloads, 983 .iov_len = sizeof(offloads), 984 }; 985 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 986 VIRTIO_NET_CTRL_GUEST_OFFLOADS, 987 VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, 988 &data, 1); 989 if (unlikely(r < 0)) { 990 return r; 991 } 992 993 return 0; 994 } 995 996 static int vhost_vdpa_net_load_rx_mode(VhostVDPAState *s, 997 struct iovec *out_cursor, 998 struct iovec *in_cursor, 999 uint8_t cmd, 1000 uint8_t on) 1001 { 1002 const struct iovec data = { 1003 .iov_base = &on, 1004 .iov_len = sizeof(on), 1005 }; 1006 ssize_t r; 1007 1008 r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 1009 VIRTIO_NET_CTRL_RX, cmd, &data, 1); 1010 if (unlikely(r < 0)) { 1011 return r; 1012 } 1013 1014 return 0; 1015 } 1016 1017 static int vhost_vdpa_net_load_rx(VhostVDPAState *s, 1018 const VirtIONet *n, 1019 struct iovec *out_cursor, 1020 struct iovec *in_cursor) 1021 { 1022 ssize_t r; 1023 1024 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX)) { 1025 return 0; 1026 } 1027 1028 /* 1029 * According to virtio_net_reset(), device turns promiscuous mode 1030 * on by default. 1031 * 1032 * Additionally, according to VirtIO standard, "Since there are 1033 * no guarantees, it can use a hash filter or silently switch to 1034 * allmulti or promiscuous mode if it is given too many addresses.". 1035 * QEMU marks `n->mac_table.uni_overflow` if guest sets too many 1036 * non-multicast MAC addresses, indicating that promiscuous mode 1037 * should be enabled. 1038 * 1039 * Therefore, QEMU should only send this CVQ command if the 1040 * `n->mac_table.uni_overflow` is not marked and `n->promisc` is off, 1041 * which sets promiscuous mode on, different from the device's defaults. 1042 * 1043 * Note that the device's defaults can mismatch the driver's 1044 * configuration only at live migration. 1045 */ 1046 if (!n->mac_table.uni_overflow && !n->promisc) { 1047 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1048 VIRTIO_NET_CTRL_RX_PROMISC, 0); 1049 if (unlikely(r < 0)) { 1050 return r; 1051 } 1052 } 1053 1054 /* 1055 * According to virtio_net_reset(), device turns all-multicast mode 1056 * off by default. 1057 * 1058 * According to VirtIO standard, "Since there are no guarantees, 1059 * it can use a hash filter or silently switch to allmulti or 1060 * promiscuous mode if it is given too many addresses.". QEMU marks 1061 * `n->mac_table.multi_overflow` if guest sets too many 1062 * non-multicast MAC addresses. 1063 * 1064 * Therefore, QEMU should only send this CVQ command if the 1065 * `n->mac_table.multi_overflow` is marked or `n->allmulti` is on, 1066 * which sets all-multicast mode on, different from the device's defaults. 1067 * 1068 * Note that the device's defaults can mismatch the driver's 1069 * configuration only at live migration. 1070 */ 1071 if (n->mac_table.multi_overflow || n->allmulti) { 1072 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1073 VIRTIO_NET_CTRL_RX_ALLMULTI, 1); 1074 if (unlikely(r < 0)) { 1075 return r; 1076 } 1077 } 1078 1079 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX_EXTRA)) { 1080 return 0; 1081 } 1082 1083 /* 1084 * According to virtio_net_reset(), device turns all-unicast mode 1085 * off by default. 1086 * 1087 * Therefore, QEMU should only send this CVQ command if the driver 1088 * sets all-unicast mode on, different from the device's defaults. 1089 * 1090 * Note that the device's defaults can mismatch the driver's 1091 * configuration only at live migration. 1092 */ 1093 if (n->alluni) { 1094 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1095 VIRTIO_NET_CTRL_RX_ALLUNI, 1); 1096 if (r < 0) { 1097 return r; 1098 } 1099 } 1100 1101 /* 1102 * According to virtio_net_reset(), device turns non-multicast mode 1103 * off by default. 1104 * 1105 * Therefore, QEMU should only send this CVQ command if the driver 1106 * sets non-multicast mode on, different from the device's defaults. 1107 * 1108 * Note that the device's defaults can mismatch the driver's 1109 * configuration only at live migration. 1110 */ 1111 if (n->nomulti) { 1112 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1113 VIRTIO_NET_CTRL_RX_NOMULTI, 1); 1114 if (r < 0) { 1115 return r; 1116 } 1117 } 1118 1119 /* 1120 * According to virtio_net_reset(), device turns non-unicast mode 1121 * off by default. 1122 * 1123 * Therefore, QEMU should only send this CVQ command if the driver 1124 * sets non-unicast mode on, different from the device's defaults. 1125 * 1126 * Note that the device's defaults can mismatch the driver's 1127 * configuration only at live migration. 1128 */ 1129 if (n->nouni) { 1130 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1131 VIRTIO_NET_CTRL_RX_NOUNI, 1); 1132 if (r < 0) { 1133 return r; 1134 } 1135 } 1136 1137 /* 1138 * According to virtio_net_reset(), device turns non-broadcast mode 1139 * off by default. 1140 * 1141 * Therefore, QEMU should only send this CVQ command if the driver 1142 * sets non-broadcast mode on, different from the device's defaults. 1143 * 1144 * Note that the device's defaults can mismatch the driver's 1145 * configuration only at live migration. 1146 */ 1147 if (n->nobcast) { 1148 r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor, 1149 VIRTIO_NET_CTRL_RX_NOBCAST, 1); 1150 if (r < 0) { 1151 return r; 1152 } 1153 } 1154 1155 return 0; 1156 } 1157 1158 static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s, 1159 const VirtIONet *n, 1160 struct iovec *out_cursor, 1161 struct iovec *in_cursor, 1162 uint16_t vid) 1163 { 1164 const struct iovec data = { 1165 .iov_base = &vid, 1166 .iov_len = sizeof(vid), 1167 }; 1168 ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor, 1169 VIRTIO_NET_CTRL_VLAN, 1170 VIRTIO_NET_CTRL_VLAN_ADD, 1171 &data, 1); 1172 if (unlikely(r < 0)) { 1173 return r; 1174 } 1175 1176 return 0; 1177 } 1178 1179 static int vhost_vdpa_net_load_vlan(VhostVDPAState *s, 1180 const VirtIONet *n, 1181 struct iovec *out_cursor, 1182 struct iovec *in_cursor) 1183 { 1184 int r; 1185 1186 if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_VLAN)) { 1187 return 0; 1188 } 1189 1190 for (int i = 0; i < MAX_VLAN >> 5; i++) { 1191 for (int j = 0; n->vlans[i] && j <= 0x1f; j++) { 1192 if (n->vlans[i] & (1U << j)) { 1193 r = vhost_vdpa_net_load_single_vlan(s, n, out_cursor, 1194 in_cursor, (i << 5) + j); 1195 if (unlikely(r != 0)) { 1196 return r; 1197 } 1198 } 1199 } 1200 } 1201 1202 return 0; 1203 } 1204 1205 static int vhost_vdpa_net_cvq_load(NetClientState *nc) 1206 { 1207 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 1208 struct vhost_vdpa *v = &s->vhost_vdpa; 1209 const VirtIONet *n; 1210 int r; 1211 struct iovec out_cursor, in_cursor; 1212 1213 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 1214 1215 vhost_vdpa_set_vring_ready(v, v->dev->vq_index); 1216 1217 if (v->shadow_vqs_enabled) { 1218 n = VIRTIO_NET(v->dev->vdev); 1219 vhost_vdpa_net_load_cursor_reset(s, &out_cursor, &in_cursor); 1220 r = vhost_vdpa_net_load_mac(s, n, &out_cursor, &in_cursor); 1221 if (unlikely(r < 0)) { 1222 return r; 1223 } 1224 r = vhost_vdpa_net_load_mq(s, n, &out_cursor, &in_cursor); 1225 if (unlikely(r)) { 1226 return r; 1227 } 1228 r = vhost_vdpa_net_load_offloads(s, n, &out_cursor, &in_cursor); 1229 if (unlikely(r)) { 1230 return r; 1231 } 1232 r = vhost_vdpa_net_load_rx(s, n, &out_cursor, &in_cursor); 1233 if (unlikely(r)) { 1234 return r; 1235 } 1236 r = vhost_vdpa_net_load_vlan(s, n, &out_cursor, &in_cursor); 1237 if (unlikely(r)) { 1238 return r; 1239 } 1240 1241 /* 1242 * We need to poll and check all pending device's used buffers. 1243 * 1244 * We can poll here since we've had BQL from the time 1245 * we sent the descriptor. 1246 */ 1247 r = vhost_vdpa_net_svq_flush(s, in_cursor.iov_base - (void *)s->status); 1248 if (unlikely(r)) { 1249 return r; 1250 } 1251 } 1252 1253 for (int i = 0; i < v->dev->vq_index; ++i) { 1254 vhost_vdpa_set_vring_ready(v, i); 1255 } 1256 1257 return 0; 1258 } 1259 1260 static NetClientInfo net_vhost_vdpa_cvq_info = { 1261 .type = NET_CLIENT_DRIVER_VHOST_VDPA, 1262 .size = sizeof(VhostVDPAState), 1263 .receive = vhost_vdpa_receive, 1264 .start = vhost_vdpa_net_cvq_start, 1265 .load = vhost_vdpa_net_cvq_load, 1266 .stop = vhost_vdpa_net_cvq_stop, 1267 .cleanup = vhost_vdpa_cleanup, 1268 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, 1269 .has_ufo = vhost_vdpa_has_ufo, 1270 .check_peer_type = vhost_vdpa_check_peer_type, 1271 .set_steering_ebpf = vhost_vdpa_set_steering_ebpf, 1272 }; 1273 1274 /* 1275 * Forward the excessive VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command to 1276 * vdpa device. 1277 * 1278 * Considering that QEMU cannot send the entire filter table to the 1279 * vdpa device, it should send the VIRTIO_NET_CTRL_RX_PROMISC CVQ 1280 * command to enable promiscuous mode to receive all packets, 1281 * according to VirtIO standard, "Since there are no guarantees, 1282 * it can use a hash filter or silently switch to allmulti or 1283 * promiscuous mode if it is given too many addresses.". 1284 * 1285 * Since QEMU ignores MAC addresses beyond `MAC_TABLE_ENTRIES` and 1286 * marks `n->mac_table.x_overflow` accordingly, it should have 1287 * the same effect on the device model to receive 1288 * (`MAC_TABLE_ENTRIES` + 1) or more non-multicast MAC addresses. 1289 * The same applies to multicast MAC addresses. 1290 * 1291 * Therefore, QEMU can provide the device model with a fake 1292 * VIRTIO_NET_CTRL_MAC_TABLE_SET command with (`MAC_TABLE_ENTRIES` + 1) 1293 * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1) multicast 1294 * MAC addresses. This ensures that the device model marks 1295 * `n->mac_table.uni_overflow` and `n->mac_table.multi_overflow`, 1296 * allowing all packets to be received, which aligns with the 1297 * state of the vdpa device. 1298 */ 1299 static int vhost_vdpa_net_excessive_mac_filter_cvq_add(VhostVDPAState *s, 1300 VirtQueueElement *elem, 1301 struct iovec *out, 1302 const struct iovec *in) 1303 { 1304 struct virtio_net_ctrl_mac mac_data, *mac_ptr; 1305 struct virtio_net_ctrl_hdr *hdr_ptr; 1306 uint32_t cursor; 1307 ssize_t r; 1308 uint8_t on = 1; 1309 1310 /* parse the non-multicast MAC address entries from CVQ command */ 1311 cursor = sizeof(*hdr_ptr); 1312 r = iov_to_buf(elem->out_sg, elem->out_num, cursor, 1313 &mac_data, sizeof(mac_data)); 1314 if (unlikely(r != sizeof(mac_data))) { 1315 /* 1316 * If the CVQ command is invalid, we should simulate the vdpa device 1317 * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command 1318 */ 1319 *s->status = VIRTIO_NET_ERR; 1320 return sizeof(*s->status); 1321 } 1322 cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN; 1323 1324 /* parse the multicast MAC address entries from CVQ command */ 1325 r = iov_to_buf(elem->out_sg, elem->out_num, cursor, 1326 &mac_data, sizeof(mac_data)); 1327 if (r != sizeof(mac_data)) { 1328 /* 1329 * If the CVQ command is invalid, we should simulate the vdpa device 1330 * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command 1331 */ 1332 *s->status = VIRTIO_NET_ERR; 1333 return sizeof(*s->status); 1334 } 1335 cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN; 1336 1337 /* validate the CVQ command */ 1338 if (iov_size(elem->out_sg, elem->out_num) != cursor) { 1339 /* 1340 * If the CVQ command is invalid, we should simulate the vdpa device 1341 * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command 1342 */ 1343 *s->status = VIRTIO_NET_ERR; 1344 return sizeof(*s->status); 1345 } 1346 1347 /* 1348 * According to VirtIO standard, "Since there are no guarantees, 1349 * it can use a hash filter or silently switch to allmulti or 1350 * promiscuous mode if it is given too many addresses.". 1351 * 1352 * Therefore, considering that QEMU is unable to send the entire 1353 * filter table to the vdpa device, it should send the 1354 * VIRTIO_NET_CTRL_RX_PROMISC CVQ command to enable promiscuous mode 1355 */ 1356 hdr_ptr = out->iov_base; 1357 out->iov_len = sizeof(*hdr_ptr) + sizeof(on); 1358 1359 hdr_ptr->class = VIRTIO_NET_CTRL_RX; 1360 hdr_ptr->cmd = VIRTIO_NET_CTRL_RX_PROMISC; 1361 iov_from_buf(out, 1, sizeof(*hdr_ptr), &on, sizeof(on)); 1362 r = vhost_vdpa_net_cvq_add(s, out, 1, in, 1); 1363 if (unlikely(r < 0)) { 1364 return r; 1365 } 1366 1367 /* 1368 * We can poll here since we've had BQL from the time 1369 * we sent the descriptor. 1370 */ 1371 r = vhost_vdpa_net_svq_poll(s, 1); 1372 if (unlikely(r < sizeof(*s->status))) { 1373 return r; 1374 } 1375 if (*s->status != VIRTIO_NET_OK) { 1376 return sizeof(*s->status); 1377 } 1378 1379 /* 1380 * QEMU should also send a fake VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ 1381 * command to the device model, including (`MAC_TABLE_ENTRIES` + 1) 1382 * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1) 1383 * multicast MAC addresses. 1384 * 1385 * By doing so, the device model can mark `n->mac_table.uni_overflow` 1386 * and `n->mac_table.multi_overflow`, enabling all packets to be 1387 * received, which aligns with the state of the vdpa device. 1388 */ 1389 cursor = 0; 1390 uint32_t fake_uni_entries = MAC_TABLE_ENTRIES + 1, 1391 fake_mul_entries = MAC_TABLE_ENTRIES + 1, 1392 fake_cvq_size = sizeof(struct virtio_net_ctrl_hdr) + 1393 sizeof(mac_data) + fake_uni_entries * ETH_ALEN + 1394 sizeof(mac_data) + fake_mul_entries * ETH_ALEN; 1395 1396 assert(fake_cvq_size < vhost_vdpa_net_cvq_cmd_page_len()); 1397 out->iov_len = fake_cvq_size; 1398 1399 /* pack the header for fake CVQ command */ 1400 hdr_ptr = out->iov_base + cursor; 1401 hdr_ptr->class = VIRTIO_NET_CTRL_MAC; 1402 hdr_ptr->cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET; 1403 cursor += sizeof(*hdr_ptr); 1404 1405 /* 1406 * Pack the non-multicast MAC addresses part for fake CVQ command. 1407 * 1408 * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC 1409 * addresses provided in CVQ command. Therefore, only the entries 1410 * field need to be prepared in the CVQ command. 1411 */ 1412 mac_ptr = out->iov_base + cursor; 1413 mac_ptr->entries = cpu_to_le32(fake_uni_entries); 1414 cursor += sizeof(*mac_ptr) + fake_uni_entries * ETH_ALEN; 1415 1416 /* 1417 * Pack the multicast MAC addresses part for fake CVQ command. 1418 * 1419 * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC 1420 * addresses provided in CVQ command. Therefore, only the entries 1421 * field need to be prepared in the CVQ command. 1422 */ 1423 mac_ptr = out->iov_base + cursor; 1424 mac_ptr->entries = cpu_to_le32(fake_mul_entries); 1425 1426 /* 1427 * Simulating QEMU poll a vdpa device used buffer 1428 * for VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command 1429 */ 1430 return sizeof(*s->status); 1431 } 1432 1433 /** 1434 * Validate and copy control virtqueue commands. 1435 * 1436 * Following QEMU guidelines, we offer a copy of the buffers to the device to 1437 * prevent TOCTOU bugs. 1438 */ 1439 static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, 1440 VirtQueueElement *elem, 1441 void *opaque) 1442 { 1443 VhostVDPAState *s = opaque; 1444 size_t in_len; 1445 const struct virtio_net_ctrl_hdr *ctrl; 1446 virtio_net_ctrl_ack status = VIRTIO_NET_ERR; 1447 /* Out buffer sent to both the vdpa device and the device model */ 1448 struct iovec out = { 1449 .iov_base = s->cvq_cmd_out_buffer, 1450 }; 1451 /* in buffer used for device model */ 1452 const struct iovec model_in = { 1453 .iov_base = &status, 1454 .iov_len = sizeof(status), 1455 }; 1456 /* in buffer used for vdpa device */ 1457 const struct iovec vdpa_in = { 1458 .iov_base = s->status, 1459 .iov_len = sizeof(*s->status), 1460 }; 1461 ssize_t dev_written = -EINVAL; 1462 1463 out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0, 1464 s->cvq_cmd_out_buffer, 1465 vhost_vdpa_net_cvq_cmd_page_len()); 1466 1467 ctrl = s->cvq_cmd_out_buffer; 1468 if (ctrl->class == VIRTIO_NET_CTRL_ANNOUNCE) { 1469 /* 1470 * Guest announce capability is emulated by qemu, so don't forward to 1471 * the device. 1472 */ 1473 dev_written = sizeof(status); 1474 *s->status = VIRTIO_NET_OK; 1475 } else if (unlikely(ctrl->class == VIRTIO_NET_CTRL_MAC && 1476 ctrl->cmd == VIRTIO_NET_CTRL_MAC_TABLE_SET && 1477 iov_size(elem->out_sg, elem->out_num) > out.iov_len)) { 1478 /* 1479 * Due to the size limitation of the out buffer sent to the vdpa device, 1480 * which is determined by vhost_vdpa_net_cvq_cmd_page_len(), excessive 1481 * MAC addresses set by the driver for the filter table can cause 1482 * truncation of the CVQ command in QEMU. As a result, the vdpa device 1483 * rejects the flawed CVQ command. 1484 * 1485 * Therefore, QEMU must handle this situation instead of sending 1486 * the CVQ command directly. 1487 */ 1488 dev_written = vhost_vdpa_net_excessive_mac_filter_cvq_add(s, elem, 1489 &out, &vdpa_in); 1490 if (unlikely(dev_written < 0)) { 1491 goto out; 1492 } 1493 } else { 1494 ssize_t r; 1495 r = vhost_vdpa_net_cvq_add(s, &out, 1, &vdpa_in, 1); 1496 if (unlikely(r < 0)) { 1497 dev_written = r; 1498 goto out; 1499 } 1500 1501 /* 1502 * We can poll here since we've had BQL from the time 1503 * we sent the descriptor. 1504 */ 1505 dev_written = vhost_vdpa_net_svq_poll(s, 1); 1506 } 1507 1508 if (unlikely(dev_written < sizeof(status))) { 1509 error_report("Insufficient written data (%zu)", dev_written); 1510 goto out; 1511 } 1512 1513 if (*s->status != VIRTIO_NET_OK) { 1514 goto out; 1515 } 1516 1517 status = VIRTIO_NET_ERR; 1518 virtio_net_handle_ctrl_iov(svq->vdev, &model_in, 1, &out, 1); 1519 if (status != VIRTIO_NET_OK) { 1520 error_report("Bad CVQ processing in model"); 1521 } 1522 1523 out: 1524 in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, 1525 sizeof(status)); 1526 if (unlikely(in_len < sizeof(status))) { 1527 error_report("Bad device CVQ written length"); 1528 } 1529 vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status))); 1530 /* 1531 * `elem` belongs to vhost_vdpa_net_handle_ctrl_avail() only when 1532 * the function successfully forwards the CVQ command, indicated 1533 * by a non-negative value of `dev_written`. Otherwise, it still 1534 * belongs to SVQ. 1535 * This function should only free the `elem` when it owns. 1536 */ 1537 if (dev_written >= 0) { 1538 g_free(elem); 1539 } 1540 return dev_written < 0 ? dev_written : 0; 1541 } 1542 1543 static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = { 1544 .avail_handler = vhost_vdpa_net_handle_ctrl_avail, 1545 }; 1546 1547 /** 1548 * Probe if CVQ is isolated 1549 * 1550 * @device_fd The vdpa device fd 1551 * @features Features offered by the device. 1552 * @cvq_index The control vq pair index 1553 * 1554 * Returns <0 in case of failure, 0 if false and 1 if true. 1555 */ 1556 static int vhost_vdpa_probe_cvq_isolation(int device_fd, uint64_t features, 1557 int cvq_index, Error **errp) 1558 { 1559 uint64_t backend_features; 1560 int64_t cvq_group; 1561 uint8_t status = VIRTIO_CONFIG_S_ACKNOWLEDGE | 1562 VIRTIO_CONFIG_S_DRIVER; 1563 int r; 1564 1565 ERRP_GUARD(); 1566 1567 r = ioctl(device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features); 1568 if (unlikely(r < 0)) { 1569 error_setg_errno(errp, errno, "Cannot get vdpa backend_features"); 1570 return r; 1571 } 1572 1573 if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID))) { 1574 return 0; 1575 } 1576 1577 r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status); 1578 if (unlikely(r)) { 1579 error_setg_errno(errp, -r, "Cannot set device status"); 1580 goto out; 1581 } 1582 1583 r = ioctl(device_fd, VHOST_SET_FEATURES, &features); 1584 if (unlikely(r)) { 1585 error_setg_errno(errp, -r, "Cannot set features"); 1586 goto out; 1587 } 1588 1589 status |= VIRTIO_CONFIG_S_FEATURES_OK; 1590 r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status); 1591 if (unlikely(r)) { 1592 error_setg_errno(errp, -r, "Cannot set device status"); 1593 goto out; 1594 } 1595 1596 cvq_group = vhost_vdpa_get_vring_group(device_fd, cvq_index, errp); 1597 if (unlikely(cvq_group < 0)) { 1598 if (cvq_group != -ENOTSUP) { 1599 r = cvq_group; 1600 goto out; 1601 } 1602 1603 /* 1604 * The kernel report VHOST_BACKEND_F_IOTLB_ASID if the vdpa frontend 1605 * support ASID even if the parent driver does not. The CVQ cannot be 1606 * isolated in this case. 1607 */ 1608 error_free(*errp); 1609 *errp = NULL; 1610 r = 0; 1611 goto out; 1612 } 1613 1614 for (int i = 0; i < cvq_index; ++i) { 1615 int64_t group = vhost_vdpa_get_vring_group(device_fd, i, errp); 1616 if (unlikely(group < 0)) { 1617 r = group; 1618 goto out; 1619 } 1620 1621 if (group == (int64_t)cvq_group) { 1622 r = 0; 1623 goto out; 1624 } 1625 } 1626 1627 r = 1; 1628 1629 out: 1630 status = 0; 1631 ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status); 1632 return r; 1633 } 1634 1635 static NetClientState *net_vhost_vdpa_init(NetClientState *peer, 1636 const char *device, 1637 const char *name, 1638 int vdpa_device_fd, 1639 int queue_pair_index, 1640 int nvqs, 1641 bool is_datapath, 1642 bool svq, 1643 struct vhost_vdpa_iova_range iova_range, 1644 uint64_t features, 1645 VhostVDPAShared *shared, 1646 Error **errp) 1647 { 1648 NetClientState *nc = NULL; 1649 VhostVDPAState *s; 1650 int ret = 0; 1651 assert(name); 1652 int cvq_isolated = 0; 1653 1654 if (is_datapath) { 1655 nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device, 1656 name); 1657 } else { 1658 cvq_isolated = vhost_vdpa_probe_cvq_isolation(vdpa_device_fd, features, 1659 queue_pair_index * 2, 1660 errp); 1661 if (unlikely(cvq_isolated < 0)) { 1662 return NULL; 1663 } 1664 1665 nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer, 1666 device, name); 1667 } 1668 qemu_set_info_str(nc, TYPE_VHOST_VDPA); 1669 s = DO_UPCAST(VhostVDPAState, nc, nc); 1670 1671 s->vhost_vdpa.index = queue_pair_index; 1672 s->always_svq = svq; 1673 s->migration_state.notify = NULL; 1674 s->vhost_vdpa.shadow_vqs_enabled = svq; 1675 if (queue_pair_index == 0) { 1676 vhost_vdpa_net_valid_svq_features(features, 1677 &s->vhost_vdpa.migration_blocker); 1678 s->vhost_vdpa.shared = g_new0(VhostVDPAShared, 1); 1679 s->vhost_vdpa.shared->device_fd = vdpa_device_fd; 1680 s->vhost_vdpa.shared->iova_range = iova_range; 1681 s->vhost_vdpa.shared->shadow_data = svq; 1682 } else if (!is_datapath) { 1683 s->cvq_cmd_out_buffer = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(), 1684 PROT_READ | PROT_WRITE, 1685 MAP_SHARED | MAP_ANONYMOUS, -1, 0); 1686 s->status = mmap(NULL, vhost_vdpa_net_cvq_cmd_page_len(), 1687 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 1688 -1, 0); 1689 1690 s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops; 1691 s->vhost_vdpa.shadow_vq_ops_opaque = s; 1692 s->cvq_isolated = cvq_isolated; 1693 } 1694 if (queue_pair_index != 0) { 1695 s->vhost_vdpa.shared = shared; 1696 } 1697 1698 ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs); 1699 if (ret) { 1700 qemu_del_net_client(nc); 1701 return NULL; 1702 } 1703 1704 return nc; 1705 } 1706 1707 static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp) 1708 { 1709 int ret = ioctl(fd, VHOST_GET_FEATURES, features); 1710 if (unlikely(ret < 0)) { 1711 error_setg_errno(errp, errno, 1712 "Fail to query features from vhost-vDPA device"); 1713 } 1714 return ret; 1715 } 1716 1717 static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features, 1718 int *has_cvq, Error **errp) 1719 { 1720 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 1721 g_autofree struct vhost_vdpa_config *config = NULL; 1722 __virtio16 *max_queue_pairs; 1723 int ret; 1724 1725 if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) { 1726 *has_cvq = 1; 1727 } else { 1728 *has_cvq = 0; 1729 } 1730 1731 if (features & (1 << VIRTIO_NET_F_MQ)) { 1732 config = g_malloc0(config_size + sizeof(*max_queue_pairs)); 1733 config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs); 1734 config->len = sizeof(*max_queue_pairs); 1735 1736 ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config); 1737 if (ret) { 1738 error_setg(errp, "Fail to get config from vhost-vDPA device"); 1739 return -ret; 1740 } 1741 1742 max_queue_pairs = (__virtio16 *)&config->buf; 1743 1744 return lduw_le_p(max_queue_pairs); 1745 } 1746 1747 return 1; 1748 } 1749 1750 int net_init_vhost_vdpa(const Netdev *netdev, const char *name, 1751 NetClientState *peer, Error **errp) 1752 { 1753 const NetdevVhostVDPAOptions *opts; 1754 uint64_t features; 1755 int vdpa_device_fd; 1756 g_autofree NetClientState **ncs = NULL; 1757 struct vhost_vdpa_iova_range iova_range; 1758 NetClientState *nc; 1759 int queue_pairs, r, i = 0, has_cvq = 0; 1760 1761 assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA); 1762 opts = &netdev->u.vhost_vdpa; 1763 if (!opts->vhostdev && !opts->vhostfd) { 1764 error_setg(errp, 1765 "vhost-vdpa: neither vhostdev= nor vhostfd= was specified"); 1766 return -1; 1767 } 1768 1769 if (opts->vhostdev && opts->vhostfd) { 1770 error_setg(errp, 1771 "vhost-vdpa: vhostdev= and vhostfd= are mutually exclusive"); 1772 return -1; 1773 } 1774 1775 if (opts->vhostdev) { 1776 vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp); 1777 if (vdpa_device_fd == -1) { 1778 return -errno; 1779 } 1780 } else { 1781 /* has_vhostfd */ 1782 vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, errp); 1783 if (vdpa_device_fd == -1) { 1784 error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: "); 1785 return -1; 1786 } 1787 } 1788 1789 r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp); 1790 if (unlikely(r < 0)) { 1791 goto err; 1792 } 1793 1794 queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features, 1795 &has_cvq, errp); 1796 if (queue_pairs < 0) { 1797 qemu_close(vdpa_device_fd); 1798 return queue_pairs; 1799 } 1800 1801 r = vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range); 1802 if (unlikely(r < 0)) { 1803 error_setg(errp, "vhost-vdpa: get iova range failed: %s", 1804 strerror(-r)); 1805 goto err; 1806 } 1807 1808 if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) { 1809 goto err; 1810 } 1811 1812 ncs = g_malloc0(sizeof(*ncs) * queue_pairs); 1813 1814 for (i = 0; i < queue_pairs; i++) { 1815 VhostVDPAShared *shared = NULL; 1816 1817 if (i) { 1818 shared = DO_UPCAST(VhostVDPAState, nc, ncs[0])->vhost_vdpa.shared; 1819 } 1820 ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, 1821 vdpa_device_fd, i, 2, true, opts->x_svq, 1822 iova_range, features, shared, errp); 1823 if (!ncs[i]) 1824 goto err; 1825 } 1826 1827 if (has_cvq) { 1828 VhostVDPAState *s0 = DO_UPCAST(VhostVDPAState, nc, ncs[0]); 1829 VhostVDPAShared *shared = s0->vhost_vdpa.shared; 1830 1831 nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, 1832 vdpa_device_fd, i, 1, false, 1833 opts->x_svq, iova_range, features, shared, 1834 errp); 1835 if (!nc) 1836 goto err; 1837 } 1838 1839 return 0; 1840 1841 err: 1842 if (i) { 1843 for (i--; i >= 0; i--) { 1844 qemu_del_net_client(ncs[i]); 1845 } 1846 } 1847 1848 qemu_close(vdpa_device_fd); 1849 1850 return -1; 1851 } 1852