1 /* 2 * vhost-vdpa.c 3 * 4 * Copyright(c) 2017-2018 Intel Corporation. 5 * Copyright(c) 2020 Red Hat, Inc. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include "clients.h" 14 #include "hw/virtio/virtio-net.h" 15 #include "net/vhost_net.h" 16 #include "net/vhost-vdpa.h" 17 #include "hw/virtio/vhost-vdpa.h" 18 #include "qemu/config-file.h" 19 #include "qemu/error-report.h" 20 #include "qemu/log.h" 21 #include "qemu/memalign.h" 22 #include "qemu/option.h" 23 #include "qapi/error.h" 24 #include <linux/vhost.h> 25 #include <sys/ioctl.h> 26 #include <err.h> 27 #include "standard-headers/linux/virtio_net.h" 28 #include "monitor/monitor.h" 29 #include "migration/migration.h" 30 #include "migration/misc.h" 31 #include "hw/virtio/vhost.h" 32 33 /* Todo:need to add the multiqueue support here */ 34 typedef struct VhostVDPAState { 35 NetClientState nc; 36 struct vhost_vdpa vhost_vdpa; 37 Notifier migration_state; 38 VHostNetState *vhost_net; 39 40 /* Control commands shadow buffers */ 41 void *cvq_cmd_out_buffer; 42 virtio_net_ctrl_ack *status; 43 44 /* The device always have SVQ enabled */ 45 bool always_svq; 46 bool started; 47 } VhostVDPAState; 48 49 const int vdpa_feature_bits[] = { 50 VIRTIO_F_NOTIFY_ON_EMPTY, 51 VIRTIO_RING_F_INDIRECT_DESC, 52 VIRTIO_RING_F_EVENT_IDX, 53 VIRTIO_F_ANY_LAYOUT, 54 VIRTIO_F_VERSION_1, 55 VIRTIO_NET_F_CSUM, 56 VIRTIO_NET_F_GUEST_CSUM, 57 VIRTIO_NET_F_GSO, 58 VIRTIO_NET_F_GUEST_TSO4, 59 VIRTIO_NET_F_GUEST_TSO6, 60 VIRTIO_NET_F_GUEST_ECN, 61 VIRTIO_NET_F_GUEST_UFO, 62 VIRTIO_NET_F_HOST_TSO4, 63 VIRTIO_NET_F_HOST_TSO6, 64 VIRTIO_NET_F_HOST_ECN, 65 VIRTIO_NET_F_HOST_UFO, 66 VIRTIO_NET_F_MRG_RXBUF, 67 VIRTIO_NET_F_MTU, 68 VIRTIO_NET_F_CTRL_RX, 69 VIRTIO_NET_F_CTRL_RX_EXTRA, 70 VIRTIO_NET_F_CTRL_VLAN, 71 VIRTIO_NET_F_CTRL_MAC_ADDR, 72 VIRTIO_NET_F_RSS, 73 VIRTIO_NET_F_MQ, 74 VIRTIO_NET_F_CTRL_VQ, 75 VIRTIO_F_IOMMU_PLATFORM, 76 VIRTIO_F_RING_PACKED, 77 VIRTIO_F_RING_RESET, 78 VIRTIO_NET_F_RSS, 79 VIRTIO_NET_F_HASH_REPORT, 80 VIRTIO_NET_F_STATUS, 81 VHOST_INVALID_FEATURE_BIT 82 }; 83 84 /** Supported device specific feature bits with SVQ */ 85 static const uint64_t vdpa_svq_device_features = 86 BIT_ULL(VIRTIO_NET_F_CSUM) | 87 BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) | 88 BIT_ULL(VIRTIO_NET_F_MTU) | 89 BIT_ULL(VIRTIO_NET_F_MAC) | 90 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | 91 BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) | 92 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | 93 BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | 94 BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | 95 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | 96 BIT_ULL(VIRTIO_NET_F_HOST_ECN) | 97 BIT_ULL(VIRTIO_NET_F_HOST_UFO) | 98 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | 99 BIT_ULL(VIRTIO_NET_F_STATUS) | 100 BIT_ULL(VIRTIO_NET_F_CTRL_VQ) | 101 BIT_ULL(VIRTIO_NET_F_MQ) | 102 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | 103 BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | 104 /* VHOST_F_LOG_ALL is exposed by SVQ */ 105 BIT_ULL(VHOST_F_LOG_ALL) | 106 BIT_ULL(VIRTIO_NET_F_RSC_EXT) | 107 BIT_ULL(VIRTIO_NET_F_STANDBY) | 108 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX); 109 110 #define VHOST_VDPA_NET_CVQ_ASID 1 111 112 VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc) 113 { 114 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 115 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 116 return s->vhost_net; 117 } 118 119 static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp) 120 { 121 uint64_t invalid_dev_features = 122 features & ~vdpa_svq_device_features & 123 /* Transport are all accepted at this point */ 124 ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START, 125 VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START); 126 127 if (invalid_dev_features) { 128 error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64, 129 invalid_dev_features); 130 return false; 131 } 132 133 return vhost_svq_valid_features(features, errp); 134 } 135 136 static int vhost_vdpa_net_check_device_id(struct vhost_net *net) 137 { 138 uint32_t device_id; 139 int ret; 140 struct vhost_dev *hdev; 141 142 hdev = (struct vhost_dev *)&net->dev; 143 ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id); 144 if (device_id != VIRTIO_ID_NET) { 145 return -ENOTSUP; 146 } 147 return ret; 148 } 149 150 static int vhost_vdpa_add(NetClientState *ncs, void *be, 151 int queue_pair_index, int nvqs) 152 { 153 VhostNetOptions options; 154 struct vhost_net *net = NULL; 155 VhostVDPAState *s; 156 int ret; 157 158 options.backend_type = VHOST_BACKEND_TYPE_VDPA; 159 assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 160 s = DO_UPCAST(VhostVDPAState, nc, ncs); 161 options.net_backend = ncs; 162 options.opaque = be; 163 options.busyloop_timeout = 0; 164 options.nvqs = nvqs; 165 166 net = vhost_net_init(&options); 167 if (!net) { 168 error_report("failed to init vhost_net for queue"); 169 goto err_init; 170 } 171 s->vhost_net = net; 172 ret = vhost_vdpa_net_check_device_id(net); 173 if (ret) { 174 goto err_check; 175 } 176 return 0; 177 err_check: 178 vhost_net_cleanup(net); 179 g_free(net); 180 err_init: 181 return -1; 182 } 183 184 static void vhost_vdpa_cleanup(NetClientState *nc) 185 { 186 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 187 188 qemu_vfree(s->cvq_cmd_out_buffer); 189 qemu_vfree(s->status); 190 if (s->vhost_net) { 191 vhost_net_cleanup(s->vhost_net); 192 g_free(s->vhost_net); 193 s->vhost_net = NULL; 194 } 195 if (s->vhost_vdpa.device_fd >= 0) { 196 qemu_close(s->vhost_vdpa.device_fd); 197 s->vhost_vdpa.device_fd = -1; 198 } 199 } 200 201 static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc) 202 { 203 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 204 205 return true; 206 } 207 208 static bool vhost_vdpa_has_ufo(NetClientState *nc) 209 { 210 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 211 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 212 uint64_t features = 0; 213 features |= (1ULL << VIRTIO_NET_F_HOST_UFO); 214 features = vhost_net_get_features(s->vhost_net, features); 215 return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO)); 216 217 } 218 219 static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc, 220 Error **errp) 221 { 222 const char *driver = object_class_get_name(oc); 223 224 if (!g_str_has_prefix(driver, "virtio-net-")) { 225 error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*"); 226 return false; 227 } 228 229 return true; 230 } 231 232 /** Dummy receive in case qemu falls back to userland tap networking */ 233 static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf, 234 size_t size) 235 { 236 return size; 237 } 238 239 /** From any vdpa net client, get the netclient of the first queue pair */ 240 static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s) 241 { 242 NICState *nic = qemu_get_nic(s->nc.peer); 243 NetClientState *nc0 = qemu_get_peer(nic->ncs, 0); 244 245 return DO_UPCAST(VhostVDPAState, nc, nc0); 246 } 247 248 static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) 249 { 250 struct vhost_vdpa *v = &s->vhost_vdpa; 251 VirtIONet *n; 252 VirtIODevice *vdev; 253 int data_queue_pairs, cvq, r; 254 255 /* We are only called on the first data vqs and only if x-svq is not set */ 256 if (s->vhost_vdpa.shadow_vqs_enabled == enable) { 257 return; 258 } 259 260 vdev = v->dev->vdev; 261 n = VIRTIO_NET(vdev); 262 if (!n->vhost_started) { 263 return; 264 } 265 266 data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1; 267 cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ? 268 n->max_ncs - n->max_queue_pairs : 0; 269 /* 270 * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter 271 * in the future and resume the device if read-only operations between 272 * suspend and reset goes wrong. 273 */ 274 vhost_net_stop(vdev, n->nic->ncs, data_queue_pairs, cvq); 275 276 /* Start will check migration setup_or_active to configure or not SVQ */ 277 r = vhost_net_start(vdev, n->nic->ncs, data_queue_pairs, cvq); 278 if (unlikely(r < 0)) { 279 error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r); 280 } 281 } 282 283 static void vdpa_net_migration_state_notifier(Notifier *notifier, void *data) 284 { 285 MigrationState *migration = data; 286 VhostVDPAState *s = container_of(notifier, VhostVDPAState, 287 migration_state); 288 289 if (migration_in_setup(migration)) { 290 vhost_vdpa_net_log_global_enable(s, true); 291 } else if (migration_has_failed(migration)) { 292 vhost_vdpa_net_log_global_enable(s, false); 293 } 294 } 295 296 static void vhost_vdpa_net_data_start_first(VhostVDPAState *s) 297 { 298 struct vhost_vdpa *v = &s->vhost_vdpa; 299 300 add_migration_state_change_notifier(&s->migration_state); 301 if (v->shadow_vqs_enabled) { 302 v->iova_tree = vhost_iova_tree_new(v->iova_range.first, 303 v->iova_range.last); 304 } 305 } 306 307 static int vhost_vdpa_net_data_start(NetClientState *nc) 308 { 309 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 310 struct vhost_vdpa *v = &s->vhost_vdpa; 311 312 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 313 314 if (s->always_svq || 315 migration_is_setup_or_active(migrate_get_current()->state)) { 316 v->shadow_vqs_enabled = true; 317 v->shadow_data = true; 318 } else { 319 v->shadow_vqs_enabled = false; 320 v->shadow_data = false; 321 } 322 323 if (v->index == 0) { 324 vhost_vdpa_net_data_start_first(s); 325 return 0; 326 } 327 328 if (v->shadow_vqs_enabled) { 329 VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s); 330 v->iova_tree = s0->vhost_vdpa.iova_tree; 331 } 332 333 return 0; 334 } 335 336 static void vhost_vdpa_net_client_stop(NetClientState *nc) 337 { 338 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 339 struct vhost_dev *dev; 340 341 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 342 343 if (s->vhost_vdpa.index == 0) { 344 remove_migration_state_change_notifier(&s->migration_state); 345 } 346 347 dev = s->vhost_vdpa.dev; 348 if (dev->vq_index + dev->nvqs == dev->vq_index_end) { 349 g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete); 350 } 351 } 352 353 static NetClientInfo net_vhost_vdpa_info = { 354 .type = NET_CLIENT_DRIVER_VHOST_VDPA, 355 .size = sizeof(VhostVDPAState), 356 .receive = vhost_vdpa_receive, 357 .start = vhost_vdpa_net_data_start, 358 .stop = vhost_vdpa_net_client_stop, 359 .cleanup = vhost_vdpa_cleanup, 360 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, 361 .has_ufo = vhost_vdpa_has_ufo, 362 .check_peer_type = vhost_vdpa_check_peer_type, 363 }; 364 365 /** 366 * Get vring virtqueue group 367 * 368 * @device_fd vdpa device fd 369 * @vq_index Virtqueue index 370 * 371 * Return -errno in case of error, or vq group if success. 372 */ 373 static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index) 374 { 375 struct vhost_vring_state state = { 376 .index = vq_index, 377 }; 378 int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state); 379 380 if (unlikely(r < 0)) { 381 r = -errno; 382 error_report("Cannot get VQ %u group: %s", vq_index, 383 g_strerror(errno)); 384 return r; 385 } 386 387 return state.num; 388 } 389 390 static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v, 391 unsigned vq_group, 392 unsigned asid_num) 393 { 394 struct vhost_vring_state asid = { 395 .index = vq_group, 396 .num = asid_num, 397 }; 398 int r; 399 400 r = ioctl(v->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid); 401 if (unlikely(r < 0)) { 402 error_report("Can't set vq group %u asid %u, errno=%d (%s)", 403 asid.index, asid.num, errno, g_strerror(errno)); 404 } 405 return r; 406 } 407 408 static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr) 409 { 410 VhostIOVATree *tree = v->iova_tree; 411 DMAMap needle = { 412 /* 413 * No need to specify size or to look for more translations since 414 * this contiguous chunk was allocated by us. 415 */ 416 .translated_addr = (hwaddr)(uintptr_t)addr, 417 }; 418 const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle); 419 int r; 420 421 if (unlikely(!map)) { 422 error_report("Cannot locate expected map"); 423 return; 424 } 425 426 r = vhost_vdpa_dma_unmap(v, v->address_space_id, map->iova, map->size + 1); 427 if (unlikely(r != 0)) { 428 error_report("Device cannot unmap: %s(%d)", g_strerror(r), r); 429 } 430 431 vhost_iova_tree_remove(tree, *map); 432 } 433 434 static size_t vhost_vdpa_net_cvq_cmd_len(void) 435 { 436 /* 437 * MAC_TABLE_SET is the ctrl command that produces the longer out buffer. 438 * In buffer is always 1 byte, so it should fit here 439 */ 440 return sizeof(struct virtio_net_ctrl_hdr) + 441 2 * sizeof(struct virtio_net_ctrl_mac) + 442 MAC_TABLE_ENTRIES * ETH_ALEN; 443 } 444 445 static size_t vhost_vdpa_net_cvq_cmd_page_len(void) 446 { 447 return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size()); 448 } 449 450 /** Map CVQ buffer. */ 451 static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size, 452 bool write) 453 { 454 DMAMap map = {}; 455 int r; 456 457 map.translated_addr = (hwaddr)(uintptr_t)buf; 458 map.size = size - 1; 459 map.perm = write ? IOMMU_RW : IOMMU_RO, 460 r = vhost_iova_tree_map_alloc(v->iova_tree, &map); 461 if (unlikely(r != IOVA_OK)) { 462 error_report("Cannot map injected element"); 463 return r; 464 } 465 466 r = vhost_vdpa_dma_map(v, v->address_space_id, map.iova, 467 vhost_vdpa_net_cvq_cmd_page_len(), buf, !write); 468 if (unlikely(r < 0)) { 469 goto dma_map_err; 470 } 471 472 return 0; 473 474 dma_map_err: 475 vhost_iova_tree_remove(v->iova_tree, map); 476 return r; 477 } 478 479 static int vhost_vdpa_net_cvq_start(NetClientState *nc) 480 { 481 VhostVDPAState *s, *s0; 482 struct vhost_vdpa *v; 483 uint64_t backend_features; 484 int64_t cvq_group; 485 int cvq_index, r; 486 487 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 488 489 s = DO_UPCAST(VhostVDPAState, nc, nc); 490 v = &s->vhost_vdpa; 491 492 s0 = vhost_vdpa_net_first_nc_vdpa(s); 493 v->shadow_data = s0->vhost_vdpa.shadow_vqs_enabled; 494 v->shadow_vqs_enabled = s->always_svq; 495 s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID; 496 497 if (s->vhost_vdpa.shadow_data) { 498 /* SVQ is already configured for all virtqueues */ 499 goto out; 500 } 501 502 /* 503 * If we early return in these cases SVQ will not be enabled. The migration 504 * will be blocked as long as vhost-vdpa backends will not offer _F_LOG. 505 * 506 * Calling VHOST_GET_BACKEND_FEATURES as they are not available in v->dev 507 * yet. 508 */ 509 r = ioctl(v->device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features); 510 if (unlikely(r < 0)) { 511 error_report("Cannot get vdpa backend_features: %s(%d)", 512 g_strerror(errno), errno); 513 return -1; 514 } 515 if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID)) || 516 !vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) { 517 return 0; 518 } 519 520 /* 521 * Check if all the virtqueues of the virtio device are in a different vq 522 * than the last vq. VQ group of last group passed in cvq_group. 523 */ 524 cvq_index = v->dev->vq_index_end - 1; 525 cvq_group = vhost_vdpa_get_vring_group(v->device_fd, cvq_index); 526 if (unlikely(cvq_group < 0)) { 527 return cvq_group; 528 } 529 for (int i = 0; i < cvq_index; ++i) { 530 int64_t group = vhost_vdpa_get_vring_group(v->device_fd, i); 531 532 if (unlikely(group < 0)) { 533 return group; 534 } 535 536 if (group == cvq_group) { 537 return 0; 538 } 539 } 540 541 r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID); 542 if (unlikely(r < 0)) { 543 return r; 544 } 545 546 v->shadow_vqs_enabled = true; 547 s->vhost_vdpa.address_space_id = VHOST_VDPA_NET_CVQ_ASID; 548 549 out: 550 if (!s->vhost_vdpa.shadow_vqs_enabled) { 551 return 0; 552 } 553 554 if (s0->vhost_vdpa.iova_tree) { 555 /* 556 * SVQ is already configured for all virtqueues. Reuse IOVA tree for 557 * simplicity, whether CVQ shares ASID with guest or not, because: 558 * - Memory listener need access to guest's memory addresses allocated 559 * in the IOVA tree. 560 * - There should be plenty of IOVA address space for both ASID not to 561 * worry about collisions between them. Guest's translations are 562 * still validated with virtio virtqueue_pop so there is no risk for 563 * the guest to access memory that it shouldn't. 564 * 565 * To allocate a iova tree per ASID is doable but it complicates the 566 * code and it is not worth it for the moment. 567 */ 568 v->iova_tree = s0->vhost_vdpa.iova_tree; 569 } else { 570 v->iova_tree = vhost_iova_tree_new(v->iova_range.first, 571 v->iova_range.last); 572 } 573 574 r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer, 575 vhost_vdpa_net_cvq_cmd_page_len(), false); 576 if (unlikely(r < 0)) { 577 return r; 578 } 579 580 r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->status, 581 vhost_vdpa_net_cvq_cmd_page_len(), true); 582 if (unlikely(r < 0)) { 583 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); 584 } 585 586 return r; 587 } 588 589 static void vhost_vdpa_net_cvq_stop(NetClientState *nc) 590 { 591 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 592 593 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 594 595 if (s->vhost_vdpa.shadow_vqs_enabled) { 596 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); 597 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status); 598 } 599 600 vhost_vdpa_net_client_stop(nc); 601 } 602 603 static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len, 604 size_t in_len) 605 { 606 /* Buffers for the device */ 607 const struct iovec out = { 608 .iov_base = s->cvq_cmd_out_buffer, 609 .iov_len = out_len, 610 }; 611 const struct iovec in = { 612 .iov_base = s->status, 613 .iov_len = sizeof(virtio_net_ctrl_ack), 614 }; 615 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0); 616 int r; 617 618 r = vhost_svq_add(svq, &out, 1, &in, 1, NULL); 619 if (unlikely(r != 0)) { 620 if (unlikely(r == -ENOSPC)) { 621 qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n", 622 __func__); 623 } 624 return r; 625 } 626 627 /* 628 * We can poll here since we've had BQL from the time we sent the 629 * descriptor. Also, we need to take the answer before SVQ pulls by itself, 630 * when BQL is released 631 */ 632 return vhost_svq_poll(svq); 633 } 634 635 static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, uint8_t class, 636 uint8_t cmd, const void *data, 637 size_t data_size) 638 { 639 const struct virtio_net_ctrl_hdr ctrl = { 640 .class = class, 641 .cmd = cmd, 642 }; 643 644 assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl)); 645 646 memcpy(s->cvq_cmd_out_buffer, &ctrl, sizeof(ctrl)); 647 memcpy(s->cvq_cmd_out_buffer + sizeof(ctrl), data, data_size); 648 649 return vhost_vdpa_net_cvq_add(s, sizeof(ctrl) + data_size, 650 sizeof(virtio_net_ctrl_ack)); 651 } 652 653 static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n) 654 { 655 uint64_t features = n->parent_obj.guest_features; 656 if (features & BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR)) { 657 ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MAC, 658 VIRTIO_NET_CTRL_MAC_ADDR_SET, 659 n->mac, sizeof(n->mac)); 660 if (unlikely(dev_written < 0)) { 661 return dev_written; 662 } 663 664 return *s->status != VIRTIO_NET_OK; 665 } 666 667 return 0; 668 } 669 670 static int vhost_vdpa_net_load_mq(VhostVDPAState *s, 671 const VirtIONet *n) 672 { 673 struct virtio_net_ctrl_mq mq; 674 uint64_t features = n->parent_obj.guest_features; 675 ssize_t dev_written; 676 677 if (!(features & BIT_ULL(VIRTIO_NET_F_MQ))) { 678 return 0; 679 } 680 681 mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs); 682 dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MQ, 683 VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &mq, 684 sizeof(mq)); 685 if (unlikely(dev_written < 0)) { 686 return dev_written; 687 } 688 689 return *s->status != VIRTIO_NET_OK; 690 } 691 692 static int vhost_vdpa_net_load(NetClientState *nc) 693 { 694 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 695 struct vhost_vdpa *v = &s->vhost_vdpa; 696 const VirtIONet *n; 697 int r; 698 699 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 700 701 if (!v->shadow_vqs_enabled) { 702 return 0; 703 } 704 705 n = VIRTIO_NET(v->dev->vdev); 706 r = vhost_vdpa_net_load_mac(s, n); 707 if (unlikely(r < 0)) { 708 return r; 709 } 710 r = vhost_vdpa_net_load_mq(s, n); 711 if (unlikely(r)) { 712 return r; 713 } 714 715 return 0; 716 } 717 718 static NetClientInfo net_vhost_vdpa_cvq_info = { 719 .type = NET_CLIENT_DRIVER_VHOST_VDPA, 720 .size = sizeof(VhostVDPAState), 721 .receive = vhost_vdpa_receive, 722 .start = vhost_vdpa_net_cvq_start, 723 .load = vhost_vdpa_net_load, 724 .stop = vhost_vdpa_net_cvq_stop, 725 .cleanup = vhost_vdpa_cleanup, 726 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, 727 .has_ufo = vhost_vdpa_has_ufo, 728 .check_peer_type = vhost_vdpa_check_peer_type, 729 }; 730 731 /** 732 * Validate and copy control virtqueue commands. 733 * 734 * Following QEMU guidelines, we offer a copy of the buffers to the device to 735 * prevent TOCTOU bugs. 736 */ 737 static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, 738 VirtQueueElement *elem, 739 void *opaque) 740 { 741 VhostVDPAState *s = opaque; 742 size_t in_len; 743 virtio_net_ctrl_ack status = VIRTIO_NET_ERR; 744 /* Out buffer sent to both the vdpa device and the device model */ 745 struct iovec out = { 746 .iov_base = s->cvq_cmd_out_buffer, 747 }; 748 /* in buffer used for device model */ 749 const struct iovec in = { 750 .iov_base = &status, 751 .iov_len = sizeof(status), 752 }; 753 ssize_t dev_written = -EINVAL; 754 755 out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0, 756 s->cvq_cmd_out_buffer, 757 vhost_vdpa_net_cvq_cmd_len()); 758 if (*(uint8_t *)s->cvq_cmd_out_buffer == VIRTIO_NET_CTRL_ANNOUNCE) { 759 /* 760 * Guest announce capability is emulated by qemu, so don't forward to 761 * the device. 762 */ 763 dev_written = sizeof(status); 764 *s->status = VIRTIO_NET_OK; 765 } else { 766 dev_written = vhost_vdpa_net_cvq_add(s, out.iov_len, sizeof(status)); 767 if (unlikely(dev_written < 0)) { 768 goto out; 769 } 770 } 771 772 if (unlikely(dev_written < sizeof(status))) { 773 error_report("Insufficient written data (%zu)", dev_written); 774 goto out; 775 } 776 777 if (*s->status != VIRTIO_NET_OK) { 778 return VIRTIO_NET_ERR; 779 } 780 781 status = VIRTIO_NET_ERR; 782 virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, &out, 1); 783 if (status != VIRTIO_NET_OK) { 784 error_report("Bad CVQ processing in model"); 785 } 786 787 out: 788 in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, 789 sizeof(status)); 790 if (unlikely(in_len < sizeof(status))) { 791 error_report("Bad device CVQ written length"); 792 } 793 vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status))); 794 g_free(elem); 795 return dev_written < 0 ? dev_written : 0; 796 } 797 798 static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = { 799 .avail_handler = vhost_vdpa_net_handle_ctrl_avail, 800 }; 801 802 static NetClientState *net_vhost_vdpa_init(NetClientState *peer, 803 const char *device, 804 const char *name, 805 int vdpa_device_fd, 806 int queue_pair_index, 807 int nvqs, 808 bool is_datapath, 809 bool svq, 810 struct vhost_vdpa_iova_range iova_range, 811 uint64_t features) 812 { 813 NetClientState *nc = NULL; 814 VhostVDPAState *s; 815 int ret = 0; 816 assert(name); 817 if (is_datapath) { 818 nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device, 819 name); 820 } else { 821 nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer, 822 device, name); 823 } 824 qemu_set_info_str(nc, TYPE_VHOST_VDPA); 825 s = DO_UPCAST(VhostVDPAState, nc, nc); 826 827 s->vhost_vdpa.device_fd = vdpa_device_fd; 828 s->vhost_vdpa.index = queue_pair_index; 829 s->always_svq = svq; 830 s->migration_state.notify = vdpa_net_migration_state_notifier; 831 s->vhost_vdpa.shadow_vqs_enabled = svq; 832 s->vhost_vdpa.iova_range = iova_range; 833 s->vhost_vdpa.shadow_data = svq; 834 if (queue_pair_index == 0) { 835 vhost_vdpa_net_valid_svq_features(features, 836 &s->vhost_vdpa.migration_blocker); 837 } else if (!is_datapath) { 838 s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(), 839 vhost_vdpa_net_cvq_cmd_page_len()); 840 memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len()); 841 s->status = qemu_memalign(qemu_real_host_page_size(), 842 vhost_vdpa_net_cvq_cmd_page_len()); 843 memset(s->status, 0, vhost_vdpa_net_cvq_cmd_page_len()); 844 845 s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops; 846 s->vhost_vdpa.shadow_vq_ops_opaque = s; 847 848 /* 849 * TODO: We cannot migrate devices with CVQ as there is no way to set 850 * the device state (MAC, MQ, etc) before starting the datapath. 851 * 852 * Migration blocker ownership now belongs to s->vhost_vdpa. 853 */ 854 error_setg(&s->vhost_vdpa.migration_blocker, 855 "net vdpa cannot migrate with CVQ feature"); 856 } 857 ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs); 858 if (ret) { 859 qemu_del_net_client(nc); 860 return NULL; 861 } 862 return nc; 863 } 864 865 static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp) 866 { 867 int ret = ioctl(fd, VHOST_GET_FEATURES, features); 868 if (unlikely(ret < 0)) { 869 error_setg_errno(errp, errno, 870 "Fail to query features from vhost-vDPA device"); 871 } 872 return ret; 873 } 874 875 static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features, 876 int *has_cvq, Error **errp) 877 { 878 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 879 g_autofree struct vhost_vdpa_config *config = NULL; 880 __virtio16 *max_queue_pairs; 881 int ret; 882 883 if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) { 884 *has_cvq = 1; 885 } else { 886 *has_cvq = 0; 887 } 888 889 if (features & (1 << VIRTIO_NET_F_MQ)) { 890 config = g_malloc0(config_size + sizeof(*max_queue_pairs)); 891 config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs); 892 config->len = sizeof(*max_queue_pairs); 893 894 ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config); 895 if (ret) { 896 error_setg(errp, "Fail to get config from vhost-vDPA device"); 897 return -ret; 898 } 899 900 max_queue_pairs = (__virtio16 *)&config->buf; 901 902 return lduw_le_p(max_queue_pairs); 903 } 904 905 return 1; 906 } 907 908 int net_init_vhost_vdpa(const Netdev *netdev, const char *name, 909 NetClientState *peer, Error **errp) 910 { 911 const NetdevVhostVDPAOptions *opts; 912 uint64_t features; 913 int vdpa_device_fd; 914 g_autofree NetClientState **ncs = NULL; 915 struct vhost_vdpa_iova_range iova_range; 916 NetClientState *nc; 917 int queue_pairs, r, i = 0, has_cvq = 0; 918 919 assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA); 920 opts = &netdev->u.vhost_vdpa; 921 if (!opts->vhostdev && !opts->vhostfd) { 922 error_setg(errp, 923 "vhost-vdpa: neither vhostdev= nor vhostfd= was specified"); 924 return -1; 925 } 926 927 if (opts->vhostdev && opts->vhostfd) { 928 error_setg(errp, 929 "vhost-vdpa: vhostdev= and vhostfd= are mutually exclusive"); 930 return -1; 931 } 932 933 if (opts->vhostdev) { 934 vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp); 935 if (vdpa_device_fd == -1) { 936 return -errno; 937 } 938 } else { 939 /* has_vhostfd */ 940 vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, errp); 941 if (vdpa_device_fd == -1) { 942 error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: "); 943 return -1; 944 } 945 } 946 947 r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp); 948 if (unlikely(r < 0)) { 949 goto err; 950 } 951 952 queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features, 953 &has_cvq, errp); 954 if (queue_pairs < 0) { 955 qemu_close(vdpa_device_fd); 956 return queue_pairs; 957 } 958 959 r = vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range); 960 if (unlikely(r < 0)) { 961 error_setg(errp, "vhost-vdpa: get iova range failed: %s", 962 strerror(-r)); 963 goto err; 964 } 965 966 if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) { 967 goto err; 968 } 969 970 ncs = g_malloc0(sizeof(*ncs) * queue_pairs); 971 972 for (i = 0; i < queue_pairs; i++) { 973 ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, 974 vdpa_device_fd, i, 2, true, opts->x_svq, 975 iova_range, features); 976 if (!ncs[i]) 977 goto err; 978 } 979 980 if (has_cvq) { 981 nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, 982 vdpa_device_fd, i, 1, false, 983 opts->x_svq, iova_range, features); 984 if (!nc) 985 goto err; 986 } 987 988 return 0; 989 990 err: 991 if (i) { 992 for (i--; i >= 0; i--) { 993 qemu_del_net_client(ncs[i]); 994 } 995 } 996 997 qemu_close(vdpa_device_fd); 998 999 return -1; 1000 } 1001