1 /* 2 * vhost-vdpa.c 3 * 4 * Copyright(c) 2017-2018 Intel Corporation. 5 * Copyright(c) 2020 Red Hat, Inc. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include "clients.h" 14 #include "hw/virtio/virtio-net.h" 15 #include "net/vhost_net.h" 16 #include "net/vhost-vdpa.h" 17 #include "hw/virtio/vhost-vdpa.h" 18 #include "qemu/config-file.h" 19 #include "qemu/error-report.h" 20 #include "qemu/log.h" 21 #include "qemu/memalign.h" 22 #include "qemu/option.h" 23 #include "qapi/error.h" 24 #include <linux/vhost.h> 25 #include <sys/ioctl.h> 26 #include <err.h> 27 #include "standard-headers/linux/virtio_net.h" 28 #include "monitor/monitor.h" 29 #include "migration/migration.h" 30 #include "migration/misc.h" 31 #include "hw/virtio/vhost.h" 32 33 /* Todo:need to add the multiqueue support here */ 34 typedef struct VhostVDPAState { 35 NetClientState nc; 36 struct vhost_vdpa vhost_vdpa; 37 Notifier migration_state; 38 VHostNetState *vhost_net; 39 40 /* Control commands shadow buffers */ 41 void *cvq_cmd_out_buffer; 42 virtio_net_ctrl_ack *status; 43 44 /* The device always have SVQ enabled */ 45 bool always_svq; 46 bool started; 47 } VhostVDPAState; 48 49 const int vdpa_feature_bits[] = { 50 VIRTIO_F_NOTIFY_ON_EMPTY, 51 VIRTIO_RING_F_INDIRECT_DESC, 52 VIRTIO_RING_F_EVENT_IDX, 53 VIRTIO_F_ANY_LAYOUT, 54 VIRTIO_F_VERSION_1, 55 VIRTIO_NET_F_CSUM, 56 VIRTIO_NET_F_GUEST_CSUM, 57 VIRTIO_NET_F_GSO, 58 VIRTIO_NET_F_GUEST_TSO4, 59 VIRTIO_NET_F_GUEST_TSO6, 60 VIRTIO_NET_F_GUEST_ECN, 61 VIRTIO_NET_F_GUEST_UFO, 62 VIRTIO_NET_F_HOST_TSO4, 63 VIRTIO_NET_F_HOST_TSO6, 64 VIRTIO_NET_F_HOST_ECN, 65 VIRTIO_NET_F_HOST_UFO, 66 VIRTIO_NET_F_MRG_RXBUF, 67 VIRTIO_NET_F_MTU, 68 VIRTIO_NET_F_CTRL_RX, 69 VIRTIO_NET_F_CTRL_RX_EXTRA, 70 VIRTIO_NET_F_CTRL_VLAN, 71 VIRTIO_NET_F_CTRL_MAC_ADDR, 72 VIRTIO_NET_F_RSS, 73 VIRTIO_NET_F_MQ, 74 VIRTIO_NET_F_CTRL_VQ, 75 VIRTIO_F_IOMMU_PLATFORM, 76 VIRTIO_F_RING_PACKED, 77 VIRTIO_F_RING_RESET, 78 VIRTIO_NET_F_RSS, 79 VIRTIO_NET_F_HASH_REPORT, 80 VIRTIO_NET_F_STATUS, 81 VHOST_INVALID_FEATURE_BIT 82 }; 83 84 /** Supported device specific feature bits with SVQ */ 85 static const uint64_t vdpa_svq_device_features = 86 BIT_ULL(VIRTIO_NET_F_CSUM) | 87 BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) | 88 BIT_ULL(VIRTIO_NET_F_MTU) | 89 BIT_ULL(VIRTIO_NET_F_MAC) | 90 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | 91 BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) | 92 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | 93 BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | 94 BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | 95 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | 96 BIT_ULL(VIRTIO_NET_F_HOST_ECN) | 97 BIT_ULL(VIRTIO_NET_F_HOST_UFO) | 98 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | 99 BIT_ULL(VIRTIO_NET_F_STATUS) | 100 BIT_ULL(VIRTIO_NET_F_CTRL_VQ) | 101 BIT_ULL(VIRTIO_NET_F_MQ) | 102 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | 103 BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | 104 /* VHOST_F_LOG_ALL is exposed by SVQ */ 105 BIT_ULL(VHOST_F_LOG_ALL) | 106 BIT_ULL(VIRTIO_NET_F_RSC_EXT) | 107 BIT_ULL(VIRTIO_NET_F_STANDBY) | 108 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX); 109 110 #define VHOST_VDPA_NET_CVQ_ASID 1 111 112 VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc) 113 { 114 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 115 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 116 return s->vhost_net; 117 } 118 119 static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp) 120 { 121 uint64_t invalid_dev_features = 122 features & ~vdpa_svq_device_features & 123 /* Transport are all accepted at this point */ 124 ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START, 125 VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START); 126 127 if (invalid_dev_features) { 128 error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64, 129 invalid_dev_features); 130 return false; 131 } 132 133 return vhost_svq_valid_features(features, errp); 134 } 135 136 static int vhost_vdpa_net_check_device_id(struct vhost_net *net) 137 { 138 uint32_t device_id; 139 int ret; 140 struct vhost_dev *hdev; 141 142 hdev = (struct vhost_dev *)&net->dev; 143 ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id); 144 if (device_id != VIRTIO_ID_NET) { 145 return -ENOTSUP; 146 } 147 return ret; 148 } 149 150 static int vhost_vdpa_add(NetClientState *ncs, void *be, 151 int queue_pair_index, int nvqs) 152 { 153 VhostNetOptions options; 154 struct vhost_net *net = NULL; 155 VhostVDPAState *s; 156 int ret; 157 158 options.backend_type = VHOST_BACKEND_TYPE_VDPA; 159 assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 160 s = DO_UPCAST(VhostVDPAState, nc, ncs); 161 options.net_backend = ncs; 162 options.opaque = be; 163 options.busyloop_timeout = 0; 164 options.nvqs = nvqs; 165 166 net = vhost_net_init(&options); 167 if (!net) { 168 error_report("failed to init vhost_net for queue"); 169 goto err_init; 170 } 171 s->vhost_net = net; 172 ret = vhost_vdpa_net_check_device_id(net); 173 if (ret) { 174 goto err_check; 175 } 176 return 0; 177 err_check: 178 vhost_net_cleanup(net); 179 g_free(net); 180 err_init: 181 return -1; 182 } 183 184 static void vhost_vdpa_cleanup(NetClientState *nc) 185 { 186 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 187 188 qemu_vfree(s->cvq_cmd_out_buffer); 189 qemu_vfree(s->status); 190 if (s->vhost_net) { 191 vhost_net_cleanup(s->vhost_net); 192 g_free(s->vhost_net); 193 s->vhost_net = NULL; 194 } 195 if (s->vhost_vdpa.device_fd >= 0) { 196 qemu_close(s->vhost_vdpa.device_fd); 197 s->vhost_vdpa.device_fd = -1; 198 } 199 } 200 201 static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc) 202 { 203 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 204 205 return true; 206 } 207 208 static bool vhost_vdpa_has_ufo(NetClientState *nc) 209 { 210 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 211 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 212 uint64_t features = 0; 213 features |= (1ULL << VIRTIO_NET_F_HOST_UFO); 214 features = vhost_net_get_features(s->vhost_net, features); 215 return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO)); 216 217 } 218 219 static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc, 220 Error **errp) 221 { 222 const char *driver = object_class_get_name(oc); 223 224 if (!g_str_has_prefix(driver, "virtio-net-")) { 225 error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*"); 226 return false; 227 } 228 229 return true; 230 } 231 232 /** Dummy receive in case qemu falls back to userland tap networking */ 233 static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf, 234 size_t size) 235 { 236 return size; 237 } 238 239 /** From any vdpa net client, get the netclient of the first queue pair */ 240 static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s) 241 { 242 NICState *nic = qemu_get_nic(s->nc.peer); 243 NetClientState *nc0 = qemu_get_peer(nic->ncs, 0); 244 245 return DO_UPCAST(VhostVDPAState, nc, nc0); 246 } 247 248 static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable) 249 { 250 struct vhost_vdpa *v = &s->vhost_vdpa; 251 VirtIONet *n; 252 VirtIODevice *vdev; 253 int data_queue_pairs, cvq, r; 254 255 /* We are only called on the first data vqs and only if x-svq is not set */ 256 if (s->vhost_vdpa.shadow_vqs_enabled == enable) { 257 return; 258 } 259 260 vdev = v->dev->vdev; 261 n = VIRTIO_NET(vdev); 262 if (!n->vhost_started) { 263 return; 264 } 265 266 data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1; 267 cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ? 268 n->max_ncs - n->max_queue_pairs : 0; 269 /* 270 * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter 271 * in the future and resume the device if read-only operations between 272 * suspend and reset goes wrong. 273 */ 274 vhost_net_stop(vdev, n->nic->ncs, data_queue_pairs, cvq); 275 276 /* Start will check migration setup_or_active to configure or not SVQ */ 277 r = vhost_net_start(vdev, n->nic->ncs, data_queue_pairs, cvq); 278 if (unlikely(r < 0)) { 279 error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r); 280 } 281 } 282 283 static void vdpa_net_migration_state_notifier(Notifier *notifier, void *data) 284 { 285 MigrationState *migration = data; 286 VhostVDPAState *s = container_of(notifier, VhostVDPAState, 287 migration_state); 288 289 if (migration_in_setup(migration)) { 290 vhost_vdpa_net_log_global_enable(s, true); 291 } else if (migration_has_failed(migration)) { 292 vhost_vdpa_net_log_global_enable(s, false); 293 } 294 } 295 296 static void vhost_vdpa_net_data_start_first(VhostVDPAState *s) 297 { 298 struct vhost_vdpa *v = &s->vhost_vdpa; 299 300 add_migration_state_change_notifier(&s->migration_state); 301 if (v->shadow_vqs_enabled) { 302 v->iova_tree = vhost_iova_tree_new(v->iova_range.first, 303 v->iova_range.last); 304 } 305 } 306 307 static int vhost_vdpa_net_data_start(NetClientState *nc) 308 { 309 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 310 struct vhost_vdpa *v = &s->vhost_vdpa; 311 312 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 313 314 if (s->always_svq || 315 migration_is_setup_or_active(migrate_get_current()->state)) { 316 v->shadow_vqs_enabled = true; 317 v->shadow_data = true; 318 } else { 319 v->shadow_vqs_enabled = false; 320 v->shadow_data = false; 321 } 322 323 if (v->index == 0) { 324 vhost_vdpa_net_data_start_first(s); 325 return 0; 326 } 327 328 if (v->shadow_vqs_enabled) { 329 VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s); 330 v->iova_tree = s0->vhost_vdpa.iova_tree; 331 } 332 333 return 0; 334 } 335 336 static void vhost_vdpa_net_client_stop(NetClientState *nc) 337 { 338 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 339 struct vhost_dev *dev; 340 341 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 342 343 if (s->vhost_vdpa.index == 0) { 344 remove_migration_state_change_notifier(&s->migration_state); 345 } 346 347 dev = s->vhost_vdpa.dev; 348 if (dev->vq_index + dev->nvqs == dev->vq_index_end) { 349 g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete); 350 } 351 } 352 353 static NetClientInfo net_vhost_vdpa_info = { 354 .type = NET_CLIENT_DRIVER_VHOST_VDPA, 355 .size = sizeof(VhostVDPAState), 356 .receive = vhost_vdpa_receive, 357 .start = vhost_vdpa_net_data_start, 358 .stop = vhost_vdpa_net_client_stop, 359 .cleanup = vhost_vdpa_cleanup, 360 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, 361 .has_ufo = vhost_vdpa_has_ufo, 362 .check_peer_type = vhost_vdpa_check_peer_type, 363 }; 364 365 static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index) 366 { 367 struct vhost_vring_state state = { 368 .index = vq_index, 369 }; 370 int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state); 371 372 if (unlikely(r < 0)) { 373 error_report("Cannot get VQ %u group: %s", vq_index, 374 g_strerror(errno)); 375 return r; 376 } 377 378 return state.num; 379 } 380 381 static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v, 382 unsigned vq_group, 383 unsigned asid_num) 384 { 385 struct vhost_vring_state asid = { 386 .index = vq_group, 387 .num = asid_num, 388 }; 389 int r; 390 391 r = ioctl(v->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid); 392 if (unlikely(r < 0)) { 393 error_report("Can't set vq group %u asid %u, errno=%d (%s)", 394 asid.index, asid.num, errno, g_strerror(errno)); 395 } 396 return r; 397 } 398 399 static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr) 400 { 401 VhostIOVATree *tree = v->iova_tree; 402 DMAMap needle = { 403 /* 404 * No need to specify size or to look for more translations since 405 * this contiguous chunk was allocated by us. 406 */ 407 .translated_addr = (hwaddr)(uintptr_t)addr, 408 }; 409 const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle); 410 int r; 411 412 if (unlikely(!map)) { 413 error_report("Cannot locate expected map"); 414 return; 415 } 416 417 r = vhost_vdpa_dma_unmap(v, v->address_space_id, map->iova, map->size + 1); 418 if (unlikely(r != 0)) { 419 error_report("Device cannot unmap: %s(%d)", g_strerror(r), r); 420 } 421 422 vhost_iova_tree_remove(tree, *map); 423 } 424 425 static size_t vhost_vdpa_net_cvq_cmd_len(void) 426 { 427 /* 428 * MAC_TABLE_SET is the ctrl command that produces the longer out buffer. 429 * In buffer is always 1 byte, so it should fit here 430 */ 431 return sizeof(struct virtio_net_ctrl_hdr) + 432 2 * sizeof(struct virtio_net_ctrl_mac) + 433 MAC_TABLE_ENTRIES * ETH_ALEN; 434 } 435 436 static size_t vhost_vdpa_net_cvq_cmd_page_len(void) 437 { 438 return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size()); 439 } 440 441 /** Map CVQ buffer. */ 442 static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size, 443 bool write) 444 { 445 DMAMap map = {}; 446 int r; 447 448 map.translated_addr = (hwaddr)(uintptr_t)buf; 449 map.size = size - 1; 450 map.perm = write ? IOMMU_RW : IOMMU_RO, 451 r = vhost_iova_tree_map_alloc(v->iova_tree, &map); 452 if (unlikely(r != IOVA_OK)) { 453 error_report("Cannot map injected element"); 454 return r; 455 } 456 457 r = vhost_vdpa_dma_map(v, v->address_space_id, map.iova, 458 vhost_vdpa_net_cvq_cmd_page_len(), buf, !write); 459 if (unlikely(r < 0)) { 460 goto dma_map_err; 461 } 462 463 return 0; 464 465 dma_map_err: 466 vhost_iova_tree_remove(v->iova_tree, map); 467 return r; 468 } 469 470 static int vhost_vdpa_net_cvq_start(NetClientState *nc) 471 { 472 VhostVDPAState *s, *s0; 473 struct vhost_vdpa *v; 474 uint64_t backend_features; 475 int64_t cvq_group; 476 int cvq_index, r; 477 478 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 479 480 s = DO_UPCAST(VhostVDPAState, nc, nc); 481 v = &s->vhost_vdpa; 482 483 s0 = vhost_vdpa_net_first_nc_vdpa(s); 484 v->shadow_data = s0->vhost_vdpa.shadow_vqs_enabled; 485 v->shadow_vqs_enabled = s->always_svq; 486 s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID; 487 488 if (s->vhost_vdpa.shadow_data) { 489 /* SVQ is already configured for all virtqueues */ 490 goto out; 491 } 492 493 /* 494 * If we early return in these cases SVQ will not be enabled. The migration 495 * will be blocked as long as vhost-vdpa backends will not offer _F_LOG. 496 * 497 * Calling VHOST_GET_BACKEND_FEATURES as they are not available in v->dev 498 * yet. 499 */ 500 r = ioctl(v->device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features); 501 if (unlikely(r < 0)) { 502 error_report("Cannot get vdpa backend_features: %s(%d)", 503 g_strerror(errno), errno); 504 return -1; 505 } 506 if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID)) || 507 !vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) { 508 return 0; 509 } 510 511 /* 512 * Check if all the virtqueues of the virtio device are in a different vq 513 * than the last vq. VQ group of last group passed in cvq_group. 514 */ 515 cvq_index = v->dev->vq_index_end - 1; 516 cvq_group = vhost_vdpa_get_vring_group(v->device_fd, cvq_index); 517 if (unlikely(cvq_group < 0)) { 518 return cvq_group; 519 } 520 for (int i = 0; i < cvq_index; ++i) { 521 int64_t group = vhost_vdpa_get_vring_group(v->device_fd, i); 522 523 if (unlikely(group < 0)) { 524 return group; 525 } 526 527 if (group == cvq_group) { 528 return 0; 529 } 530 } 531 532 r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID); 533 if (unlikely(r < 0)) { 534 return r; 535 } 536 537 v->shadow_vqs_enabled = true; 538 s->vhost_vdpa.address_space_id = VHOST_VDPA_NET_CVQ_ASID; 539 540 out: 541 if (!s->vhost_vdpa.shadow_vqs_enabled) { 542 return 0; 543 } 544 545 if (s0->vhost_vdpa.iova_tree) { 546 /* 547 * SVQ is already configured for all virtqueues. Reuse IOVA tree for 548 * simplicity, whether CVQ shares ASID with guest or not, because: 549 * - Memory listener need access to guest's memory addresses allocated 550 * in the IOVA tree. 551 * - There should be plenty of IOVA address space for both ASID not to 552 * worry about collisions between them. Guest's translations are 553 * still validated with virtio virtqueue_pop so there is no risk for 554 * the guest to access memory that it shouldn't. 555 * 556 * To allocate a iova tree per ASID is doable but it complicates the 557 * code and it is not worth it for the moment. 558 */ 559 v->iova_tree = s0->vhost_vdpa.iova_tree; 560 } else { 561 v->iova_tree = vhost_iova_tree_new(v->iova_range.first, 562 v->iova_range.last); 563 } 564 565 r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer, 566 vhost_vdpa_net_cvq_cmd_page_len(), false); 567 if (unlikely(r < 0)) { 568 return r; 569 } 570 571 r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->status, 572 vhost_vdpa_net_cvq_cmd_page_len(), true); 573 if (unlikely(r < 0)) { 574 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); 575 } 576 577 return r; 578 } 579 580 static void vhost_vdpa_net_cvq_stop(NetClientState *nc) 581 { 582 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 583 584 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 585 586 if (s->vhost_vdpa.shadow_vqs_enabled) { 587 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer); 588 vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status); 589 } 590 591 vhost_vdpa_net_client_stop(nc); 592 } 593 594 static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len, 595 size_t in_len) 596 { 597 /* Buffers for the device */ 598 const struct iovec out = { 599 .iov_base = s->cvq_cmd_out_buffer, 600 .iov_len = out_len, 601 }; 602 const struct iovec in = { 603 .iov_base = s->status, 604 .iov_len = sizeof(virtio_net_ctrl_ack), 605 }; 606 VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0); 607 int r; 608 609 r = vhost_svq_add(svq, &out, 1, &in, 1, NULL); 610 if (unlikely(r != 0)) { 611 if (unlikely(r == -ENOSPC)) { 612 qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n", 613 __func__); 614 } 615 return r; 616 } 617 618 /* 619 * We can poll here since we've had BQL from the time we sent the 620 * descriptor. Also, we need to take the answer before SVQ pulls by itself, 621 * when BQL is released 622 */ 623 return vhost_svq_poll(svq); 624 } 625 626 static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, uint8_t class, 627 uint8_t cmd, const void *data, 628 size_t data_size) 629 { 630 const struct virtio_net_ctrl_hdr ctrl = { 631 .class = class, 632 .cmd = cmd, 633 }; 634 635 assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl)); 636 637 memcpy(s->cvq_cmd_out_buffer, &ctrl, sizeof(ctrl)); 638 memcpy(s->cvq_cmd_out_buffer + sizeof(ctrl), data, data_size); 639 640 return vhost_vdpa_net_cvq_add(s, sizeof(ctrl) + data_size, 641 sizeof(virtio_net_ctrl_ack)); 642 } 643 644 static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n) 645 { 646 uint64_t features = n->parent_obj.guest_features; 647 if (features & BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR)) { 648 ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MAC, 649 VIRTIO_NET_CTRL_MAC_ADDR_SET, 650 n->mac, sizeof(n->mac)); 651 if (unlikely(dev_written < 0)) { 652 return dev_written; 653 } 654 655 return *s->status != VIRTIO_NET_OK; 656 } 657 658 return 0; 659 } 660 661 static int vhost_vdpa_net_load_mq(VhostVDPAState *s, 662 const VirtIONet *n) 663 { 664 struct virtio_net_ctrl_mq mq; 665 uint64_t features = n->parent_obj.guest_features; 666 ssize_t dev_written; 667 668 if (!(features & BIT_ULL(VIRTIO_NET_F_MQ))) { 669 return 0; 670 } 671 672 mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs); 673 dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MQ, 674 VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &mq, 675 sizeof(mq)); 676 if (unlikely(dev_written < 0)) { 677 return dev_written; 678 } 679 680 return *s->status != VIRTIO_NET_OK; 681 } 682 683 static int vhost_vdpa_net_load(NetClientState *nc) 684 { 685 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 686 struct vhost_vdpa *v = &s->vhost_vdpa; 687 const VirtIONet *n; 688 int r; 689 690 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 691 692 if (!v->shadow_vqs_enabled) { 693 return 0; 694 } 695 696 n = VIRTIO_NET(v->dev->vdev); 697 r = vhost_vdpa_net_load_mac(s, n); 698 if (unlikely(r < 0)) { 699 return r; 700 } 701 r = vhost_vdpa_net_load_mq(s, n); 702 if (unlikely(r)) { 703 return r; 704 } 705 706 return 0; 707 } 708 709 static NetClientInfo net_vhost_vdpa_cvq_info = { 710 .type = NET_CLIENT_DRIVER_VHOST_VDPA, 711 .size = sizeof(VhostVDPAState), 712 .receive = vhost_vdpa_receive, 713 .start = vhost_vdpa_net_cvq_start, 714 .load = vhost_vdpa_net_load, 715 .stop = vhost_vdpa_net_cvq_stop, 716 .cleanup = vhost_vdpa_cleanup, 717 .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, 718 .has_ufo = vhost_vdpa_has_ufo, 719 .check_peer_type = vhost_vdpa_check_peer_type, 720 }; 721 722 /** 723 * Validate and copy control virtqueue commands. 724 * 725 * Following QEMU guidelines, we offer a copy of the buffers to the device to 726 * prevent TOCTOU bugs. 727 */ 728 static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq, 729 VirtQueueElement *elem, 730 void *opaque) 731 { 732 VhostVDPAState *s = opaque; 733 size_t in_len; 734 virtio_net_ctrl_ack status = VIRTIO_NET_ERR; 735 /* Out buffer sent to both the vdpa device and the device model */ 736 struct iovec out = { 737 .iov_base = s->cvq_cmd_out_buffer, 738 }; 739 /* in buffer used for device model */ 740 const struct iovec in = { 741 .iov_base = &status, 742 .iov_len = sizeof(status), 743 }; 744 ssize_t dev_written = -EINVAL; 745 746 out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0, 747 s->cvq_cmd_out_buffer, 748 vhost_vdpa_net_cvq_cmd_len()); 749 if (*(uint8_t *)s->cvq_cmd_out_buffer == VIRTIO_NET_CTRL_ANNOUNCE) { 750 /* 751 * Guest announce capability is emulated by qemu, so don't forward to 752 * the device. 753 */ 754 dev_written = sizeof(status); 755 *s->status = VIRTIO_NET_OK; 756 } else { 757 dev_written = vhost_vdpa_net_cvq_add(s, out.iov_len, sizeof(status)); 758 if (unlikely(dev_written < 0)) { 759 goto out; 760 } 761 } 762 763 if (unlikely(dev_written < sizeof(status))) { 764 error_report("Insufficient written data (%zu)", dev_written); 765 goto out; 766 } 767 768 if (*s->status != VIRTIO_NET_OK) { 769 return VIRTIO_NET_ERR; 770 } 771 772 status = VIRTIO_NET_ERR; 773 virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, &out, 1); 774 if (status != VIRTIO_NET_OK) { 775 error_report("Bad CVQ processing in model"); 776 } 777 778 out: 779 in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, 780 sizeof(status)); 781 if (unlikely(in_len < sizeof(status))) { 782 error_report("Bad device CVQ written length"); 783 } 784 vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status))); 785 g_free(elem); 786 return dev_written < 0 ? dev_written : 0; 787 } 788 789 static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = { 790 .avail_handler = vhost_vdpa_net_handle_ctrl_avail, 791 }; 792 793 static NetClientState *net_vhost_vdpa_init(NetClientState *peer, 794 const char *device, 795 const char *name, 796 int vdpa_device_fd, 797 int queue_pair_index, 798 int nvqs, 799 bool is_datapath, 800 bool svq, 801 struct vhost_vdpa_iova_range iova_range, 802 uint64_t features) 803 { 804 NetClientState *nc = NULL; 805 VhostVDPAState *s; 806 int ret = 0; 807 assert(name); 808 if (is_datapath) { 809 nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device, 810 name); 811 } else { 812 nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer, 813 device, name); 814 } 815 qemu_set_info_str(nc, TYPE_VHOST_VDPA); 816 s = DO_UPCAST(VhostVDPAState, nc, nc); 817 818 s->vhost_vdpa.device_fd = vdpa_device_fd; 819 s->vhost_vdpa.index = queue_pair_index; 820 s->always_svq = svq; 821 s->migration_state.notify = vdpa_net_migration_state_notifier; 822 s->vhost_vdpa.shadow_vqs_enabled = svq; 823 s->vhost_vdpa.iova_range = iova_range; 824 s->vhost_vdpa.shadow_data = svq; 825 if (queue_pair_index == 0) { 826 vhost_vdpa_net_valid_svq_features(features, 827 &s->vhost_vdpa.migration_blocker); 828 } else if (!is_datapath) { 829 s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(), 830 vhost_vdpa_net_cvq_cmd_page_len()); 831 memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len()); 832 s->status = qemu_memalign(qemu_real_host_page_size(), 833 vhost_vdpa_net_cvq_cmd_page_len()); 834 memset(s->status, 0, vhost_vdpa_net_cvq_cmd_page_len()); 835 836 s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops; 837 s->vhost_vdpa.shadow_vq_ops_opaque = s; 838 839 /* 840 * TODO: We cannot migrate devices with CVQ as there is no way to set 841 * the device state (MAC, MQ, etc) before starting the datapath. 842 * 843 * Migration blocker ownership now belongs to s->vhost_vdpa. 844 */ 845 error_setg(&s->vhost_vdpa.migration_blocker, 846 "net vdpa cannot migrate with CVQ feature"); 847 } 848 ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs); 849 if (ret) { 850 qemu_del_net_client(nc); 851 return NULL; 852 } 853 return nc; 854 } 855 856 static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp) 857 { 858 int ret = ioctl(fd, VHOST_GET_FEATURES, features); 859 if (unlikely(ret < 0)) { 860 error_setg_errno(errp, errno, 861 "Fail to query features from vhost-vDPA device"); 862 } 863 return ret; 864 } 865 866 static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features, 867 int *has_cvq, Error **errp) 868 { 869 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 870 g_autofree struct vhost_vdpa_config *config = NULL; 871 __virtio16 *max_queue_pairs; 872 int ret; 873 874 if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) { 875 *has_cvq = 1; 876 } else { 877 *has_cvq = 0; 878 } 879 880 if (features & (1 << VIRTIO_NET_F_MQ)) { 881 config = g_malloc0(config_size + sizeof(*max_queue_pairs)); 882 config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs); 883 config->len = sizeof(*max_queue_pairs); 884 885 ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config); 886 if (ret) { 887 error_setg(errp, "Fail to get config from vhost-vDPA device"); 888 return -ret; 889 } 890 891 max_queue_pairs = (__virtio16 *)&config->buf; 892 893 return lduw_le_p(max_queue_pairs); 894 } 895 896 return 1; 897 } 898 899 int net_init_vhost_vdpa(const Netdev *netdev, const char *name, 900 NetClientState *peer, Error **errp) 901 { 902 const NetdevVhostVDPAOptions *opts; 903 uint64_t features; 904 int vdpa_device_fd; 905 g_autofree NetClientState **ncs = NULL; 906 struct vhost_vdpa_iova_range iova_range; 907 NetClientState *nc; 908 int queue_pairs, r, i = 0, has_cvq = 0; 909 910 assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA); 911 opts = &netdev->u.vhost_vdpa; 912 if (!opts->vhostdev && !opts->vhostfd) { 913 error_setg(errp, 914 "vhost-vdpa: neither vhostdev= nor vhostfd= was specified"); 915 return -1; 916 } 917 918 if (opts->vhostdev && opts->vhostfd) { 919 error_setg(errp, 920 "vhost-vdpa: vhostdev= and vhostfd= are mutually exclusive"); 921 return -1; 922 } 923 924 if (opts->vhostdev) { 925 vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp); 926 if (vdpa_device_fd == -1) { 927 return -errno; 928 } 929 } else { 930 /* has_vhostfd */ 931 vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, errp); 932 if (vdpa_device_fd == -1) { 933 error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: "); 934 return -1; 935 } 936 } 937 938 r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp); 939 if (unlikely(r < 0)) { 940 goto err; 941 } 942 943 queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features, 944 &has_cvq, errp); 945 if (queue_pairs < 0) { 946 qemu_close(vdpa_device_fd); 947 return queue_pairs; 948 } 949 950 r = vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range); 951 if (unlikely(r < 0)) { 952 error_setg(errp, "vhost-vdpa: get iova range failed: %s", 953 strerror(-r)); 954 goto err; 955 } 956 957 if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) { 958 goto err; 959 } 960 961 ncs = g_malloc0(sizeof(*ncs) * queue_pairs); 962 963 for (i = 0; i < queue_pairs; i++) { 964 ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, 965 vdpa_device_fd, i, 2, true, opts->x_svq, 966 iova_range, features); 967 if (!ncs[i]) 968 goto err; 969 } 970 971 if (has_cvq) { 972 nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, 973 vdpa_device_fd, i, 1, false, 974 opts->x_svq, iova_range, features); 975 if (!nc) 976 goto err; 977 } 978 979 return 0; 980 981 err: 982 if (i) { 983 for (i--; i >= 0; i--) { 984 qemu_del_net_client(ncs[i]); 985 } 986 } 987 988 qemu_close(vdpa_device_fd); 989 990 return -1; 991 } 992