1 /* 2 * vhost-vdpa 3 * 4 * Copyright(c) 2017-2018 Intel Corporation. 5 * Copyright(c) 2020 Red Hat, Inc. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include <linux/vhost.h> 14 #include <linux/vfio.h> 15 #include <sys/eventfd.h> 16 #include <sys/ioctl.h> 17 #include "hw/virtio/vhost.h" 18 #include "hw/virtio/vhost-backend.h" 19 #include "hw/virtio/virtio-net.h" 20 #include "hw/virtio/vhost-shadow-virtqueue.h" 21 #include "hw/virtio/vhost-vdpa.h" 22 #include "exec/address-spaces.h" 23 #include "qemu/cutils.h" 24 #include "qemu/main-loop.h" 25 #include "cpu.h" 26 #include "trace.h" 27 #include "qapi/error.h" 28 29 /* 30 * Return one past the end of the end of section. Be careful with uint64_t 31 * conversions! 32 */ 33 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section) 34 { 35 Int128 llend = int128_make64(section->offset_within_address_space); 36 llend = int128_add(llend, section->size); 37 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); 38 39 return llend; 40 } 41 42 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, 43 uint64_t iova_min, 44 uint64_t iova_max) 45 { 46 Int128 llend; 47 48 if ((!memory_region_is_ram(section->mr) && 49 !memory_region_is_iommu(section->mr)) || 50 memory_region_is_protected(section->mr) || 51 /* vhost-vDPA doesn't allow MMIO to be mapped */ 52 memory_region_is_ram_device(section->mr)) { 53 return true; 54 } 55 56 if (section->offset_within_address_space < iova_min) { 57 error_report("RAM section out of device range (min=0x%" PRIx64 58 ", addr=0x%" HWADDR_PRIx ")", 59 iova_min, section->offset_within_address_space); 60 return true; 61 } 62 63 llend = vhost_vdpa_section_end(section); 64 if (int128_gt(llend, int128_make64(iova_max))) { 65 error_report("RAM section out of device range (max=0x%" PRIx64 66 ", end addr=0x%" PRIx64 ")", 67 iova_max, int128_get64(llend)); 68 return true; 69 } 70 71 return false; 72 } 73 74 static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, 75 void *vaddr, bool readonly) 76 { 77 struct vhost_msg_v2 msg = {}; 78 int fd = v->device_fd; 79 int ret = 0; 80 81 msg.type = v->msg_type; 82 msg.iotlb.iova = iova; 83 msg.iotlb.size = size; 84 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr; 85 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW; 86 msg.iotlb.type = VHOST_IOTLB_UPDATE; 87 88 trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size, 89 msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type); 90 91 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 92 error_report("failed to write, fd=%d, errno=%d (%s)", 93 fd, errno, strerror(errno)); 94 return -EIO ; 95 } 96 97 return ret; 98 } 99 100 static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, 101 hwaddr size) 102 { 103 struct vhost_msg_v2 msg = {}; 104 int fd = v->device_fd; 105 int ret = 0; 106 107 msg.type = v->msg_type; 108 msg.iotlb.iova = iova; 109 msg.iotlb.size = size; 110 msg.iotlb.type = VHOST_IOTLB_INVALIDATE; 111 112 trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova, 113 msg.iotlb.size, msg.iotlb.type); 114 115 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 116 error_report("failed to write, fd=%d, errno=%d (%s)", 117 fd, errno, strerror(errno)); 118 return -EIO ; 119 } 120 121 return ret; 122 } 123 124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v) 125 { 126 int fd = v->device_fd; 127 struct vhost_msg_v2 msg = { 128 .type = v->msg_type, 129 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN, 130 }; 131 132 trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type); 133 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 134 error_report("failed to write, fd=%d, errno=%d (%s)", 135 fd, errno, strerror(errno)); 136 } 137 } 138 139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v) 140 { 141 if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) && 142 !v->iotlb_batch_begin_sent) { 143 vhost_vdpa_listener_begin_batch(v); 144 } 145 146 v->iotlb_batch_begin_sent = true; 147 } 148 149 static void vhost_vdpa_listener_commit(MemoryListener *listener) 150 { 151 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 152 struct vhost_dev *dev = v->dev; 153 struct vhost_msg_v2 msg = {}; 154 int fd = v->device_fd; 155 156 if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) { 157 return; 158 } 159 160 if (!v->iotlb_batch_begin_sent) { 161 return; 162 } 163 164 msg.type = v->msg_type; 165 msg.iotlb.type = VHOST_IOTLB_BATCH_END; 166 167 trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type); 168 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 169 error_report("failed to write, fd=%d, errno=%d (%s)", 170 fd, errno, strerror(errno)); 171 } 172 173 v->iotlb_batch_begin_sent = false; 174 } 175 176 static void vhost_vdpa_listener_region_add(MemoryListener *listener, 177 MemoryRegionSection *section) 178 { 179 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 180 hwaddr iova; 181 Int128 llend, llsize; 182 void *vaddr; 183 int ret; 184 185 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, 186 v->iova_range.last)) { 187 return; 188 } 189 190 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 191 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 192 error_report("%s received unaligned region", __func__); 193 return; 194 } 195 196 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 197 llend = vhost_vdpa_section_end(section); 198 if (int128_ge(int128_make64(iova), llend)) { 199 return; 200 } 201 202 memory_region_ref(section->mr); 203 204 /* Here we assume that memory_region_is_ram(section->mr)==true */ 205 206 vaddr = memory_region_get_ram_ptr(section->mr) + 207 section->offset_within_region + 208 (iova - section->offset_within_address_space); 209 210 trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend), 211 vaddr, section->readonly); 212 213 llsize = int128_sub(llend, int128_make64(iova)); 214 if (v->shadow_vqs_enabled) { 215 DMAMap mem_region = { 216 .translated_addr = (hwaddr)(uintptr_t)vaddr, 217 .size = int128_get64(llsize) - 1, 218 .perm = IOMMU_ACCESS_FLAG(true, section->readonly), 219 }; 220 221 int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); 222 if (unlikely(r != IOVA_OK)) { 223 error_report("Can't allocate a mapping (%d)", r); 224 goto fail; 225 } 226 227 iova = mem_region.iova; 228 } 229 230 vhost_vdpa_iotlb_batch_begin_once(v); 231 ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize), 232 vaddr, section->readonly); 233 if (ret) { 234 error_report("vhost vdpa map fail!"); 235 goto fail; 236 } 237 238 return; 239 240 fail: 241 /* 242 * On the initfn path, store the first error in the container so we 243 * can gracefully fail. Runtime, there's not much we can do other 244 * than throw a hardware error. 245 */ 246 error_report("vhost-vdpa: DMA mapping failed, unable to continue"); 247 return; 248 249 } 250 251 static void vhost_vdpa_listener_region_del(MemoryListener *listener, 252 MemoryRegionSection *section) 253 { 254 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 255 hwaddr iova; 256 Int128 llend, llsize; 257 int ret; 258 259 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, 260 v->iova_range.last)) { 261 return; 262 } 263 264 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 265 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 266 error_report("%s received unaligned region", __func__); 267 return; 268 } 269 270 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 271 llend = vhost_vdpa_section_end(section); 272 273 trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend)); 274 275 if (int128_ge(int128_make64(iova), llend)) { 276 return; 277 } 278 279 llsize = int128_sub(llend, int128_make64(iova)); 280 281 if (v->shadow_vqs_enabled) { 282 const DMAMap *result; 283 const void *vaddr = memory_region_get_ram_ptr(section->mr) + 284 section->offset_within_region + 285 (iova - section->offset_within_address_space); 286 DMAMap mem_region = { 287 .translated_addr = (hwaddr)(uintptr_t)vaddr, 288 .size = int128_get64(llsize) - 1, 289 }; 290 291 result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region); 292 iova = result->iova; 293 vhost_iova_tree_remove(v->iova_tree, &mem_region); 294 } 295 vhost_vdpa_iotlb_batch_begin_once(v); 296 ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); 297 if (ret) { 298 error_report("vhost_vdpa dma unmap error!"); 299 } 300 301 memory_region_unref(section->mr); 302 } 303 /* 304 * IOTLB API is used by vhost-vdpa which requires incremental updating 305 * of the mapping. So we can not use generic vhost memory listener which 306 * depends on the addnop(). 307 */ 308 static const MemoryListener vhost_vdpa_memory_listener = { 309 .name = "vhost-vdpa", 310 .commit = vhost_vdpa_listener_commit, 311 .region_add = vhost_vdpa_listener_region_add, 312 .region_del = vhost_vdpa_listener_region_del, 313 }; 314 315 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, 316 void *arg) 317 { 318 struct vhost_vdpa *v = dev->opaque; 319 int fd = v->device_fd; 320 int ret; 321 322 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 323 324 ret = ioctl(fd, request, arg); 325 return ret < 0 ? -errno : ret; 326 } 327 328 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status) 329 { 330 uint8_t s; 331 int ret; 332 333 trace_vhost_vdpa_add_status(dev, status); 334 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); 335 if (ret < 0) { 336 return ret; 337 } 338 339 s |= status; 340 341 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s); 342 if (ret < 0) { 343 return ret; 344 } 345 346 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); 347 if (ret < 0) { 348 return ret; 349 } 350 351 if (!(s & status)) { 352 return -EIO; 353 } 354 355 return 0; 356 } 357 358 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v) 359 { 360 int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE, 361 &v->iova_range); 362 if (ret != 0) { 363 v->iova_range.first = 0; 364 v->iova_range.last = UINT64_MAX; 365 } 366 367 trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first, 368 v->iova_range.last); 369 } 370 371 /* 372 * The use of this function is for requests that only need to be 373 * applied once. Typically such request occurs at the beginning 374 * of operation, and before setting up queues. It should not be 375 * used for request that performs operation until all queues are 376 * set, which would need to check dev->vq_index_end instead. 377 */ 378 static bool vhost_vdpa_first_dev(struct vhost_dev *dev) 379 { 380 struct vhost_vdpa *v = dev->opaque; 381 382 return v->index == 0; 383 } 384 385 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, 386 uint64_t *features) 387 { 388 int ret; 389 390 ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features); 391 trace_vhost_vdpa_get_features(dev, *features); 392 return ret; 393 } 394 395 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v, 396 Error **errp) 397 { 398 g_autoptr(GPtrArray) shadow_vqs = NULL; 399 uint64_t dev_features, svq_features; 400 int r; 401 bool ok; 402 403 if (!v->shadow_vqs_enabled) { 404 return 0; 405 } 406 407 r = vhost_vdpa_get_dev_features(hdev, &dev_features); 408 if (r != 0) { 409 error_setg_errno(errp, -r, "Can't get vdpa device features"); 410 return r; 411 } 412 413 svq_features = dev_features; 414 ok = vhost_svq_valid_features(svq_features, errp); 415 if (unlikely(!ok)) { 416 return -1; 417 } 418 419 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); 420 for (unsigned n = 0; n < hdev->nvqs; ++n) { 421 g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree); 422 423 if (unlikely(!svq)) { 424 error_setg(errp, "Cannot create svq %u", n); 425 return -1; 426 } 427 g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq)); 428 } 429 430 v->shadow_vqs = g_steal_pointer(&shadow_vqs); 431 return 0; 432 } 433 434 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) 435 { 436 struct vhost_vdpa *v; 437 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 438 trace_vhost_vdpa_init(dev, opaque); 439 int ret; 440 441 /* 442 * Similar to VFIO, we end up pinning all guest memory and have to 443 * disable discarding of RAM. 444 */ 445 ret = ram_block_discard_disable(true); 446 if (ret) { 447 error_report("Cannot set discarding of RAM broken"); 448 return ret; 449 } 450 451 v = opaque; 452 v->dev = dev; 453 dev->opaque = opaque ; 454 v->listener = vhost_vdpa_memory_listener; 455 v->msg_type = VHOST_IOTLB_MSG_V2; 456 ret = vhost_vdpa_init_svq(dev, v, errp); 457 if (ret) { 458 goto err; 459 } 460 461 vhost_vdpa_get_iova_range(v); 462 463 if (!vhost_vdpa_first_dev(dev)) { 464 return 0; 465 } 466 467 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 468 VIRTIO_CONFIG_S_DRIVER); 469 470 return 0; 471 472 err: 473 ram_block_discard_disable(false); 474 return ret; 475 } 476 477 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev, 478 int queue_index) 479 { 480 size_t page_size = qemu_real_host_page_size(); 481 struct vhost_vdpa *v = dev->opaque; 482 VirtIODevice *vdev = dev->vdev; 483 VhostVDPAHostNotifier *n; 484 485 n = &v->notifier[queue_index]; 486 487 if (n->addr) { 488 virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false); 489 object_unparent(OBJECT(&n->mr)); 490 munmap(n->addr, page_size); 491 n->addr = NULL; 492 } 493 } 494 495 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index) 496 { 497 size_t page_size = qemu_real_host_page_size(); 498 struct vhost_vdpa *v = dev->opaque; 499 VirtIODevice *vdev = dev->vdev; 500 VhostVDPAHostNotifier *n; 501 int fd = v->device_fd; 502 void *addr; 503 char *name; 504 505 vhost_vdpa_host_notifier_uninit(dev, queue_index); 506 507 n = &v->notifier[queue_index]; 508 509 addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd, 510 queue_index * page_size); 511 if (addr == MAP_FAILED) { 512 goto err; 513 } 514 515 name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]", 516 v, queue_index); 517 memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name, 518 page_size, addr); 519 g_free(name); 520 521 if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) { 522 object_unparent(OBJECT(&n->mr)); 523 munmap(addr, page_size); 524 goto err; 525 } 526 n->addr = addr; 527 528 return 0; 529 530 err: 531 return -1; 532 } 533 534 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n) 535 { 536 int i; 537 538 for (i = dev->vq_index; i < dev->vq_index + n; i++) { 539 vhost_vdpa_host_notifier_uninit(dev, i); 540 } 541 } 542 543 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev) 544 { 545 struct vhost_vdpa *v = dev->opaque; 546 int i; 547 548 if (v->shadow_vqs_enabled) { 549 /* FIXME SVQ is not compatible with host notifiers mr */ 550 return; 551 } 552 553 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) { 554 if (vhost_vdpa_host_notifier_init(dev, i)) { 555 goto err; 556 } 557 } 558 559 return; 560 561 err: 562 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index); 563 return; 564 } 565 566 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev) 567 { 568 struct vhost_vdpa *v = dev->opaque; 569 size_t idx; 570 571 if (!v->shadow_vqs) { 572 return; 573 } 574 575 for (idx = 0; idx < v->shadow_vqs->len; ++idx) { 576 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx)); 577 } 578 g_ptr_array_free(v->shadow_vqs, true); 579 } 580 581 static int vhost_vdpa_cleanup(struct vhost_dev *dev) 582 { 583 struct vhost_vdpa *v; 584 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 585 v = dev->opaque; 586 trace_vhost_vdpa_cleanup(dev, v); 587 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); 588 memory_listener_unregister(&v->listener); 589 vhost_vdpa_svq_cleanup(dev); 590 591 dev->opaque = NULL; 592 ram_block_discard_disable(false); 593 594 return 0; 595 } 596 597 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) 598 { 599 trace_vhost_vdpa_memslots_limit(dev, INT_MAX); 600 return INT_MAX; 601 } 602 603 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, 604 struct vhost_memory *mem) 605 { 606 if (!vhost_vdpa_first_dev(dev)) { 607 return 0; 608 } 609 610 trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding); 611 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) && 612 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) { 613 int i; 614 for (i = 0; i < mem->nregions; i++) { 615 trace_vhost_vdpa_dump_regions(dev, i, 616 mem->regions[i].guest_phys_addr, 617 mem->regions[i].memory_size, 618 mem->regions[i].userspace_addr, 619 mem->regions[i].flags_padding); 620 } 621 } 622 if (mem->padding) { 623 return -EINVAL; 624 } 625 626 return 0; 627 } 628 629 static int vhost_vdpa_set_features(struct vhost_dev *dev, 630 uint64_t features) 631 { 632 struct vhost_vdpa *v = dev->opaque; 633 int ret; 634 635 if (!vhost_vdpa_first_dev(dev)) { 636 return 0; 637 } 638 639 if (v->shadow_vqs_enabled) { 640 if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) { 641 /* 642 * QEMU is just trying to enable or disable logging. SVQ handles 643 * this sepparately, so no need to forward this. 644 */ 645 v->acked_features = features; 646 return 0; 647 } 648 649 v->acked_features = features; 650 651 /* We must not ack _F_LOG if SVQ is enabled */ 652 features &= ~BIT_ULL(VHOST_F_LOG_ALL); 653 } 654 655 trace_vhost_vdpa_set_features(dev, features); 656 ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features); 657 if (ret) { 658 return ret; 659 } 660 661 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); 662 } 663 664 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) 665 { 666 uint64_t features; 667 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | 668 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH; 669 int r; 670 671 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { 672 return -EFAULT; 673 } 674 675 features &= f; 676 677 if (vhost_vdpa_first_dev(dev)) { 678 r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); 679 if (r) { 680 return -EFAULT; 681 } 682 } 683 684 dev->backend_cap = features; 685 686 return 0; 687 } 688 689 static int vhost_vdpa_get_device_id(struct vhost_dev *dev, 690 uint32_t *device_id) 691 { 692 int ret; 693 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id); 694 trace_vhost_vdpa_get_device_id(dev, *device_id); 695 return ret; 696 } 697 698 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v) 699 { 700 if (!v->shadow_vqs_enabled) { 701 return; 702 } 703 704 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { 705 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 706 vhost_svq_stop(svq); 707 } 708 } 709 710 static int vhost_vdpa_reset_device(struct vhost_dev *dev) 711 { 712 struct vhost_vdpa *v = dev->opaque; 713 int ret; 714 uint8_t status = 0; 715 716 vhost_vdpa_reset_svq(v); 717 718 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); 719 trace_vhost_vdpa_reset_device(dev, status); 720 return ret; 721 } 722 723 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) 724 { 725 assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); 726 727 trace_vhost_vdpa_get_vq_index(dev, idx, idx); 728 return idx; 729 } 730 731 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev) 732 { 733 int i; 734 trace_vhost_vdpa_set_vring_ready(dev); 735 for (i = 0; i < dev->nvqs; ++i) { 736 struct vhost_vring_state state = { 737 .index = dev->vq_index + i, 738 .num = 1, 739 }; 740 vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); 741 } 742 return 0; 743 } 744 745 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config, 746 uint32_t config_len) 747 { 748 int b, len; 749 char line[QEMU_HEXDUMP_LINE_LEN]; 750 751 for (b = 0; b < config_len; b += 16) { 752 len = config_len - b; 753 qemu_hexdump_line(line, b, config, len, false); 754 trace_vhost_vdpa_dump_config(dev, line); 755 } 756 } 757 758 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data, 759 uint32_t offset, uint32_t size, 760 uint32_t flags) 761 { 762 struct vhost_vdpa_config *config; 763 int ret; 764 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 765 766 trace_vhost_vdpa_set_config(dev, offset, size, flags); 767 config = g_malloc(size + config_size); 768 config->off = offset; 769 config->len = size; 770 memcpy(config->buf, data, size); 771 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) && 772 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { 773 vhost_vdpa_dump_config(dev, data, size); 774 } 775 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config); 776 g_free(config); 777 return ret; 778 } 779 780 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, 781 uint32_t config_len, Error **errp) 782 { 783 struct vhost_vdpa_config *v_config; 784 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 785 int ret; 786 787 trace_vhost_vdpa_get_config(dev, config, config_len); 788 v_config = g_malloc(config_len + config_size); 789 v_config->len = config_len; 790 v_config->off = 0; 791 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config); 792 memcpy(config, v_config->buf, config_len); 793 g_free(v_config); 794 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) && 795 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { 796 vhost_vdpa_dump_config(dev, config, config_len); 797 } 798 return ret; 799 } 800 801 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, 802 struct vhost_vring_state *ring) 803 { 804 trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num); 805 return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); 806 } 807 808 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev, 809 struct vhost_vring_file *file) 810 { 811 trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd); 812 return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file); 813 } 814 815 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev, 816 struct vhost_vring_file *file) 817 { 818 trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd); 819 return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file); 820 } 821 822 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev, 823 struct vhost_vring_addr *addr) 824 { 825 trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags, 826 addr->desc_user_addr, addr->used_user_addr, 827 addr->avail_user_addr, 828 addr->log_guest_addr); 829 830 return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr); 831 832 } 833 834 /** 835 * Set the shadow virtqueue descriptors to the device 836 * 837 * @dev: The vhost device model 838 * @svq: The shadow virtqueue 839 * @idx: The index of the virtqueue in the vhost device 840 * @errp: Error 841 * 842 * Note that this function does not rewind kick file descriptor if cannot set 843 * call one. 844 */ 845 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, 846 VhostShadowVirtqueue *svq, unsigned idx, 847 Error **errp) 848 { 849 struct vhost_vring_file file = { 850 .index = dev->vq_index + idx, 851 }; 852 const EventNotifier *event_notifier = &svq->hdev_kick; 853 int r; 854 855 file.fd = event_notifier_get_fd(event_notifier); 856 r = vhost_vdpa_set_vring_dev_kick(dev, &file); 857 if (unlikely(r != 0)) { 858 error_setg_errno(errp, -r, "Can't set device kick fd"); 859 return r; 860 } 861 862 event_notifier = &svq->hdev_call; 863 file.fd = event_notifier_get_fd(event_notifier); 864 r = vhost_vdpa_set_vring_dev_call(dev, &file); 865 if (unlikely(r != 0)) { 866 error_setg_errno(errp, -r, "Can't set device call fd"); 867 } 868 869 return r; 870 } 871 872 /** 873 * Unmap a SVQ area in the device 874 */ 875 static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, 876 const DMAMap *needle) 877 { 878 const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle); 879 hwaddr size; 880 int r; 881 882 if (unlikely(!result)) { 883 error_report("Unable to find SVQ address to unmap"); 884 return false; 885 } 886 887 size = ROUND_UP(result->size, qemu_real_host_page_size()); 888 r = vhost_vdpa_dma_unmap(v, result->iova, size); 889 return r == 0; 890 } 891 892 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, 893 const VhostShadowVirtqueue *svq) 894 { 895 DMAMap needle = {}; 896 struct vhost_vdpa *v = dev->opaque; 897 struct vhost_vring_addr svq_addr; 898 bool ok; 899 900 vhost_svq_get_vring_addr(svq, &svq_addr); 901 902 needle.translated_addr = svq_addr.desc_user_addr; 903 ok = vhost_vdpa_svq_unmap_ring(v, &needle); 904 if (unlikely(!ok)) { 905 return false; 906 } 907 908 needle.translated_addr = svq_addr.used_user_addr; 909 return vhost_vdpa_svq_unmap_ring(v, &needle); 910 } 911 912 /** 913 * Map the SVQ area in the device 914 * 915 * @v: Vhost-vdpa device 916 * @needle: The area to search iova 917 * @errorp: Error pointer 918 */ 919 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, 920 Error **errp) 921 { 922 int r; 923 924 r = vhost_iova_tree_map_alloc(v->iova_tree, needle); 925 if (unlikely(r != IOVA_OK)) { 926 error_setg(errp, "Cannot allocate iova (%d)", r); 927 return false; 928 } 929 930 r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1, 931 (void *)(uintptr_t)needle->translated_addr, 932 needle->perm == IOMMU_RO); 933 if (unlikely(r != 0)) { 934 error_setg_errno(errp, -r, "Cannot map region to device"); 935 vhost_iova_tree_remove(v->iova_tree, needle); 936 } 937 938 return r == 0; 939 } 940 941 /** 942 * Map the shadow virtqueue rings in the device 943 * 944 * @dev: The vhost device 945 * @svq: The shadow virtqueue 946 * @addr: Assigned IOVA addresses 947 * @errp: Error pointer 948 */ 949 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, 950 const VhostShadowVirtqueue *svq, 951 struct vhost_vring_addr *addr, 952 Error **errp) 953 { 954 DMAMap device_region, driver_region; 955 struct vhost_vring_addr svq_addr; 956 struct vhost_vdpa *v = dev->opaque; 957 size_t device_size = vhost_svq_device_area_size(svq); 958 size_t driver_size = vhost_svq_driver_area_size(svq); 959 size_t avail_offset; 960 bool ok; 961 962 ERRP_GUARD(); 963 vhost_svq_get_vring_addr(svq, &svq_addr); 964 965 driver_region = (DMAMap) { 966 .translated_addr = svq_addr.desc_user_addr, 967 .size = driver_size - 1, 968 .perm = IOMMU_RO, 969 }; 970 ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); 971 if (unlikely(!ok)) { 972 error_prepend(errp, "Cannot create vq driver region: "); 973 return false; 974 } 975 addr->desc_user_addr = driver_region.iova; 976 avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr; 977 addr->avail_user_addr = driver_region.iova + avail_offset; 978 979 device_region = (DMAMap) { 980 .translated_addr = svq_addr.used_user_addr, 981 .size = device_size - 1, 982 .perm = IOMMU_RW, 983 }; 984 ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); 985 if (unlikely(!ok)) { 986 error_prepend(errp, "Cannot create vq device region: "); 987 vhost_vdpa_svq_unmap_ring(v, &driver_region); 988 } 989 addr->used_user_addr = device_region.iova; 990 991 return ok; 992 } 993 994 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev, 995 VhostShadowVirtqueue *svq, unsigned idx, 996 Error **errp) 997 { 998 uint16_t vq_index = dev->vq_index + idx; 999 struct vhost_vring_state s = { 1000 .index = vq_index, 1001 }; 1002 int r; 1003 1004 r = vhost_vdpa_set_dev_vring_base(dev, &s); 1005 if (unlikely(r)) { 1006 error_setg_errno(errp, -r, "Cannot set vring base"); 1007 return false; 1008 } 1009 1010 r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp); 1011 return r == 0; 1012 } 1013 1014 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev) 1015 { 1016 struct vhost_vdpa *v = dev->opaque; 1017 Error *err = NULL; 1018 unsigned i; 1019 1020 if (!v->shadow_vqs) { 1021 return true; 1022 } 1023 1024 for (i = 0; i < v->shadow_vqs->len; ++i) { 1025 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i); 1026 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 1027 struct vhost_vring_addr addr = { 1028 .index = dev->vq_index + i, 1029 }; 1030 int r; 1031 bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err); 1032 if (unlikely(!ok)) { 1033 goto err; 1034 } 1035 1036 vhost_svq_start(svq, dev->vdev, vq); 1037 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err); 1038 if (unlikely(!ok)) { 1039 goto err_map; 1040 } 1041 1042 /* Override vring GPA set by vhost subsystem */ 1043 r = vhost_vdpa_set_vring_dev_addr(dev, &addr); 1044 if (unlikely(r != 0)) { 1045 error_setg_errno(&err, -r, "Cannot set device address"); 1046 goto err_set_addr; 1047 } 1048 } 1049 1050 return true; 1051 1052 err_set_addr: 1053 vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i)); 1054 1055 err_map: 1056 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i)); 1057 1058 err: 1059 error_reportf_err(err, "Cannot setup SVQ %u: ", i); 1060 for (unsigned j = 0; j < i; ++j) { 1061 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j); 1062 vhost_vdpa_svq_unmap_rings(dev, svq); 1063 vhost_svq_stop(svq); 1064 } 1065 1066 return false; 1067 } 1068 1069 static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev) 1070 { 1071 struct vhost_vdpa *v = dev->opaque; 1072 1073 if (!v->shadow_vqs) { 1074 return true; 1075 } 1076 1077 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { 1078 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 1079 bool ok = vhost_vdpa_svq_unmap_rings(dev, svq); 1080 if (unlikely(!ok)) { 1081 return false; 1082 } 1083 } 1084 1085 return true; 1086 } 1087 1088 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) 1089 { 1090 struct vhost_vdpa *v = dev->opaque; 1091 bool ok; 1092 trace_vhost_vdpa_dev_start(dev, started); 1093 1094 if (started) { 1095 vhost_vdpa_host_notifiers_init(dev); 1096 ok = vhost_vdpa_svqs_start(dev); 1097 if (unlikely(!ok)) { 1098 return -1; 1099 } 1100 vhost_vdpa_set_vring_ready(dev); 1101 } else { 1102 ok = vhost_vdpa_svqs_stop(dev); 1103 if (unlikely(!ok)) { 1104 return -1; 1105 } 1106 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); 1107 } 1108 1109 if (dev->vq_index + dev->nvqs != dev->vq_index_end) { 1110 return 0; 1111 } 1112 1113 if (started) { 1114 memory_listener_register(&v->listener, &address_space_memory); 1115 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); 1116 } else { 1117 vhost_vdpa_reset_device(dev); 1118 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 1119 VIRTIO_CONFIG_S_DRIVER); 1120 memory_listener_unregister(&v->listener); 1121 1122 return 0; 1123 } 1124 } 1125 1126 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, 1127 struct vhost_log *log) 1128 { 1129 struct vhost_vdpa *v = dev->opaque; 1130 if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { 1131 return 0; 1132 } 1133 1134 trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd, 1135 log->log); 1136 return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base); 1137 } 1138 1139 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev, 1140 struct vhost_vring_addr *addr) 1141 { 1142 struct vhost_vdpa *v = dev->opaque; 1143 1144 if (v->shadow_vqs_enabled) { 1145 /* 1146 * Device vring addr was set at device start. SVQ base is handled by 1147 * VirtQueue code. 1148 */ 1149 return 0; 1150 } 1151 1152 return vhost_vdpa_set_vring_dev_addr(dev, addr); 1153 } 1154 1155 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev, 1156 struct vhost_vring_state *ring) 1157 { 1158 trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num); 1159 return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring); 1160 } 1161 1162 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev, 1163 struct vhost_vring_state *ring) 1164 { 1165 struct vhost_vdpa *v = dev->opaque; 1166 1167 if (v->shadow_vqs_enabled) { 1168 /* 1169 * Device vring base was set at device start. SVQ base is handled by 1170 * VirtQueue code. 1171 */ 1172 return 0; 1173 } 1174 1175 return vhost_vdpa_set_dev_vring_base(dev, ring); 1176 } 1177 1178 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, 1179 struct vhost_vring_state *ring) 1180 { 1181 struct vhost_vdpa *v = dev->opaque; 1182 int vdpa_idx = ring->index - dev->vq_index; 1183 int ret; 1184 1185 if (v->shadow_vqs_enabled) { 1186 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1187 1188 /* 1189 * Setting base as last used idx, so destination will see as available 1190 * all the entries that the device did not use, including the in-flight 1191 * processing ones. 1192 * 1193 * TODO: This is ok for networking, but other kinds of devices might 1194 * have problems with these retransmissions. 1195 */ 1196 ring->num = svq->last_used_idx; 1197 return 0; 1198 } 1199 1200 ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); 1201 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); 1202 return ret; 1203 } 1204 1205 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev, 1206 struct vhost_vring_file *file) 1207 { 1208 struct vhost_vdpa *v = dev->opaque; 1209 int vdpa_idx = file->index - dev->vq_index; 1210 1211 if (v->shadow_vqs_enabled) { 1212 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1213 vhost_svq_set_svq_kick_fd(svq, file->fd); 1214 return 0; 1215 } else { 1216 return vhost_vdpa_set_vring_dev_kick(dev, file); 1217 } 1218 } 1219 1220 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, 1221 struct vhost_vring_file *file) 1222 { 1223 struct vhost_vdpa *v = dev->opaque; 1224 1225 if (v->shadow_vqs_enabled) { 1226 int vdpa_idx = file->index - dev->vq_index; 1227 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1228 1229 vhost_svq_set_svq_call_fd(svq, file->fd); 1230 return 0; 1231 } else { 1232 return vhost_vdpa_set_vring_dev_call(dev, file); 1233 } 1234 } 1235 1236 static int vhost_vdpa_get_features(struct vhost_dev *dev, 1237 uint64_t *features) 1238 { 1239 struct vhost_vdpa *v = dev->opaque; 1240 int ret = vhost_vdpa_get_dev_features(dev, features); 1241 1242 if (ret == 0 && v->shadow_vqs_enabled) { 1243 /* Add SVQ logging capabilities */ 1244 *features |= BIT_ULL(VHOST_F_LOG_ALL); 1245 } 1246 1247 return ret; 1248 } 1249 1250 static int vhost_vdpa_set_owner(struct vhost_dev *dev) 1251 { 1252 if (!vhost_vdpa_first_dev(dev)) { 1253 return 0; 1254 } 1255 1256 trace_vhost_vdpa_set_owner(dev); 1257 return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL); 1258 } 1259 1260 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev, 1261 struct vhost_vring_addr *addr, struct vhost_virtqueue *vq) 1262 { 1263 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 1264 addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys; 1265 addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys; 1266 addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys; 1267 trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr, 1268 addr->avail_user_addr, addr->used_user_addr); 1269 return 0; 1270 } 1271 1272 static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) 1273 { 1274 return true; 1275 } 1276 1277 const VhostOps vdpa_ops = { 1278 .backend_type = VHOST_BACKEND_TYPE_VDPA, 1279 .vhost_backend_init = vhost_vdpa_init, 1280 .vhost_backend_cleanup = vhost_vdpa_cleanup, 1281 .vhost_set_log_base = vhost_vdpa_set_log_base, 1282 .vhost_set_vring_addr = vhost_vdpa_set_vring_addr, 1283 .vhost_set_vring_num = vhost_vdpa_set_vring_num, 1284 .vhost_set_vring_base = vhost_vdpa_set_vring_base, 1285 .vhost_get_vring_base = vhost_vdpa_get_vring_base, 1286 .vhost_set_vring_kick = vhost_vdpa_set_vring_kick, 1287 .vhost_set_vring_call = vhost_vdpa_set_vring_call, 1288 .vhost_get_features = vhost_vdpa_get_features, 1289 .vhost_set_backend_cap = vhost_vdpa_set_backend_cap, 1290 .vhost_set_owner = vhost_vdpa_set_owner, 1291 .vhost_set_vring_endian = NULL, 1292 .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit, 1293 .vhost_set_mem_table = vhost_vdpa_set_mem_table, 1294 .vhost_set_features = vhost_vdpa_set_features, 1295 .vhost_reset_device = vhost_vdpa_reset_device, 1296 .vhost_get_vq_index = vhost_vdpa_get_vq_index, 1297 .vhost_get_config = vhost_vdpa_get_config, 1298 .vhost_set_config = vhost_vdpa_set_config, 1299 .vhost_requires_shm_log = NULL, 1300 .vhost_migration_done = NULL, 1301 .vhost_backend_can_merge = NULL, 1302 .vhost_net_set_mtu = NULL, 1303 .vhost_set_iotlb_callback = NULL, 1304 .vhost_send_device_iotlb_msg = NULL, 1305 .vhost_dev_start = vhost_vdpa_dev_start, 1306 .vhost_get_device_id = vhost_vdpa_get_device_id, 1307 .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, 1308 .vhost_force_iommu = vhost_vdpa_force_iommu, 1309 }; 1310