1 /* 2 * vhost-vdpa 3 * 4 * Copyright(c) 2017-2018 Intel Corporation. 5 * Copyright(c) 2020 Red Hat, Inc. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include <linux/vhost.h> 14 #include <linux/vfio.h> 15 #include <sys/eventfd.h> 16 #include <sys/ioctl.h> 17 #include "hw/virtio/vhost.h" 18 #include "hw/virtio/vhost-backend.h" 19 #include "hw/virtio/virtio-net.h" 20 #include "hw/virtio/vhost-shadow-virtqueue.h" 21 #include "hw/virtio/vhost-vdpa.h" 22 #include "exec/address-spaces.h" 23 #include "migration/blocker.h" 24 #include "qemu/cutils.h" 25 #include "qemu/main-loop.h" 26 #include "cpu.h" 27 #include "trace.h" 28 #include "qapi/error.h" 29 30 /* 31 * Return one past the end of the end of section. Be careful with uint64_t 32 * conversions! 33 */ 34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section) 35 { 36 Int128 llend = int128_make64(section->offset_within_address_space); 37 llend = int128_add(llend, section->size); 38 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); 39 40 return llend; 41 } 42 43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, 44 uint64_t iova_min, 45 uint64_t iova_max) 46 { 47 Int128 llend; 48 49 if ((!memory_region_is_ram(section->mr) && 50 !memory_region_is_iommu(section->mr)) || 51 memory_region_is_protected(section->mr) || 52 /* vhost-vDPA doesn't allow MMIO to be mapped */ 53 memory_region_is_ram_device(section->mr)) { 54 return true; 55 } 56 57 if (section->offset_within_address_space < iova_min) { 58 error_report("RAM section out of device range (min=0x%" PRIx64 59 ", addr=0x%" HWADDR_PRIx ")", 60 iova_min, section->offset_within_address_space); 61 return true; 62 } 63 64 llend = vhost_vdpa_section_end(section); 65 if (int128_gt(llend, int128_make64(iova_max))) { 66 error_report("RAM section out of device range (max=0x%" PRIx64 67 ", end addr=0x%" PRIx64 ")", 68 iova_max, int128_get64(llend)); 69 return true; 70 } 71 72 return false; 73 } 74 75 int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, 76 void *vaddr, bool readonly) 77 { 78 struct vhost_msg_v2 msg = {}; 79 int fd = v->device_fd; 80 int ret = 0; 81 82 msg.type = v->msg_type; 83 msg.iotlb.iova = iova; 84 msg.iotlb.size = size; 85 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr; 86 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW; 87 msg.iotlb.type = VHOST_IOTLB_UPDATE; 88 89 trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size, 90 msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type); 91 92 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 93 error_report("failed to write, fd=%d, errno=%d (%s)", 94 fd, errno, strerror(errno)); 95 return -EIO ; 96 } 97 98 return ret; 99 } 100 101 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size) 102 { 103 struct vhost_msg_v2 msg = {}; 104 int fd = v->device_fd; 105 int ret = 0; 106 107 msg.type = v->msg_type; 108 msg.iotlb.iova = iova; 109 msg.iotlb.size = size; 110 msg.iotlb.type = VHOST_IOTLB_INVALIDATE; 111 112 trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova, 113 msg.iotlb.size, msg.iotlb.type); 114 115 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 116 error_report("failed to write, fd=%d, errno=%d (%s)", 117 fd, errno, strerror(errno)); 118 return -EIO ; 119 } 120 121 return ret; 122 } 123 124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v) 125 { 126 int fd = v->device_fd; 127 struct vhost_msg_v2 msg = { 128 .type = v->msg_type, 129 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN, 130 }; 131 132 trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type); 133 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 134 error_report("failed to write, fd=%d, errno=%d (%s)", 135 fd, errno, strerror(errno)); 136 } 137 } 138 139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v) 140 { 141 if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) && 142 !v->iotlb_batch_begin_sent) { 143 vhost_vdpa_listener_begin_batch(v); 144 } 145 146 v->iotlb_batch_begin_sent = true; 147 } 148 149 static void vhost_vdpa_listener_commit(MemoryListener *listener) 150 { 151 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 152 struct vhost_dev *dev = v->dev; 153 struct vhost_msg_v2 msg = {}; 154 int fd = v->device_fd; 155 156 if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) { 157 return; 158 } 159 160 if (!v->iotlb_batch_begin_sent) { 161 return; 162 } 163 164 msg.type = v->msg_type; 165 msg.iotlb.type = VHOST_IOTLB_BATCH_END; 166 167 trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type); 168 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 169 error_report("failed to write, fd=%d, errno=%d (%s)", 170 fd, errno, strerror(errno)); 171 } 172 173 v->iotlb_batch_begin_sent = false; 174 } 175 176 static void vhost_vdpa_listener_region_add(MemoryListener *listener, 177 MemoryRegionSection *section) 178 { 179 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 180 hwaddr iova; 181 Int128 llend, llsize; 182 void *vaddr; 183 int ret; 184 185 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, 186 v->iova_range.last)) { 187 return; 188 } 189 190 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 191 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 192 error_report("%s received unaligned region", __func__); 193 return; 194 } 195 196 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 197 llend = vhost_vdpa_section_end(section); 198 if (int128_ge(int128_make64(iova), llend)) { 199 return; 200 } 201 202 memory_region_ref(section->mr); 203 204 /* Here we assume that memory_region_is_ram(section->mr)==true */ 205 206 vaddr = memory_region_get_ram_ptr(section->mr) + 207 section->offset_within_region + 208 (iova - section->offset_within_address_space); 209 210 trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend), 211 vaddr, section->readonly); 212 213 llsize = int128_sub(llend, int128_make64(iova)); 214 if (v->shadow_vqs_enabled) { 215 DMAMap mem_region = { 216 .translated_addr = (hwaddr)(uintptr_t)vaddr, 217 .size = int128_get64(llsize) - 1, 218 .perm = IOMMU_ACCESS_FLAG(true, section->readonly), 219 }; 220 221 int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); 222 if (unlikely(r != IOVA_OK)) { 223 error_report("Can't allocate a mapping (%d)", r); 224 goto fail; 225 } 226 227 iova = mem_region.iova; 228 } 229 230 vhost_vdpa_iotlb_batch_begin_once(v); 231 ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize), 232 vaddr, section->readonly); 233 if (ret) { 234 error_report("vhost vdpa map fail!"); 235 goto fail; 236 } 237 238 return; 239 240 fail: 241 /* 242 * On the initfn path, store the first error in the container so we 243 * can gracefully fail. Runtime, there's not much we can do other 244 * than throw a hardware error. 245 */ 246 error_report("vhost-vdpa: DMA mapping failed, unable to continue"); 247 return; 248 249 } 250 251 static void vhost_vdpa_listener_region_del(MemoryListener *listener, 252 MemoryRegionSection *section) 253 { 254 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 255 hwaddr iova; 256 Int128 llend, llsize; 257 int ret; 258 259 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, 260 v->iova_range.last)) { 261 return; 262 } 263 264 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 265 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 266 error_report("%s received unaligned region", __func__); 267 return; 268 } 269 270 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 271 llend = vhost_vdpa_section_end(section); 272 273 trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend)); 274 275 if (int128_ge(int128_make64(iova), llend)) { 276 return; 277 } 278 279 llsize = int128_sub(llend, int128_make64(iova)); 280 281 if (v->shadow_vqs_enabled) { 282 const DMAMap *result; 283 const void *vaddr = memory_region_get_ram_ptr(section->mr) + 284 section->offset_within_region + 285 (iova - section->offset_within_address_space); 286 DMAMap mem_region = { 287 .translated_addr = (hwaddr)(uintptr_t)vaddr, 288 .size = int128_get64(llsize) - 1, 289 }; 290 291 result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region); 292 iova = result->iova; 293 vhost_iova_tree_remove(v->iova_tree, result); 294 } 295 vhost_vdpa_iotlb_batch_begin_once(v); 296 ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); 297 if (ret) { 298 error_report("vhost_vdpa dma unmap error!"); 299 } 300 301 memory_region_unref(section->mr); 302 } 303 /* 304 * IOTLB API is used by vhost-vdpa which requires incremental updating 305 * of the mapping. So we can not use generic vhost memory listener which 306 * depends on the addnop(). 307 */ 308 static const MemoryListener vhost_vdpa_memory_listener = { 309 .name = "vhost-vdpa", 310 .commit = vhost_vdpa_listener_commit, 311 .region_add = vhost_vdpa_listener_region_add, 312 .region_del = vhost_vdpa_listener_region_del, 313 }; 314 315 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, 316 void *arg) 317 { 318 struct vhost_vdpa *v = dev->opaque; 319 int fd = v->device_fd; 320 int ret; 321 322 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 323 324 ret = ioctl(fd, request, arg); 325 return ret < 0 ? -errno : ret; 326 } 327 328 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status) 329 { 330 uint8_t s; 331 int ret; 332 333 trace_vhost_vdpa_add_status(dev, status); 334 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); 335 if (ret < 0) { 336 return ret; 337 } 338 339 s |= status; 340 341 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s); 342 if (ret < 0) { 343 return ret; 344 } 345 346 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); 347 if (ret < 0) { 348 return ret; 349 } 350 351 if (!(s & status)) { 352 return -EIO; 353 } 354 355 return 0; 356 } 357 358 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v) 359 { 360 int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE, 361 &v->iova_range); 362 if (ret != 0) { 363 v->iova_range.first = 0; 364 v->iova_range.last = UINT64_MAX; 365 } 366 367 trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first, 368 v->iova_range.last); 369 } 370 371 /* 372 * The use of this function is for requests that only need to be 373 * applied once. Typically such request occurs at the beginning 374 * of operation, and before setting up queues. It should not be 375 * used for request that performs operation until all queues are 376 * set, which would need to check dev->vq_index_end instead. 377 */ 378 static bool vhost_vdpa_first_dev(struct vhost_dev *dev) 379 { 380 struct vhost_vdpa *v = dev->opaque; 381 382 return v->index == 0; 383 } 384 385 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, 386 uint64_t *features) 387 { 388 int ret; 389 390 ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features); 391 trace_vhost_vdpa_get_features(dev, *features); 392 return ret; 393 } 394 395 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v, 396 Error **errp) 397 { 398 g_autoptr(GPtrArray) shadow_vqs = NULL; 399 uint64_t dev_features, svq_features; 400 int r; 401 bool ok; 402 403 if (!v->shadow_vqs_enabled) { 404 return 0; 405 } 406 407 r = vhost_vdpa_get_dev_features(hdev, &dev_features); 408 if (r != 0) { 409 error_setg_errno(errp, -r, "Can't get vdpa device features"); 410 return r; 411 } 412 413 svq_features = dev_features; 414 ok = vhost_svq_valid_features(svq_features, errp); 415 if (unlikely(!ok)) { 416 return -1; 417 } 418 419 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); 420 for (unsigned n = 0; n < hdev->nvqs; ++n) { 421 g_autoptr(VhostShadowVirtqueue) svq; 422 423 svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops, 424 v->shadow_vq_ops_opaque); 425 if (unlikely(!svq)) { 426 error_setg(errp, "Cannot create svq %u", n); 427 return -1; 428 } 429 g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq)); 430 } 431 432 v->shadow_vqs = g_steal_pointer(&shadow_vqs); 433 return 0; 434 } 435 436 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) 437 { 438 struct vhost_vdpa *v; 439 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 440 trace_vhost_vdpa_init(dev, opaque); 441 int ret; 442 443 /* 444 * Similar to VFIO, we end up pinning all guest memory and have to 445 * disable discarding of RAM. 446 */ 447 ret = ram_block_discard_disable(true); 448 if (ret) { 449 error_report("Cannot set discarding of RAM broken"); 450 return ret; 451 } 452 453 v = opaque; 454 v->dev = dev; 455 dev->opaque = opaque ; 456 v->listener = vhost_vdpa_memory_listener; 457 v->msg_type = VHOST_IOTLB_MSG_V2; 458 ret = vhost_vdpa_init_svq(dev, v, errp); 459 if (ret) { 460 goto err; 461 } 462 463 vhost_vdpa_get_iova_range(v); 464 465 if (!vhost_vdpa_first_dev(dev)) { 466 return 0; 467 } 468 469 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 470 VIRTIO_CONFIG_S_DRIVER); 471 472 return 0; 473 474 err: 475 ram_block_discard_disable(false); 476 return ret; 477 } 478 479 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev, 480 int queue_index) 481 { 482 size_t page_size = qemu_real_host_page_size(); 483 struct vhost_vdpa *v = dev->opaque; 484 VirtIODevice *vdev = dev->vdev; 485 VhostVDPAHostNotifier *n; 486 487 n = &v->notifier[queue_index]; 488 489 if (n->addr) { 490 virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false); 491 object_unparent(OBJECT(&n->mr)); 492 munmap(n->addr, page_size); 493 n->addr = NULL; 494 } 495 } 496 497 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index) 498 { 499 size_t page_size = qemu_real_host_page_size(); 500 struct vhost_vdpa *v = dev->opaque; 501 VirtIODevice *vdev = dev->vdev; 502 VhostVDPAHostNotifier *n; 503 int fd = v->device_fd; 504 void *addr; 505 char *name; 506 507 vhost_vdpa_host_notifier_uninit(dev, queue_index); 508 509 n = &v->notifier[queue_index]; 510 511 addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd, 512 queue_index * page_size); 513 if (addr == MAP_FAILED) { 514 goto err; 515 } 516 517 name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]", 518 v, queue_index); 519 memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name, 520 page_size, addr); 521 g_free(name); 522 523 if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) { 524 object_unparent(OBJECT(&n->mr)); 525 munmap(addr, page_size); 526 goto err; 527 } 528 n->addr = addr; 529 530 return 0; 531 532 err: 533 return -1; 534 } 535 536 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n) 537 { 538 int i; 539 540 for (i = dev->vq_index; i < dev->vq_index + n; i++) { 541 vhost_vdpa_host_notifier_uninit(dev, i); 542 } 543 } 544 545 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev) 546 { 547 struct vhost_vdpa *v = dev->opaque; 548 int i; 549 550 if (v->shadow_vqs_enabled) { 551 /* FIXME SVQ is not compatible with host notifiers mr */ 552 return; 553 } 554 555 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) { 556 if (vhost_vdpa_host_notifier_init(dev, i)) { 557 goto err; 558 } 559 } 560 561 return; 562 563 err: 564 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index); 565 return; 566 } 567 568 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev) 569 { 570 struct vhost_vdpa *v = dev->opaque; 571 size_t idx; 572 573 if (!v->shadow_vqs) { 574 return; 575 } 576 577 for (idx = 0; idx < v->shadow_vqs->len; ++idx) { 578 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx)); 579 } 580 g_ptr_array_free(v->shadow_vqs, true); 581 } 582 583 static int vhost_vdpa_cleanup(struct vhost_dev *dev) 584 { 585 struct vhost_vdpa *v; 586 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 587 v = dev->opaque; 588 trace_vhost_vdpa_cleanup(dev, v); 589 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); 590 memory_listener_unregister(&v->listener); 591 vhost_vdpa_svq_cleanup(dev); 592 593 dev->opaque = NULL; 594 ram_block_discard_disable(false); 595 596 return 0; 597 } 598 599 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) 600 { 601 trace_vhost_vdpa_memslots_limit(dev, INT_MAX); 602 return INT_MAX; 603 } 604 605 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, 606 struct vhost_memory *mem) 607 { 608 if (!vhost_vdpa_first_dev(dev)) { 609 return 0; 610 } 611 612 trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding); 613 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) && 614 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) { 615 int i; 616 for (i = 0; i < mem->nregions; i++) { 617 trace_vhost_vdpa_dump_regions(dev, i, 618 mem->regions[i].guest_phys_addr, 619 mem->regions[i].memory_size, 620 mem->regions[i].userspace_addr, 621 mem->regions[i].flags_padding); 622 } 623 } 624 if (mem->padding) { 625 return -EINVAL; 626 } 627 628 return 0; 629 } 630 631 static int vhost_vdpa_set_features(struct vhost_dev *dev, 632 uint64_t features) 633 { 634 struct vhost_vdpa *v = dev->opaque; 635 int ret; 636 637 if (!vhost_vdpa_first_dev(dev)) { 638 return 0; 639 } 640 641 if (v->shadow_vqs_enabled) { 642 if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) { 643 /* 644 * QEMU is just trying to enable or disable logging. SVQ handles 645 * this sepparately, so no need to forward this. 646 */ 647 v->acked_features = features; 648 return 0; 649 } 650 651 v->acked_features = features; 652 653 /* We must not ack _F_LOG if SVQ is enabled */ 654 features &= ~BIT_ULL(VHOST_F_LOG_ALL); 655 } 656 657 trace_vhost_vdpa_set_features(dev, features); 658 ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features); 659 if (ret) { 660 return ret; 661 } 662 663 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); 664 } 665 666 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) 667 { 668 uint64_t features; 669 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | 670 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH; 671 int r; 672 673 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { 674 return -EFAULT; 675 } 676 677 features &= f; 678 679 if (vhost_vdpa_first_dev(dev)) { 680 r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); 681 if (r) { 682 return -EFAULT; 683 } 684 } 685 686 dev->backend_cap = features; 687 688 return 0; 689 } 690 691 static int vhost_vdpa_get_device_id(struct vhost_dev *dev, 692 uint32_t *device_id) 693 { 694 int ret; 695 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id); 696 trace_vhost_vdpa_get_device_id(dev, *device_id); 697 return ret; 698 } 699 700 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v) 701 { 702 if (!v->shadow_vqs_enabled) { 703 return; 704 } 705 706 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { 707 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 708 vhost_svq_stop(svq); 709 } 710 } 711 712 static int vhost_vdpa_reset_device(struct vhost_dev *dev) 713 { 714 struct vhost_vdpa *v = dev->opaque; 715 int ret; 716 uint8_t status = 0; 717 718 vhost_vdpa_reset_svq(v); 719 720 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); 721 trace_vhost_vdpa_reset_device(dev, status); 722 return ret; 723 } 724 725 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) 726 { 727 assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); 728 729 trace_vhost_vdpa_get_vq_index(dev, idx, idx); 730 return idx; 731 } 732 733 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev) 734 { 735 int i; 736 trace_vhost_vdpa_set_vring_ready(dev); 737 for (i = 0; i < dev->nvqs; ++i) { 738 struct vhost_vring_state state = { 739 .index = dev->vq_index + i, 740 .num = 1, 741 }; 742 vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); 743 } 744 return 0; 745 } 746 747 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config, 748 uint32_t config_len) 749 { 750 int b, len; 751 char line[QEMU_HEXDUMP_LINE_LEN]; 752 753 for (b = 0; b < config_len; b += 16) { 754 len = config_len - b; 755 qemu_hexdump_line(line, b, config, len, false); 756 trace_vhost_vdpa_dump_config(dev, line); 757 } 758 } 759 760 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data, 761 uint32_t offset, uint32_t size, 762 uint32_t flags) 763 { 764 struct vhost_vdpa_config *config; 765 int ret; 766 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 767 768 trace_vhost_vdpa_set_config(dev, offset, size, flags); 769 config = g_malloc(size + config_size); 770 config->off = offset; 771 config->len = size; 772 memcpy(config->buf, data, size); 773 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) && 774 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { 775 vhost_vdpa_dump_config(dev, data, size); 776 } 777 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config); 778 g_free(config); 779 return ret; 780 } 781 782 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, 783 uint32_t config_len, Error **errp) 784 { 785 struct vhost_vdpa_config *v_config; 786 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 787 int ret; 788 789 trace_vhost_vdpa_get_config(dev, config, config_len); 790 v_config = g_malloc(config_len + config_size); 791 v_config->len = config_len; 792 v_config->off = 0; 793 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config); 794 memcpy(config, v_config->buf, config_len); 795 g_free(v_config); 796 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) && 797 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { 798 vhost_vdpa_dump_config(dev, config, config_len); 799 } 800 return ret; 801 } 802 803 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, 804 struct vhost_vring_state *ring) 805 { 806 trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num); 807 return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); 808 } 809 810 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev, 811 struct vhost_vring_file *file) 812 { 813 trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd); 814 return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file); 815 } 816 817 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev, 818 struct vhost_vring_file *file) 819 { 820 trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd); 821 return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file); 822 } 823 824 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev, 825 struct vhost_vring_addr *addr) 826 { 827 trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags, 828 addr->desc_user_addr, addr->used_user_addr, 829 addr->avail_user_addr, 830 addr->log_guest_addr); 831 832 return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr); 833 834 } 835 836 /** 837 * Set the shadow virtqueue descriptors to the device 838 * 839 * @dev: The vhost device model 840 * @svq: The shadow virtqueue 841 * @idx: The index of the virtqueue in the vhost device 842 * @errp: Error 843 * 844 * Note that this function does not rewind kick file descriptor if cannot set 845 * call one. 846 */ 847 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, 848 VhostShadowVirtqueue *svq, unsigned idx, 849 Error **errp) 850 { 851 struct vhost_vring_file file = { 852 .index = dev->vq_index + idx, 853 }; 854 const EventNotifier *event_notifier = &svq->hdev_kick; 855 int r; 856 857 file.fd = event_notifier_get_fd(event_notifier); 858 r = vhost_vdpa_set_vring_dev_kick(dev, &file); 859 if (unlikely(r != 0)) { 860 error_setg_errno(errp, -r, "Can't set device kick fd"); 861 return r; 862 } 863 864 event_notifier = &svq->hdev_call; 865 file.fd = event_notifier_get_fd(event_notifier); 866 r = vhost_vdpa_set_vring_dev_call(dev, &file); 867 if (unlikely(r != 0)) { 868 error_setg_errno(errp, -r, "Can't set device call fd"); 869 } 870 871 return r; 872 } 873 874 /** 875 * Unmap a SVQ area in the device 876 */ 877 static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, 878 const DMAMap *needle) 879 { 880 const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle); 881 hwaddr size; 882 int r; 883 884 if (unlikely(!result)) { 885 error_report("Unable to find SVQ address to unmap"); 886 return false; 887 } 888 889 size = ROUND_UP(result->size, qemu_real_host_page_size()); 890 r = vhost_vdpa_dma_unmap(v, result->iova, size); 891 return r == 0; 892 } 893 894 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, 895 const VhostShadowVirtqueue *svq) 896 { 897 DMAMap needle = {}; 898 struct vhost_vdpa *v = dev->opaque; 899 struct vhost_vring_addr svq_addr; 900 bool ok; 901 902 vhost_svq_get_vring_addr(svq, &svq_addr); 903 904 needle.translated_addr = svq_addr.desc_user_addr; 905 ok = vhost_vdpa_svq_unmap_ring(v, &needle); 906 if (unlikely(!ok)) { 907 return false; 908 } 909 910 needle.translated_addr = svq_addr.used_user_addr; 911 return vhost_vdpa_svq_unmap_ring(v, &needle); 912 } 913 914 /** 915 * Map the SVQ area in the device 916 * 917 * @v: Vhost-vdpa device 918 * @needle: The area to search iova 919 * @errorp: Error pointer 920 */ 921 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, 922 Error **errp) 923 { 924 int r; 925 926 r = vhost_iova_tree_map_alloc(v->iova_tree, needle); 927 if (unlikely(r != IOVA_OK)) { 928 error_setg(errp, "Cannot allocate iova (%d)", r); 929 return false; 930 } 931 932 r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1, 933 (void *)(uintptr_t)needle->translated_addr, 934 needle->perm == IOMMU_RO); 935 if (unlikely(r != 0)) { 936 error_setg_errno(errp, -r, "Cannot map region to device"); 937 vhost_iova_tree_remove(v->iova_tree, needle); 938 } 939 940 return r == 0; 941 } 942 943 /** 944 * Map the shadow virtqueue rings in the device 945 * 946 * @dev: The vhost device 947 * @svq: The shadow virtqueue 948 * @addr: Assigned IOVA addresses 949 * @errp: Error pointer 950 */ 951 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, 952 const VhostShadowVirtqueue *svq, 953 struct vhost_vring_addr *addr, 954 Error **errp) 955 { 956 DMAMap device_region, driver_region; 957 struct vhost_vring_addr svq_addr; 958 struct vhost_vdpa *v = dev->opaque; 959 size_t device_size = vhost_svq_device_area_size(svq); 960 size_t driver_size = vhost_svq_driver_area_size(svq); 961 size_t avail_offset; 962 bool ok; 963 964 ERRP_GUARD(); 965 vhost_svq_get_vring_addr(svq, &svq_addr); 966 967 driver_region = (DMAMap) { 968 .translated_addr = svq_addr.desc_user_addr, 969 .size = driver_size - 1, 970 .perm = IOMMU_RO, 971 }; 972 ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); 973 if (unlikely(!ok)) { 974 error_prepend(errp, "Cannot create vq driver region: "); 975 return false; 976 } 977 addr->desc_user_addr = driver_region.iova; 978 avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr; 979 addr->avail_user_addr = driver_region.iova + avail_offset; 980 981 device_region = (DMAMap) { 982 .translated_addr = svq_addr.used_user_addr, 983 .size = device_size - 1, 984 .perm = IOMMU_RW, 985 }; 986 ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); 987 if (unlikely(!ok)) { 988 error_prepend(errp, "Cannot create vq device region: "); 989 vhost_vdpa_svq_unmap_ring(v, &driver_region); 990 } 991 addr->used_user_addr = device_region.iova; 992 993 return ok; 994 } 995 996 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev, 997 VhostShadowVirtqueue *svq, unsigned idx, 998 Error **errp) 999 { 1000 uint16_t vq_index = dev->vq_index + idx; 1001 struct vhost_vring_state s = { 1002 .index = vq_index, 1003 }; 1004 int r; 1005 1006 r = vhost_vdpa_set_dev_vring_base(dev, &s); 1007 if (unlikely(r)) { 1008 error_setg_errno(errp, -r, "Cannot set vring base"); 1009 return false; 1010 } 1011 1012 r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp); 1013 return r == 0; 1014 } 1015 1016 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev) 1017 { 1018 struct vhost_vdpa *v = dev->opaque; 1019 Error *err = NULL; 1020 unsigned i; 1021 1022 if (!v->shadow_vqs) { 1023 return true; 1024 } 1025 1026 if (v->migration_blocker) { 1027 int r = migrate_add_blocker(v->migration_blocker, &err); 1028 if (unlikely(r < 0)) { 1029 return false; 1030 } 1031 } 1032 1033 for (i = 0; i < v->shadow_vqs->len; ++i) { 1034 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i); 1035 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 1036 struct vhost_vring_addr addr = { 1037 .index = dev->vq_index + i, 1038 }; 1039 int r; 1040 bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err); 1041 if (unlikely(!ok)) { 1042 goto err; 1043 } 1044 1045 vhost_svq_start(svq, dev->vdev, vq); 1046 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err); 1047 if (unlikely(!ok)) { 1048 goto err_map; 1049 } 1050 1051 /* Override vring GPA set by vhost subsystem */ 1052 r = vhost_vdpa_set_vring_dev_addr(dev, &addr); 1053 if (unlikely(r != 0)) { 1054 error_setg_errno(&err, -r, "Cannot set device address"); 1055 goto err_set_addr; 1056 } 1057 } 1058 1059 return true; 1060 1061 err_set_addr: 1062 vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i)); 1063 1064 err_map: 1065 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i)); 1066 1067 err: 1068 error_reportf_err(err, "Cannot setup SVQ %u: ", i); 1069 for (unsigned j = 0; j < i; ++j) { 1070 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j); 1071 vhost_vdpa_svq_unmap_rings(dev, svq); 1072 vhost_svq_stop(svq); 1073 } 1074 1075 if (v->migration_blocker) { 1076 migrate_del_blocker(v->migration_blocker); 1077 } 1078 1079 return false; 1080 } 1081 1082 static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev) 1083 { 1084 struct vhost_vdpa *v = dev->opaque; 1085 1086 if (!v->shadow_vqs) { 1087 return true; 1088 } 1089 1090 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { 1091 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 1092 bool ok = vhost_vdpa_svq_unmap_rings(dev, svq); 1093 if (unlikely(!ok)) { 1094 return false; 1095 } 1096 } 1097 1098 if (v->migration_blocker) { 1099 migrate_del_blocker(v->migration_blocker); 1100 } 1101 return true; 1102 } 1103 1104 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) 1105 { 1106 struct vhost_vdpa *v = dev->opaque; 1107 bool ok; 1108 trace_vhost_vdpa_dev_start(dev, started); 1109 1110 if (started) { 1111 vhost_vdpa_host_notifiers_init(dev); 1112 ok = vhost_vdpa_svqs_start(dev); 1113 if (unlikely(!ok)) { 1114 return -1; 1115 } 1116 vhost_vdpa_set_vring_ready(dev); 1117 } else { 1118 ok = vhost_vdpa_svqs_stop(dev); 1119 if (unlikely(!ok)) { 1120 return -1; 1121 } 1122 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); 1123 } 1124 1125 if (dev->vq_index + dev->nvqs != dev->vq_index_end) { 1126 return 0; 1127 } 1128 1129 if (started) { 1130 memory_listener_register(&v->listener, &address_space_memory); 1131 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); 1132 } else { 1133 vhost_vdpa_reset_device(dev); 1134 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 1135 VIRTIO_CONFIG_S_DRIVER); 1136 memory_listener_unregister(&v->listener); 1137 1138 return 0; 1139 } 1140 } 1141 1142 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, 1143 struct vhost_log *log) 1144 { 1145 struct vhost_vdpa *v = dev->opaque; 1146 if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { 1147 return 0; 1148 } 1149 1150 trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd, 1151 log->log); 1152 return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base); 1153 } 1154 1155 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev, 1156 struct vhost_vring_addr *addr) 1157 { 1158 struct vhost_vdpa *v = dev->opaque; 1159 1160 if (v->shadow_vqs_enabled) { 1161 /* 1162 * Device vring addr was set at device start. SVQ base is handled by 1163 * VirtQueue code. 1164 */ 1165 return 0; 1166 } 1167 1168 return vhost_vdpa_set_vring_dev_addr(dev, addr); 1169 } 1170 1171 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev, 1172 struct vhost_vring_state *ring) 1173 { 1174 trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num); 1175 return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring); 1176 } 1177 1178 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev, 1179 struct vhost_vring_state *ring) 1180 { 1181 struct vhost_vdpa *v = dev->opaque; 1182 VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index); 1183 1184 /* 1185 * vhost-vdpa devices does not support in-flight requests. Set all of them 1186 * as available. 1187 * 1188 * TODO: This is ok for networking, but other kinds of devices might 1189 * have problems with these retransmissions. 1190 */ 1191 while (virtqueue_rewind(vq, 1)) { 1192 continue; 1193 } 1194 if (v->shadow_vqs_enabled) { 1195 /* 1196 * Device vring base was set at device start. SVQ base is handled by 1197 * VirtQueue code. 1198 */ 1199 return 0; 1200 } 1201 1202 return vhost_vdpa_set_dev_vring_base(dev, ring); 1203 } 1204 1205 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, 1206 struct vhost_vring_state *ring) 1207 { 1208 struct vhost_vdpa *v = dev->opaque; 1209 int ret; 1210 1211 if (v->shadow_vqs_enabled) { 1212 ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index); 1213 return 0; 1214 } 1215 1216 ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); 1217 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); 1218 return ret; 1219 } 1220 1221 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev, 1222 struct vhost_vring_file *file) 1223 { 1224 struct vhost_vdpa *v = dev->opaque; 1225 int vdpa_idx = file->index - dev->vq_index; 1226 1227 if (v->shadow_vqs_enabled) { 1228 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1229 vhost_svq_set_svq_kick_fd(svq, file->fd); 1230 return 0; 1231 } else { 1232 return vhost_vdpa_set_vring_dev_kick(dev, file); 1233 } 1234 } 1235 1236 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, 1237 struct vhost_vring_file *file) 1238 { 1239 struct vhost_vdpa *v = dev->opaque; 1240 1241 if (v->shadow_vqs_enabled) { 1242 int vdpa_idx = file->index - dev->vq_index; 1243 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1244 1245 vhost_svq_set_svq_call_fd(svq, file->fd); 1246 return 0; 1247 } else { 1248 return vhost_vdpa_set_vring_dev_call(dev, file); 1249 } 1250 } 1251 1252 static int vhost_vdpa_get_features(struct vhost_dev *dev, 1253 uint64_t *features) 1254 { 1255 struct vhost_vdpa *v = dev->opaque; 1256 int ret = vhost_vdpa_get_dev_features(dev, features); 1257 1258 if (ret == 0 && v->shadow_vqs_enabled) { 1259 /* Add SVQ logging capabilities */ 1260 *features |= BIT_ULL(VHOST_F_LOG_ALL); 1261 } 1262 1263 return ret; 1264 } 1265 1266 static int vhost_vdpa_set_owner(struct vhost_dev *dev) 1267 { 1268 if (!vhost_vdpa_first_dev(dev)) { 1269 return 0; 1270 } 1271 1272 trace_vhost_vdpa_set_owner(dev); 1273 return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL); 1274 } 1275 1276 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev, 1277 struct vhost_vring_addr *addr, struct vhost_virtqueue *vq) 1278 { 1279 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 1280 addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys; 1281 addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys; 1282 addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys; 1283 trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr, 1284 addr->avail_user_addr, addr->used_user_addr); 1285 return 0; 1286 } 1287 1288 static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) 1289 { 1290 return true; 1291 } 1292 1293 const VhostOps vdpa_ops = { 1294 .backend_type = VHOST_BACKEND_TYPE_VDPA, 1295 .vhost_backend_init = vhost_vdpa_init, 1296 .vhost_backend_cleanup = vhost_vdpa_cleanup, 1297 .vhost_set_log_base = vhost_vdpa_set_log_base, 1298 .vhost_set_vring_addr = vhost_vdpa_set_vring_addr, 1299 .vhost_set_vring_num = vhost_vdpa_set_vring_num, 1300 .vhost_set_vring_base = vhost_vdpa_set_vring_base, 1301 .vhost_get_vring_base = vhost_vdpa_get_vring_base, 1302 .vhost_set_vring_kick = vhost_vdpa_set_vring_kick, 1303 .vhost_set_vring_call = vhost_vdpa_set_vring_call, 1304 .vhost_get_features = vhost_vdpa_get_features, 1305 .vhost_set_backend_cap = vhost_vdpa_set_backend_cap, 1306 .vhost_set_owner = vhost_vdpa_set_owner, 1307 .vhost_set_vring_endian = NULL, 1308 .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit, 1309 .vhost_set_mem_table = vhost_vdpa_set_mem_table, 1310 .vhost_set_features = vhost_vdpa_set_features, 1311 .vhost_reset_device = vhost_vdpa_reset_device, 1312 .vhost_get_vq_index = vhost_vdpa_get_vq_index, 1313 .vhost_get_config = vhost_vdpa_get_config, 1314 .vhost_set_config = vhost_vdpa_set_config, 1315 .vhost_requires_shm_log = NULL, 1316 .vhost_migration_done = NULL, 1317 .vhost_backend_can_merge = NULL, 1318 .vhost_net_set_mtu = NULL, 1319 .vhost_set_iotlb_callback = NULL, 1320 .vhost_send_device_iotlb_msg = NULL, 1321 .vhost_dev_start = vhost_vdpa_dev_start, 1322 .vhost_get_device_id = vhost_vdpa_get_device_id, 1323 .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, 1324 .vhost_force_iommu = vhost_vdpa_force_iommu, 1325 }; 1326