1 /* 2 * vhost-vdpa 3 * 4 * Copyright(c) 2017-2018 Intel Corporation. 5 * Copyright(c) 2020 Red Hat, Inc. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include <linux/vhost.h> 14 #include <linux/vfio.h> 15 #include <sys/eventfd.h> 16 #include <sys/ioctl.h> 17 #include "hw/virtio/vhost.h" 18 #include "hw/virtio/vhost-backend.h" 19 #include "hw/virtio/virtio-net.h" 20 #include "hw/virtio/vhost-shadow-virtqueue.h" 21 #include "hw/virtio/vhost-vdpa.h" 22 #include "exec/address-spaces.h" 23 #include "qemu/cutils.h" 24 #include "qemu/main-loop.h" 25 #include "cpu.h" 26 #include "trace.h" 27 #include "qapi/error.h" 28 29 /* 30 * Return one past the end of the end of section. Be careful with uint64_t 31 * conversions! 32 */ 33 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section) 34 { 35 Int128 llend = int128_make64(section->offset_within_address_space); 36 llend = int128_add(llend, section->size); 37 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); 38 39 return llend; 40 } 41 42 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, 43 uint64_t iova_min, 44 uint64_t iova_max) 45 { 46 Int128 llend; 47 48 if ((!memory_region_is_ram(section->mr) && 49 !memory_region_is_iommu(section->mr)) || 50 memory_region_is_protected(section->mr) || 51 /* vhost-vDPA doesn't allow MMIO to be mapped */ 52 memory_region_is_ram_device(section->mr)) { 53 return true; 54 } 55 56 if (section->offset_within_address_space < iova_min) { 57 error_report("RAM section out of device range (min=0x%" PRIx64 58 ", addr=0x%" HWADDR_PRIx ")", 59 iova_min, section->offset_within_address_space); 60 return true; 61 } 62 63 llend = vhost_vdpa_section_end(section); 64 if (int128_gt(llend, int128_make64(iova_max))) { 65 error_report("RAM section out of device range (max=0x%" PRIx64 66 ", end addr=0x%" PRIx64 ")", 67 iova_max, int128_get64(llend)); 68 return true; 69 } 70 71 return false; 72 } 73 74 static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, 75 void *vaddr, bool readonly) 76 { 77 struct vhost_msg_v2 msg = {}; 78 int fd = v->device_fd; 79 int ret = 0; 80 81 msg.type = v->msg_type; 82 msg.iotlb.iova = iova; 83 msg.iotlb.size = size; 84 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr; 85 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW; 86 msg.iotlb.type = VHOST_IOTLB_UPDATE; 87 88 trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size, 89 msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type); 90 91 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 92 error_report("failed to write, fd=%d, errno=%d (%s)", 93 fd, errno, strerror(errno)); 94 return -EIO ; 95 } 96 97 return ret; 98 } 99 100 static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, 101 hwaddr size) 102 { 103 struct vhost_msg_v2 msg = {}; 104 int fd = v->device_fd; 105 int ret = 0; 106 107 msg.type = v->msg_type; 108 msg.iotlb.iova = iova; 109 msg.iotlb.size = size; 110 msg.iotlb.type = VHOST_IOTLB_INVALIDATE; 111 112 trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova, 113 msg.iotlb.size, msg.iotlb.type); 114 115 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 116 error_report("failed to write, fd=%d, errno=%d (%s)", 117 fd, errno, strerror(errno)); 118 return -EIO ; 119 } 120 121 return ret; 122 } 123 124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v) 125 { 126 int fd = v->device_fd; 127 struct vhost_msg_v2 msg = { 128 .type = v->msg_type, 129 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN, 130 }; 131 132 trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type); 133 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 134 error_report("failed to write, fd=%d, errno=%d (%s)", 135 fd, errno, strerror(errno)); 136 } 137 } 138 139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v) 140 { 141 if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) && 142 !v->iotlb_batch_begin_sent) { 143 vhost_vdpa_listener_begin_batch(v); 144 } 145 146 v->iotlb_batch_begin_sent = true; 147 } 148 149 static void vhost_vdpa_listener_commit(MemoryListener *listener) 150 { 151 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 152 struct vhost_dev *dev = v->dev; 153 struct vhost_msg_v2 msg = {}; 154 int fd = v->device_fd; 155 156 if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) { 157 return; 158 } 159 160 if (!v->iotlb_batch_begin_sent) { 161 return; 162 } 163 164 msg.type = v->msg_type; 165 msg.iotlb.type = VHOST_IOTLB_BATCH_END; 166 167 trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type); 168 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 169 error_report("failed to write, fd=%d, errno=%d (%s)", 170 fd, errno, strerror(errno)); 171 } 172 173 v->iotlb_batch_begin_sent = false; 174 } 175 176 static void vhost_vdpa_listener_region_add(MemoryListener *listener, 177 MemoryRegionSection *section) 178 { 179 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 180 hwaddr iova; 181 Int128 llend, llsize; 182 void *vaddr; 183 int ret; 184 185 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, 186 v->iova_range.last)) { 187 return; 188 } 189 190 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 191 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 192 error_report("%s received unaligned region", __func__); 193 return; 194 } 195 196 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 197 llend = vhost_vdpa_section_end(section); 198 if (int128_ge(int128_make64(iova), llend)) { 199 return; 200 } 201 202 memory_region_ref(section->mr); 203 204 /* Here we assume that memory_region_is_ram(section->mr)==true */ 205 206 vaddr = memory_region_get_ram_ptr(section->mr) + 207 section->offset_within_region + 208 (iova - section->offset_within_address_space); 209 210 trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend), 211 vaddr, section->readonly); 212 213 llsize = int128_sub(llend, int128_make64(iova)); 214 if (v->shadow_vqs_enabled) { 215 DMAMap mem_region = { 216 .translated_addr = (hwaddr)(uintptr_t)vaddr, 217 .size = int128_get64(llsize) - 1, 218 .perm = IOMMU_ACCESS_FLAG(true, section->readonly), 219 }; 220 221 int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); 222 if (unlikely(r != IOVA_OK)) { 223 error_report("Can't allocate a mapping (%d)", r); 224 goto fail; 225 } 226 227 iova = mem_region.iova; 228 } 229 230 vhost_vdpa_iotlb_batch_begin_once(v); 231 ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize), 232 vaddr, section->readonly); 233 if (ret) { 234 error_report("vhost vdpa map fail!"); 235 goto fail; 236 } 237 238 return; 239 240 fail: 241 /* 242 * On the initfn path, store the first error in the container so we 243 * can gracefully fail. Runtime, there's not much we can do other 244 * than throw a hardware error. 245 */ 246 error_report("vhost-vdpa: DMA mapping failed, unable to continue"); 247 return; 248 249 } 250 251 static void vhost_vdpa_listener_region_del(MemoryListener *listener, 252 MemoryRegionSection *section) 253 { 254 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 255 hwaddr iova; 256 Int128 llend, llsize; 257 int ret; 258 259 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, 260 v->iova_range.last)) { 261 return; 262 } 263 264 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 265 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 266 error_report("%s received unaligned region", __func__); 267 return; 268 } 269 270 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 271 llend = vhost_vdpa_section_end(section); 272 273 trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend)); 274 275 if (int128_ge(int128_make64(iova), llend)) { 276 return; 277 } 278 279 llsize = int128_sub(llend, int128_make64(iova)); 280 281 if (v->shadow_vqs_enabled) { 282 const DMAMap *result; 283 const void *vaddr = memory_region_get_ram_ptr(section->mr) + 284 section->offset_within_region + 285 (iova - section->offset_within_address_space); 286 DMAMap mem_region = { 287 .translated_addr = (hwaddr)(uintptr_t)vaddr, 288 .size = int128_get64(llsize) - 1, 289 }; 290 291 result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region); 292 iova = result->iova; 293 vhost_iova_tree_remove(v->iova_tree, &mem_region); 294 } 295 vhost_vdpa_iotlb_batch_begin_once(v); 296 ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); 297 if (ret) { 298 error_report("vhost_vdpa dma unmap error!"); 299 } 300 301 memory_region_unref(section->mr); 302 } 303 /* 304 * IOTLB API is used by vhost-vdpa which requires incremental updating 305 * of the mapping. So we can not use generic vhost memory listener which 306 * depends on the addnop(). 307 */ 308 static const MemoryListener vhost_vdpa_memory_listener = { 309 .name = "vhost-vdpa", 310 .commit = vhost_vdpa_listener_commit, 311 .region_add = vhost_vdpa_listener_region_add, 312 .region_del = vhost_vdpa_listener_region_del, 313 }; 314 315 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, 316 void *arg) 317 { 318 struct vhost_vdpa *v = dev->opaque; 319 int fd = v->device_fd; 320 int ret; 321 322 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 323 324 ret = ioctl(fd, request, arg); 325 return ret < 0 ? -errno : ret; 326 } 327 328 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status) 329 { 330 uint8_t s; 331 int ret; 332 333 trace_vhost_vdpa_add_status(dev, status); 334 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); 335 if (ret < 0) { 336 return ret; 337 } 338 339 s |= status; 340 341 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s); 342 if (ret < 0) { 343 return ret; 344 } 345 346 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); 347 if (ret < 0) { 348 return ret; 349 } 350 351 if (!(s & status)) { 352 return -EIO; 353 } 354 355 return 0; 356 } 357 358 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v) 359 { 360 int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE, 361 &v->iova_range); 362 if (ret != 0) { 363 v->iova_range.first = 0; 364 v->iova_range.last = UINT64_MAX; 365 } 366 367 trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first, 368 v->iova_range.last); 369 } 370 371 static bool vhost_vdpa_one_time_request(struct vhost_dev *dev) 372 { 373 struct vhost_vdpa *v = dev->opaque; 374 375 return v->index != 0; 376 } 377 378 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, 379 uint64_t *features) 380 { 381 int ret; 382 383 ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features); 384 trace_vhost_vdpa_get_features(dev, *features); 385 return ret; 386 } 387 388 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v, 389 Error **errp) 390 { 391 g_autoptr(GPtrArray) shadow_vqs = NULL; 392 uint64_t dev_features, svq_features; 393 int r; 394 bool ok; 395 396 if (!v->shadow_vqs_enabled) { 397 return 0; 398 } 399 400 r = vhost_vdpa_get_dev_features(hdev, &dev_features); 401 if (r != 0) { 402 error_setg_errno(errp, -r, "Can't get vdpa device features"); 403 return r; 404 } 405 406 svq_features = dev_features; 407 ok = vhost_svq_valid_features(svq_features, errp); 408 if (unlikely(!ok)) { 409 return -1; 410 } 411 412 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); 413 for (unsigned n = 0; n < hdev->nvqs; ++n) { 414 g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree); 415 416 if (unlikely(!svq)) { 417 error_setg(errp, "Cannot create svq %u", n); 418 return -1; 419 } 420 g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq)); 421 } 422 423 v->shadow_vqs = g_steal_pointer(&shadow_vqs); 424 return 0; 425 } 426 427 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) 428 { 429 struct vhost_vdpa *v; 430 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 431 trace_vhost_vdpa_init(dev, opaque); 432 int ret; 433 434 /* 435 * Similar to VFIO, we end up pinning all guest memory and have to 436 * disable discarding of RAM. 437 */ 438 ret = ram_block_discard_disable(true); 439 if (ret) { 440 error_report("Cannot set discarding of RAM broken"); 441 return ret; 442 } 443 444 v = opaque; 445 v->dev = dev; 446 dev->opaque = opaque ; 447 v->listener = vhost_vdpa_memory_listener; 448 v->msg_type = VHOST_IOTLB_MSG_V2; 449 ret = vhost_vdpa_init_svq(dev, v, errp); 450 if (ret) { 451 goto err; 452 } 453 454 vhost_vdpa_get_iova_range(v); 455 456 if (vhost_vdpa_one_time_request(dev)) { 457 return 0; 458 } 459 460 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 461 VIRTIO_CONFIG_S_DRIVER); 462 463 return 0; 464 465 err: 466 ram_block_discard_disable(false); 467 return ret; 468 } 469 470 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev, 471 int queue_index) 472 { 473 size_t page_size = qemu_real_host_page_size(); 474 struct vhost_vdpa *v = dev->opaque; 475 VirtIODevice *vdev = dev->vdev; 476 VhostVDPAHostNotifier *n; 477 478 n = &v->notifier[queue_index]; 479 480 if (n->addr) { 481 virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false); 482 object_unparent(OBJECT(&n->mr)); 483 munmap(n->addr, page_size); 484 n->addr = NULL; 485 } 486 } 487 488 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index) 489 { 490 size_t page_size = qemu_real_host_page_size(); 491 struct vhost_vdpa *v = dev->opaque; 492 VirtIODevice *vdev = dev->vdev; 493 VhostVDPAHostNotifier *n; 494 int fd = v->device_fd; 495 void *addr; 496 char *name; 497 498 vhost_vdpa_host_notifier_uninit(dev, queue_index); 499 500 n = &v->notifier[queue_index]; 501 502 addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd, 503 queue_index * page_size); 504 if (addr == MAP_FAILED) { 505 goto err; 506 } 507 508 name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]", 509 v, queue_index); 510 memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name, 511 page_size, addr); 512 g_free(name); 513 514 if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) { 515 object_unparent(OBJECT(&n->mr)); 516 munmap(addr, page_size); 517 goto err; 518 } 519 n->addr = addr; 520 521 return 0; 522 523 err: 524 return -1; 525 } 526 527 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n) 528 { 529 int i; 530 531 for (i = dev->vq_index; i < dev->vq_index + n; i++) { 532 vhost_vdpa_host_notifier_uninit(dev, i); 533 } 534 } 535 536 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev) 537 { 538 struct vhost_vdpa *v = dev->opaque; 539 int i; 540 541 if (v->shadow_vqs_enabled) { 542 /* FIXME SVQ is not compatible with host notifiers mr */ 543 return; 544 } 545 546 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) { 547 if (vhost_vdpa_host_notifier_init(dev, i)) { 548 goto err; 549 } 550 } 551 552 return; 553 554 err: 555 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index); 556 return; 557 } 558 559 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev) 560 { 561 struct vhost_vdpa *v = dev->opaque; 562 size_t idx; 563 564 if (!v->shadow_vqs) { 565 return; 566 } 567 568 for (idx = 0; idx < v->shadow_vqs->len; ++idx) { 569 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx)); 570 } 571 g_ptr_array_free(v->shadow_vqs, true); 572 } 573 574 static int vhost_vdpa_cleanup(struct vhost_dev *dev) 575 { 576 struct vhost_vdpa *v; 577 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 578 v = dev->opaque; 579 trace_vhost_vdpa_cleanup(dev, v); 580 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); 581 memory_listener_unregister(&v->listener); 582 vhost_vdpa_svq_cleanup(dev); 583 584 dev->opaque = NULL; 585 ram_block_discard_disable(false); 586 587 return 0; 588 } 589 590 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) 591 { 592 trace_vhost_vdpa_memslots_limit(dev, INT_MAX); 593 return INT_MAX; 594 } 595 596 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, 597 struct vhost_memory *mem) 598 { 599 if (vhost_vdpa_one_time_request(dev)) { 600 return 0; 601 } 602 603 trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding); 604 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) && 605 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) { 606 int i; 607 for (i = 0; i < mem->nregions; i++) { 608 trace_vhost_vdpa_dump_regions(dev, i, 609 mem->regions[i].guest_phys_addr, 610 mem->regions[i].memory_size, 611 mem->regions[i].userspace_addr, 612 mem->regions[i].flags_padding); 613 } 614 } 615 if (mem->padding) { 616 return -EINVAL; 617 } 618 619 return 0; 620 } 621 622 static int vhost_vdpa_set_features(struct vhost_dev *dev, 623 uint64_t features) 624 { 625 struct vhost_vdpa *v = dev->opaque; 626 int ret; 627 628 if (vhost_vdpa_one_time_request(dev)) { 629 return 0; 630 } 631 632 if (v->shadow_vqs_enabled) { 633 if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) { 634 /* 635 * QEMU is just trying to enable or disable logging. SVQ handles 636 * this sepparately, so no need to forward this. 637 */ 638 v->acked_features = features; 639 return 0; 640 } 641 642 v->acked_features = features; 643 644 /* We must not ack _F_LOG if SVQ is enabled */ 645 features &= ~BIT_ULL(VHOST_F_LOG_ALL); 646 } 647 648 trace_vhost_vdpa_set_features(dev, features); 649 ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features); 650 if (ret) { 651 return ret; 652 } 653 654 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); 655 } 656 657 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) 658 { 659 uint64_t features; 660 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | 661 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH; 662 int r; 663 664 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { 665 return -EFAULT; 666 } 667 668 features &= f; 669 670 if (vhost_vdpa_one_time_request(dev)) { 671 r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); 672 if (r) { 673 return -EFAULT; 674 } 675 } 676 677 dev->backend_cap = features; 678 679 return 0; 680 } 681 682 static int vhost_vdpa_get_device_id(struct vhost_dev *dev, 683 uint32_t *device_id) 684 { 685 int ret; 686 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id); 687 trace_vhost_vdpa_get_device_id(dev, *device_id); 688 return ret; 689 } 690 691 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v) 692 { 693 if (!v->shadow_vqs_enabled) { 694 return; 695 } 696 697 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { 698 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 699 vhost_svq_stop(svq); 700 } 701 } 702 703 static int vhost_vdpa_reset_device(struct vhost_dev *dev) 704 { 705 struct vhost_vdpa *v = dev->opaque; 706 int ret; 707 uint8_t status = 0; 708 709 vhost_vdpa_reset_svq(v); 710 711 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); 712 trace_vhost_vdpa_reset_device(dev, status); 713 return ret; 714 } 715 716 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) 717 { 718 assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); 719 720 trace_vhost_vdpa_get_vq_index(dev, idx, idx); 721 return idx; 722 } 723 724 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev) 725 { 726 int i; 727 trace_vhost_vdpa_set_vring_ready(dev); 728 for (i = 0; i < dev->nvqs; ++i) { 729 struct vhost_vring_state state = { 730 .index = dev->vq_index + i, 731 .num = 1, 732 }; 733 vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); 734 } 735 return 0; 736 } 737 738 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config, 739 uint32_t config_len) 740 { 741 int b, len; 742 char line[QEMU_HEXDUMP_LINE_LEN]; 743 744 for (b = 0; b < config_len; b += 16) { 745 len = config_len - b; 746 qemu_hexdump_line(line, b, config, len, false); 747 trace_vhost_vdpa_dump_config(dev, line); 748 } 749 } 750 751 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data, 752 uint32_t offset, uint32_t size, 753 uint32_t flags) 754 { 755 struct vhost_vdpa_config *config; 756 int ret; 757 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 758 759 trace_vhost_vdpa_set_config(dev, offset, size, flags); 760 config = g_malloc(size + config_size); 761 config->off = offset; 762 config->len = size; 763 memcpy(config->buf, data, size); 764 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) && 765 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { 766 vhost_vdpa_dump_config(dev, data, size); 767 } 768 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config); 769 g_free(config); 770 return ret; 771 } 772 773 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, 774 uint32_t config_len, Error **errp) 775 { 776 struct vhost_vdpa_config *v_config; 777 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 778 int ret; 779 780 trace_vhost_vdpa_get_config(dev, config, config_len); 781 v_config = g_malloc(config_len + config_size); 782 v_config->len = config_len; 783 v_config->off = 0; 784 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config); 785 memcpy(config, v_config->buf, config_len); 786 g_free(v_config); 787 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) && 788 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { 789 vhost_vdpa_dump_config(dev, config, config_len); 790 } 791 return ret; 792 } 793 794 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, 795 struct vhost_vring_state *ring) 796 { 797 trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num); 798 return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); 799 } 800 801 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev, 802 struct vhost_vring_file *file) 803 { 804 trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd); 805 return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file); 806 } 807 808 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev, 809 struct vhost_vring_file *file) 810 { 811 trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd); 812 return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file); 813 } 814 815 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev, 816 struct vhost_vring_addr *addr) 817 { 818 trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags, 819 addr->desc_user_addr, addr->used_user_addr, 820 addr->avail_user_addr, 821 addr->log_guest_addr); 822 823 return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr); 824 825 } 826 827 /** 828 * Set the shadow virtqueue descriptors to the device 829 * 830 * @dev: The vhost device model 831 * @svq: The shadow virtqueue 832 * @idx: The index of the virtqueue in the vhost device 833 * @errp: Error 834 * 835 * Note that this function does not rewind kick file descriptor if cannot set 836 * call one. 837 */ 838 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, 839 VhostShadowVirtqueue *svq, unsigned idx, 840 Error **errp) 841 { 842 struct vhost_vring_file file = { 843 .index = dev->vq_index + idx, 844 }; 845 const EventNotifier *event_notifier = &svq->hdev_kick; 846 int r; 847 848 file.fd = event_notifier_get_fd(event_notifier); 849 r = vhost_vdpa_set_vring_dev_kick(dev, &file); 850 if (unlikely(r != 0)) { 851 error_setg_errno(errp, -r, "Can't set device kick fd"); 852 return r; 853 } 854 855 event_notifier = &svq->hdev_call; 856 file.fd = event_notifier_get_fd(event_notifier); 857 r = vhost_vdpa_set_vring_dev_call(dev, &file); 858 if (unlikely(r != 0)) { 859 error_setg_errno(errp, -r, "Can't set device call fd"); 860 } 861 862 return r; 863 } 864 865 /** 866 * Unmap a SVQ area in the device 867 */ 868 static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, 869 const DMAMap *needle) 870 { 871 const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle); 872 hwaddr size; 873 int r; 874 875 if (unlikely(!result)) { 876 error_report("Unable to find SVQ address to unmap"); 877 return false; 878 } 879 880 size = ROUND_UP(result->size, qemu_real_host_page_size()); 881 r = vhost_vdpa_dma_unmap(v, result->iova, size); 882 return r == 0; 883 } 884 885 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, 886 const VhostShadowVirtqueue *svq) 887 { 888 DMAMap needle = {}; 889 struct vhost_vdpa *v = dev->opaque; 890 struct vhost_vring_addr svq_addr; 891 bool ok; 892 893 vhost_svq_get_vring_addr(svq, &svq_addr); 894 895 needle.translated_addr = svq_addr.desc_user_addr; 896 ok = vhost_vdpa_svq_unmap_ring(v, &needle); 897 if (unlikely(!ok)) { 898 return false; 899 } 900 901 needle.translated_addr = svq_addr.used_user_addr; 902 return vhost_vdpa_svq_unmap_ring(v, &needle); 903 } 904 905 /** 906 * Map the SVQ area in the device 907 * 908 * @v: Vhost-vdpa device 909 * @needle: The area to search iova 910 * @errorp: Error pointer 911 */ 912 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, 913 Error **errp) 914 { 915 int r; 916 917 r = vhost_iova_tree_map_alloc(v->iova_tree, needle); 918 if (unlikely(r != IOVA_OK)) { 919 error_setg(errp, "Cannot allocate iova (%d)", r); 920 return false; 921 } 922 923 r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1, 924 (void *)(uintptr_t)needle->translated_addr, 925 needle->perm == IOMMU_RO); 926 if (unlikely(r != 0)) { 927 error_setg_errno(errp, -r, "Cannot map region to device"); 928 vhost_iova_tree_remove(v->iova_tree, needle); 929 } 930 931 return r == 0; 932 } 933 934 /** 935 * Map the shadow virtqueue rings in the device 936 * 937 * @dev: The vhost device 938 * @svq: The shadow virtqueue 939 * @addr: Assigned IOVA addresses 940 * @errp: Error pointer 941 */ 942 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, 943 const VhostShadowVirtqueue *svq, 944 struct vhost_vring_addr *addr, 945 Error **errp) 946 { 947 DMAMap device_region, driver_region; 948 struct vhost_vring_addr svq_addr; 949 struct vhost_vdpa *v = dev->opaque; 950 size_t device_size = vhost_svq_device_area_size(svq); 951 size_t driver_size = vhost_svq_driver_area_size(svq); 952 size_t avail_offset; 953 bool ok; 954 955 ERRP_GUARD(); 956 vhost_svq_get_vring_addr(svq, &svq_addr); 957 958 driver_region = (DMAMap) { 959 .translated_addr = svq_addr.desc_user_addr, 960 .size = driver_size - 1, 961 .perm = IOMMU_RO, 962 }; 963 ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); 964 if (unlikely(!ok)) { 965 error_prepend(errp, "Cannot create vq driver region: "); 966 return false; 967 } 968 addr->desc_user_addr = driver_region.iova; 969 avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr; 970 addr->avail_user_addr = driver_region.iova + avail_offset; 971 972 device_region = (DMAMap) { 973 .translated_addr = svq_addr.used_user_addr, 974 .size = device_size - 1, 975 .perm = IOMMU_RW, 976 }; 977 ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); 978 if (unlikely(!ok)) { 979 error_prepend(errp, "Cannot create vq device region: "); 980 vhost_vdpa_svq_unmap_ring(v, &driver_region); 981 } 982 addr->used_user_addr = device_region.iova; 983 984 return ok; 985 } 986 987 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev, 988 VhostShadowVirtqueue *svq, unsigned idx, 989 Error **errp) 990 { 991 uint16_t vq_index = dev->vq_index + idx; 992 struct vhost_vring_state s = { 993 .index = vq_index, 994 }; 995 int r; 996 997 r = vhost_vdpa_set_dev_vring_base(dev, &s); 998 if (unlikely(r)) { 999 error_setg_errno(errp, -r, "Cannot set vring base"); 1000 return false; 1001 } 1002 1003 r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp); 1004 return r == 0; 1005 } 1006 1007 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev) 1008 { 1009 struct vhost_vdpa *v = dev->opaque; 1010 Error *err = NULL; 1011 unsigned i; 1012 1013 if (!v->shadow_vqs) { 1014 return true; 1015 } 1016 1017 for (i = 0; i < v->shadow_vqs->len; ++i) { 1018 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i); 1019 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 1020 struct vhost_vring_addr addr = { 1021 .index = i, 1022 }; 1023 int r; 1024 bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err); 1025 if (unlikely(!ok)) { 1026 goto err; 1027 } 1028 1029 vhost_svq_start(svq, dev->vdev, vq); 1030 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err); 1031 if (unlikely(!ok)) { 1032 goto err_map; 1033 } 1034 1035 /* Override vring GPA set by vhost subsystem */ 1036 r = vhost_vdpa_set_vring_dev_addr(dev, &addr); 1037 if (unlikely(r != 0)) { 1038 error_setg_errno(&err, -r, "Cannot set device address"); 1039 goto err_set_addr; 1040 } 1041 } 1042 1043 return true; 1044 1045 err_set_addr: 1046 vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i)); 1047 1048 err_map: 1049 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i)); 1050 1051 err: 1052 error_reportf_err(err, "Cannot setup SVQ %u: ", i); 1053 for (unsigned j = 0; j < i; ++j) { 1054 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j); 1055 vhost_vdpa_svq_unmap_rings(dev, svq); 1056 vhost_svq_stop(svq); 1057 } 1058 1059 return false; 1060 } 1061 1062 static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev) 1063 { 1064 struct vhost_vdpa *v = dev->opaque; 1065 1066 if (!v->shadow_vqs) { 1067 return true; 1068 } 1069 1070 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { 1071 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 1072 bool ok = vhost_vdpa_svq_unmap_rings(dev, svq); 1073 if (unlikely(!ok)) { 1074 return false; 1075 } 1076 } 1077 1078 return true; 1079 } 1080 1081 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) 1082 { 1083 struct vhost_vdpa *v = dev->opaque; 1084 bool ok; 1085 trace_vhost_vdpa_dev_start(dev, started); 1086 1087 if (started) { 1088 vhost_vdpa_host_notifiers_init(dev); 1089 ok = vhost_vdpa_svqs_start(dev); 1090 if (unlikely(!ok)) { 1091 return -1; 1092 } 1093 vhost_vdpa_set_vring_ready(dev); 1094 } else { 1095 ok = vhost_vdpa_svqs_stop(dev); 1096 if (unlikely(!ok)) { 1097 return -1; 1098 } 1099 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); 1100 } 1101 1102 if (dev->vq_index + dev->nvqs != dev->vq_index_end) { 1103 return 0; 1104 } 1105 1106 if (started) { 1107 memory_listener_register(&v->listener, &address_space_memory); 1108 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); 1109 } else { 1110 vhost_vdpa_reset_device(dev); 1111 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 1112 VIRTIO_CONFIG_S_DRIVER); 1113 memory_listener_unregister(&v->listener); 1114 1115 return 0; 1116 } 1117 } 1118 1119 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, 1120 struct vhost_log *log) 1121 { 1122 struct vhost_vdpa *v = dev->opaque; 1123 if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) { 1124 return 0; 1125 } 1126 1127 trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd, 1128 log->log); 1129 return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base); 1130 } 1131 1132 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev, 1133 struct vhost_vring_addr *addr) 1134 { 1135 struct vhost_vdpa *v = dev->opaque; 1136 1137 if (v->shadow_vqs_enabled) { 1138 /* 1139 * Device vring addr was set at device start. SVQ base is handled by 1140 * VirtQueue code. 1141 */ 1142 return 0; 1143 } 1144 1145 return vhost_vdpa_set_vring_dev_addr(dev, addr); 1146 } 1147 1148 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev, 1149 struct vhost_vring_state *ring) 1150 { 1151 trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num); 1152 return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring); 1153 } 1154 1155 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev, 1156 struct vhost_vring_state *ring) 1157 { 1158 struct vhost_vdpa *v = dev->opaque; 1159 1160 if (v->shadow_vqs_enabled) { 1161 /* 1162 * Device vring base was set at device start. SVQ base is handled by 1163 * VirtQueue code. 1164 */ 1165 return 0; 1166 } 1167 1168 return vhost_vdpa_set_dev_vring_base(dev, ring); 1169 } 1170 1171 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, 1172 struct vhost_vring_state *ring) 1173 { 1174 struct vhost_vdpa *v = dev->opaque; 1175 int ret; 1176 1177 if (v->shadow_vqs_enabled) { 1178 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, 1179 ring->index); 1180 1181 /* 1182 * Setting base as last used idx, so destination will see as available 1183 * all the entries that the device did not use, including the in-flight 1184 * processing ones. 1185 * 1186 * TODO: This is ok for networking, but other kinds of devices might 1187 * have problems with these retransmissions. 1188 */ 1189 ring->num = svq->last_used_idx; 1190 return 0; 1191 } 1192 1193 ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); 1194 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); 1195 return ret; 1196 } 1197 1198 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev, 1199 struct vhost_vring_file *file) 1200 { 1201 struct vhost_vdpa *v = dev->opaque; 1202 int vdpa_idx = file->index - dev->vq_index; 1203 1204 if (v->shadow_vqs_enabled) { 1205 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1206 vhost_svq_set_svq_kick_fd(svq, file->fd); 1207 return 0; 1208 } else { 1209 return vhost_vdpa_set_vring_dev_kick(dev, file); 1210 } 1211 } 1212 1213 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, 1214 struct vhost_vring_file *file) 1215 { 1216 struct vhost_vdpa *v = dev->opaque; 1217 1218 if (v->shadow_vqs_enabled) { 1219 int vdpa_idx = file->index - dev->vq_index; 1220 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1221 1222 vhost_svq_set_svq_call_fd(svq, file->fd); 1223 return 0; 1224 } else { 1225 return vhost_vdpa_set_vring_dev_call(dev, file); 1226 } 1227 } 1228 1229 static int vhost_vdpa_get_features(struct vhost_dev *dev, 1230 uint64_t *features) 1231 { 1232 struct vhost_vdpa *v = dev->opaque; 1233 int ret = vhost_vdpa_get_dev_features(dev, features); 1234 1235 if (ret == 0 && v->shadow_vqs_enabled) { 1236 /* Add SVQ logging capabilities */ 1237 *features |= BIT_ULL(VHOST_F_LOG_ALL); 1238 } 1239 1240 return ret; 1241 } 1242 1243 static int vhost_vdpa_set_owner(struct vhost_dev *dev) 1244 { 1245 if (vhost_vdpa_one_time_request(dev)) { 1246 return 0; 1247 } 1248 1249 trace_vhost_vdpa_set_owner(dev); 1250 return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL); 1251 } 1252 1253 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev, 1254 struct vhost_vring_addr *addr, struct vhost_virtqueue *vq) 1255 { 1256 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 1257 addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys; 1258 addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys; 1259 addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys; 1260 trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr, 1261 addr->avail_user_addr, addr->used_user_addr); 1262 return 0; 1263 } 1264 1265 static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) 1266 { 1267 return true; 1268 } 1269 1270 const VhostOps vdpa_ops = { 1271 .backend_type = VHOST_BACKEND_TYPE_VDPA, 1272 .vhost_backend_init = vhost_vdpa_init, 1273 .vhost_backend_cleanup = vhost_vdpa_cleanup, 1274 .vhost_set_log_base = vhost_vdpa_set_log_base, 1275 .vhost_set_vring_addr = vhost_vdpa_set_vring_addr, 1276 .vhost_set_vring_num = vhost_vdpa_set_vring_num, 1277 .vhost_set_vring_base = vhost_vdpa_set_vring_base, 1278 .vhost_get_vring_base = vhost_vdpa_get_vring_base, 1279 .vhost_set_vring_kick = vhost_vdpa_set_vring_kick, 1280 .vhost_set_vring_call = vhost_vdpa_set_vring_call, 1281 .vhost_get_features = vhost_vdpa_get_features, 1282 .vhost_set_backend_cap = vhost_vdpa_set_backend_cap, 1283 .vhost_set_owner = vhost_vdpa_set_owner, 1284 .vhost_set_vring_endian = NULL, 1285 .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit, 1286 .vhost_set_mem_table = vhost_vdpa_set_mem_table, 1287 .vhost_set_features = vhost_vdpa_set_features, 1288 .vhost_reset_device = vhost_vdpa_reset_device, 1289 .vhost_get_vq_index = vhost_vdpa_get_vq_index, 1290 .vhost_get_config = vhost_vdpa_get_config, 1291 .vhost_set_config = vhost_vdpa_set_config, 1292 .vhost_requires_shm_log = NULL, 1293 .vhost_migration_done = NULL, 1294 .vhost_backend_can_merge = NULL, 1295 .vhost_net_set_mtu = NULL, 1296 .vhost_set_iotlb_callback = NULL, 1297 .vhost_send_device_iotlb_msg = NULL, 1298 .vhost_dev_start = vhost_vdpa_dev_start, 1299 .vhost_get_device_id = vhost_vdpa_get_device_id, 1300 .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, 1301 .vhost_force_iommu = vhost_vdpa_force_iommu, 1302 }; 1303