1 /* 2 * vhost support 3 * 4 * Copyright Red Hat, Inc. 2010 5 * 6 * Authors: 7 * Michael S. Tsirkin <mst@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "hw/virtio/vhost.h" 19 #include "qemu/atomic.h" 20 #include "qemu/range.h" 21 #include "qemu/error-report.h" 22 #include "qemu/memfd.h" 23 #include "qemu/log.h" 24 #include "standard-headers/linux/vhost_types.h" 25 #include "hw/virtio/virtio-bus.h" 26 #include "hw/mem/memory-device.h" 27 #include "migration/blocker.h" 28 #include "migration/qemu-file-types.h" 29 #include "system/dma.h" 30 #include "system/memory.h" 31 #include "trace.h" 32 33 /* enabled until disconnected backend stabilizes */ 34 #define _VHOST_DEBUG 1 35 36 #ifdef _VHOST_DEBUG 37 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 38 do { \ 39 error_report(fmt ": %s (%d)", ## __VA_ARGS__, \ 40 strerror(-retval), -retval); \ 41 } while (0) 42 #else 43 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 44 do { } while (0) 45 #endif 46 47 static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX]; 48 static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX]; 49 static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX]; 50 51 static QLIST_HEAD(, vhost_dev) vhost_devices = 52 QLIST_HEAD_INITIALIZER(vhost_devices); 53 54 unsigned int vhost_get_max_memslots(void) 55 { 56 unsigned int max = UINT_MAX; 57 struct vhost_dev *hdev; 58 59 QLIST_FOREACH(hdev, &vhost_devices, entry) { 60 max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev)); 61 } 62 return max; 63 } 64 65 unsigned int vhost_get_free_memslots(void) 66 { 67 unsigned int free = UINT_MAX; 68 struct vhost_dev *hdev; 69 70 QLIST_FOREACH(hdev, &vhost_devices, entry) { 71 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 72 unsigned int cur_free = r - hdev->mem->nregions; 73 74 if (unlikely(r < hdev->mem->nregions)) { 75 warn_report_once("used (%u) vhost backend memory slots exceed" 76 " the device limit (%u).", hdev->mem->nregions, r); 77 free = 0; 78 } else { 79 free = MIN(free, cur_free); 80 } 81 } 82 return free; 83 } 84 85 static void vhost_dev_sync_region(struct vhost_dev *dev, 86 MemoryRegionSection *section, 87 uint64_t mfirst, uint64_t mlast, 88 uint64_t rfirst, uint64_t rlast) 89 { 90 vhost_log_chunk_t *dev_log = dev->log->log; 91 92 uint64_t start = MAX(mfirst, rfirst); 93 uint64_t end = MIN(mlast, rlast); 94 vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK; 95 vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1; 96 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK); 97 98 if (end < start) { 99 return; 100 } 101 assert(end / VHOST_LOG_CHUNK < dev->log_size); 102 assert(start / VHOST_LOG_CHUNK < dev->log_size); 103 104 for (;from < to; ++from) { 105 vhost_log_chunk_t log; 106 /* We first check with non-atomic: much cheaper, 107 * and we expect non-dirty to be the common case. */ 108 if (!*from) { 109 addr += VHOST_LOG_CHUNK; 110 continue; 111 } 112 /* Data must be read atomically. We don't really need barrier semantics 113 * but it's easier to use atomic_* than roll our own. */ 114 log = qatomic_xchg(from, 0); 115 while (log) { 116 int bit = ctzl(log); 117 hwaddr page_addr; 118 hwaddr section_offset; 119 hwaddr mr_offset; 120 page_addr = addr + bit * VHOST_LOG_PAGE; 121 section_offset = page_addr - section->offset_within_address_space; 122 mr_offset = section_offset + section->offset_within_region; 123 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); 124 log &= ~(0x1ull << bit); 125 } 126 addr += VHOST_LOG_CHUNK; 127 } 128 } 129 130 bool vhost_dev_has_iommu(struct vhost_dev *dev) 131 { 132 VirtIODevice *vdev = dev->vdev; 133 134 /* 135 * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support 136 * incremental memory mapping API via IOTLB API. For platform that 137 * does not have IOMMU, there's no need to enable this feature 138 * which may cause unnecessary IOTLB miss/update transactions. 139 */ 140 if (vdev) { 141 return virtio_bus_device_iommu_enabled(vdev) && 142 virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); 143 } else { 144 return false; 145 } 146 } 147 148 static inline bool vhost_dev_should_log(struct vhost_dev *dev) 149 { 150 assert(dev->vhost_ops); 151 assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE); 152 assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX); 153 154 return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]); 155 } 156 157 static inline void vhost_dev_elect_mem_logger(struct vhost_dev *hdev, bool add) 158 { 159 VhostBackendType backend_type; 160 161 assert(hdev->vhost_ops); 162 163 backend_type = hdev->vhost_ops->backend_type; 164 assert(backend_type > VHOST_BACKEND_TYPE_NONE); 165 assert(backend_type < VHOST_BACKEND_TYPE_MAX); 166 167 if (add && !QLIST_IS_INSERTED(hdev, logdev_entry)) { 168 if (QLIST_EMPTY(&vhost_log_devs[backend_type])) { 169 QLIST_INSERT_HEAD(&vhost_log_devs[backend_type], 170 hdev, logdev_entry); 171 } else { 172 /* 173 * The first vhost_device in the list is selected as the shared 174 * logger to scan memory sections. Put new entry next to the head 175 * to avoid inadvertent change to the underlying logger device. 176 * This is done in order to get better cache locality and to avoid 177 * performance churn on the hot path for log scanning. Even when 178 * new devices come and go quickly, it wouldn't end up changing 179 * the active leading logger device at all. 180 */ 181 QLIST_INSERT_AFTER(QLIST_FIRST(&vhost_log_devs[backend_type]), 182 hdev, logdev_entry); 183 } 184 } else if (!add && QLIST_IS_INSERTED(hdev, logdev_entry)) { 185 QLIST_REMOVE(hdev, logdev_entry); 186 } 187 } 188 189 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, 190 MemoryRegionSection *section, 191 hwaddr first, 192 hwaddr last) 193 { 194 int i; 195 hwaddr start_addr; 196 hwaddr end_addr; 197 198 if (!dev->log_enabled || !dev->started) { 199 return 0; 200 } 201 start_addr = section->offset_within_address_space; 202 end_addr = range_get_last(start_addr, int128_get64(section->size)); 203 start_addr = MAX(first, start_addr); 204 end_addr = MIN(last, end_addr); 205 206 if (vhost_dev_should_log(dev)) { 207 for (i = 0; i < dev->mem->nregions; ++i) { 208 struct vhost_memory_region *reg = dev->mem->regions + i; 209 vhost_dev_sync_region(dev, section, start_addr, end_addr, 210 reg->guest_phys_addr, 211 range_get_last(reg->guest_phys_addr, 212 reg->memory_size)); 213 } 214 } 215 for (i = 0; i < dev->nvqs; ++i) { 216 struct vhost_virtqueue *vq = dev->vqs + i; 217 218 if (!vq->used_phys && !vq->used_size) { 219 continue; 220 } 221 222 if (vhost_dev_has_iommu(dev)) { 223 IOMMUTLBEntry iotlb; 224 hwaddr used_phys = vq->used_phys, used_size = vq->used_size; 225 hwaddr phys, s, offset; 226 227 while (used_size) { 228 rcu_read_lock(); 229 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 230 used_phys, 231 true, 232 MEMTXATTRS_UNSPECIFIED); 233 rcu_read_unlock(); 234 235 if (!iotlb.target_as) { 236 qemu_log_mask(LOG_GUEST_ERROR, "translation " 237 "failure for used_iova %"PRIx64"\n", 238 used_phys); 239 return -EINVAL; 240 } 241 242 offset = used_phys & iotlb.addr_mask; 243 phys = iotlb.translated_addr + offset; 244 245 /* 246 * Distance from start of used ring until last byte of 247 * IOMMU page. 248 */ 249 s = iotlb.addr_mask - offset; 250 /* 251 * Size of used ring, or of the part of it until end 252 * of IOMMU page. To avoid zero result, do the adding 253 * outside of MIN(). 254 */ 255 s = MIN(s, used_size - 1) + 1; 256 257 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys, 258 range_get_last(phys, s)); 259 used_size -= s; 260 used_phys += s; 261 } 262 } else { 263 vhost_dev_sync_region(dev, section, start_addr, 264 end_addr, vq->used_phys, 265 range_get_last(vq->used_phys, vq->used_size)); 266 } 267 } 268 return 0; 269 } 270 271 static void vhost_log_sync(MemoryListener *listener, 272 MemoryRegionSection *section) 273 { 274 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 275 memory_listener); 276 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); 277 } 278 279 static void vhost_log_sync_range(struct vhost_dev *dev, 280 hwaddr first, hwaddr last) 281 { 282 int i; 283 /* FIXME: this is N^2 in number of sections */ 284 for (i = 0; i < dev->n_mem_sections; ++i) { 285 MemoryRegionSection *section = &dev->mem_sections[i]; 286 vhost_sync_dirty_bitmap(dev, section, first, last); 287 } 288 } 289 290 static uint64_t vhost_get_log_size(struct vhost_dev *dev) 291 { 292 uint64_t log_size = 0; 293 int i; 294 for (i = 0; i < dev->mem->nregions; ++i) { 295 struct vhost_memory_region *reg = dev->mem->regions + i; 296 uint64_t last = range_get_last(reg->guest_phys_addr, 297 reg->memory_size); 298 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 299 } 300 return log_size; 301 } 302 303 static int vhost_set_backend_type(struct vhost_dev *dev, 304 VhostBackendType backend_type) 305 { 306 int r = 0; 307 308 switch (backend_type) { 309 #ifdef CONFIG_VHOST_KERNEL 310 case VHOST_BACKEND_TYPE_KERNEL: 311 dev->vhost_ops = &kernel_ops; 312 break; 313 #endif 314 #ifdef CONFIG_VHOST_USER 315 case VHOST_BACKEND_TYPE_USER: 316 dev->vhost_ops = &user_ops; 317 break; 318 #endif 319 #ifdef CONFIG_VHOST_VDPA 320 case VHOST_BACKEND_TYPE_VDPA: 321 dev->vhost_ops = &vdpa_ops; 322 break; 323 #endif 324 default: 325 error_report("Unknown vhost backend type"); 326 r = -1; 327 } 328 329 if (r == 0) { 330 assert(dev->vhost_ops->backend_type == backend_type); 331 } 332 333 return r; 334 } 335 336 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) 337 { 338 Error *err = NULL; 339 struct vhost_log *log; 340 uint64_t logsize = size * sizeof(*(log->log)); 341 int fd = -1; 342 343 log = g_new0(struct vhost_log, 1); 344 if (share) { 345 log->log = qemu_memfd_alloc("vhost-log", logsize, 346 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 347 &fd, &err); 348 if (err) { 349 error_report_err(err); 350 g_free(log); 351 return NULL; 352 } 353 memset(log->log, 0, logsize); 354 } else { 355 log->log = g_malloc0(logsize); 356 } 357 358 log->size = size; 359 log->refcnt = 1; 360 log->fd = fd; 361 362 return log; 363 } 364 365 static struct vhost_log *vhost_log_get(VhostBackendType backend_type, 366 uint64_t size, bool share) 367 { 368 struct vhost_log *log; 369 370 assert(backend_type > VHOST_BACKEND_TYPE_NONE); 371 assert(backend_type < VHOST_BACKEND_TYPE_MAX); 372 373 log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type]; 374 375 if (!log || log->size != size) { 376 log = vhost_log_alloc(size, share); 377 if (share) { 378 vhost_log_shm[backend_type] = log; 379 } else { 380 vhost_log[backend_type] = log; 381 } 382 } else { 383 ++log->refcnt; 384 } 385 386 return log; 387 } 388 389 static void vhost_log_put(struct vhost_dev *dev, bool sync) 390 { 391 struct vhost_log *log = dev->log; 392 VhostBackendType backend_type; 393 394 if (!log) { 395 return; 396 } 397 398 assert(dev->vhost_ops); 399 backend_type = dev->vhost_ops->backend_type; 400 401 if (backend_type == VHOST_BACKEND_TYPE_NONE || 402 backend_type >= VHOST_BACKEND_TYPE_MAX) { 403 return; 404 } 405 406 --log->refcnt; 407 if (log->refcnt == 0) { 408 /* Sync only the range covered by the old log */ 409 if (dev->log_size && sync) { 410 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); 411 } 412 413 if (vhost_log[backend_type] == log) { 414 g_free(log->log); 415 vhost_log[backend_type] = NULL; 416 } else if (vhost_log_shm[backend_type] == log) { 417 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), 418 log->fd); 419 vhost_log_shm[backend_type] = NULL; 420 } 421 422 g_free(log); 423 } 424 425 vhost_dev_elect_mem_logger(dev, false); 426 dev->log = NULL; 427 dev->log_size = 0; 428 } 429 430 static bool vhost_dev_log_is_shared(struct vhost_dev *dev) 431 { 432 return dev->vhost_ops->vhost_requires_shm_log && 433 dev->vhost_ops->vhost_requires_shm_log(dev); 434 } 435 436 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) 437 { 438 struct vhost_log *log = vhost_log_get(dev->vhost_ops->backend_type, 439 size, vhost_dev_log_is_shared(dev)); 440 uint64_t log_base = (uintptr_t)log->log; 441 int r; 442 443 /* inform backend of log switching, this must be done before 444 releasing the current log, to ensure no logging is lost */ 445 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); 446 if (r < 0) { 447 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 448 } 449 450 vhost_log_put(dev, true); 451 dev->log = log; 452 dev->log_size = size; 453 } 454 455 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, 456 hwaddr *plen, bool is_write) 457 { 458 if (!vhost_dev_has_iommu(dev)) { 459 return address_space_map(dev->vdev->dma_as, addr, plen, is_write, 460 MEMTXATTRS_UNSPECIFIED); 461 } else { 462 return (void *)(uintptr_t)addr; 463 } 464 } 465 466 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer, 467 hwaddr len, int is_write, 468 hwaddr access_len) 469 { 470 if (!vhost_dev_has_iommu(dev)) { 471 address_space_unmap(dev->vdev->dma_as, buffer, len, is_write, 472 access_len); 473 } 474 } 475 476 static int vhost_verify_ring_part_mapping(void *ring_hva, 477 uint64_t ring_gpa, 478 uint64_t ring_size, 479 void *reg_hva, 480 uint64_t reg_gpa, 481 uint64_t reg_size) 482 { 483 uint64_t hva_ring_offset; 484 uint64_t ring_last = range_get_last(ring_gpa, ring_size); 485 uint64_t reg_last = range_get_last(reg_gpa, reg_size); 486 487 if (ring_last < reg_gpa || ring_gpa > reg_last) { 488 return 0; 489 } 490 /* check that whole ring's is mapped */ 491 if (ring_last > reg_last) { 492 return -ENOMEM; 493 } 494 /* check that ring's MemoryRegion wasn't replaced */ 495 hva_ring_offset = ring_gpa - reg_gpa; 496 if (ring_hva != reg_hva + hva_ring_offset) { 497 return -EBUSY; 498 } 499 500 return 0; 501 } 502 503 static int vhost_verify_ring_mappings(struct vhost_dev *dev, 504 void *reg_hva, 505 uint64_t reg_gpa, 506 uint64_t reg_size) 507 { 508 int i, j; 509 int r = 0; 510 const char *part_name[] = { 511 "descriptor table", 512 "available ring", 513 "used ring" 514 }; 515 516 if (vhost_dev_has_iommu(dev)) { 517 return 0; 518 } 519 520 for (i = 0; i < dev->nvqs; ++i) { 521 struct vhost_virtqueue *vq = dev->vqs + i; 522 523 if (vq->desc_phys == 0) { 524 continue; 525 } 526 527 j = 0; 528 r = vhost_verify_ring_part_mapping( 529 vq->desc, vq->desc_phys, vq->desc_size, 530 reg_hva, reg_gpa, reg_size); 531 if (r) { 532 break; 533 } 534 535 j++; 536 r = vhost_verify_ring_part_mapping( 537 vq->avail, vq->avail_phys, vq->avail_size, 538 reg_hva, reg_gpa, reg_size); 539 if (r) { 540 break; 541 } 542 543 j++; 544 r = vhost_verify_ring_part_mapping( 545 vq->used, vq->used_phys, vq->used_size, 546 reg_hva, reg_gpa, reg_size); 547 if (r) { 548 break; 549 } 550 } 551 552 if (r == -ENOMEM) { 553 error_report("Unable to map %s for ring %d", part_name[j], i); 554 } else if (r == -EBUSY) { 555 error_report("%s relocated for ring %d", part_name[j], i); 556 } 557 return r; 558 } 559 560 /* 561 * vhost_section: identify sections needed for vhost access 562 * 563 * We only care about RAM sections here (where virtqueue and guest 564 * internals accessed by virtio might live). 565 */ 566 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) 567 { 568 MemoryRegion *mr = section->mr; 569 570 if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) { 571 uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr); 572 uint8_t handled_dirty; 573 574 /* 575 * Kernel based vhost doesn't handle any block which is doing 576 * dirty-tracking other than migration for which it has 577 * specific logging support. However for TCG the kernel never 578 * gets involved anyway so we can also ignore it's 579 * self-modiying code detection flags. However a vhost-user 580 * client could still confuse a TCG guest if it re-writes 581 * executable memory that has already been translated. 582 */ 583 handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) | 584 (1 << DIRTY_MEMORY_CODE); 585 586 if (dirty_mask & ~handled_dirty) { 587 trace_vhost_reject_section(mr->name, 1); 588 return false; 589 } 590 591 /* 592 * Some backends (like vhost-user) can only handle memory regions 593 * that have an fd (can be mapped into a different process). Filter 594 * the ones without an fd out, if requested. 595 * 596 * TODO: we might have to limit to MAP_SHARED as well. 597 */ 598 if (memory_region_get_fd(section->mr) < 0 && 599 dev->vhost_ops->vhost_backend_no_private_memslots && 600 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 601 trace_vhost_reject_section(mr->name, 2); 602 return false; 603 } 604 605 trace_vhost_section(mr->name); 606 return true; 607 } else { 608 trace_vhost_reject_section(mr->name, 3); 609 return false; 610 } 611 } 612 613 static void vhost_begin(MemoryListener *listener) 614 { 615 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 616 memory_listener); 617 dev->tmp_sections = NULL; 618 dev->n_tmp_sections = 0; 619 } 620 621 static void vhost_commit(MemoryListener *listener) 622 { 623 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 624 memory_listener); 625 MemoryRegionSection *old_sections; 626 int n_old_sections; 627 uint64_t log_size; 628 size_t regions_size; 629 int r; 630 int i; 631 bool changed = false; 632 633 /* Note we can be called before the device is started, but then 634 * starting the device calls set_mem_table, so we need to have 635 * built the data structures. 636 */ 637 old_sections = dev->mem_sections; 638 n_old_sections = dev->n_mem_sections; 639 dev->mem_sections = dev->tmp_sections; 640 dev->n_mem_sections = dev->n_tmp_sections; 641 642 if (dev->n_mem_sections != n_old_sections) { 643 changed = true; 644 } else { 645 /* Same size, lets check the contents */ 646 for (i = 0; i < n_old_sections; i++) { 647 if (!MemoryRegionSection_eq(&old_sections[i], 648 &dev->mem_sections[i])) { 649 changed = true; 650 break; 651 } 652 } 653 } 654 655 trace_vhost_commit(dev->started, changed); 656 if (!changed) { 657 goto out; 658 } 659 660 /* Rebuild the regions list from the new sections list */ 661 regions_size = offsetof(struct vhost_memory, regions) + 662 dev->n_mem_sections * sizeof dev->mem->regions[0]; 663 dev->mem = g_realloc(dev->mem, regions_size); 664 dev->mem->nregions = dev->n_mem_sections; 665 666 for (i = 0; i < dev->n_mem_sections; i++) { 667 struct vhost_memory_region *cur_vmr = dev->mem->regions + i; 668 struct MemoryRegionSection *mrs = dev->mem_sections + i; 669 670 cur_vmr->guest_phys_addr = mrs->offset_within_address_space; 671 cur_vmr->memory_size = int128_get64(mrs->size); 672 cur_vmr->userspace_addr = 673 (uintptr_t)memory_region_get_ram_ptr(mrs->mr) + 674 mrs->offset_within_region; 675 cur_vmr->flags_padding = 0; 676 } 677 678 if (!dev->started) { 679 goto out; 680 } 681 682 for (i = 0; i < dev->mem->nregions; i++) { 683 if (vhost_verify_ring_mappings(dev, 684 (void *)(uintptr_t)dev->mem->regions[i].userspace_addr, 685 dev->mem->regions[i].guest_phys_addr, 686 dev->mem->regions[i].memory_size)) { 687 error_report("Verify ring failure on region %d", i); 688 abort(); 689 } 690 } 691 692 if (!dev->log_enabled) { 693 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 694 if (r < 0) { 695 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 696 } 697 goto out; 698 } 699 log_size = vhost_get_log_size(dev); 700 /* We allocate an extra 4K bytes to log, 701 * to reduce the * number of reallocations. */ 702 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) 703 /* To log more, must increase log size before table update. */ 704 if (dev->log_size < log_size) { 705 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); 706 } 707 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 708 if (r < 0) { 709 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 710 } 711 /* To log less, can only decrease log size after table update. */ 712 if (dev->log_size > log_size + VHOST_LOG_BUFFER) { 713 vhost_dev_log_resize(dev, log_size); 714 } 715 716 out: 717 /* Deref the old list of sections, this must happen _after_ the 718 * vhost_set_mem_table to ensure the client isn't still using the 719 * section we're about to unref. 720 */ 721 while (n_old_sections--) { 722 memory_region_unref(old_sections[n_old_sections].mr); 723 } 724 g_free(old_sections); 725 } 726 727 /* Adds the section data to the tmp_section structure. 728 * It relies on the listener calling us in memory address order 729 * and for each region (via the _add and _nop methods) to 730 * join neighbours. 731 */ 732 static void vhost_region_add_section(struct vhost_dev *dev, 733 MemoryRegionSection *section) 734 { 735 bool need_add = true; 736 uint64_t mrs_size = int128_get64(section->size); 737 uint64_t mrs_gpa = section->offset_within_address_space; 738 uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + 739 section->offset_within_region; 740 RAMBlock *mrs_rb = section->mr->ram_block; 741 742 trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, 743 mrs_host); 744 745 if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { 746 /* Round the section to it's page size */ 747 /* First align the start down to a page boundary */ 748 size_t mrs_page = qemu_ram_pagesize(mrs_rb); 749 uint64_t alignage = mrs_host & (mrs_page - 1); 750 if (alignage) { 751 mrs_host -= alignage; 752 mrs_size += alignage; 753 mrs_gpa -= alignage; 754 } 755 /* Now align the size up to a page boundary */ 756 alignage = mrs_size & (mrs_page - 1); 757 if (alignage) { 758 mrs_size += mrs_page - alignage; 759 } 760 trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, 761 mrs_size, mrs_host); 762 } 763 764 if (dev->n_tmp_sections && !section->unmergeable) { 765 /* Since we already have at least one section, lets see if 766 * this extends it; since we're scanning in order, we only 767 * have to look at the last one, and the FlatView that calls 768 * us shouldn't have overlaps. 769 */ 770 MemoryRegionSection *prev_sec = dev->tmp_sections + 771 (dev->n_tmp_sections - 1); 772 uint64_t prev_gpa_start = prev_sec->offset_within_address_space; 773 uint64_t prev_size = int128_get64(prev_sec->size); 774 uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); 775 uint64_t prev_host_start = 776 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + 777 prev_sec->offset_within_region; 778 uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); 779 780 if (mrs_gpa <= (prev_gpa_end + 1)) { 781 /* OK, looks like overlapping/intersecting - it's possible that 782 * the rounding to page sizes has made them overlap, but they should 783 * match up in the same RAMBlock if they do. 784 */ 785 if (mrs_gpa < prev_gpa_start) { 786 error_report("%s:Section '%s' rounded to %"PRIx64 787 " prior to previous '%s' %"PRIx64, 788 __func__, section->mr->name, mrs_gpa, 789 prev_sec->mr->name, prev_gpa_start); 790 /* A way to cleanly fail here would be better */ 791 return; 792 } 793 /* Offset from the start of the previous GPA to this GPA */ 794 size_t offset = mrs_gpa - prev_gpa_start; 795 796 if (prev_host_start + offset == mrs_host && 797 section->mr == prev_sec->mr && !prev_sec->unmergeable) { 798 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); 799 need_add = false; 800 prev_sec->offset_within_address_space = 801 MIN(prev_gpa_start, mrs_gpa); 802 prev_sec->offset_within_region = 803 MIN(prev_host_start, mrs_host) - 804 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); 805 prev_sec->size = int128_make64(max_end - MIN(prev_host_start, 806 mrs_host)); 807 trace_vhost_region_add_section_merge(section->mr->name, 808 int128_get64(prev_sec->size), 809 prev_sec->offset_within_address_space, 810 prev_sec->offset_within_region); 811 } else { 812 /* adjoining regions are fine, but overlapping ones with 813 * different blocks/offsets shouldn't happen 814 */ 815 if (mrs_gpa != prev_gpa_end + 1) { 816 error_report("%s: Overlapping but not coherent sections " 817 "at %"PRIx64, 818 __func__, mrs_gpa); 819 return; 820 } 821 } 822 } 823 } 824 825 if (need_add) { 826 ++dev->n_tmp_sections; 827 dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections, 828 dev->n_tmp_sections); 829 dev->tmp_sections[dev->n_tmp_sections - 1] = *section; 830 /* The flatview isn't stable and we don't use it, making it NULL 831 * means we can memcmp the list. 832 */ 833 dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL; 834 memory_region_ref(section->mr); 835 } 836 } 837 838 /* Used for both add and nop callbacks */ 839 static void vhost_region_addnop(MemoryListener *listener, 840 MemoryRegionSection *section) 841 { 842 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 843 memory_listener); 844 845 if (!vhost_section(dev, section)) { 846 return; 847 } 848 vhost_region_add_section(dev, section); 849 } 850 851 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 852 { 853 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n); 854 struct vhost_dev *hdev = iommu->hdev; 855 hwaddr iova = iotlb->iova + iommu->iommu_offset; 856 857 if (vhost_backend_invalidate_device_iotlb(hdev, iova, 858 iotlb->addr_mask + 1)) { 859 error_report("Fail to invalidate device iotlb"); 860 } 861 } 862 863 static void vhost_iommu_region_add(MemoryListener *listener, 864 MemoryRegionSection *section) 865 { 866 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 867 iommu_listener); 868 struct vhost_iommu *iommu; 869 Int128 end; 870 int iommu_idx; 871 IOMMUMemoryRegion *iommu_mr; 872 873 if (!memory_region_is_iommu(section->mr)) { 874 return; 875 } 876 877 iommu_mr = IOMMU_MEMORY_REGION(section->mr); 878 879 iommu = g_malloc0(sizeof(*iommu)); 880 end = int128_add(int128_make64(section->offset_within_region), 881 section->size); 882 end = int128_sub(end, int128_one()); 883 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 884 MEMTXATTRS_UNSPECIFIED); 885 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, 886 dev->vdev->device_iotlb_enabled ? 887 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : 888 IOMMU_NOTIFIER_UNMAP, 889 section->offset_within_region, 890 int128_get64(end), 891 iommu_idx); 892 iommu->mr = section->mr; 893 iommu->iommu_offset = section->offset_within_address_space - 894 section->offset_within_region; 895 iommu->hdev = dev; 896 memory_region_register_iommu_notifier(section->mr, &iommu->n, 897 &error_fatal); 898 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next); 899 /* TODO: can replay help performance here? */ 900 } 901 902 static void vhost_iommu_region_del(MemoryListener *listener, 903 MemoryRegionSection *section) 904 { 905 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 906 iommu_listener); 907 struct vhost_iommu *iommu; 908 909 if (!memory_region_is_iommu(section->mr)) { 910 return; 911 } 912 913 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 914 if (iommu->mr == section->mr && 915 iommu->n.start == section->offset_within_region) { 916 memory_region_unregister_iommu_notifier(iommu->mr, 917 &iommu->n); 918 QLIST_REMOVE(iommu, iommu_next); 919 g_free(iommu); 920 break; 921 } 922 } 923 } 924 925 void vhost_toggle_device_iotlb(VirtIODevice *vdev) 926 { 927 VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); 928 struct vhost_dev *dev; 929 struct vhost_iommu *iommu; 930 931 if (vdev->vhost_started) { 932 dev = vdc->get_vhost(vdev); 933 } else { 934 return; 935 } 936 937 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 938 memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n); 939 iommu->n.notifier_flags = vdev->device_iotlb_enabled ? 940 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP; 941 memory_region_register_iommu_notifier(iommu->mr, &iommu->n, 942 &error_fatal); 943 } 944 } 945 946 static int vhost_virtqueue_set_addr(struct vhost_dev *dev, 947 struct vhost_virtqueue *vq, 948 unsigned idx, bool enable_log) 949 { 950 struct vhost_vring_addr addr; 951 int r; 952 memset(&addr, 0, sizeof(struct vhost_vring_addr)); 953 954 if (dev->vhost_ops->vhost_vq_get_addr) { 955 r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq); 956 if (r < 0) { 957 VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed"); 958 return r; 959 } 960 } else { 961 addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc; 962 addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail; 963 addr.used_user_addr = (uint64_t)(unsigned long)vq->used; 964 } 965 addr.index = idx; 966 addr.log_guest_addr = vq->used_phys; 967 addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0; 968 r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 969 if (r < 0) { 970 VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed"); 971 } 972 return r; 973 } 974 975 static int vhost_dev_set_features(struct vhost_dev *dev, 976 bool enable_log) 977 { 978 uint64_t features[VIRTIO_FEATURES_NU64S]; 979 int r; 980 981 virtio_features_copy(features, dev->acked_features_ex); 982 if (enable_log) { 983 virtio_add_feature_ex(features, VHOST_F_LOG_ALL); 984 } 985 if (!vhost_dev_has_iommu(dev)) { 986 virtio_clear_feature_ex(features, VIRTIO_F_IOMMU_PLATFORM); 987 } 988 if (dev->vhost_ops->vhost_force_iommu) { 989 if (dev->vhost_ops->vhost_force_iommu(dev) == true) { 990 virtio_add_feature_ex(features, VIRTIO_F_IOMMU_PLATFORM); 991 } 992 } 993 994 if (virtio_features_use_ex(features) && 995 !dev->vhost_ops->vhost_set_features_ex) { 996 r = -EINVAL; 997 VHOST_OPS_DEBUG(r, "extended features without device support"); 998 goto out; 999 } 1000 1001 if (dev->vhost_ops->vhost_set_features_ex) { 1002 r = dev->vhost_ops->vhost_set_features_ex(dev, features); 1003 } else { 1004 r = dev->vhost_ops->vhost_set_features(dev, features[0]); 1005 } 1006 if (r < 0) { 1007 VHOST_OPS_DEBUG(r, "vhost_set_features failed"); 1008 goto out; 1009 } 1010 if (dev->vhost_ops->vhost_set_backend_cap) { 1011 r = dev->vhost_ops->vhost_set_backend_cap(dev); 1012 if (r < 0) { 1013 VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed"); 1014 goto out; 1015 } 1016 } 1017 1018 out: 1019 return r; 1020 } 1021 1022 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) 1023 { 1024 int r, i, idx; 1025 hwaddr addr; 1026 1027 r = vhost_dev_set_features(dev, enable_log); 1028 if (r < 0) { 1029 goto err_features; 1030 } 1031 for (i = 0; i < dev->nvqs; ++i) { 1032 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 1033 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 1034 if (!addr) { 1035 /* 1036 * The queue might not be ready for start. If this 1037 * is the case there is no reason to continue the process. 1038 * The similar logic is used by the vhost_virtqueue_start() 1039 * routine. 1040 */ 1041 continue; 1042 } 1043 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 1044 enable_log); 1045 if (r < 0) { 1046 goto err_vq; 1047 } 1048 } 1049 1050 /* 1051 * At log start we select our vhost_device logger that will scan the 1052 * memory sections and skip for the others. This is possible because 1053 * the log is shared amongst all vhost devices for a given type of 1054 * backend. 1055 */ 1056 vhost_dev_elect_mem_logger(dev, enable_log); 1057 1058 return 0; 1059 err_vq: 1060 for (; i >= 0; --i) { 1061 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 1062 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 1063 if (!addr) { 1064 continue; 1065 } 1066 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 1067 dev->log_enabled); 1068 } 1069 vhost_dev_set_features(dev, dev->log_enabled); 1070 err_features: 1071 return r; 1072 } 1073 1074 static int vhost_migration_log(MemoryListener *listener, bool enable) 1075 { 1076 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 1077 memory_listener); 1078 int r; 1079 if (enable == dev->log_enabled) { 1080 return 0; 1081 } 1082 if (!dev->started) { 1083 dev->log_enabled = enable; 1084 return 0; 1085 } 1086 1087 r = 0; 1088 if (!enable) { 1089 r = vhost_dev_set_log(dev, false); 1090 if (r < 0) { 1091 goto check_dev_state; 1092 } 1093 vhost_log_put(dev, false); 1094 } else { 1095 vhost_dev_log_resize(dev, vhost_get_log_size(dev)); 1096 r = vhost_dev_set_log(dev, true); 1097 if (r < 0) { 1098 goto check_dev_state; 1099 } 1100 } 1101 1102 check_dev_state: 1103 dev->log_enabled = enable; 1104 /* 1105 * vhost-user-* devices could change their state during log 1106 * initialization due to disconnect. So check dev state after 1107 * vhost communication. 1108 */ 1109 if (!dev->started) { 1110 /* 1111 * Since device is in the stopped state, it is okay for 1112 * migration. Return success. 1113 */ 1114 r = 0; 1115 } 1116 if (r) { 1117 /* An error occurred. */ 1118 dev->log_enabled = false; 1119 } 1120 1121 return r; 1122 } 1123 1124 static bool vhost_log_global_start(MemoryListener *listener, Error **errp) 1125 { 1126 int r; 1127 1128 r = vhost_migration_log(listener, true); 1129 if (r < 0) { 1130 error_setg_errno(errp, -r, "vhost: Failed to start logging"); 1131 return false; 1132 } 1133 return true; 1134 } 1135 1136 static void vhost_log_global_stop(MemoryListener *listener) 1137 { 1138 int r; 1139 1140 r = vhost_migration_log(listener, false); 1141 if (r < 0) { 1142 /* Not fatal, so report it, but take no further action */ 1143 warn_report("vhost: Failed to stop logging"); 1144 } 1145 } 1146 1147 static void vhost_log_start(MemoryListener *listener, 1148 MemoryRegionSection *section, 1149 int old, int new) 1150 { 1151 /* FIXME: implement */ 1152 } 1153 1154 static void vhost_log_stop(MemoryListener *listener, 1155 MemoryRegionSection *section, 1156 int old, int new) 1157 { 1158 /* FIXME: implement */ 1159 } 1160 1161 /* The vhost driver natively knows how to handle the vrings of non 1162 * cross-endian legacy devices and modern devices. Only legacy devices 1163 * exposed to a bi-endian guest may require the vhost driver to use a 1164 * specific endianness. 1165 */ 1166 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) 1167 { 1168 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1169 return false; 1170 } 1171 #if HOST_BIG_ENDIAN 1172 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; 1173 #else 1174 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; 1175 #endif 1176 } 1177 1178 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, 1179 bool is_big_endian, 1180 int vhost_vq_index) 1181 { 1182 int r; 1183 struct vhost_vring_state s = { 1184 .index = vhost_vq_index, 1185 .num = is_big_endian 1186 }; 1187 1188 r = dev->vhost_ops->vhost_set_vring_endian(dev, &s); 1189 if (r < 0) { 1190 VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed"); 1191 } 1192 return r; 1193 } 1194 1195 static int vhost_memory_region_lookup(struct vhost_dev *hdev, 1196 uint64_t gpa, uint64_t *uaddr, 1197 uint64_t *len) 1198 { 1199 int i; 1200 1201 for (i = 0; i < hdev->mem->nregions; i++) { 1202 struct vhost_memory_region *reg = hdev->mem->regions + i; 1203 1204 if (gpa >= reg->guest_phys_addr && 1205 reg->guest_phys_addr + reg->memory_size > gpa) { 1206 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr; 1207 *len = reg->guest_phys_addr + reg->memory_size - gpa; 1208 return 0; 1209 } 1210 } 1211 1212 return -EFAULT; 1213 } 1214 1215 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) 1216 { 1217 IOMMUTLBEntry iotlb; 1218 uint64_t uaddr, len; 1219 int ret = -EFAULT; 1220 1221 RCU_READ_LOCK_GUARD(); 1222 1223 trace_vhost_iotlb_miss(dev, 1); 1224 1225 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 1226 iova, write, 1227 MEMTXATTRS_UNSPECIFIED); 1228 if (iotlb.target_as != NULL) { 1229 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, 1230 &uaddr, &len); 1231 if (ret) { 1232 trace_vhost_iotlb_miss(dev, 3); 1233 error_report("Fail to lookup the translated address " 1234 "%"PRIx64, iotlb.translated_addr); 1235 goto out; 1236 } 1237 1238 len = MIN(iotlb.addr_mask + 1, len); 1239 iova = iova & ~iotlb.addr_mask; 1240 1241 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, 1242 len, iotlb.perm); 1243 if (ret) { 1244 trace_vhost_iotlb_miss(dev, 4); 1245 error_report("Fail to update device iotlb"); 1246 goto out; 1247 } 1248 } 1249 1250 trace_vhost_iotlb_miss(dev, 2); 1251 1252 out: 1253 return ret; 1254 } 1255 1256 int vhost_virtqueue_start(struct vhost_dev *dev, 1257 struct VirtIODevice *vdev, 1258 struct vhost_virtqueue *vq, 1259 unsigned idx) 1260 { 1261 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1262 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1263 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1264 hwaddr l; 1265 int r; 1266 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1267 struct vhost_vring_file file = { 1268 .index = vhost_vq_index 1269 }; 1270 struct vhost_vring_state state = { 1271 .index = vhost_vq_index 1272 }; 1273 struct VirtQueue *vvq = virtio_get_queue(vdev, idx); 1274 1275 vq->desc_size = virtio_queue_get_desc_size(vdev, idx); 1276 vq->desc_phys = virtio_queue_get_desc_addr(vdev, idx); 1277 vq->desc = NULL; 1278 vq->avail_size = virtio_queue_get_avail_size(vdev, idx); 1279 vq->avail_phys = virtio_queue_get_avail_addr(vdev, idx); 1280 vq->avail = NULL; 1281 vq->used_size = virtio_queue_get_used_size(vdev, idx); 1282 vq->used_phys = virtio_queue_get_used_addr(vdev, idx); 1283 vq->used = NULL; 1284 1285 if (vq->desc_phys == 0) { 1286 /* Queue might not be ready for start */ 1287 return 0; 1288 } 1289 1290 vq->num = state.num = virtio_queue_get_num(vdev, idx); 1291 r = dev->vhost_ops->vhost_set_vring_num(dev, &state); 1292 if (r) { 1293 VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed"); 1294 return r; 1295 } 1296 1297 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 1298 r = dev->vhost_ops->vhost_set_vring_base(dev, &state); 1299 if (r) { 1300 VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed"); 1301 return r; 1302 } 1303 1304 if (vhost_needs_vring_endian(vdev)) { 1305 r = vhost_virtqueue_set_vring_endian_legacy(dev, 1306 virtio_is_big_endian(vdev), 1307 vhost_vq_index); 1308 if (r) { 1309 return r; 1310 } 1311 } 1312 1313 l = vq->desc_size; 1314 vq->desc = vhost_memory_map(dev, vq->desc_phys, &l, false); 1315 if (!vq->desc || l != vq->desc_size) { 1316 r = -ENOMEM; 1317 goto fail_alloc_desc; 1318 } 1319 1320 l = vq->avail_size; 1321 vq->avail = vhost_memory_map(dev, vq->avail_phys, &l, false); 1322 if (!vq->avail || l != vq->avail_size) { 1323 r = -ENOMEM; 1324 goto fail_alloc_avail; 1325 } 1326 1327 l = vq->used_size; 1328 vq->used = vhost_memory_map(dev, vq->used_phys, &l, true); 1329 if (!vq->used || l != vq->used_size) { 1330 r = -ENOMEM; 1331 goto fail_alloc_used; 1332 } 1333 1334 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); 1335 if (r < 0) { 1336 goto fail_alloc; 1337 } 1338 1339 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); 1340 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); 1341 if (r) { 1342 VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed"); 1343 goto fail_kick; 1344 } 1345 1346 /* Clear and discard previous events if any. */ 1347 event_notifier_test_and_clear(&vq->masked_notifier); 1348 1349 /* Init vring in unmasked state, unless guest_notifier_mask 1350 * will do it later. 1351 */ 1352 if (!vdev->use_guest_notifier_mask) { 1353 /* TODO: check and handle errors. */ 1354 vhost_virtqueue_mask(dev, vdev, idx, false); 1355 } 1356 1357 if (k->query_guest_notifiers && 1358 k->query_guest_notifiers(qbus->parent) && 1359 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) { 1360 file.fd = -1; 1361 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1362 if (r) { 1363 goto fail_vector; 1364 } 1365 } 1366 1367 return 0; 1368 1369 fail_vector: 1370 fail_kick: 1371 fail_alloc: 1372 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1373 0, 0); 1374 fail_alloc_used: 1375 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1376 0, 0); 1377 fail_alloc_avail: 1378 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1379 0, 0); 1380 fail_alloc_desc: 1381 return r; 1382 } 1383 1384 static int do_vhost_virtqueue_stop(struct vhost_dev *dev, 1385 struct VirtIODevice *vdev, 1386 struct vhost_virtqueue *vq, 1387 unsigned idx, bool force) 1388 { 1389 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1390 struct vhost_vring_state state = { 1391 .index = vhost_vq_index, 1392 }; 1393 int r = 0; 1394 1395 if (virtio_queue_get_desc_addr(vdev, idx) == 0) { 1396 /* Don't stop the virtqueue which might have not been started */ 1397 return 0; 1398 } 1399 1400 if (!force) { 1401 r = dev->vhost_ops->vhost_get_vring_base(dev, &state); 1402 if (r < 0) { 1403 VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r); 1404 } 1405 } 1406 1407 if (r < 0 || force) { 1408 /* Connection to the backend is broken, so let's sync internal 1409 * last avail idx to the device used idx. 1410 */ 1411 virtio_queue_restore_last_avail_idx(vdev, idx); 1412 } else { 1413 virtio_queue_set_last_avail_idx(vdev, idx, state.num); 1414 } 1415 virtio_queue_invalidate_signalled_used(vdev, idx); 1416 virtio_queue_update_used_idx(vdev, idx); 1417 1418 /* In the cross-endian case, we need to reset the vring endianness to 1419 * native as legacy devices expect so by default. 1420 */ 1421 if (vhost_needs_vring_endian(vdev)) { 1422 vhost_virtqueue_set_vring_endian_legacy(dev, 1423 !virtio_is_big_endian(vdev), 1424 vhost_vq_index); 1425 } 1426 1427 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1428 1, virtio_queue_get_used_size(vdev, idx)); 1429 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1430 0, virtio_queue_get_avail_size(vdev, idx)); 1431 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1432 0, virtio_queue_get_desc_size(vdev, idx)); 1433 return r; 1434 } 1435 1436 int vhost_virtqueue_stop(struct vhost_dev *dev, 1437 struct VirtIODevice *vdev, 1438 struct vhost_virtqueue *vq, 1439 unsigned idx) 1440 { 1441 return do_vhost_virtqueue_stop(dev, vdev, vq, idx, false); 1442 } 1443 1444 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, 1445 int n, uint32_t timeout) 1446 { 1447 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1448 struct vhost_vring_state state = { 1449 .index = vhost_vq_index, 1450 .num = timeout, 1451 }; 1452 int r; 1453 1454 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) { 1455 return -EINVAL; 1456 } 1457 1458 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state); 1459 if (r) { 1460 VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed"); 1461 return r; 1462 } 1463 1464 return 0; 1465 } 1466 1467 static void vhost_virtqueue_error_notifier(EventNotifier *n) 1468 { 1469 struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue, 1470 error_notifier); 1471 struct vhost_dev *dev = vq->dev; 1472 int index = vq - dev->vqs; 1473 1474 if (event_notifier_test_and_clear(n) && dev->vdev) { 1475 VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", 1476 dev->vq_index + index); 1477 } 1478 } 1479 1480 static int vhost_virtqueue_init(struct vhost_dev *dev, 1481 struct vhost_virtqueue *vq, int n) 1482 { 1483 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1484 struct vhost_vring_file file = { 1485 .index = vhost_vq_index, 1486 }; 1487 int r = event_notifier_init(&vq->masked_notifier, 0); 1488 if (r < 0) { 1489 return r; 1490 } 1491 1492 file.fd = event_notifier_get_wfd(&vq->masked_notifier); 1493 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1494 if (r) { 1495 VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed"); 1496 goto fail_call; 1497 } 1498 1499 vq->dev = dev; 1500 1501 if (dev->vhost_ops->vhost_set_vring_err) { 1502 r = event_notifier_init(&vq->error_notifier, 0); 1503 if (r < 0) { 1504 goto fail_call; 1505 } 1506 1507 file.fd = event_notifier_get_fd(&vq->error_notifier); 1508 r = dev->vhost_ops->vhost_set_vring_err(dev, &file); 1509 if (r) { 1510 VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed"); 1511 goto fail_err; 1512 } 1513 1514 event_notifier_set_handler(&vq->error_notifier, 1515 vhost_virtqueue_error_notifier); 1516 } 1517 1518 return 0; 1519 1520 fail_err: 1521 event_notifier_cleanup(&vq->error_notifier); 1522 fail_call: 1523 event_notifier_cleanup(&vq->masked_notifier); 1524 return r; 1525 } 1526 1527 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) 1528 { 1529 event_notifier_cleanup(&vq->masked_notifier); 1530 if (vq->dev->vhost_ops->vhost_set_vring_err) { 1531 event_notifier_set_handler(&vq->error_notifier, NULL); 1532 event_notifier_cleanup(&vq->error_notifier); 1533 } 1534 } 1535 1536 static int vhost_dev_get_features(struct vhost_dev *hdev, 1537 uint64_t *features) 1538 { 1539 uint64_t features64; 1540 int r; 1541 1542 if (hdev->vhost_ops->vhost_get_features_ex) { 1543 return hdev->vhost_ops->vhost_get_features_ex(hdev, features); 1544 } 1545 1546 r = hdev->vhost_ops->vhost_get_features(hdev, &features64); 1547 virtio_features_from_u64(features, features64); 1548 return r; 1549 } 1550 1551 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 1552 VhostBackendType backend_type, uint32_t busyloop_timeout, 1553 Error **errp) 1554 { 1555 uint64_t features[VIRTIO_FEATURES_NU64S]; 1556 unsigned int used, reserved, limit; 1557 int i, r, n_initialized_vqs = 0; 1558 1559 hdev->vdev = NULL; 1560 hdev->migration_blocker = NULL; 1561 1562 r = vhost_set_backend_type(hdev, backend_type); 1563 assert(r >= 0); 1564 1565 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp); 1566 if (r < 0) { 1567 goto fail; 1568 } 1569 1570 r = hdev->vhost_ops->vhost_set_owner(hdev); 1571 if (r < 0) { 1572 error_setg_errno(errp, -r, "vhost_set_owner failed"); 1573 goto fail; 1574 } 1575 1576 r = vhost_dev_get_features(hdev, features); 1577 if (r < 0) { 1578 error_setg_errno(errp, -r, "vhost_get_features failed"); 1579 goto fail; 1580 } 1581 1582 limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 1583 if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS && 1584 memory_devices_memslot_auto_decision_active()) { 1585 error_setg(errp, "some memory device (like virtio-mem)" 1586 " decided how many memory slots to use based on the overall" 1587 " number of memory slots; this vhost backend would further" 1588 " restricts the overall number of memory slots"); 1589 error_append_hint(errp, "Try plugging this vhost backend before" 1590 " plugging such memory devices.\n"); 1591 r = -EINVAL; 1592 goto fail; 1593 } 1594 1595 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) { 1596 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); 1597 if (r < 0) { 1598 error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i); 1599 goto fail; 1600 } 1601 } 1602 1603 if (busyloop_timeout) { 1604 for (i = 0; i < hdev->nvqs; ++i) { 1605 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 1606 busyloop_timeout); 1607 if (r < 0) { 1608 error_setg_errno(errp, -r, "Failed to set busyloop timeout"); 1609 goto fail_busyloop; 1610 } 1611 } 1612 } 1613 1614 virtio_features_copy(hdev->features_ex, features); 1615 1616 hdev->memory_listener = (MemoryListener) { 1617 .name = "vhost", 1618 .begin = vhost_begin, 1619 .commit = vhost_commit, 1620 .region_add = vhost_region_addnop, 1621 .region_nop = vhost_region_addnop, 1622 .log_start = vhost_log_start, 1623 .log_stop = vhost_log_stop, 1624 .log_sync = vhost_log_sync, 1625 .log_global_start = vhost_log_global_start, 1626 .log_global_stop = vhost_log_global_stop, 1627 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND 1628 }; 1629 1630 hdev->iommu_listener = (MemoryListener) { 1631 .name = "vhost-iommu", 1632 .region_add = vhost_iommu_region_add, 1633 .region_del = vhost_iommu_region_del, 1634 }; 1635 1636 if (hdev->migration_blocker == NULL) { 1637 if (!virtio_has_feature_ex(hdev->features_ex, VHOST_F_LOG_ALL)) { 1638 error_setg(&hdev->migration_blocker, 1639 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); 1640 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { 1641 error_setg(&hdev->migration_blocker, 1642 "Migration disabled: failed to allocate shared memory"); 1643 } 1644 } 1645 1646 if (hdev->migration_blocker != NULL) { 1647 r = migrate_add_blocker_normal(&hdev->migration_blocker, errp); 1648 if (r < 0) { 1649 goto fail_busyloop; 1650 } 1651 } 1652 1653 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); 1654 hdev->n_mem_sections = 0; 1655 hdev->mem_sections = NULL; 1656 hdev->log = NULL; 1657 hdev->log_size = 0; 1658 hdev->log_enabled = false; 1659 hdev->started = false; 1660 memory_listener_register(&hdev->memory_listener, &address_space_memory); 1661 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); 1662 1663 /* 1664 * The listener we registered properly setup the number of required 1665 * memslots in vhost_commit(). 1666 */ 1667 used = hdev->mem->nregions; 1668 1669 /* 1670 * We assume that all reserved memslots actually require a real memslot 1671 * in our vhost backend. This might not be true, for example, if the 1672 * memslot would be ROM. If ever relevant, we can optimize for that -- 1673 * but we'll need additional information about the reservations. 1674 */ 1675 reserved = memory_devices_get_reserved_memslots(); 1676 if (used + reserved > limit) { 1677 error_setg(errp, "vhost backend memory slots limit (%d) is less" 1678 " than current number of used (%d) and reserved (%d)" 1679 " memory slots for memory devices.", limit, used, reserved); 1680 r = -EINVAL; 1681 goto fail_busyloop; 1682 } 1683 1684 return 0; 1685 1686 fail_busyloop: 1687 if (busyloop_timeout) { 1688 while (--i >= 0) { 1689 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0); 1690 } 1691 } 1692 fail: 1693 hdev->nvqs = n_initialized_vqs; 1694 vhost_dev_cleanup(hdev); 1695 return r; 1696 } 1697 1698 void vhost_dev_cleanup(struct vhost_dev *hdev) 1699 { 1700 int i; 1701 1702 trace_vhost_dev_cleanup(hdev); 1703 1704 for (i = 0; i < hdev->nvqs; ++i) { 1705 vhost_virtqueue_cleanup(hdev->vqs + i); 1706 } 1707 if (hdev->mem) { 1708 /* those are only safe after successful init */ 1709 memory_listener_unregister(&hdev->memory_listener); 1710 QLIST_REMOVE(hdev, entry); 1711 } 1712 migrate_del_blocker(&hdev->migration_blocker); 1713 g_free(hdev->mem); 1714 g_free(hdev->mem_sections); 1715 if (hdev->vhost_ops) { 1716 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1717 } 1718 assert(!hdev->log); 1719 1720 memset(hdev, 0, sizeof(struct vhost_dev)); 1721 } 1722 1723 void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev, 1724 VirtIODevice *vdev, 1725 unsigned int nvqs) 1726 { 1727 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1728 int i, r; 1729 1730 /* 1731 * Batch all the host notifiers in a single transaction to avoid 1732 * quadratic time complexity in address_space_update_ioeventfds(). 1733 */ 1734 memory_region_transaction_begin(); 1735 1736 for (i = 0; i < nvqs; ++i) { 1737 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1738 false); 1739 if (r < 0) { 1740 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); 1741 } 1742 assert(r >= 0); 1743 } 1744 1745 /* 1746 * The transaction expects the ioeventfds to be open when it 1747 * commits. Do it now, before the cleanup loop. 1748 */ 1749 memory_region_transaction_commit(); 1750 1751 for (i = 0; i < nvqs; ++i) { 1752 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); 1753 } 1754 virtio_device_release_ioeventfd(vdev); 1755 } 1756 1757 /* Stop processing guest IO notifications in qemu. 1758 * Start processing them in vhost in kernel. 1759 */ 1760 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1761 { 1762 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1763 int i, r; 1764 1765 /* We will pass the notifiers to the kernel, make sure that QEMU 1766 * doesn't interfere. 1767 */ 1768 r = virtio_device_grab_ioeventfd(vdev); 1769 if (r < 0) { 1770 error_report("binding does not support host notifiers"); 1771 return r; 1772 } 1773 1774 /* 1775 * Batch all the host notifiers in a single transaction to avoid 1776 * quadratic time complexity in address_space_update_ioeventfds(). 1777 */ 1778 memory_region_transaction_begin(); 1779 1780 for (i = 0; i < hdev->nvqs; ++i) { 1781 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1782 true); 1783 if (r < 0) { 1784 error_report("vhost VQ %d notifier binding failed: %d", i, -r); 1785 memory_region_transaction_commit(); 1786 vhost_dev_disable_notifiers_nvqs(hdev, vdev, i); 1787 return r; 1788 } 1789 } 1790 1791 memory_region_transaction_commit(); 1792 1793 return 0; 1794 } 1795 1796 /* Stop processing guest IO notifications in vhost. 1797 * Start processing them in qemu. 1798 * This might actually run the qemu handlers right away, 1799 * so virtio in qemu must be completely setup when this is called. 1800 */ 1801 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1802 { 1803 vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs); 1804 } 1805 1806 /* Test and clear event pending status. 1807 * Should be called after unmask to avoid losing events. 1808 */ 1809 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) 1810 { 1811 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; 1812 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); 1813 return event_notifier_test_and_clear(&vq->masked_notifier); 1814 } 1815 1816 /* Mask/unmask events from this vq. */ 1817 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, 1818 bool mask) 1819 { 1820 struct VirtQueue *vvq = virtio_get_queue(vdev, n); 1821 int r, index = n - hdev->vq_index; 1822 struct vhost_vring_file file; 1823 1824 /* should only be called after backend is connected */ 1825 assert(hdev->vhost_ops); 1826 1827 if (mask) { 1828 assert(vdev->use_guest_notifier_mask); 1829 file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier); 1830 } else { 1831 file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq)); 1832 } 1833 1834 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); 1835 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); 1836 if (r < 0) { 1837 error_report("vhost_set_vring_call failed %d", -r); 1838 } 1839 } 1840 1841 bool vhost_config_pending(struct vhost_dev *hdev) 1842 { 1843 assert(hdev->vhost_ops); 1844 if ((hdev->started == false) || 1845 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1846 return false; 1847 } 1848 1849 EventNotifier *notifier = 1850 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1851 return event_notifier_test_and_clear(notifier); 1852 } 1853 1854 void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask) 1855 { 1856 int fd; 1857 int r; 1858 EventNotifier *notifier = 1859 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1860 EventNotifier *config_notifier = virtio_config_get_guest_notifier(vdev); 1861 assert(hdev->vhost_ops); 1862 1863 if ((hdev->started == false) || 1864 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1865 return; 1866 } 1867 if (mask) { 1868 assert(vdev->use_guest_notifier_mask); 1869 fd = event_notifier_get_fd(notifier); 1870 } else { 1871 fd = event_notifier_get_fd(config_notifier); 1872 } 1873 r = hdev->vhost_ops->vhost_set_config_call(hdev, fd); 1874 if (r < 0) { 1875 error_report("vhost_set_config_call failed %d", -r); 1876 } 1877 } 1878 1879 static void vhost_stop_config_intr(struct vhost_dev *dev) 1880 { 1881 int fd = -1; 1882 assert(dev->vhost_ops); 1883 if (dev->vhost_ops->vhost_set_config_call) { 1884 dev->vhost_ops->vhost_set_config_call(dev, fd); 1885 } 1886 } 1887 1888 static void vhost_start_config_intr(struct vhost_dev *dev) 1889 { 1890 int r; 1891 EventNotifier *config_notifier = 1892 virtio_config_get_guest_notifier(dev->vdev); 1893 1894 assert(dev->vhost_ops); 1895 int fd = event_notifier_get_fd(config_notifier); 1896 if (dev->vhost_ops->vhost_set_config_call) { 1897 r = dev->vhost_ops->vhost_set_config_call(dev, fd); 1898 if (!r) { 1899 event_notifier_set(config_notifier); 1900 } 1901 } 1902 } 1903 1904 void vhost_get_features_ex(struct vhost_dev *hdev, 1905 const int *feature_bits, 1906 uint64_t *features) 1907 { 1908 const int *bit = feature_bits; 1909 1910 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1911 if (!virtio_has_feature_ex(hdev->features_ex, *bit)) { 1912 virtio_clear_feature_ex(features, *bit); 1913 } 1914 bit++; 1915 } 1916 } 1917 1918 void vhost_ack_features_ex(struct vhost_dev *hdev, const int *feature_bits, 1919 const uint64_t *features) 1920 { 1921 const int *bit = feature_bits; 1922 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1923 if (virtio_has_feature_ex(features, *bit)) { 1924 virtio_add_feature_ex(hdev->acked_features_ex, *bit); 1925 } 1926 bit++; 1927 } 1928 } 1929 1930 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, 1931 uint32_t config_len, Error **errp) 1932 { 1933 assert(hdev->vhost_ops); 1934 1935 if (hdev->vhost_ops->vhost_get_config) { 1936 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len, 1937 errp); 1938 } 1939 1940 error_setg(errp, "vhost_get_config not implemented"); 1941 return -ENOSYS; 1942 } 1943 1944 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data, 1945 uint32_t offset, uint32_t size, uint32_t flags) 1946 { 1947 assert(hdev->vhost_ops); 1948 1949 if (hdev->vhost_ops->vhost_set_config) { 1950 return hdev->vhost_ops->vhost_set_config(hdev, data, offset, 1951 size, flags); 1952 } 1953 1954 return -ENOSYS; 1955 } 1956 1957 void vhost_dev_set_config_notifier(struct vhost_dev *hdev, 1958 const VhostDevConfigOps *ops) 1959 { 1960 hdev->config_ops = ops; 1961 } 1962 1963 void vhost_dev_free_inflight(struct vhost_inflight *inflight) 1964 { 1965 if (inflight && inflight->addr) { 1966 qemu_memfd_free(inflight->addr, inflight->size, inflight->fd); 1967 inflight->addr = NULL; 1968 inflight->fd = -1; 1969 } 1970 } 1971 1972 int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev) 1973 { 1974 int r; 1975 1976 if (hdev->vhost_ops->vhost_get_inflight_fd == NULL || 1977 hdev->vhost_ops->vhost_set_inflight_fd == NULL) { 1978 return 0; 1979 } 1980 1981 hdev->vdev = vdev; 1982 1983 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1984 if (r < 0) { 1985 VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed"); 1986 return r; 1987 } 1988 1989 return 0; 1990 } 1991 1992 int vhost_dev_set_inflight(struct vhost_dev *dev, 1993 struct vhost_inflight *inflight) 1994 { 1995 int r; 1996 1997 if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) { 1998 r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight); 1999 if (r) { 2000 VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed"); 2001 return r; 2002 } 2003 } 2004 2005 return 0; 2006 } 2007 2008 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, 2009 struct vhost_inflight *inflight) 2010 { 2011 int r; 2012 2013 if (dev->vhost_ops->vhost_get_inflight_fd) { 2014 r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight); 2015 if (r) { 2016 VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed"); 2017 return r; 2018 } 2019 } 2020 2021 return 0; 2022 } 2023 2024 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) 2025 { 2026 if (!hdev->vhost_ops->vhost_set_vring_enable) { 2027 return 0; 2028 } 2029 2030 /* 2031 * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not 2032 * been negotiated, the rings start directly in the enabled state, and 2033 * .vhost_set_vring_enable callback will fail since 2034 * VHOST_USER_SET_VRING_ENABLE is not supported. 2035 */ 2036 if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER && 2037 !virtio_has_feature(hdev->backend_features, 2038 VHOST_USER_F_PROTOCOL_FEATURES)) { 2039 return 0; 2040 } 2041 2042 return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); 2043 } 2044 2045 /* 2046 * Host notifiers must be enabled at this point. 2047 * 2048 * If @vrings is true, this function will enable all vrings before starting the 2049 * device. If it is false, the vring initialization is left to be done by the 2050 * caller. 2051 */ 2052 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 2053 { 2054 int i, r; 2055 2056 /* should only be called after backend is connected */ 2057 assert(hdev->vhost_ops); 2058 2059 trace_vhost_dev_start(hdev, vdev->name, vrings); 2060 2061 vdev->vhost_started = true; 2062 hdev->started = true; 2063 hdev->vdev = vdev; 2064 2065 r = vhost_dev_set_features(hdev, hdev->log_enabled); 2066 if (r < 0) { 2067 goto fail_features; 2068 } 2069 2070 if (vhost_dev_has_iommu(hdev)) { 2071 memory_listener_register(&hdev->iommu_listener, vdev->dma_as); 2072 } 2073 2074 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); 2075 if (r < 0) { 2076 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 2077 goto fail_mem; 2078 } 2079 for (i = 0; i < hdev->nvqs; ++i) { 2080 r = vhost_virtqueue_start(hdev, 2081 vdev, 2082 hdev->vqs + i, 2083 hdev->vq_index + i); 2084 if (r < 0) { 2085 goto fail_vq; 2086 } 2087 } 2088 2089 r = event_notifier_init( 2090 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0); 2091 if (r < 0) { 2092 VHOST_OPS_DEBUG(r, "event_notifier_init failed"); 2093 goto fail_vq; 2094 } 2095 event_notifier_test_and_clear( 2096 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2097 if (!vdev->use_guest_notifier_mask) { 2098 vhost_config_mask(hdev, vdev, true); 2099 } 2100 if (hdev->log_enabled) { 2101 uint64_t log_base; 2102 2103 hdev->log_size = vhost_get_log_size(hdev); 2104 hdev->log = vhost_log_get(hdev->vhost_ops->backend_type, 2105 hdev->log_size, 2106 vhost_dev_log_is_shared(hdev)); 2107 log_base = (uintptr_t)hdev->log->log; 2108 r = hdev->vhost_ops->vhost_set_log_base(hdev, 2109 hdev->log_size ? log_base : 0, 2110 hdev->log); 2111 if (r < 0) { 2112 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 2113 goto fail_log; 2114 } 2115 vhost_dev_elect_mem_logger(hdev, true); 2116 } 2117 if (vrings) { 2118 r = vhost_dev_set_vring_enable(hdev, true); 2119 if (r) { 2120 goto fail_log; 2121 } 2122 } 2123 if (hdev->vhost_ops->vhost_dev_start) { 2124 r = hdev->vhost_ops->vhost_dev_start(hdev, true); 2125 if (r) { 2126 goto fail_start; 2127 } 2128 } 2129 if (vhost_dev_has_iommu(hdev) && 2130 hdev->vhost_ops->vhost_set_iotlb_callback) { 2131 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); 2132 2133 /* Update used ring information for IOTLB to work correctly, 2134 * vhost-kernel code requires for this.*/ 2135 for (i = 0; i < hdev->nvqs; ++i) { 2136 struct vhost_virtqueue *vq = hdev->vqs + i; 2137 r = vhost_device_iotlb_miss(hdev, vq->used_phys, true); 2138 if (r) { 2139 goto fail_iotlb; 2140 } 2141 } 2142 } 2143 vhost_start_config_intr(hdev); 2144 return 0; 2145 fail_iotlb: 2146 if (vhost_dev_has_iommu(hdev) && 2147 hdev->vhost_ops->vhost_set_iotlb_callback) { 2148 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 2149 } 2150 if (hdev->vhost_ops->vhost_dev_start) { 2151 hdev->vhost_ops->vhost_dev_start(hdev, false); 2152 } 2153 fail_start: 2154 if (vrings) { 2155 vhost_dev_set_vring_enable(hdev, false); 2156 } 2157 fail_log: 2158 vhost_log_put(hdev, false); 2159 fail_vq: 2160 while (--i >= 0) { 2161 vhost_virtqueue_stop(hdev, 2162 vdev, 2163 hdev->vqs + i, 2164 hdev->vq_index + i); 2165 } 2166 2167 fail_mem: 2168 if (vhost_dev_has_iommu(hdev)) { 2169 memory_listener_unregister(&hdev->iommu_listener); 2170 } 2171 fail_features: 2172 vdev->vhost_started = false; 2173 hdev->started = false; 2174 return r; 2175 } 2176 2177 /* Host notifiers must be enabled at this point. */ 2178 static int do_vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, 2179 bool vrings, bool force) 2180 { 2181 int i; 2182 int rc = 0; 2183 EventNotifier *config_notifier = virtio_config_get_guest_notifier(vdev); 2184 2185 /* should only be called after backend is connected */ 2186 assert(hdev->vhost_ops); 2187 event_notifier_test_and_clear( 2188 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2189 event_notifier_test_and_clear(config_notifier); 2190 event_notifier_cleanup( 2191 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2192 2193 trace_vhost_dev_stop(hdev, vdev->name, vrings); 2194 2195 if (hdev->vhost_ops->vhost_dev_start) { 2196 hdev->vhost_ops->vhost_dev_start(hdev, false); 2197 } 2198 if (vrings) { 2199 vhost_dev_set_vring_enable(hdev, false); 2200 } 2201 for (i = 0; i < hdev->nvqs; ++i) { 2202 rc |= do_vhost_virtqueue_stop(hdev, 2203 vdev, 2204 hdev->vqs + i, 2205 hdev->vq_index + i, 2206 force); 2207 } 2208 if (hdev->vhost_ops->vhost_reset_status) { 2209 hdev->vhost_ops->vhost_reset_status(hdev); 2210 } 2211 2212 if (vhost_dev_has_iommu(hdev)) { 2213 if (hdev->vhost_ops->vhost_set_iotlb_callback) { 2214 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 2215 } 2216 memory_listener_unregister(&hdev->iommu_listener); 2217 } 2218 vhost_stop_config_intr(hdev); 2219 vhost_log_put(hdev, true); 2220 hdev->started = false; 2221 vdev->vhost_started = false; 2222 hdev->vdev = NULL; 2223 return rc; 2224 } 2225 2226 int vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 2227 { 2228 return do_vhost_dev_stop(hdev, vdev, vrings, false); 2229 } 2230 2231 int vhost_dev_force_stop(struct vhost_dev *hdev, VirtIODevice *vdev, 2232 bool vrings) 2233 { 2234 return do_vhost_dev_stop(hdev, vdev, vrings, true); 2235 } 2236 2237 int vhost_net_set_backend(struct vhost_dev *hdev, 2238 struct vhost_vring_file *file) 2239 { 2240 if (hdev->vhost_ops->vhost_net_set_backend) { 2241 return hdev->vhost_ops->vhost_net_set_backend(hdev, file); 2242 } 2243 2244 return -ENOSYS; 2245 } 2246 2247 int vhost_reset_device(struct vhost_dev *hdev) 2248 { 2249 if (hdev->vhost_ops->vhost_reset_device) { 2250 return hdev->vhost_ops->vhost_reset_device(hdev); 2251 } 2252 2253 return -ENOSYS; 2254 } 2255 2256 bool vhost_supports_device_state(struct vhost_dev *dev) 2257 { 2258 if (dev->vhost_ops->vhost_supports_device_state) { 2259 return dev->vhost_ops->vhost_supports_device_state(dev); 2260 } 2261 2262 return false; 2263 } 2264 2265 int vhost_set_device_state_fd(struct vhost_dev *dev, 2266 VhostDeviceStateDirection direction, 2267 VhostDeviceStatePhase phase, 2268 int fd, 2269 int *reply_fd, 2270 Error **errp) 2271 { 2272 if (dev->vhost_ops->vhost_set_device_state_fd) { 2273 return dev->vhost_ops->vhost_set_device_state_fd(dev, direction, phase, 2274 fd, reply_fd, errp); 2275 } 2276 2277 error_setg(errp, 2278 "vhost transport does not support migration state transfer"); 2279 return -ENOSYS; 2280 } 2281 2282 int vhost_check_device_state(struct vhost_dev *dev, Error **errp) 2283 { 2284 if (dev->vhost_ops->vhost_check_device_state) { 2285 return dev->vhost_ops->vhost_check_device_state(dev, errp); 2286 } 2287 2288 error_setg(errp, 2289 "vhost transport does not support migration state transfer"); 2290 return -ENOSYS; 2291 } 2292 2293 int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) 2294 { 2295 ERRP_GUARD(); 2296 /* Maximum chunk size in which to transfer the state */ 2297 const size_t chunk_size = 1 * 1024 * 1024; 2298 g_autofree void *transfer_buf = NULL; 2299 g_autoptr(GError) g_err = NULL; 2300 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; 2301 int ret; 2302 2303 /* [0] for reading (our end), [1] for writing (back-end's end) */ 2304 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { 2305 error_setg(errp, "Failed to set up state transfer pipe: %s", 2306 g_err->message); 2307 ret = -EINVAL; 2308 goto fail; 2309 } 2310 2311 read_fd = pipe_fds[0]; 2312 write_fd = pipe_fds[1]; 2313 2314 /* 2315 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. 2316 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for 2317 * vhost-user, so just check that it is stopped at all. 2318 */ 2319 assert(!dev->started); 2320 2321 /* Transfer ownership of write_fd to the back-end */ 2322 ret = vhost_set_device_state_fd(dev, 2323 VHOST_TRANSFER_STATE_DIRECTION_SAVE, 2324 VHOST_TRANSFER_STATE_PHASE_STOPPED, 2325 write_fd, 2326 &reply_fd, 2327 errp); 2328 if (ret < 0) { 2329 error_prepend(errp, "Failed to initiate state transfer: "); 2330 goto fail; 2331 } 2332 2333 /* If the back-end wishes to use a different pipe, switch over */ 2334 if (reply_fd >= 0) { 2335 close(read_fd); 2336 read_fd = reply_fd; 2337 } 2338 2339 transfer_buf = g_malloc(chunk_size); 2340 2341 while (true) { 2342 ssize_t read_ret; 2343 2344 read_ret = RETRY_ON_EINTR(read(read_fd, transfer_buf, chunk_size)); 2345 if (read_ret < 0) { 2346 ret = -errno; 2347 error_setg_errno(errp, -ret, "Failed to receive state"); 2348 goto fail; 2349 } 2350 2351 assert(read_ret <= chunk_size); 2352 qemu_put_be32(f, read_ret); 2353 2354 if (read_ret == 0) { 2355 /* EOF */ 2356 break; 2357 } 2358 2359 qemu_put_buffer(f, transfer_buf, read_ret); 2360 } 2361 2362 /* 2363 * Back-end will not really care, but be clean and close our end of the pipe 2364 * before inquiring the back-end about whether transfer was successful 2365 */ 2366 close(read_fd); 2367 read_fd = -1; 2368 2369 /* Also, verify that the device is still stopped */ 2370 assert(!dev->started); 2371 2372 ret = vhost_check_device_state(dev, errp); 2373 if (ret < 0) { 2374 goto fail; 2375 } 2376 2377 ret = 0; 2378 fail: 2379 if (read_fd >= 0) { 2380 close(read_fd); 2381 } 2382 2383 return ret; 2384 } 2385 2386 int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) 2387 { 2388 ERRP_GUARD(); 2389 size_t transfer_buf_size = 0; 2390 g_autofree void *transfer_buf = NULL; 2391 g_autoptr(GError) g_err = NULL; 2392 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; 2393 int ret; 2394 2395 /* [0] for reading (back-end's end), [1] for writing (our end) */ 2396 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { 2397 error_setg(errp, "Failed to set up state transfer pipe: %s", 2398 g_err->message); 2399 ret = -EINVAL; 2400 goto fail; 2401 } 2402 2403 read_fd = pipe_fds[0]; 2404 write_fd = pipe_fds[1]; 2405 2406 /* 2407 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. 2408 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for 2409 * vhost-user, so just check that it is stopped at all. 2410 */ 2411 assert(!dev->started); 2412 2413 /* Transfer ownership of read_fd to the back-end */ 2414 ret = vhost_set_device_state_fd(dev, 2415 VHOST_TRANSFER_STATE_DIRECTION_LOAD, 2416 VHOST_TRANSFER_STATE_PHASE_STOPPED, 2417 read_fd, 2418 &reply_fd, 2419 errp); 2420 if (ret < 0) { 2421 error_prepend(errp, "Failed to initiate state transfer: "); 2422 goto fail; 2423 } 2424 2425 /* If the back-end wishes to use a different pipe, switch over */ 2426 if (reply_fd >= 0) { 2427 close(write_fd); 2428 write_fd = reply_fd; 2429 } 2430 2431 while (true) { 2432 size_t this_chunk_size = qemu_get_be32(f); 2433 ssize_t write_ret; 2434 const uint8_t *transfer_pointer; 2435 2436 if (this_chunk_size == 0) { 2437 /* End of state */ 2438 break; 2439 } 2440 2441 if (transfer_buf_size < this_chunk_size) { 2442 transfer_buf = g_realloc(transfer_buf, this_chunk_size); 2443 transfer_buf_size = this_chunk_size; 2444 } 2445 2446 if (qemu_get_buffer(f, transfer_buf, this_chunk_size) < 2447 this_chunk_size) 2448 { 2449 error_setg(errp, "Failed to read state"); 2450 ret = -EINVAL; 2451 goto fail; 2452 } 2453 2454 transfer_pointer = transfer_buf; 2455 while (this_chunk_size > 0) { 2456 write_ret = RETRY_ON_EINTR( 2457 write(write_fd, transfer_pointer, this_chunk_size) 2458 ); 2459 if (write_ret < 0) { 2460 ret = -errno; 2461 error_setg_errno(errp, -ret, "Failed to send state"); 2462 goto fail; 2463 } else if (write_ret == 0) { 2464 error_setg(errp, "Failed to send state: Connection is closed"); 2465 ret = -ECONNRESET; 2466 goto fail; 2467 } 2468 2469 assert(write_ret <= this_chunk_size); 2470 this_chunk_size -= write_ret; 2471 transfer_pointer += write_ret; 2472 } 2473 } 2474 2475 /* 2476 * Close our end, thus ending transfer, before inquiring the back-end about 2477 * whether transfer was successful 2478 */ 2479 close(write_fd); 2480 write_fd = -1; 2481 2482 /* Also, verify that the device is still stopped */ 2483 assert(!dev->started); 2484 2485 ret = vhost_check_device_state(dev, errp); 2486 if (ret < 0) { 2487 goto fail; 2488 } 2489 2490 ret = 0; 2491 fail: 2492 if (write_fd >= 0) { 2493 close(write_fd); 2494 } 2495 2496 return ret; 2497 } 2498