1 /* 2 * vhost support 3 * 4 * Copyright Red Hat, Inc. 2010 5 * 6 * Authors: 7 * Michael S. Tsirkin <mst@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "hw/virtio/vhost.h" 19 #include "qemu/atomic.h" 20 #include "qemu/range.h" 21 #include "qemu/error-report.h" 22 #include "qemu/memfd.h" 23 #include "qemu/log.h" 24 #include "standard-headers/linux/vhost_types.h" 25 #include "hw/virtio/virtio-bus.h" 26 #include "hw/mem/memory-device.h" 27 #include "migration/blocker.h" 28 #include "migration/qemu-file-types.h" 29 #include "system/dma.h" 30 #include "trace.h" 31 32 /* enabled until disconnected backend stabilizes */ 33 #define _VHOST_DEBUG 1 34 35 #ifdef _VHOST_DEBUG 36 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 37 do { \ 38 error_report(fmt ": %s (%d)", ## __VA_ARGS__, \ 39 strerror(-retval), -retval); \ 40 } while (0) 41 #else 42 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 43 do { } while (0) 44 #endif 45 46 static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX]; 47 static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX]; 48 static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX]; 49 50 static QLIST_HEAD(, vhost_dev) vhost_devices = 51 QLIST_HEAD_INITIALIZER(vhost_devices); 52 53 unsigned int vhost_get_max_memslots(void) 54 { 55 unsigned int max = UINT_MAX; 56 struct vhost_dev *hdev; 57 58 QLIST_FOREACH(hdev, &vhost_devices, entry) { 59 max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev)); 60 } 61 return max; 62 } 63 64 unsigned int vhost_get_free_memslots(void) 65 { 66 unsigned int free = UINT_MAX; 67 struct vhost_dev *hdev; 68 69 QLIST_FOREACH(hdev, &vhost_devices, entry) { 70 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 71 unsigned int cur_free = r - hdev->mem->nregions; 72 73 if (unlikely(r < hdev->mem->nregions)) { 74 warn_report_once("used (%u) vhost backend memory slots exceed" 75 " the device limit (%u).", hdev->mem->nregions, r); 76 free = 0; 77 } else { 78 free = MIN(free, cur_free); 79 } 80 } 81 return free; 82 } 83 84 static void vhost_dev_sync_region(struct vhost_dev *dev, 85 MemoryRegionSection *section, 86 uint64_t mfirst, uint64_t mlast, 87 uint64_t rfirst, uint64_t rlast) 88 { 89 vhost_log_chunk_t *dev_log = dev->log->log; 90 91 uint64_t start = MAX(mfirst, rfirst); 92 uint64_t end = MIN(mlast, rlast); 93 vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK; 94 vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1; 95 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK); 96 97 if (end < start) { 98 return; 99 } 100 assert(end / VHOST_LOG_CHUNK < dev->log_size); 101 assert(start / VHOST_LOG_CHUNK < dev->log_size); 102 103 for (;from < to; ++from) { 104 vhost_log_chunk_t log; 105 /* We first check with non-atomic: much cheaper, 106 * and we expect non-dirty to be the common case. */ 107 if (!*from) { 108 addr += VHOST_LOG_CHUNK; 109 continue; 110 } 111 /* Data must be read atomically. We don't really need barrier semantics 112 * but it's easier to use atomic_* than roll our own. */ 113 log = qatomic_xchg(from, 0); 114 while (log) { 115 int bit = ctzl(log); 116 hwaddr page_addr; 117 hwaddr section_offset; 118 hwaddr mr_offset; 119 page_addr = addr + bit * VHOST_LOG_PAGE; 120 section_offset = page_addr - section->offset_within_address_space; 121 mr_offset = section_offset + section->offset_within_region; 122 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); 123 log &= ~(0x1ull << bit); 124 } 125 addr += VHOST_LOG_CHUNK; 126 } 127 } 128 129 bool vhost_dev_has_iommu(struct vhost_dev *dev) 130 { 131 VirtIODevice *vdev = dev->vdev; 132 133 /* 134 * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support 135 * incremental memory mapping API via IOTLB API. For platform that 136 * does not have IOMMU, there's no need to enable this feature 137 * which may cause unnecessary IOTLB miss/update transactions. 138 */ 139 if (vdev) { 140 return virtio_bus_device_iommu_enabled(vdev) && 141 virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); 142 } else { 143 return false; 144 } 145 } 146 147 static inline bool vhost_dev_should_log(struct vhost_dev *dev) 148 { 149 assert(dev->vhost_ops); 150 assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE); 151 assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX); 152 153 return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]); 154 } 155 156 static inline void vhost_dev_elect_mem_logger(struct vhost_dev *hdev, bool add) 157 { 158 VhostBackendType backend_type; 159 160 assert(hdev->vhost_ops); 161 162 backend_type = hdev->vhost_ops->backend_type; 163 assert(backend_type > VHOST_BACKEND_TYPE_NONE); 164 assert(backend_type < VHOST_BACKEND_TYPE_MAX); 165 166 if (add && !QLIST_IS_INSERTED(hdev, logdev_entry)) { 167 if (QLIST_EMPTY(&vhost_log_devs[backend_type])) { 168 QLIST_INSERT_HEAD(&vhost_log_devs[backend_type], 169 hdev, logdev_entry); 170 } else { 171 /* 172 * The first vhost_device in the list is selected as the shared 173 * logger to scan memory sections. Put new entry next to the head 174 * to avoid inadvertent change to the underlying logger device. 175 * This is done in order to get better cache locality and to avoid 176 * performance churn on the hot path for log scanning. Even when 177 * new devices come and go quickly, it wouldn't end up changing 178 * the active leading logger device at all. 179 */ 180 QLIST_INSERT_AFTER(QLIST_FIRST(&vhost_log_devs[backend_type]), 181 hdev, logdev_entry); 182 } 183 } else if (!add && QLIST_IS_INSERTED(hdev, logdev_entry)) { 184 QLIST_REMOVE(hdev, logdev_entry); 185 } 186 } 187 188 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, 189 MemoryRegionSection *section, 190 hwaddr first, 191 hwaddr last) 192 { 193 int i; 194 hwaddr start_addr; 195 hwaddr end_addr; 196 197 if (!dev->log_enabled || !dev->started) { 198 return 0; 199 } 200 start_addr = section->offset_within_address_space; 201 end_addr = range_get_last(start_addr, int128_get64(section->size)); 202 start_addr = MAX(first, start_addr); 203 end_addr = MIN(last, end_addr); 204 205 if (vhost_dev_should_log(dev)) { 206 for (i = 0; i < dev->mem->nregions; ++i) { 207 struct vhost_memory_region *reg = dev->mem->regions + i; 208 vhost_dev_sync_region(dev, section, start_addr, end_addr, 209 reg->guest_phys_addr, 210 range_get_last(reg->guest_phys_addr, 211 reg->memory_size)); 212 } 213 } 214 for (i = 0; i < dev->nvqs; ++i) { 215 struct vhost_virtqueue *vq = dev->vqs + i; 216 217 if (!vq->used_phys && !vq->used_size) { 218 continue; 219 } 220 221 if (vhost_dev_has_iommu(dev)) { 222 IOMMUTLBEntry iotlb; 223 hwaddr used_phys = vq->used_phys, used_size = vq->used_size; 224 hwaddr phys, s, offset; 225 226 while (used_size) { 227 rcu_read_lock(); 228 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 229 used_phys, 230 true, 231 MEMTXATTRS_UNSPECIFIED); 232 rcu_read_unlock(); 233 234 if (!iotlb.target_as) { 235 qemu_log_mask(LOG_GUEST_ERROR, "translation " 236 "failure for used_iova %"PRIx64"\n", 237 used_phys); 238 return -EINVAL; 239 } 240 241 offset = used_phys & iotlb.addr_mask; 242 phys = iotlb.translated_addr + offset; 243 244 /* 245 * Distance from start of used ring until last byte of 246 * IOMMU page. 247 */ 248 s = iotlb.addr_mask - offset; 249 /* 250 * Size of used ring, or of the part of it until end 251 * of IOMMU page. To avoid zero result, do the adding 252 * outside of MIN(). 253 */ 254 s = MIN(s, used_size - 1) + 1; 255 256 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys, 257 range_get_last(phys, s)); 258 used_size -= s; 259 used_phys += s; 260 } 261 } else { 262 vhost_dev_sync_region(dev, section, start_addr, 263 end_addr, vq->used_phys, 264 range_get_last(vq->used_phys, vq->used_size)); 265 } 266 } 267 return 0; 268 } 269 270 static void vhost_log_sync(MemoryListener *listener, 271 MemoryRegionSection *section) 272 { 273 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 274 memory_listener); 275 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); 276 } 277 278 static void vhost_log_sync_range(struct vhost_dev *dev, 279 hwaddr first, hwaddr last) 280 { 281 int i; 282 /* FIXME: this is N^2 in number of sections */ 283 for (i = 0; i < dev->n_mem_sections; ++i) { 284 MemoryRegionSection *section = &dev->mem_sections[i]; 285 vhost_sync_dirty_bitmap(dev, section, first, last); 286 } 287 } 288 289 static uint64_t vhost_get_log_size(struct vhost_dev *dev) 290 { 291 uint64_t log_size = 0; 292 int i; 293 for (i = 0; i < dev->mem->nregions; ++i) { 294 struct vhost_memory_region *reg = dev->mem->regions + i; 295 uint64_t last = range_get_last(reg->guest_phys_addr, 296 reg->memory_size); 297 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 298 } 299 return log_size; 300 } 301 302 static int vhost_set_backend_type(struct vhost_dev *dev, 303 VhostBackendType backend_type) 304 { 305 int r = 0; 306 307 switch (backend_type) { 308 #ifdef CONFIG_VHOST_KERNEL 309 case VHOST_BACKEND_TYPE_KERNEL: 310 dev->vhost_ops = &kernel_ops; 311 break; 312 #endif 313 #ifdef CONFIG_VHOST_USER 314 case VHOST_BACKEND_TYPE_USER: 315 dev->vhost_ops = &user_ops; 316 break; 317 #endif 318 #ifdef CONFIG_VHOST_VDPA 319 case VHOST_BACKEND_TYPE_VDPA: 320 dev->vhost_ops = &vdpa_ops; 321 break; 322 #endif 323 default: 324 error_report("Unknown vhost backend type"); 325 r = -1; 326 } 327 328 if (r == 0) { 329 assert(dev->vhost_ops->backend_type == backend_type); 330 } 331 332 return r; 333 } 334 335 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) 336 { 337 Error *err = NULL; 338 struct vhost_log *log; 339 uint64_t logsize = size * sizeof(*(log->log)); 340 int fd = -1; 341 342 log = g_new0(struct vhost_log, 1); 343 if (share) { 344 log->log = qemu_memfd_alloc("vhost-log", logsize, 345 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 346 &fd, &err); 347 if (err) { 348 error_report_err(err); 349 g_free(log); 350 return NULL; 351 } 352 memset(log->log, 0, logsize); 353 } else { 354 log->log = g_malloc0(logsize); 355 } 356 357 log->size = size; 358 log->refcnt = 1; 359 log->fd = fd; 360 361 return log; 362 } 363 364 static struct vhost_log *vhost_log_get(VhostBackendType backend_type, 365 uint64_t size, bool share) 366 { 367 struct vhost_log *log; 368 369 assert(backend_type > VHOST_BACKEND_TYPE_NONE); 370 assert(backend_type < VHOST_BACKEND_TYPE_MAX); 371 372 log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type]; 373 374 if (!log || log->size != size) { 375 log = vhost_log_alloc(size, share); 376 if (share) { 377 vhost_log_shm[backend_type] = log; 378 } else { 379 vhost_log[backend_type] = log; 380 } 381 } else { 382 ++log->refcnt; 383 } 384 385 return log; 386 } 387 388 static void vhost_log_put(struct vhost_dev *dev, bool sync) 389 { 390 struct vhost_log *log = dev->log; 391 VhostBackendType backend_type; 392 393 if (!log) { 394 return; 395 } 396 397 assert(dev->vhost_ops); 398 backend_type = dev->vhost_ops->backend_type; 399 400 if (backend_type == VHOST_BACKEND_TYPE_NONE || 401 backend_type >= VHOST_BACKEND_TYPE_MAX) { 402 return; 403 } 404 405 --log->refcnt; 406 if (log->refcnt == 0) { 407 /* Sync only the range covered by the old log */ 408 if (dev->log_size && sync) { 409 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); 410 } 411 412 if (vhost_log[backend_type] == log) { 413 g_free(log->log); 414 vhost_log[backend_type] = NULL; 415 } else if (vhost_log_shm[backend_type] == log) { 416 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), 417 log->fd); 418 vhost_log_shm[backend_type] = NULL; 419 } 420 421 g_free(log); 422 } 423 424 vhost_dev_elect_mem_logger(dev, false); 425 dev->log = NULL; 426 dev->log_size = 0; 427 } 428 429 static bool vhost_dev_log_is_shared(struct vhost_dev *dev) 430 { 431 return dev->vhost_ops->vhost_requires_shm_log && 432 dev->vhost_ops->vhost_requires_shm_log(dev); 433 } 434 435 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) 436 { 437 struct vhost_log *log = vhost_log_get(dev->vhost_ops->backend_type, 438 size, vhost_dev_log_is_shared(dev)); 439 uint64_t log_base = (uintptr_t)log->log; 440 int r; 441 442 /* inform backend of log switching, this must be done before 443 releasing the current log, to ensure no logging is lost */ 444 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); 445 if (r < 0) { 446 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 447 } 448 449 vhost_log_put(dev, true); 450 dev->log = log; 451 dev->log_size = size; 452 } 453 454 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, 455 hwaddr *plen, bool is_write) 456 { 457 if (!vhost_dev_has_iommu(dev)) { 458 return cpu_physical_memory_map(addr, plen, is_write); 459 } else { 460 return (void *)(uintptr_t)addr; 461 } 462 } 463 464 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer, 465 hwaddr len, int is_write, 466 hwaddr access_len) 467 { 468 if (!vhost_dev_has_iommu(dev)) { 469 cpu_physical_memory_unmap(buffer, len, is_write, access_len); 470 } 471 } 472 473 static int vhost_verify_ring_part_mapping(void *ring_hva, 474 uint64_t ring_gpa, 475 uint64_t ring_size, 476 void *reg_hva, 477 uint64_t reg_gpa, 478 uint64_t reg_size) 479 { 480 uint64_t hva_ring_offset; 481 uint64_t ring_last = range_get_last(ring_gpa, ring_size); 482 uint64_t reg_last = range_get_last(reg_gpa, reg_size); 483 484 if (ring_last < reg_gpa || ring_gpa > reg_last) { 485 return 0; 486 } 487 /* check that whole ring's is mapped */ 488 if (ring_last > reg_last) { 489 return -ENOMEM; 490 } 491 /* check that ring's MemoryRegion wasn't replaced */ 492 hva_ring_offset = ring_gpa - reg_gpa; 493 if (ring_hva != reg_hva + hva_ring_offset) { 494 return -EBUSY; 495 } 496 497 return 0; 498 } 499 500 static int vhost_verify_ring_mappings(struct vhost_dev *dev, 501 void *reg_hva, 502 uint64_t reg_gpa, 503 uint64_t reg_size) 504 { 505 int i, j; 506 int r = 0; 507 const char *part_name[] = { 508 "descriptor table", 509 "available ring", 510 "used ring" 511 }; 512 513 if (vhost_dev_has_iommu(dev)) { 514 return 0; 515 } 516 517 for (i = 0; i < dev->nvqs; ++i) { 518 struct vhost_virtqueue *vq = dev->vqs + i; 519 520 if (vq->desc_phys == 0) { 521 continue; 522 } 523 524 j = 0; 525 r = vhost_verify_ring_part_mapping( 526 vq->desc, vq->desc_phys, vq->desc_size, 527 reg_hva, reg_gpa, reg_size); 528 if (r) { 529 break; 530 } 531 532 j++; 533 r = vhost_verify_ring_part_mapping( 534 vq->avail, vq->avail_phys, vq->avail_size, 535 reg_hva, reg_gpa, reg_size); 536 if (r) { 537 break; 538 } 539 540 j++; 541 r = vhost_verify_ring_part_mapping( 542 vq->used, vq->used_phys, vq->used_size, 543 reg_hva, reg_gpa, reg_size); 544 if (r) { 545 break; 546 } 547 } 548 549 if (r == -ENOMEM) { 550 error_report("Unable to map %s for ring %d", part_name[j], i); 551 } else if (r == -EBUSY) { 552 error_report("%s relocated for ring %d", part_name[j], i); 553 } 554 return r; 555 } 556 557 /* 558 * vhost_section: identify sections needed for vhost access 559 * 560 * We only care about RAM sections here (where virtqueue and guest 561 * internals accessed by virtio might live). 562 */ 563 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) 564 { 565 MemoryRegion *mr = section->mr; 566 567 if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) { 568 uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr); 569 uint8_t handled_dirty; 570 571 /* 572 * Kernel based vhost doesn't handle any block which is doing 573 * dirty-tracking other than migration for which it has 574 * specific logging support. However for TCG the kernel never 575 * gets involved anyway so we can also ignore it's 576 * self-modiying code detection flags. However a vhost-user 577 * client could still confuse a TCG guest if it re-writes 578 * executable memory that has already been translated. 579 */ 580 handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) | 581 (1 << DIRTY_MEMORY_CODE); 582 583 if (dirty_mask & ~handled_dirty) { 584 trace_vhost_reject_section(mr->name, 1); 585 return false; 586 } 587 588 /* 589 * Some backends (like vhost-user) can only handle memory regions 590 * that have an fd (can be mapped into a different process). Filter 591 * the ones without an fd out, if requested. 592 * 593 * TODO: we might have to limit to MAP_SHARED as well. 594 */ 595 if (memory_region_get_fd(section->mr) < 0 && 596 dev->vhost_ops->vhost_backend_no_private_memslots && 597 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 598 trace_vhost_reject_section(mr->name, 2); 599 return false; 600 } 601 602 trace_vhost_section(mr->name); 603 return true; 604 } else { 605 trace_vhost_reject_section(mr->name, 3); 606 return false; 607 } 608 } 609 610 static void vhost_begin(MemoryListener *listener) 611 { 612 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 613 memory_listener); 614 dev->tmp_sections = NULL; 615 dev->n_tmp_sections = 0; 616 } 617 618 static void vhost_commit(MemoryListener *listener) 619 { 620 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 621 memory_listener); 622 MemoryRegionSection *old_sections; 623 int n_old_sections; 624 uint64_t log_size; 625 size_t regions_size; 626 int r; 627 int i; 628 bool changed = false; 629 630 /* Note we can be called before the device is started, but then 631 * starting the device calls set_mem_table, so we need to have 632 * built the data structures. 633 */ 634 old_sections = dev->mem_sections; 635 n_old_sections = dev->n_mem_sections; 636 dev->mem_sections = dev->tmp_sections; 637 dev->n_mem_sections = dev->n_tmp_sections; 638 639 if (dev->n_mem_sections != n_old_sections) { 640 changed = true; 641 } else { 642 /* Same size, lets check the contents */ 643 for (i = 0; i < n_old_sections; i++) { 644 if (!MemoryRegionSection_eq(&old_sections[i], 645 &dev->mem_sections[i])) { 646 changed = true; 647 break; 648 } 649 } 650 } 651 652 trace_vhost_commit(dev->started, changed); 653 if (!changed) { 654 goto out; 655 } 656 657 /* Rebuild the regions list from the new sections list */ 658 regions_size = offsetof(struct vhost_memory, regions) + 659 dev->n_mem_sections * sizeof dev->mem->regions[0]; 660 dev->mem = g_realloc(dev->mem, regions_size); 661 dev->mem->nregions = dev->n_mem_sections; 662 663 for (i = 0; i < dev->n_mem_sections; i++) { 664 struct vhost_memory_region *cur_vmr = dev->mem->regions + i; 665 struct MemoryRegionSection *mrs = dev->mem_sections + i; 666 667 cur_vmr->guest_phys_addr = mrs->offset_within_address_space; 668 cur_vmr->memory_size = int128_get64(mrs->size); 669 cur_vmr->userspace_addr = 670 (uintptr_t)memory_region_get_ram_ptr(mrs->mr) + 671 mrs->offset_within_region; 672 cur_vmr->flags_padding = 0; 673 } 674 675 if (!dev->started) { 676 goto out; 677 } 678 679 for (i = 0; i < dev->mem->nregions; i++) { 680 if (vhost_verify_ring_mappings(dev, 681 (void *)(uintptr_t)dev->mem->regions[i].userspace_addr, 682 dev->mem->regions[i].guest_phys_addr, 683 dev->mem->regions[i].memory_size)) { 684 error_report("Verify ring failure on region %d", i); 685 abort(); 686 } 687 } 688 689 if (!dev->log_enabled) { 690 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 691 if (r < 0) { 692 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 693 } 694 goto out; 695 } 696 log_size = vhost_get_log_size(dev); 697 /* We allocate an extra 4K bytes to log, 698 * to reduce the * number of reallocations. */ 699 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) 700 /* To log more, must increase log size before table update. */ 701 if (dev->log_size < log_size) { 702 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); 703 } 704 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 705 if (r < 0) { 706 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 707 } 708 /* To log less, can only decrease log size after table update. */ 709 if (dev->log_size > log_size + VHOST_LOG_BUFFER) { 710 vhost_dev_log_resize(dev, log_size); 711 } 712 713 out: 714 /* Deref the old list of sections, this must happen _after_ the 715 * vhost_set_mem_table to ensure the client isn't still using the 716 * section we're about to unref. 717 */ 718 while (n_old_sections--) { 719 memory_region_unref(old_sections[n_old_sections].mr); 720 } 721 g_free(old_sections); 722 } 723 724 /* Adds the section data to the tmp_section structure. 725 * It relies on the listener calling us in memory address order 726 * and for each region (via the _add and _nop methods) to 727 * join neighbours. 728 */ 729 static void vhost_region_add_section(struct vhost_dev *dev, 730 MemoryRegionSection *section) 731 { 732 bool need_add = true; 733 uint64_t mrs_size = int128_get64(section->size); 734 uint64_t mrs_gpa = section->offset_within_address_space; 735 uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + 736 section->offset_within_region; 737 RAMBlock *mrs_rb = section->mr->ram_block; 738 739 trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, 740 mrs_host); 741 742 if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { 743 /* Round the section to it's page size */ 744 /* First align the start down to a page boundary */ 745 size_t mrs_page = qemu_ram_pagesize(mrs_rb); 746 uint64_t alignage = mrs_host & (mrs_page - 1); 747 if (alignage) { 748 mrs_host -= alignage; 749 mrs_size += alignage; 750 mrs_gpa -= alignage; 751 } 752 /* Now align the size up to a page boundary */ 753 alignage = mrs_size & (mrs_page - 1); 754 if (alignage) { 755 mrs_size += mrs_page - alignage; 756 } 757 trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, 758 mrs_size, mrs_host); 759 } 760 761 if (dev->n_tmp_sections && !section->unmergeable) { 762 /* Since we already have at least one section, lets see if 763 * this extends it; since we're scanning in order, we only 764 * have to look at the last one, and the FlatView that calls 765 * us shouldn't have overlaps. 766 */ 767 MemoryRegionSection *prev_sec = dev->tmp_sections + 768 (dev->n_tmp_sections - 1); 769 uint64_t prev_gpa_start = prev_sec->offset_within_address_space; 770 uint64_t prev_size = int128_get64(prev_sec->size); 771 uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); 772 uint64_t prev_host_start = 773 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + 774 prev_sec->offset_within_region; 775 uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); 776 777 if (mrs_gpa <= (prev_gpa_end + 1)) { 778 /* OK, looks like overlapping/intersecting - it's possible that 779 * the rounding to page sizes has made them overlap, but they should 780 * match up in the same RAMBlock if they do. 781 */ 782 if (mrs_gpa < prev_gpa_start) { 783 error_report("%s:Section '%s' rounded to %"PRIx64 784 " prior to previous '%s' %"PRIx64, 785 __func__, section->mr->name, mrs_gpa, 786 prev_sec->mr->name, prev_gpa_start); 787 /* A way to cleanly fail here would be better */ 788 return; 789 } 790 /* Offset from the start of the previous GPA to this GPA */ 791 size_t offset = mrs_gpa - prev_gpa_start; 792 793 if (prev_host_start + offset == mrs_host && 794 section->mr == prev_sec->mr && !prev_sec->unmergeable) { 795 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); 796 need_add = false; 797 prev_sec->offset_within_address_space = 798 MIN(prev_gpa_start, mrs_gpa); 799 prev_sec->offset_within_region = 800 MIN(prev_host_start, mrs_host) - 801 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); 802 prev_sec->size = int128_make64(max_end - MIN(prev_host_start, 803 mrs_host)); 804 trace_vhost_region_add_section_merge(section->mr->name, 805 int128_get64(prev_sec->size), 806 prev_sec->offset_within_address_space, 807 prev_sec->offset_within_region); 808 } else { 809 /* adjoining regions are fine, but overlapping ones with 810 * different blocks/offsets shouldn't happen 811 */ 812 if (mrs_gpa != prev_gpa_end + 1) { 813 error_report("%s: Overlapping but not coherent sections " 814 "at %"PRIx64, 815 __func__, mrs_gpa); 816 return; 817 } 818 } 819 } 820 } 821 822 if (need_add) { 823 ++dev->n_tmp_sections; 824 dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections, 825 dev->n_tmp_sections); 826 dev->tmp_sections[dev->n_tmp_sections - 1] = *section; 827 /* The flatview isn't stable and we don't use it, making it NULL 828 * means we can memcmp the list. 829 */ 830 dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL; 831 memory_region_ref(section->mr); 832 } 833 } 834 835 /* Used for both add and nop callbacks */ 836 static void vhost_region_addnop(MemoryListener *listener, 837 MemoryRegionSection *section) 838 { 839 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 840 memory_listener); 841 842 if (!vhost_section(dev, section)) { 843 return; 844 } 845 vhost_region_add_section(dev, section); 846 } 847 848 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 849 { 850 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n); 851 struct vhost_dev *hdev = iommu->hdev; 852 hwaddr iova = iotlb->iova + iommu->iommu_offset; 853 854 if (vhost_backend_invalidate_device_iotlb(hdev, iova, 855 iotlb->addr_mask + 1)) { 856 error_report("Fail to invalidate device iotlb"); 857 } 858 } 859 860 static void vhost_iommu_region_add(MemoryListener *listener, 861 MemoryRegionSection *section) 862 { 863 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 864 iommu_listener); 865 struct vhost_iommu *iommu; 866 Int128 end; 867 int iommu_idx; 868 IOMMUMemoryRegion *iommu_mr; 869 870 if (!memory_region_is_iommu(section->mr)) { 871 return; 872 } 873 874 iommu_mr = IOMMU_MEMORY_REGION(section->mr); 875 876 iommu = g_malloc0(sizeof(*iommu)); 877 end = int128_add(int128_make64(section->offset_within_region), 878 section->size); 879 end = int128_sub(end, int128_one()); 880 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 881 MEMTXATTRS_UNSPECIFIED); 882 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, 883 dev->vdev->device_iotlb_enabled ? 884 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : 885 IOMMU_NOTIFIER_UNMAP, 886 section->offset_within_region, 887 int128_get64(end), 888 iommu_idx); 889 iommu->mr = section->mr; 890 iommu->iommu_offset = section->offset_within_address_space - 891 section->offset_within_region; 892 iommu->hdev = dev; 893 memory_region_register_iommu_notifier(section->mr, &iommu->n, 894 &error_fatal); 895 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next); 896 /* TODO: can replay help performance here? */ 897 } 898 899 static void vhost_iommu_region_del(MemoryListener *listener, 900 MemoryRegionSection *section) 901 { 902 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 903 iommu_listener); 904 struct vhost_iommu *iommu; 905 906 if (!memory_region_is_iommu(section->mr)) { 907 return; 908 } 909 910 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 911 if (iommu->mr == section->mr && 912 iommu->n.start == section->offset_within_region) { 913 memory_region_unregister_iommu_notifier(iommu->mr, 914 &iommu->n); 915 QLIST_REMOVE(iommu, iommu_next); 916 g_free(iommu); 917 break; 918 } 919 } 920 } 921 922 void vhost_toggle_device_iotlb(VirtIODevice *vdev) 923 { 924 VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); 925 struct vhost_dev *dev; 926 struct vhost_iommu *iommu; 927 928 if (vdev->vhost_started) { 929 dev = vdc->get_vhost(vdev); 930 } else { 931 return; 932 } 933 934 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 935 memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n); 936 iommu->n.notifier_flags = vdev->device_iotlb_enabled ? 937 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP; 938 memory_region_register_iommu_notifier(iommu->mr, &iommu->n, 939 &error_fatal); 940 } 941 } 942 943 static int vhost_virtqueue_set_addr(struct vhost_dev *dev, 944 struct vhost_virtqueue *vq, 945 unsigned idx, bool enable_log) 946 { 947 struct vhost_vring_addr addr; 948 int r; 949 memset(&addr, 0, sizeof(struct vhost_vring_addr)); 950 951 if (dev->vhost_ops->vhost_vq_get_addr) { 952 r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq); 953 if (r < 0) { 954 VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed"); 955 return r; 956 } 957 } else { 958 addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc; 959 addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail; 960 addr.used_user_addr = (uint64_t)(unsigned long)vq->used; 961 } 962 addr.index = idx; 963 addr.log_guest_addr = vq->used_phys; 964 addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0; 965 r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 966 if (r < 0) { 967 VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed"); 968 } 969 return r; 970 } 971 972 static int vhost_dev_set_features(struct vhost_dev *dev, 973 bool enable_log) 974 { 975 uint64_t features = dev->acked_features; 976 int r; 977 if (enable_log) { 978 features |= 0x1ULL << VHOST_F_LOG_ALL; 979 } 980 if (!vhost_dev_has_iommu(dev)) { 981 features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); 982 } 983 if (dev->vhost_ops->vhost_force_iommu) { 984 if (dev->vhost_ops->vhost_force_iommu(dev) == true) { 985 features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; 986 } 987 } 988 r = dev->vhost_ops->vhost_set_features(dev, features); 989 if (r < 0) { 990 VHOST_OPS_DEBUG(r, "vhost_set_features failed"); 991 goto out; 992 } 993 if (dev->vhost_ops->vhost_set_backend_cap) { 994 r = dev->vhost_ops->vhost_set_backend_cap(dev); 995 if (r < 0) { 996 VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed"); 997 goto out; 998 } 999 } 1000 1001 out: 1002 return r; 1003 } 1004 1005 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) 1006 { 1007 int r, i, idx; 1008 hwaddr addr; 1009 1010 r = vhost_dev_set_features(dev, enable_log); 1011 if (r < 0) { 1012 goto err_features; 1013 } 1014 for (i = 0; i < dev->nvqs; ++i) { 1015 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 1016 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 1017 if (!addr) { 1018 /* 1019 * The queue might not be ready for start. If this 1020 * is the case there is no reason to continue the process. 1021 * The similar logic is used by the vhost_virtqueue_start() 1022 * routine. 1023 */ 1024 continue; 1025 } 1026 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 1027 enable_log); 1028 if (r < 0) { 1029 goto err_vq; 1030 } 1031 } 1032 1033 /* 1034 * At log start we select our vhost_device logger that will scan the 1035 * memory sections and skip for the others. This is possible because 1036 * the log is shared amongst all vhost devices for a given type of 1037 * backend. 1038 */ 1039 vhost_dev_elect_mem_logger(dev, enable_log); 1040 1041 return 0; 1042 err_vq: 1043 for (; i >= 0; --i) { 1044 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 1045 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 1046 if (!addr) { 1047 continue; 1048 } 1049 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 1050 dev->log_enabled); 1051 } 1052 vhost_dev_set_features(dev, dev->log_enabled); 1053 err_features: 1054 return r; 1055 } 1056 1057 static int vhost_migration_log(MemoryListener *listener, bool enable) 1058 { 1059 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 1060 memory_listener); 1061 int r; 1062 if (enable == dev->log_enabled) { 1063 return 0; 1064 } 1065 if (!dev->started) { 1066 dev->log_enabled = enable; 1067 return 0; 1068 } 1069 1070 r = 0; 1071 if (!enable) { 1072 r = vhost_dev_set_log(dev, false); 1073 if (r < 0) { 1074 goto check_dev_state; 1075 } 1076 vhost_log_put(dev, false); 1077 } else { 1078 vhost_dev_log_resize(dev, vhost_get_log_size(dev)); 1079 r = vhost_dev_set_log(dev, true); 1080 if (r < 0) { 1081 goto check_dev_state; 1082 } 1083 } 1084 1085 check_dev_state: 1086 dev->log_enabled = enable; 1087 /* 1088 * vhost-user-* devices could change their state during log 1089 * initialization due to disconnect. So check dev state after 1090 * vhost communication. 1091 */ 1092 if (!dev->started) { 1093 /* 1094 * Since device is in the stopped state, it is okay for 1095 * migration. Return success. 1096 */ 1097 r = 0; 1098 } 1099 if (r) { 1100 /* An error occurred. */ 1101 dev->log_enabled = false; 1102 } 1103 1104 return r; 1105 } 1106 1107 static bool vhost_log_global_start(MemoryListener *listener, Error **errp) 1108 { 1109 int r; 1110 1111 r = vhost_migration_log(listener, true); 1112 if (r < 0) { 1113 error_setg_errno(errp, -r, "vhost: Failed to start logging"); 1114 return false; 1115 } 1116 return true; 1117 } 1118 1119 static void vhost_log_global_stop(MemoryListener *listener) 1120 { 1121 int r; 1122 1123 r = vhost_migration_log(listener, false); 1124 if (r < 0) { 1125 /* Not fatal, so report it, but take no further action */ 1126 warn_report("vhost: Failed to stop logging"); 1127 } 1128 } 1129 1130 static void vhost_log_start(MemoryListener *listener, 1131 MemoryRegionSection *section, 1132 int old, int new) 1133 { 1134 /* FIXME: implement */ 1135 } 1136 1137 static void vhost_log_stop(MemoryListener *listener, 1138 MemoryRegionSection *section, 1139 int old, int new) 1140 { 1141 /* FIXME: implement */ 1142 } 1143 1144 /* The vhost driver natively knows how to handle the vrings of non 1145 * cross-endian legacy devices and modern devices. Only legacy devices 1146 * exposed to a bi-endian guest may require the vhost driver to use a 1147 * specific endianness. 1148 */ 1149 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) 1150 { 1151 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1152 return false; 1153 } 1154 #if HOST_BIG_ENDIAN 1155 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; 1156 #else 1157 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; 1158 #endif 1159 } 1160 1161 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, 1162 bool is_big_endian, 1163 int vhost_vq_index) 1164 { 1165 int r; 1166 struct vhost_vring_state s = { 1167 .index = vhost_vq_index, 1168 .num = is_big_endian 1169 }; 1170 1171 r = dev->vhost_ops->vhost_set_vring_endian(dev, &s); 1172 if (r < 0) { 1173 VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed"); 1174 } 1175 return r; 1176 } 1177 1178 static int vhost_memory_region_lookup(struct vhost_dev *hdev, 1179 uint64_t gpa, uint64_t *uaddr, 1180 uint64_t *len) 1181 { 1182 int i; 1183 1184 for (i = 0; i < hdev->mem->nregions; i++) { 1185 struct vhost_memory_region *reg = hdev->mem->regions + i; 1186 1187 if (gpa >= reg->guest_phys_addr && 1188 reg->guest_phys_addr + reg->memory_size > gpa) { 1189 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr; 1190 *len = reg->guest_phys_addr + reg->memory_size - gpa; 1191 return 0; 1192 } 1193 } 1194 1195 return -EFAULT; 1196 } 1197 1198 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) 1199 { 1200 IOMMUTLBEntry iotlb; 1201 uint64_t uaddr, len; 1202 int ret = -EFAULT; 1203 1204 RCU_READ_LOCK_GUARD(); 1205 1206 trace_vhost_iotlb_miss(dev, 1); 1207 1208 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 1209 iova, write, 1210 MEMTXATTRS_UNSPECIFIED); 1211 if (iotlb.target_as != NULL) { 1212 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, 1213 &uaddr, &len); 1214 if (ret) { 1215 trace_vhost_iotlb_miss(dev, 3); 1216 error_report("Fail to lookup the translated address " 1217 "%"PRIx64, iotlb.translated_addr); 1218 goto out; 1219 } 1220 1221 len = MIN(iotlb.addr_mask + 1, len); 1222 iova = iova & ~iotlb.addr_mask; 1223 1224 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, 1225 len, iotlb.perm); 1226 if (ret) { 1227 trace_vhost_iotlb_miss(dev, 4); 1228 error_report("Fail to update device iotlb"); 1229 goto out; 1230 } 1231 } 1232 1233 trace_vhost_iotlb_miss(dev, 2); 1234 1235 out: 1236 return ret; 1237 } 1238 1239 int vhost_virtqueue_start(struct vhost_dev *dev, 1240 struct VirtIODevice *vdev, 1241 struct vhost_virtqueue *vq, 1242 unsigned idx) 1243 { 1244 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1245 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1246 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1247 hwaddr s, l, a; 1248 int r; 1249 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1250 struct vhost_vring_file file = { 1251 .index = vhost_vq_index 1252 }; 1253 struct vhost_vring_state state = { 1254 .index = vhost_vq_index 1255 }; 1256 struct VirtQueue *vvq = virtio_get_queue(vdev, idx); 1257 1258 a = virtio_queue_get_desc_addr(vdev, idx); 1259 if (a == 0) { 1260 /* Queue might not be ready for start */ 1261 return 0; 1262 } 1263 1264 vq->num = state.num = virtio_queue_get_num(vdev, idx); 1265 r = dev->vhost_ops->vhost_set_vring_num(dev, &state); 1266 if (r) { 1267 VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed"); 1268 return r; 1269 } 1270 1271 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 1272 r = dev->vhost_ops->vhost_set_vring_base(dev, &state); 1273 if (r) { 1274 VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed"); 1275 return r; 1276 } 1277 1278 if (vhost_needs_vring_endian(vdev)) { 1279 r = vhost_virtqueue_set_vring_endian_legacy(dev, 1280 virtio_is_big_endian(vdev), 1281 vhost_vq_index); 1282 if (r) { 1283 return r; 1284 } 1285 } 1286 1287 vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx); 1288 vq->desc_phys = a; 1289 vq->desc = vhost_memory_map(dev, a, &l, false); 1290 if (!vq->desc || l != s) { 1291 r = -ENOMEM; 1292 goto fail_alloc_desc; 1293 } 1294 vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx); 1295 vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx); 1296 vq->avail = vhost_memory_map(dev, a, &l, false); 1297 if (!vq->avail || l != s) { 1298 r = -ENOMEM; 1299 goto fail_alloc_avail; 1300 } 1301 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); 1302 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); 1303 vq->used = vhost_memory_map(dev, a, &l, true); 1304 if (!vq->used || l != s) { 1305 r = -ENOMEM; 1306 goto fail_alloc_used; 1307 } 1308 1309 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); 1310 if (r < 0) { 1311 goto fail_alloc; 1312 } 1313 1314 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); 1315 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); 1316 if (r) { 1317 VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed"); 1318 goto fail_kick; 1319 } 1320 1321 /* Clear and discard previous events if any. */ 1322 event_notifier_test_and_clear(&vq->masked_notifier); 1323 1324 /* Init vring in unmasked state, unless guest_notifier_mask 1325 * will do it later. 1326 */ 1327 if (!vdev->use_guest_notifier_mask) { 1328 /* TODO: check and handle errors. */ 1329 vhost_virtqueue_mask(dev, vdev, idx, false); 1330 } 1331 1332 if (k->query_guest_notifiers && 1333 k->query_guest_notifiers(qbus->parent) && 1334 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) { 1335 file.fd = -1; 1336 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1337 if (r) { 1338 goto fail_vector; 1339 } 1340 } 1341 1342 return 0; 1343 1344 fail_vector: 1345 fail_kick: 1346 fail_alloc: 1347 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1348 0, 0); 1349 fail_alloc_used: 1350 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1351 0, 0); 1352 fail_alloc_avail: 1353 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1354 0, 0); 1355 fail_alloc_desc: 1356 return r; 1357 } 1358 1359 static int do_vhost_virtqueue_stop(struct vhost_dev *dev, 1360 struct VirtIODevice *vdev, 1361 struct vhost_virtqueue *vq, 1362 unsigned idx, bool force) 1363 { 1364 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1365 struct vhost_vring_state state = { 1366 .index = vhost_vq_index, 1367 }; 1368 int r = 0; 1369 1370 if (virtio_queue_get_desc_addr(vdev, idx) == 0) { 1371 /* Don't stop the virtqueue which might have not been started */ 1372 return 0; 1373 } 1374 1375 if (!force) { 1376 r = dev->vhost_ops->vhost_get_vring_base(dev, &state); 1377 if (r < 0) { 1378 VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r); 1379 } 1380 } 1381 1382 if (r < 0 || force) { 1383 /* Connection to the backend is broken, so let's sync internal 1384 * last avail idx to the device used idx. 1385 */ 1386 virtio_queue_restore_last_avail_idx(vdev, idx); 1387 } else { 1388 virtio_queue_set_last_avail_idx(vdev, idx, state.num); 1389 } 1390 virtio_queue_invalidate_signalled_used(vdev, idx); 1391 virtio_queue_update_used_idx(vdev, idx); 1392 1393 /* In the cross-endian case, we need to reset the vring endianness to 1394 * native as legacy devices expect so by default. 1395 */ 1396 if (vhost_needs_vring_endian(vdev)) { 1397 vhost_virtqueue_set_vring_endian_legacy(dev, 1398 !virtio_is_big_endian(vdev), 1399 vhost_vq_index); 1400 } 1401 1402 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1403 1, virtio_queue_get_used_size(vdev, idx)); 1404 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1405 0, virtio_queue_get_avail_size(vdev, idx)); 1406 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1407 0, virtio_queue_get_desc_size(vdev, idx)); 1408 return r; 1409 } 1410 1411 int vhost_virtqueue_stop(struct vhost_dev *dev, 1412 struct VirtIODevice *vdev, 1413 struct vhost_virtqueue *vq, 1414 unsigned idx) 1415 { 1416 return do_vhost_virtqueue_stop(dev, vdev, vq, idx, false); 1417 } 1418 1419 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, 1420 int n, uint32_t timeout) 1421 { 1422 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1423 struct vhost_vring_state state = { 1424 .index = vhost_vq_index, 1425 .num = timeout, 1426 }; 1427 int r; 1428 1429 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) { 1430 return -EINVAL; 1431 } 1432 1433 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state); 1434 if (r) { 1435 VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed"); 1436 return r; 1437 } 1438 1439 return 0; 1440 } 1441 1442 static void vhost_virtqueue_error_notifier(EventNotifier *n) 1443 { 1444 struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue, 1445 error_notifier); 1446 struct vhost_dev *dev = vq->dev; 1447 int index = vq - dev->vqs; 1448 1449 if (event_notifier_test_and_clear(n) && dev->vdev) { 1450 VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", 1451 dev->vq_index + index); 1452 } 1453 } 1454 1455 static int vhost_virtqueue_init(struct vhost_dev *dev, 1456 struct vhost_virtqueue *vq, int n) 1457 { 1458 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1459 struct vhost_vring_file file = { 1460 .index = vhost_vq_index, 1461 }; 1462 int r = event_notifier_init(&vq->masked_notifier, 0); 1463 if (r < 0) { 1464 return r; 1465 } 1466 1467 file.fd = event_notifier_get_wfd(&vq->masked_notifier); 1468 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1469 if (r) { 1470 VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed"); 1471 goto fail_call; 1472 } 1473 1474 vq->dev = dev; 1475 1476 if (dev->vhost_ops->vhost_set_vring_err) { 1477 r = event_notifier_init(&vq->error_notifier, 0); 1478 if (r < 0) { 1479 goto fail_call; 1480 } 1481 1482 file.fd = event_notifier_get_fd(&vq->error_notifier); 1483 r = dev->vhost_ops->vhost_set_vring_err(dev, &file); 1484 if (r) { 1485 VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed"); 1486 goto fail_err; 1487 } 1488 1489 event_notifier_set_handler(&vq->error_notifier, 1490 vhost_virtqueue_error_notifier); 1491 } 1492 1493 return 0; 1494 1495 fail_err: 1496 event_notifier_cleanup(&vq->error_notifier); 1497 fail_call: 1498 event_notifier_cleanup(&vq->masked_notifier); 1499 return r; 1500 } 1501 1502 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) 1503 { 1504 event_notifier_cleanup(&vq->masked_notifier); 1505 if (vq->dev->vhost_ops->vhost_set_vring_err) { 1506 event_notifier_set_handler(&vq->error_notifier, NULL); 1507 event_notifier_cleanup(&vq->error_notifier); 1508 } 1509 } 1510 1511 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 1512 VhostBackendType backend_type, uint32_t busyloop_timeout, 1513 Error **errp) 1514 { 1515 unsigned int used, reserved, limit; 1516 uint64_t features; 1517 int i, r, n_initialized_vqs = 0; 1518 1519 hdev->vdev = NULL; 1520 hdev->migration_blocker = NULL; 1521 1522 r = vhost_set_backend_type(hdev, backend_type); 1523 assert(r >= 0); 1524 1525 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp); 1526 if (r < 0) { 1527 goto fail; 1528 } 1529 1530 r = hdev->vhost_ops->vhost_set_owner(hdev); 1531 if (r < 0) { 1532 error_setg_errno(errp, -r, "vhost_set_owner failed"); 1533 goto fail; 1534 } 1535 1536 r = hdev->vhost_ops->vhost_get_features(hdev, &features); 1537 if (r < 0) { 1538 error_setg_errno(errp, -r, "vhost_get_features failed"); 1539 goto fail; 1540 } 1541 1542 limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 1543 if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS && 1544 memory_devices_memslot_auto_decision_active()) { 1545 error_setg(errp, "some memory device (like virtio-mem)" 1546 " decided how many memory slots to use based on the overall" 1547 " number of memory slots; this vhost backend would further" 1548 " restricts the overall number of memory slots"); 1549 error_append_hint(errp, "Try plugging this vhost backend before" 1550 " plugging such memory devices.\n"); 1551 r = -EINVAL; 1552 goto fail; 1553 } 1554 1555 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) { 1556 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); 1557 if (r < 0) { 1558 error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i); 1559 goto fail; 1560 } 1561 } 1562 1563 if (busyloop_timeout) { 1564 for (i = 0; i < hdev->nvqs; ++i) { 1565 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 1566 busyloop_timeout); 1567 if (r < 0) { 1568 error_setg_errno(errp, -r, "Failed to set busyloop timeout"); 1569 goto fail_busyloop; 1570 } 1571 } 1572 } 1573 1574 hdev->features = features; 1575 1576 hdev->memory_listener = (MemoryListener) { 1577 .name = "vhost", 1578 .begin = vhost_begin, 1579 .commit = vhost_commit, 1580 .region_add = vhost_region_addnop, 1581 .region_nop = vhost_region_addnop, 1582 .log_start = vhost_log_start, 1583 .log_stop = vhost_log_stop, 1584 .log_sync = vhost_log_sync, 1585 .log_global_start = vhost_log_global_start, 1586 .log_global_stop = vhost_log_global_stop, 1587 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND 1588 }; 1589 1590 hdev->iommu_listener = (MemoryListener) { 1591 .name = "vhost-iommu", 1592 .region_add = vhost_iommu_region_add, 1593 .region_del = vhost_iommu_region_del, 1594 }; 1595 1596 if (hdev->migration_blocker == NULL) { 1597 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { 1598 error_setg(&hdev->migration_blocker, 1599 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); 1600 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { 1601 error_setg(&hdev->migration_blocker, 1602 "Migration disabled: failed to allocate shared memory"); 1603 } 1604 } 1605 1606 if (hdev->migration_blocker != NULL) { 1607 r = migrate_add_blocker_normal(&hdev->migration_blocker, errp); 1608 if (r < 0) { 1609 goto fail_busyloop; 1610 } 1611 } 1612 1613 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); 1614 hdev->n_mem_sections = 0; 1615 hdev->mem_sections = NULL; 1616 hdev->log = NULL; 1617 hdev->log_size = 0; 1618 hdev->log_enabled = false; 1619 hdev->started = false; 1620 memory_listener_register(&hdev->memory_listener, &address_space_memory); 1621 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); 1622 1623 /* 1624 * The listener we registered properly setup the number of required 1625 * memslots in vhost_commit(). 1626 */ 1627 used = hdev->mem->nregions; 1628 1629 /* 1630 * We assume that all reserved memslots actually require a real memslot 1631 * in our vhost backend. This might not be true, for example, if the 1632 * memslot would be ROM. If ever relevant, we can optimize for that -- 1633 * but we'll need additional information about the reservations. 1634 */ 1635 reserved = memory_devices_get_reserved_memslots(); 1636 if (used + reserved > limit) { 1637 error_setg(errp, "vhost backend memory slots limit (%d) is less" 1638 " than current number of used (%d) and reserved (%d)" 1639 " memory slots for memory devices.", limit, used, reserved); 1640 r = -EINVAL; 1641 goto fail_busyloop; 1642 } 1643 1644 return 0; 1645 1646 fail_busyloop: 1647 if (busyloop_timeout) { 1648 while (--i >= 0) { 1649 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0); 1650 } 1651 } 1652 fail: 1653 hdev->nvqs = n_initialized_vqs; 1654 vhost_dev_cleanup(hdev); 1655 return r; 1656 } 1657 1658 void vhost_dev_cleanup(struct vhost_dev *hdev) 1659 { 1660 int i; 1661 1662 trace_vhost_dev_cleanup(hdev); 1663 1664 for (i = 0; i < hdev->nvqs; ++i) { 1665 vhost_virtqueue_cleanup(hdev->vqs + i); 1666 } 1667 if (hdev->mem) { 1668 /* those are only safe after successful init */ 1669 memory_listener_unregister(&hdev->memory_listener); 1670 QLIST_REMOVE(hdev, entry); 1671 } 1672 migrate_del_blocker(&hdev->migration_blocker); 1673 g_free(hdev->mem); 1674 g_free(hdev->mem_sections); 1675 if (hdev->vhost_ops) { 1676 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1677 } 1678 assert(!hdev->log); 1679 1680 memset(hdev, 0, sizeof(struct vhost_dev)); 1681 } 1682 1683 void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev, 1684 VirtIODevice *vdev, 1685 unsigned int nvqs) 1686 { 1687 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1688 int i, r; 1689 1690 /* 1691 * Batch all the host notifiers in a single transaction to avoid 1692 * quadratic time complexity in address_space_update_ioeventfds(). 1693 */ 1694 memory_region_transaction_begin(); 1695 1696 for (i = 0; i < nvqs; ++i) { 1697 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1698 false); 1699 if (r < 0) { 1700 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); 1701 } 1702 assert(r >= 0); 1703 } 1704 1705 /* 1706 * The transaction expects the ioeventfds to be open when it 1707 * commits. Do it now, before the cleanup loop. 1708 */ 1709 memory_region_transaction_commit(); 1710 1711 for (i = 0; i < nvqs; ++i) { 1712 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); 1713 } 1714 virtio_device_release_ioeventfd(vdev); 1715 } 1716 1717 /* Stop processing guest IO notifications in qemu. 1718 * Start processing them in vhost in kernel. 1719 */ 1720 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1721 { 1722 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1723 int i, r; 1724 1725 /* We will pass the notifiers to the kernel, make sure that QEMU 1726 * doesn't interfere. 1727 */ 1728 r = virtio_device_grab_ioeventfd(vdev); 1729 if (r < 0) { 1730 error_report("binding does not support host notifiers"); 1731 return r; 1732 } 1733 1734 /* 1735 * Batch all the host notifiers in a single transaction to avoid 1736 * quadratic time complexity in address_space_update_ioeventfds(). 1737 */ 1738 memory_region_transaction_begin(); 1739 1740 for (i = 0; i < hdev->nvqs; ++i) { 1741 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1742 true); 1743 if (r < 0) { 1744 error_report("vhost VQ %d notifier binding failed: %d", i, -r); 1745 memory_region_transaction_commit(); 1746 vhost_dev_disable_notifiers_nvqs(hdev, vdev, i); 1747 return r; 1748 } 1749 } 1750 1751 memory_region_transaction_commit(); 1752 1753 return 0; 1754 } 1755 1756 /* Stop processing guest IO notifications in vhost. 1757 * Start processing them in qemu. 1758 * This might actually run the qemu handlers right away, 1759 * so virtio in qemu must be completely setup when this is called. 1760 */ 1761 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1762 { 1763 vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs); 1764 } 1765 1766 /* Test and clear event pending status. 1767 * Should be called after unmask to avoid losing events. 1768 */ 1769 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) 1770 { 1771 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; 1772 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); 1773 return event_notifier_test_and_clear(&vq->masked_notifier); 1774 } 1775 1776 /* Mask/unmask events from this vq. */ 1777 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, 1778 bool mask) 1779 { 1780 struct VirtQueue *vvq = virtio_get_queue(vdev, n); 1781 int r, index = n - hdev->vq_index; 1782 struct vhost_vring_file file; 1783 1784 /* should only be called after backend is connected */ 1785 assert(hdev->vhost_ops); 1786 1787 if (mask) { 1788 assert(vdev->use_guest_notifier_mask); 1789 file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier); 1790 } else { 1791 file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq)); 1792 } 1793 1794 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); 1795 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); 1796 if (r < 0) { 1797 error_report("vhost_set_vring_call failed %d", -r); 1798 } 1799 } 1800 1801 bool vhost_config_pending(struct vhost_dev *hdev) 1802 { 1803 assert(hdev->vhost_ops); 1804 if ((hdev->started == false) || 1805 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1806 return false; 1807 } 1808 1809 EventNotifier *notifier = 1810 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1811 return event_notifier_test_and_clear(notifier); 1812 } 1813 1814 void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask) 1815 { 1816 int fd; 1817 int r; 1818 EventNotifier *notifier = 1819 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1820 EventNotifier *config_notifier = &vdev->config_notifier; 1821 assert(hdev->vhost_ops); 1822 1823 if ((hdev->started == false) || 1824 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1825 return; 1826 } 1827 if (mask) { 1828 assert(vdev->use_guest_notifier_mask); 1829 fd = event_notifier_get_fd(notifier); 1830 } else { 1831 fd = event_notifier_get_fd(config_notifier); 1832 } 1833 r = hdev->vhost_ops->vhost_set_config_call(hdev, fd); 1834 if (r < 0) { 1835 error_report("vhost_set_config_call failed %d", -r); 1836 } 1837 } 1838 1839 static void vhost_stop_config_intr(struct vhost_dev *dev) 1840 { 1841 int fd = -1; 1842 assert(dev->vhost_ops); 1843 if (dev->vhost_ops->vhost_set_config_call) { 1844 dev->vhost_ops->vhost_set_config_call(dev, fd); 1845 } 1846 } 1847 1848 static void vhost_start_config_intr(struct vhost_dev *dev) 1849 { 1850 int r; 1851 1852 assert(dev->vhost_ops); 1853 int fd = event_notifier_get_fd(&dev->vdev->config_notifier); 1854 if (dev->vhost_ops->vhost_set_config_call) { 1855 r = dev->vhost_ops->vhost_set_config_call(dev, fd); 1856 if (!r) { 1857 event_notifier_set(&dev->vdev->config_notifier); 1858 } 1859 } 1860 } 1861 1862 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, 1863 uint64_t features) 1864 { 1865 const int *bit = feature_bits; 1866 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1867 uint64_t bit_mask = (1ULL << *bit); 1868 if (!(hdev->features & bit_mask)) { 1869 features &= ~bit_mask; 1870 } 1871 bit++; 1872 } 1873 return features; 1874 } 1875 1876 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, 1877 uint64_t features) 1878 { 1879 const int *bit = feature_bits; 1880 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1881 uint64_t bit_mask = (1ULL << *bit); 1882 if (features & bit_mask) { 1883 hdev->acked_features |= bit_mask; 1884 } 1885 bit++; 1886 } 1887 } 1888 1889 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, 1890 uint32_t config_len, Error **errp) 1891 { 1892 assert(hdev->vhost_ops); 1893 1894 if (hdev->vhost_ops->vhost_get_config) { 1895 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len, 1896 errp); 1897 } 1898 1899 error_setg(errp, "vhost_get_config not implemented"); 1900 return -ENOSYS; 1901 } 1902 1903 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data, 1904 uint32_t offset, uint32_t size, uint32_t flags) 1905 { 1906 assert(hdev->vhost_ops); 1907 1908 if (hdev->vhost_ops->vhost_set_config) { 1909 return hdev->vhost_ops->vhost_set_config(hdev, data, offset, 1910 size, flags); 1911 } 1912 1913 return -ENOSYS; 1914 } 1915 1916 void vhost_dev_set_config_notifier(struct vhost_dev *hdev, 1917 const VhostDevConfigOps *ops) 1918 { 1919 hdev->config_ops = ops; 1920 } 1921 1922 void vhost_dev_free_inflight(struct vhost_inflight *inflight) 1923 { 1924 if (inflight && inflight->addr) { 1925 qemu_memfd_free(inflight->addr, inflight->size, inflight->fd); 1926 inflight->addr = NULL; 1927 inflight->fd = -1; 1928 } 1929 } 1930 1931 int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev) 1932 { 1933 int r; 1934 1935 if (hdev->vhost_ops->vhost_get_inflight_fd == NULL || 1936 hdev->vhost_ops->vhost_set_inflight_fd == NULL) { 1937 return 0; 1938 } 1939 1940 hdev->vdev = vdev; 1941 1942 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1943 if (r < 0) { 1944 VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed"); 1945 return r; 1946 } 1947 1948 return 0; 1949 } 1950 1951 int vhost_dev_set_inflight(struct vhost_dev *dev, 1952 struct vhost_inflight *inflight) 1953 { 1954 int r; 1955 1956 if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) { 1957 r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight); 1958 if (r) { 1959 VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed"); 1960 return r; 1961 } 1962 } 1963 1964 return 0; 1965 } 1966 1967 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, 1968 struct vhost_inflight *inflight) 1969 { 1970 int r; 1971 1972 if (dev->vhost_ops->vhost_get_inflight_fd) { 1973 r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight); 1974 if (r) { 1975 VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed"); 1976 return r; 1977 } 1978 } 1979 1980 return 0; 1981 } 1982 1983 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) 1984 { 1985 if (!hdev->vhost_ops->vhost_set_vring_enable) { 1986 return 0; 1987 } 1988 1989 /* 1990 * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not 1991 * been negotiated, the rings start directly in the enabled state, and 1992 * .vhost_set_vring_enable callback will fail since 1993 * VHOST_USER_SET_VRING_ENABLE is not supported. 1994 */ 1995 if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER && 1996 !virtio_has_feature(hdev->backend_features, 1997 VHOST_USER_F_PROTOCOL_FEATURES)) { 1998 return 0; 1999 } 2000 2001 return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); 2002 } 2003 2004 /* 2005 * Host notifiers must be enabled at this point. 2006 * 2007 * If @vrings is true, this function will enable all vrings before starting the 2008 * device. If it is false, the vring initialization is left to be done by the 2009 * caller. 2010 */ 2011 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 2012 { 2013 int i, r; 2014 2015 /* should only be called after backend is connected */ 2016 assert(hdev->vhost_ops); 2017 2018 trace_vhost_dev_start(hdev, vdev->name, vrings); 2019 2020 vdev->vhost_started = true; 2021 hdev->started = true; 2022 hdev->vdev = vdev; 2023 2024 r = vhost_dev_set_features(hdev, hdev->log_enabled); 2025 if (r < 0) { 2026 goto fail_features; 2027 } 2028 2029 if (vhost_dev_has_iommu(hdev)) { 2030 memory_listener_register(&hdev->iommu_listener, vdev->dma_as); 2031 } 2032 2033 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); 2034 if (r < 0) { 2035 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 2036 goto fail_mem; 2037 } 2038 for (i = 0; i < hdev->nvqs; ++i) { 2039 r = vhost_virtqueue_start(hdev, 2040 vdev, 2041 hdev->vqs + i, 2042 hdev->vq_index + i); 2043 if (r < 0) { 2044 goto fail_vq; 2045 } 2046 } 2047 2048 r = event_notifier_init( 2049 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0); 2050 if (r < 0) { 2051 VHOST_OPS_DEBUG(r, "event_notifier_init failed"); 2052 goto fail_vq; 2053 } 2054 event_notifier_test_and_clear( 2055 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2056 if (!vdev->use_guest_notifier_mask) { 2057 vhost_config_mask(hdev, vdev, true); 2058 } 2059 if (hdev->log_enabled) { 2060 uint64_t log_base; 2061 2062 hdev->log_size = vhost_get_log_size(hdev); 2063 hdev->log = vhost_log_get(hdev->vhost_ops->backend_type, 2064 hdev->log_size, 2065 vhost_dev_log_is_shared(hdev)); 2066 log_base = (uintptr_t)hdev->log->log; 2067 r = hdev->vhost_ops->vhost_set_log_base(hdev, 2068 hdev->log_size ? log_base : 0, 2069 hdev->log); 2070 if (r < 0) { 2071 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 2072 goto fail_log; 2073 } 2074 vhost_dev_elect_mem_logger(hdev, true); 2075 } 2076 if (vrings) { 2077 r = vhost_dev_set_vring_enable(hdev, true); 2078 if (r) { 2079 goto fail_log; 2080 } 2081 } 2082 if (hdev->vhost_ops->vhost_dev_start) { 2083 r = hdev->vhost_ops->vhost_dev_start(hdev, true); 2084 if (r) { 2085 goto fail_start; 2086 } 2087 } 2088 if (vhost_dev_has_iommu(hdev) && 2089 hdev->vhost_ops->vhost_set_iotlb_callback) { 2090 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); 2091 2092 /* Update used ring information for IOTLB to work correctly, 2093 * vhost-kernel code requires for this.*/ 2094 for (i = 0; i < hdev->nvqs; ++i) { 2095 struct vhost_virtqueue *vq = hdev->vqs + i; 2096 r = vhost_device_iotlb_miss(hdev, vq->used_phys, true); 2097 if (r) { 2098 goto fail_iotlb; 2099 } 2100 } 2101 } 2102 vhost_start_config_intr(hdev); 2103 return 0; 2104 fail_iotlb: 2105 if (vhost_dev_has_iommu(hdev) && 2106 hdev->vhost_ops->vhost_set_iotlb_callback) { 2107 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 2108 } 2109 if (hdev->vhost_ops->vhost_dev_start) { 2110 hdev->vhost_ops->vhost_dev_start(hdev, false); 2111 } 2112 fail_start: 2113 if (vrings) { 2114 vhost_dev_set_vring_enable(hdev, false); 2115 } 2116 fail_log: 2117 vhost_log_put(hdev, false); 2118 fail_vq: 2119 while (--i >= 0) { 2120 vhost_virtqueue_stop(hdev, 2121 vdev, 2122 hdev->vqs + i, 2123 hdev->vq_index + i); 2124 } 2125 2126 fail_mem: 2127 if (vhost_dev_has_iommu(hdev)) { 2128 memory_listener_unregister(&hdev->iommu_listener); 2129 } 2130 fail_features: 2131 vdev->vhost_started = false; 2132 hdev->started = false; 2133 return r; 2134 } 2135 2136 /* Host notifiers must be enabled at this point. */ 2137 static int do_vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, 2138 bool vrings, bool force) 2139 { 2140 int i; 2141 int rc = 0; 2142 2143 /* should only be called after backend is connected */ 2144 assert(hdev->vhost_ops); 2145 event_notifier_test_and_clear( 2146 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2147 event_notifier_test_and_clear(&vdev->config_notifier); 2148 event_notifier_cleanup( 2149 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2150 2151 trace_vhost_dev_stop(hdev, vdev->name, vrings); 2152 2153 if (hdev->vhost_ops->vhost_dev_start) { 2154 hdev->vhost_ops->vhost_dev_start(hdev, false); 2155 } 2156 if (vrings) { 2157 vhost_dev_set_vring_enable(hdev, false); 2158 } 2159 for (i = 0; i < hdev->nvqs; ++i) { 2160 rc |= do_vhost_virtqueue_stop(hdev, 2161 vdev, 2162 hdev->vqs + i, 2163 hdev->vq_index + i, 2164 force); 2165 } 2166 if (hdev->vhost_ops->vhost_reset_status) { 2167 hdev->vhost_ops->vhost_reset_status(hdev); 2168 } 2169 2170 if (vhost_dev_has_iommu(hdev)) { 2171 if (hdev->vhost_ops->vhost_set_iotlb_callback) { 2172 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 2173 } 2174 memory_listener_unregister(&hdev->iommu_listener); 2175 } 2176 vhost_stop_config_intr(hdev); 2177 vhost_log_put(hdev, true); 2178 hdev->started = false; 2179 vdev->vhost_started = false; 2180 hdev->vdev = NULL; 2181 return rc; 2182 } 2183 2184 int vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 2185 { 2186 return do_vhost_dev_stop(hdev, vdev, vrings, false); 2187 } 2188 2189 int vhost_dev_force_stop(struct vhost_dev *hdev, VirtIODevice *vdev, 2190 bool vrings) 2191 { 2192 return do_vhost_dev_stop(hdev, vdev, vrings, true); 2193 } 2194 2195 int vhost_net_set_backend(struct vhost_dev *hdev, 2196 struct vhost_vring_file *file) 2197 { 2198 if (hdev->vhost_ops->vhost_net_set_backend) { 2199 return hdev->vhost_ops->vhost_net_set_backend(hdev, file); 2200 } 2201 2202 return -ENOSYS; 2203 } 2204 2205 int vhost_reset_device(struct vhost_dev *hdev) 2206 { 2207 if (hdev->vhost_ops->vhost_reset_device) { 2208 return hdev->vhost_ops->vhost_reset_device(hdev); 2209 } 2210 2211 return -ENOSYS; 2212 } 2213 2214 bool vhost_supports_device_state(struct vhost_dev *dev) 2215 { 2216 if (dev->vhost_ops->vhost_supports_device_state) { 2217 return dev->vhost_ops->vhost_supports_device_state(dev); 2218 } 2219 2220 return false; 2221 } 2222 2223 int vhost_set_device_state_fd(struct vhost_dev *dev, 2224 VhostDeviceStateDirection direction, 2225 VhostDeviceStatePhase phase, 2226 int fd, 2227 int *reply_fd, 2228 Error **errp) 2229 { 2230 if (dev->vhost_ops->vhost_set_device_state_fd) { 2231 return dev->vhost_ops->vhost_set_device_state_fd(dev, direction, phase, 2232 fd, reply_fd, errp); 2233 } 2234 2235 error_setg(errp, 2236 "vhost transport does not support migration state transfer"); 2237 return -ENOSYS; 2238 } 2239 2240 int vhost_check_device_state(struct vhost_dev *dev, Error **errp) 2241 { 2242 if (dev->vhost_ops->vhost_check_device_state) { 2243 return dev->vhost_ops->vhost_check_device_state(dev, errp); 2244 } 2245 2246 error_setg(errp, 2247 "vhost transport does not support migration state transfer"); 2248 return -ENOSYS; 2249 } 2250 2251 int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) 2252 { 2253 ERRP_GUARD(); 2254 /* Maximum chunk size in which to transfer the state */ 2255 const size_t chunk_size = 1 * 1024 * 1024; 2256 g_autofree void *transfer_buf = NULL; 2257 g_autoptr(GError) g_err = NULL; 2258 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; 2259 int ret; 2260 2261 /* [0] for reading (our end), [1] for writing (back-end's end) */ 2262 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { 2263 error_setg(errp, "Failed to set up state transfer pipe: %s", 2264 g_err->message); 2265 ret = -EINVAL; 2266 goto fail; 2267 } 2268 2269 read_fd = pipe_fds[0]; 2270 write_fd = pipe_fds[1]; 2271 2272 /* 2273 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. 2274 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for 2275 * vhost-user, so just check that it is stopped at all. 2276 */ 2277 assert(!dev->started); 2278 2279 /* Transfer ownership of write_fd to the back-end */ 2280 ret = vhost_set_device_state_fd(dev, 2281 VHOST_TRANSFER_STATE_DIRECTION_SAVE, 2282 VHOST_TRANSFER_STATE_PHASE_STOPPED, 2283 write_fd, 2284 &reply_fd, 2285 errp); 2286 if (ret < 0) { 2287 error_prepend(errp, "Failed to initiate state transfer: "); 2288 goto fail; 2289 } 2290 2291 /* If the back-end wishes to use a different pipe, switch over */ 2292 if (reply_fd >= 0) { 2293 close(read_fd); 2294 read_fd = reply_fd; 2295 } 2296 2297 transfer_buf = g_malloc(chunk_size); 2298 2299 while (true) { 2300 ssize_t read_ret; 2301 2302 read_ret = RETRY_ON_EINTR(read(read_fd, transfer_buf, chunk_size)); 2303 if (read_ret < 0) { 2304 ret = -errno; 2305 error_setg_errno(errp, -ret, "Failed to receive state"); 2306 goto fail; 2307 } 2308 2309 assert(read_ret <= chunk_size); 2310 qemu_put_be32(f, read_ret); 2311 2312 if (read_ret == 0) { 2313 /* EOF */ 2314 break; 2315 } 2316 2317 qemu_put_buffer(f, transfer_buf, read_ret); 2318 } 2319 2320 /* 2321 * Back-end will not really care, but be clean and close our end of the pipe 2322 * before inquiring the back-end about whether transfer was successful 2323 */ 2324 close(read_fd); 2325 read_fd = -1; 2326 2327 /* Also, verify that the device is still stopped */ 2328 assert(!dev->started); 2329 2330 ret = vhost_check_device_state(dev, errp); 2331 if (ret < 0) { 2332 goto fail; 2333 } 2334 2335 ret = 0; 2336 fail: 2337 if (read_fd >= 0) { 2338 close(read_fd); 2339 } 2340 2341 return ret; 2342 } 2343 2344 int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) 2345 { 2346 ERRP_GUARD(); 2347 size_t transfer_buf_size = 0; 2348 g_autofree void *transfer_buf = NULL; 2349 g_autoptr(GError) g_err = NULL; 2350 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; 2351 int ret; 2352 2353 /* [0] for reading (back-end's end), [1] for writing (our end) */ 2354 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { 2355 error_setg(errp, "Failed to set up state transfer pipe: %s", 2356 g_err->message); 2357 ret = -EINVAL; 2358 goto fail; 2359 } 2360 2361 read_fd = pipe_fds[0]; 2362 write_fd = pipe_fds[1]; 2363 2364 /* 2365 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. 2366 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for 2367 * vhost-user, so just check that it is stopped at all. 2368 */ 2369 assert(!dev->started); 2370 2371 /* Transfer ownership of read_fd to the back-end */ 2372 ret = vhost_set_device_state_fd(dev, 2373 VHOST_TRANSFER_STATE_DIRECTION_LOAD, 2374 VHOST_TRANSFER_STATE_PHASE_STOPPED, 2375 read_fd, 2376 &reply_fd, 2377 errp); 2378 if (ret < 0) { 2379 error_prepend(errp, "Failed to initiate state transfer: "); 2380 goto fail; 2381 } 2382 2383 /* If the back-end wishes to use a different pipe, switch over */ 2384 if (reply_fd >= 0) { 2385 close(write_fd); 2386 write_fd = reply_fd; 2387 } 2388 2389 while (true) { 2390 size_t this_chunk_size = qemu_get_be32(f); 2391 ssize_t write_ret; 2392 const uint8_t *transfer_pointer; 2393 2394 if (this_chunk_size == 0) { 2395 /* End of state */ 2396 break; 2397 } 2398 2399 if (transfer_buf_size < this_chunk_size) { 2400 transfer_buf = g_realloc(transfer_buf, this_chunk_size); 2401 transfer_buf_size = this_chunk_size; 2402 } 2403 2404 if (qemu_get_buffer(f, transfer_buf, this_chunk_size) < 2405 this_chunk_size) 2406 { 2407 error_setg(errp, "Failed to read state"); 2408 ret = -EINVAL; 2409 goto fail; 2410 } 2411 2412 transfer_pointer = transfer_buf; 2413 while (this_chunk_size > 0) { 2414 write_ret = RETRY_ON_EINTR( 2415 write(write_fd, transfer_pointer, this_chunk_size) 2416 ); 2417 if (write_ret < 0) { 2418 ret = -errno; 2419 error_setg_errno(errp, -ret, "Failed to send state"); 2420 goto fail; 2421 } else if (write_ret == 0) { 2422 error_setg(errp, "Failed to send state: Connection is closed"); 2423 ret = -ECONNRESET; 2424 goto fail; 2425 } 2426 2427 assert(write_ret <= this_chunk_size); 2428 this_chunk_size -= write_ret; 2429 transfer_pointer += write_ret; 2430 } 2431 } 2432 2433 /* 2434 * Close our end, thus ending transfer, before inquiring the back-end about 2435 * whether transfer was successful 2436 */ 2437 close(write_fd); 2438 write_fd = -1; 2439 2440 /* Also, verify that the device is still stopped */ 2441 assert(!dev->started); 2442 2443 ret = vhost_check_device_state(dev, errp); 2444 if (ret < 0) { 2445 goto fail; 2446 } 2447 2448 ret = 0; 2449 fail: 2450 if (write_fd >= 0) { 2451 close(write_fd); 2452 } 2453 2454 return ret; 2455 } 2456