1 /* 2 * vhost support 3 * 4 * Copyright Red Hat, Inc. 2010 5 * 6 * Authors: 7 * Michael S. Tsirkin <mst@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "hw/virtio/vhost.h" 19 #include "qemu/atomic.h" 20 #include "qemu/range.h" 21 #include "qemu/error-report.h" 22 #include "qemu/memfd.h" 23 #include "qemu/log.h" 24 #include "standard-headers/linux/vhost_types.h" 25 #include "hw/virtio/virtio-bus.h" 26 #include "hw/mem/memory-device.h" 27 #include "migration/blocker.h" 28 #include "migration/qemu-file-types.h" 29 #include "system/dma.h" 30 #include "trace.h" 31 32 /* enabled until disconnected backend stabilizes */ 33 #define _VHOST_DEBUG 1 34 35 #ifdef _VHOST_DEBUG 36 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 37 do { \ 38 error_report(fmt ": %s (%d)", ## __VA_ARGS__, \ 39 strerror(-retval), -retval); \ 40 } while (0) 41 #else 42 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 43 do { } while (0) 44 #endif 45 46 static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX]; 47 static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX]; 48 static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX]; 49 50 static QLIST_HEAD(, vhost_dev) vhost_devices = 51 QLIST_HEAD_INITIALIZER(vhost_devices); 52 53 unsigned int vhost_get_max_memslots(void) 54 { 55 unsigned int max = UINT_MAX; 56 struct vhost_dev *hdev; 57 58 QLIST_FOREACH(hdev, &vhost_devices, entry) { 59 max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev)); 60 } 61 return max; 62 } 63 64 unsigned int vhost_get_free_memslots(void) 65 { 66 unsigned int free = UINT_MAX; 67 struct vhost_dev *hdev; 68 69 QLIST_FOREACH(hdev, &vhost_devices, entry) { 70 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 71 unsigned int cur_free = r - hdev->mem->nregions; 72 73 if (unlikely(r < hdev->mem->nregions)) { 74 warn_report_once("used (%u) vhost backend memory slots exceed" 75 " the device limit (%u).", hdev->mem->nregions, r); 76 free = 0; 77 } else { 78 free = MIN(free, cur_free); 79 } 80 } 81 return free; 82 } 83 84 static void vhost_dev_sync_region(struct vhost_dev *dev, 85 MemoryRegionSection *section, 86 uint64_t mfirst, uint64_t mlast, 87 uint64_t rfirst, uint64_t rlast) 88 { 89 vhost_log_chunk_t *dev_log = dev->log->log; 90 91 uint64_t start = MAX(mfirst, rfirst); 92 uint64_t end = MIN(mlast, rlast); 93 vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK; 94 vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1; 95 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK); 96 97 if (end < start) { 98 return; 99 } 100 assert(end / VHOST_LOG_CHUNK < dev->log_size); 101 assert(start / VHOST_LOG_CHUNK < dev->log_size); 102 103 for (;from < to; ++from) { 104 vhost_log_chunk_t log; 105 /* We first check with non-atomic: much cheaper, 106 * and we expect non-dirty to be the common case. */ 107 if (!*from) { 108 addr += VHOST_LOG_CHUNK; 109 continue; 110 } 111 /* Data must be read atomically. We don't really need barrier semantics 112 * but it's easier to use atomic_* than roll our own. */ 113 log = qatomic_xchg(from, 0); 114 while (log) { 115 int bit = ctzl(log); 116 hwaddr page_addr; 117 hwaddr section_offset; 118 hwaddr mr_offset; 119 page_addr = addr + bit * VHOST_LOG_PAGE; 120 section_offset = page_addr - section->offset_within_address_space; 121 mr_offset = section_offset + section->offset_within_region; 122 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); 123 log &= ~(0x1ull << bit); 124 } 125 addr += VHOST_LOG_CHUNK; 126 } 127 } 128 129 bool vhost_dev_has_iommu(struct vhost_dev *dev) 130 { 131 VirtIODevice *vdev = dev->vdev; 132 133 /* 134 * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support 135 * incremental memory mapping API via IOTLB API. For platform that 136 * does not have IOMMU, there's no need to enable this feature 137 * which may cause unnecessary IOTLB miss/update transactions. 138 */ 139 if (vdev) { 140 return virtio_bus_device_iommu_enabled(vdev) && 141 virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); 142 } else { 143 return false; 144 } 145 } 146 147 static inline bool vhost_dev_should_log(struct vhost_dev *dev) 148 { 149 assert(dev->vhost_ops); 150 assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE); 151 assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX); 152 153 return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]); 154 } 155 156 static inline void vhost_dev_elect_mem_logger(struct vhost_dev *hdev, bool add) 157 { 158 VhostBackendType backend_type; 159 160 assert(hdev->vhost_ops); 161 162 backend_type = hdev->vhost_ops->backend_type; 163 assert(backend_type > VHOST_BACKEND_TYPE_NONE); 164 assert(backend_type < VHOST_BACKEND_TYPE_MAX); 165 166 if (add && !QLIST_IS_INSERTED(hdev, logdev_entry)) { 167 if (QLIST_EMPTY(&vhost_log_devs[backend_type])) { 168 QLIST_INSERT_HEAD(&vhost_log_devs[backend_type], 169 hdev, logdev_entry); 170 } else { 171 /* 172 * The first vhost_device in the list is selected as the shared 173 * logger to scan memory sections. Put new entry next to the head 174 * to avoid inadvertent change to the underlying logger device. 175 * This is done in order to get better cache locality and to avoid 176 * performance churn on the hot path for log scanning. Even when 177 * new devices come and go quickly, it wouldn't end up changing 178 * the active leading logger device at all. 179 */ 180 QLIST_INSERT_AFTER(QLIST_FIRST(&vhost_log_devs[backend_type]), 181 hdev, logdev_entry); 182 } 183 } else if (!add && QLIST_IS_INSERTED(hdev, logdev_entry)) { 184 QLIST_REMOVE(hdev, logdev_entry); 185 } 186 } 187 188 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, 189 MemoryRegionSection *section, 190 hwaddr first, 191 hwaddr last) 192 { 193 int i; 194 hwaddr start_addr; 195 hwaddr end_addr; 196 197 if (!dev->log_enabled || !dev->started) { 198 return 0; 199 } 200 start_addr = section->offset_within_address_space; 201 end_addr = range_get_last(start_addr, int128_get64(section->size)); 202 start_addr = MAX(first, start_addr); 203 end_addr = MIN(last, end_addr); 204 205 if (vhost_dev_should_log(dev)) { 206 for (i = 0; i < dev->mem->nregions; ++i) { 207 struct vhost_memory_region *reg = dev->mem->regions + i; 208 vhost_dev_sync_region(dev, section, start_addr, end_addr, 209 reg->guest_phys_addr, 210 range_get_last(reg->guest_phys_addr, 211 reg->memory_size)); 212 } 213 } 214 for (i = 0; i < dev->nvqs; ++i) { 215 struct vhost_virtqueue *vq = dev->vqs + i; 216 217 if (!vq->used_phys && !vq->used_size) { 218 continue; 219 } 220 221 if (vhost_dev_has_iommu(dev)) { 222 IOMMUTLBEntry iotlb; 223 hwaddr used_phys = vq->used_phys, used_size = vq->used_size; 224 hwaddr phys, s, offset; 225 226 while (used_size) { 227 rcu_read_lock(); 228 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 229 used_phys, 230 true, 231 MEMTXATTRS_UNSPECIFIED); 232 rcu_read_unlock(); 233 234 if (!iotlb.target_as) { 235 qemu_log_mask(LOG_GUEST_ERROR, "translation " 236 "failure for used_iova %"PRIx64"\n", 237 used_phys); 238 return -EINVAL; 239 } 240 241 offset = used_phys & iotlb.addr_mask; 242 phys = iotlb.translated_addr + offset; 243 244 /* 245 * Distance from start of used ring until last byte of 246 * IOMMU page. 247 */ 248 s = iotlb.addr_mask - offset; 249 /* 250 * Size of used ring, or of the part of it until end 251 * of IOMMU page. To avoid zero result, do the adding 252 * outside of MIN(). 253 */ 254 s = MIN(s, used_size - 1) + 1; 255 256 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys, 257 range_get_last(phys, s)); 258 used_size -= s; 259 used_phys += s; 260 } 261 } else { 262 vhost_dev_sync_region(dev, section, start_addr, 263 end_addr, vq->used_phys, 264 range_get_last(vq->used_phys, vq->used_size)); 265 } 266 } 267 return 0; 268 } 269 270 static void vhost_log_sync(MemoryListener *listener, 271 MemoryRegionSection *section) 272 { 273 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 274 memory_listener); 275 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); 276 } 277 278 static void vhost_log_sync_range(struct vhost_dev *dev, 279 hwaddr first, hwaddr last) 280 { 281 int i; 282 /* FIXME: this is N^2 in number of sections */ 283 for (i = 0; i < dev->n_mem_sections; ++i) { 284 MemoryRegionSection *section = &dev->mem_sections[i]; 285 vhost_sync_dirty_bitmap(dev, section, first, last); 286 } 287 } 288 289 static uint64_t vhost_get_log_size(struct vhost_dev *dev) 290 { 291 uint64_t log_size = 0; 292 int i; 293 for (i = 0; i < dev->mem->nregions; ++i) { 294 struct vhost_memory_region *reg = dev->mem->regions + i; 295 uint64_t last = range_get_last(reg->guest_phys_addr, 296 reg->memory_size); 297 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 298 } 299 return log_size; 300 } 301 302 static int vhost_set_backend_type(struct vhost_dev *dev, 303 VhostBackendType backend_type) 304 { 305 int r = 0; 306 307 switch (backend_type) { 308 #ifdef CONFIG_VHOST_KERNEL 309 case VHOST_BACKEND_TYPE_KERNEL: 310 dev->vhost_ops = &kernel_ops; 311 break; 312 #endif 313 #ifdef CONFIG_VHOST_USER 314 case VHOST_BACKEND_TYPE_USER: 315 dev->vhost_ops = &user_ops; 316 break; 317 #endif 318 #ifdef CONFIG_VHOST_VDPA 319 case VHOST_BACKEND_TYPE_VDPA: 320 dev->vhost_ops = &vdpa_ops; 321 break; 322 #endif 323 default: 324 error_report("Unknown vhost backend type"); 325 r = -1; 326 } 327 328 if (r == 0) { 329 assert(dev->vhost_ops->backend_type == backend_type); 330 } 331 332 return r; 333 } 334 335 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) 336 { 337 Error *err = NULL; 338 struct vhost_log *log; 339 uint64_t logsize = size * sizeof(*(log->log)); 340 int fd = -1; 341 342 log = g_new0(struct vhost_log, 1); 343 if (share) { 344 log->log = qemu_memfd_alloc("vhost-log", logsize, 345 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 346 &fd, &err); 347 if (err) { 348 error_report_err(err); 349 g_free(log); 350 return NULL; 351 } 352 memset(log->log, 0, logsize); 353 } else { 354 log->log = g_malloc0(logsize); 355 } 356 357 log->size = size; 358 log->refcnt = 1; 359 log->fd = fd; 360 361 return log; 362 } 363 364 static struct vhost_log *vhost_log_get(VhostBackendType backend_type, 365 uint64_t size, bool share) 366 { 367 struct vhost_log *log; 368 369 assert(backend_type > VHOST_BACKEND_TYPE_NONE); 370 assert(backend_type < VHOST_BACKEND_TYPE_MAX); 371 372 log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type]; 373 374 if (!log || log->size != size) { 375 log = vhost_log_alloc(size, share); 376 if (share) { 377 vhost_log_shm[backend_type] = log; 378 } else { 379 vhost_log[backend_type] = log; 380 } 381 } else { 382 ++log->refcnt; 383 } 384 385 return log; 386 } 387 388 static void vhost_log_put(struct vhost_dev *dev, bool sync) 389 { 390 struct vhost_log *log = dev->log; 391 VhostBackendType backend_type; 392 393 if (!log) { 394 return; 395 } 396 397 assert(dev->vhost_ops); 398 backend_type = dev->vhost_ops->backend_type; 399 400 if (backend_type == VHOST_BACKEND_TYPE_NONE || 401 backend_type >= VHOST_BACKEND_TYPE_MAX) { 402 return; 403 } 404 405 --log->refcnt; 406 if (log->refcnt == 0) { 407 /* Sync only the range covered by the old log */ 408 if (dev->log_size && sync) { 409 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); 410 } 411 412 if (vhost_log[backend_type] == log) { 413 g_free(log->log); 414 vhost_log[backend_type] = NULL; 415 } else if (vhost_log_shm[backend_type] == log) { 416 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), 417 log->fd); 418 vhost_log_shm[backend_type] = NULL; 419 } 420 421 g_free(log); 422 } 423 424 vhost_dev_elect_mem_logger(dev, false); 425 dev->log = NULL; 426 dev->log_size = 0; 427 } 428 429 static bool vhost_dev_log_is_shared(struct vhost_dev *dev) 430 { 431 return dev->vhost_ops->vhost_requires_shm_log && 432 dev->vhost_ops->vhost_requires_shm_log(dev); 433 } 434 435 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) 436 { 437 struct vhost_log *log = vhost_log_get(dev->vhost_ops->backend_type, 438 size, vhost_dev_log_is_shared(dev)); 439 uint64_t log_base = (uintptr_t)log->log; 440 int r; 441 442 /* inform backend of log switching, this must be done before 443 releasing the current log, to ensure no logging is lost */ 444 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); 445 if (r < 0) { 446 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 447 } 448 449 vhost_log_put(dev, true); 450 dev->log = log; 451 dev->log_size = size; 452 } 453 454 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, 455 hwaddr *plen, bool is_write) 456 { 457 if (!vhost_dev_has_iommu(dev)) { 458 return cpu_physical_memory_map(addr, plen, is_write); 459 } else { 460 return (void *)(uintptr_t)addr; 461 } 462 } 463 464 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer, 465 hwaddr len, int is_write, 466 hwaddr access_len) 467 { 468 if (!vhost_dev_has_iommu(dev)) { 469 cpu_physical_memory_unmap(buffer, len, is_write, access_len); 470 } 471 } 472 473 static int vhost_verify_ring_part_mapping(void *ring_hva, 474 uint64_t ring_gpa, 475 uint64_t ring_size, 476 void *reg_hva, 477 uint64_t reg_gpa, 478 uint64_t reg_size) 479 { 480 uint64_t hva_ring_offset; 481 uint64_t ring_last = range_get_last(ring_gpa, ring_size); 482 uint64_t reg_last = range_get_last(reg_gpa, reg_size); 483 484 if (ring_last < reg_gpa || ring_gpa > reg_last) { 485 return 0; 486 } 487 /* check that whole ring's is mapped */ 488 if (ring_last > reg_last) { 489 return -ENOMEM; 490 } 491 /* check that ring's MemoryRegion wasn't replaced */ 492 hva_ring_offset = ring_gpa - reg_gpa; 493 if (ring_hva != reg_hva + hva_ring_offset) { 494 return -EBUSY; 495 } 496 497 return 0; 498 } 499 500 static int vhost_verify_ring_mappings(struct vhost_dev *dev, 501 void *reg_hva, 502 uint64_t reg_gpa, 503 uint64_t reg_size) 504 { 505 int i, j; 506 int r = 0; 507 const char *part_name[] = { 508 "descriptor table", 509 "available ring", 510 "used ring" 511 }; 512 513 if (vhost_dev_has_iommu(dev)) { 514 return 0; 515 } 516 517 for (i = 0; i < dev->nvqs; ++i) { 518 struct vhost_virtqueue *vq = dev->vqs + i; 519 520 if (vq->desc_phys == 0) { 521 continue; 522 } 523 524 j = 0; 525 r = vhost_verify_ring_part_mapping( 526 vq->desc, vq->desc_phys, vq->desc_size, 527 reg_hva, reg_gpa, reg_size); 528 if (r) { 529 break; 530 } 531 532 j++; 533 r = vhost_verify_ring_part_mapping( 534 vq->avail, vq->avail_phys, vq->avail_size, 535 reg_hva, reg_gpa, reg_size); 536 if (r) { 537 break; 538 } 539 540 j++; 541 r = vhost_verify_ring_part_mapping( 542 vq->used, vq->used_phys, vq->used_size, 543 reg_hva, reg_gpa, reg_size); 544 if (r) { 545 break; 546 } 547 } 548 549 if (r == -ENOMEM) { 550 error_report("Unable to map %s for ring %d", part_name[j], i); 551 } else if (r == -EBUSY) { 552 error_report("%s relocated for ring %d", part_name[j], i); 553 } 554 return r; 555 } 556 557 /* 558 * vhost_section: identify sections needed for vhost access 559 * 560 * We only care about RAM sections here (where virtqueue and guest 561 * internals accessed by virtio might live). 562 */ 563 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) 564 { 565 MemoryRegion *mr = section->mr; 566 567 if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) { 568 uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr); 569 uint8_t handled_dirty; 570 571 /* 572 * Kernel based vhost doesn't handle any block which is doing 573 * dirty-tracking other than migration for which it has 574 * specific logging support. However for TCG the kernel never 575 * gets involved anyway so we can also ignore it's 576 * self-modiying code detection flags. However a vhost-user 577 * client could still confuse a TCG guest if it re-writes 578 * executable memory that has already been translated. 579 */ 580 handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) | 581 (1 << DIRTY_MEMORY_CODE); 582 583 if (dirty_mask & ~handled_dirty) { 584 trace_vhost_reject_section(mr->name, 1); 585 return false; 586 } 587 588 /* 589 * Some backends (like vhost-user) can only handle memory regions 590 * that have an fd (can be mapped into a different process). Filter 591 * the ones without an fd out, if requested. 592 * 593 * TODO: we might have to limit to MAP_SHARED as well. 594 */ 595 if (memory_region_get_fd(section->mr) < 0 && 596 dev->vhost_ops->vhost_backend_no_private_memslots && 597 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 598 trace_vhost_reject_section(mr->name, 2); 599 return false; 600 } 601 602 trace_vhost_section(mr->name); 603 return true; 604 } else { 605 trace_vhost_reject_section(mr->name, 3); 606 return false; 607 } 608 } 609 610 static void vhost_begin(MemoryListener *listener) 611 { 612 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 613 memory_listener); 614 dev->tmp_sections = NULL; 615 dev->n_tmp_sections = 0; 616 } 617 618 static void vhost_commit(MemoryListener *listener) 619 { 620 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 621 memory_listener); 622 MemoryRegionSection *old_sections; 623 int n_old_sections; 624 uint64_t log_size; 625 size_t regions_size; 626 int r; 627 int i; 628 bool changed = false; 629 630 /* Note we can be called before the device is started, but then 631 * starting the device calls set_mem_table, so we need to have 632 * built the data structures. 633 */ 634 old_sections = dev->mem_sections; 635 n_old_sections = dev->n_mem_sections; 636 dev->mem_sections = dev->tmp_sections; 637 dev->n_mem_sections = dev->n_tmp_sections; 638 639 if (dev->n_mem_sections != n_old_sections) { 640 changed = true; 641 } else { 642 /* Same size, lets check the contents */ 643 for (i = 0; i < n_old_sections; i++) { 644 if (!MemoryRegionSection_eq(&old_sections[i], 645 &dev->mem_sections[i])) { 646 changed = true; 647 break; 648 } 649 } 650 } 651 652 trace_vhost_commit(dev->started, changed); 653 if (!changed) { 654 goto out; 655 } 656 657 /* Rebuild the regions list from the new sections list */ 658 regions_size = offsetof(struct vhost_memory, regions) + 659 dev->n_mem_sections * sizeof dev->mem->regions[0]; 660 dev->mem = g_realloc(dev->mem, regions_size); 661 dev->mem->nregions = dev->n_mem_sections; 662 663 for (i = 0; i < dev->n_mem_sections; i++) { 664 struct vhost_memory_region *cur_vmr = dev->mem->regions + i; 665 struct MemoryRegionSection *mrs = dev->mem_sections + i; 666 667 cur_vmr->guest_phys_addr = mrs->offset_within_address_space; 668 cur_vmr->memory_size = int128_get64(mrs->size); 669 cur_vmr->userspace_addr = 670 (uintptr_t)memory_region_get_ram_ptr(mrs->mr) + 671 mrs->offset_within_region; 672 cur_vmr->flags_padding = 0; 673 } 674 675 if (!dev->started) { 676 goto out; 677 } 678 679 for (i = 0; i < dev->mem->nregions; i++) { 680 if (vhost_verify_ring_mappings(dev, 681 (void *)(uintptr_t)dev->mem->regions[i].userspace_addr, 682 dev->mem->regions[i].guest_phys_addr, 683 dev->mem->regions[i].memory_size)) { 684 error_report("Verify ring failure on region %d", i); 685 abort(); 686 } 687 } 688 689 if (!dev->log_enabled) { 690 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 691 if (r < 0) { 692 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 693 } 694 goto out; 695 } 696 log_size = vhost_get_log_size(dev); 697 /* We allocate an extra 4K bytes to log, 698 * to reduce the * number of reallocations. */ 699 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) 700 /* To log more, must increase log size before table update. */ 701 if (dev->log_size < log_size) { 702 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); 703 } 704 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 705 if (r < 0) { 706 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 707 } 708 /* To log less, can only decrease log size after table update. */ 709 if (dev->log_size > log_size + VHOST_LOG_BUFFER) { 710 vhost_dev_log_resize(dev, log_size); 711 } 712 713 out: 714 /* Deref the old list of sections, this must happen _after_ the 715 * vhost_set_mem_table to ensure the client isn't still using the 716 * section we're about to unref. 717 */ 718 while (n_old_sections--) { 719 memory_region_unref(old_sections[n_old_sections].mr); 720 } 721 g_free(old_sections); 722 } 723 724 /* Adds the section data to the tmp_section structure. 725 * It relies on the listener calling us in memory address order 726 * and for each region (via the _add and _nop methods) to 727 * join neighbours. 728 */ 729 static void vhost_region_add_section(struct vhost_dev *dev, 730 MemoryRegionSection *section) 731 { 732 bool need_add = true; 733 uint64_t mrs_size = int128_get64(section->size); 734 uint64_t mrs_gpa = section->offset_within_address_space; 735 uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + 736 section->offset_within_region; 737 RAMBlock *mrs_rb = section->mr->ram_block; 738 739 trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, 740 mrs_host); 741 742 if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { 743 /* Round the section to it's page size */ 744 /* First align the start down to a page boundary */ 745 size_t mrs_page = qemu_ram_pagesize(mrs_rb); 746 uint64_t alignage = mrs_host & (mrs_page - 1); 747 if (alignage) { 748 mrs_host -= alignage; 749 mrs_size += alignage; 750 mrs_gpa -= alignage; 751 } 752 /* Now align the size up to a page boundary */ 753 alignage = mrs_size & (mrs_page - 1); 754 if (alignage) { 755 mrs_size += mrs_page - alignage; 756 } 757 trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, 758 mrs_size, mrs_host); 759 } 760 761 if (dev->n_tmp_sections && !section->unmergeable) { 762 /* Since we already have at least one section, lets see if 763 * this extends it; since we're scanning in order, we only 764 * have to look at the last one, and the FlatView that calls 765 * us shouldn't have overlaps. 766 */ 767 MemoryRegionSection *prev_sec = dev->tmp_sections + 768 (dev->n_tmp_sections - 1); 769 uint64_t prev_gpa_start = prev_sec->offset_within_address_space; 770 uint64_t prev_size = int128_get64(prev_sec->size); 771 uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); 772 uint64_t prev_host_start = 773 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + 774 prev_sec->offset_within_region; 775 uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); 776 777 if (mrs_gpa <= (prev_gpa_end + 1)) { 778 /* OK, looks like overlapping/intersecting - it's possible that 779 * the rounding to page sizes has made them overlap, but they should 780 * match up in the same RAMBlock if they do. 781 */ 782 if (mrs_gpa < prev_gpa_start) { 783 error_report("%s:Section '%s' rounded to %"PRIx64 784 " prior to previous '%s' %"PRIx64, 785 __func__, section->mr->name, mrs_gpa, 786 prev_sec->mr->name, prev_gpa_start); 787 /* A way to cleanly fail here would be better */ 788 return; 789 } 790 /* Offset from the start of the previous GPA to this GPA */ 791 size_t offset = mrs_gpa - prev_gpa_start; 792 793 if (prev_host_start + offset == mrs_host && 794 section->mr == prev_sec->mr && !prev_sec->unmergeable) { 795 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); 796 need_add = false; 797 prev_sec->offset_within_address_space = 798 MIN(prev_gpa_start, mrs_gpa); 799 prev_sec->offset_within_region = 800 MIN(prev_host_start, mrs_host) - 801 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); 802 prev_sec->size = int128_make64(max_end - MIN(prev_host_start, 803 mrs_host)); 804 trace_vhost_region_add_section_merge(section->mr->name, 805 int128_get64(prev_sec->size), 806 prev_sec->offset_within_address_space, 807 prev_sec->offset_within_region); 808 } else { 809 /* adjoining regions are fine, but overlapping ones with 810 * different blocks/offsets shouldn't happen 811 */ 812 if (mrs_gpa != prev_gpa_end + 1) { 813 error_report("%s: Overlapping but not coherent sections " 814 "at %"PRIx64, 815 __func__, mrs_gpa); 816 return; 817 } 818 } 819 } 820 } 821 822 if (need_add) { 823 ++dev->n_tmp_sections; 824 dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections, 825 dev->n_tmp_sections); 826 dev->tmp_sections[dev->n_tmp_sections - 1] = *section; 827 /* The flatview isn't stable and we don't use it, making it NULL 828 * means we can memcmp the list. 829 */ 830 dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL; 831 memory_region_ref(section->mr); 832 } 833 } 834 835 /* Used for both add and nop callbacks */ 836 static void vhost_region_addnop(MemoryListener *listener, 837 MemoryRegionSection *section) 838 { 839 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 840 memory_listener); 841 842 if (!vhost_section(dev, section)) { 843 return; 844 } 845 vhost_region_add_section(dev, section); 846 } 847 848 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 849 { 850 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n); 851 struct vhost_dev *hdev = iommu->hdev; 852 hwaddr iova = iotlb->iova + iommu->iommu_offset; 853 854 if (vhost_backend_invalidate_device_iotlb(hdev, iova, 855 iotlb->addr_mask + 1)) { 856 error_report("Fail to invalidate device iotlb"); 857 } 858 } 859 860 static void vhost_iommu_region_add(MemoryListener *listener, 861 MemoryRegionSection *section) 862 { 863 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 864 iommu_listener); 865 struct vhost_iommu *iommu; 866 Int128 end; 867 int iommu_idx; 868 IOMMUMemoryRegion *iommu_mr; 869 870 if (!memory_region_is_iommu(section->mr)) { 871 return; 872 } 873 874 iommu_mr = IOMMU_MEMORY_REGION(section->mr); 875 876 iommu = g_malloc0(sizeof(*iommu)); 877 end = int128_add(int128_make64(section->offset_within_region), 878 section->size); 879 end = int128_sub(end, int128_one()); 880 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 881 MEMTXATTRS_UNSPECIFIED); 882 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, 883 dev->vdev->device_iotlb_enabled ? 884 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : 885 IOMMU_NOTIFIER_UNMAP, 886 section->offset_within_region, 887 int128_get64(end), 888 iommu_idx); 889 iommu->mr = section->mr; 890 iommu->iommu_offset = section->offset_within_address_space - 891 section->offset_within_region; 892 iommu->hdev = dev; 893 memory_region_register_iommu_notifier(section->mr, &iommu->n, 894 &error_fatal); 895 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next); 896 /* TODO: can replay help performance here? */ 897 } 898 899 static void vhost_iommu_region_del(MemoryListener *listener, 900 MemoryRegionSection *section) 901 { 902 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 903 iommu_listener); 904 struct vhost_iommu *iommu; 905 906 if (!memory_region_is_iommu(section->mr)) { 907 return; 908 } 909 910 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 911 if (iommu->mr == section->mr && 912 iommu->n.start == section->offset_within_region) { 913 memory_region_unregister_iommu_notifier(iommu->mr, 914 &iommu->n); 915 QLIST_REMOVE(iommu, iommu_next); 916 g_free(iommu); 917 break; 918 } 919 } 920 } 921 922 void vhost_toggle_device_iotlb(VirtIODevice *vdev) 923 { 924 VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); 925 struct vhost_dev *dev; 926 struct vhost_iommu *iommu; 927 928 if (vdev->vhost_started) { 929 dev = vdc->get_vhost(vdev); 930 } else { 931 return; 932 } 933 934 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 935 memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n); 936 iommu->n.notifier_flags = vdev->device_iotlb_enabled ? 937 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP; 938 memory_region_register_iommu_notifier(iommu->mr, &iommu->n, 939 &error_fatal); 940 } 941 } 942 943 static int vhost_virtqueue_set_addr(struct vhost_dev *dev, 944 struct vhost_virtqueue *vq, 945 unsigned idx, bool enable_log) 946 { 947 struct vhost_vring_addr addr; 948 int r; 949 memset(&addr, 0, sizeof(struct vhost_vring_addr)); 950 951 if (dev->vhost_ops->vhost_vq_get_addr) { 952 r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq); 953 if (r < 0) { 954 VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed"); 955 return r; 956 } 957 } else { 958 addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc; 959 addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail; 960 addr.used_user_addr = (uint64_t)(unsigned long)vq->used; 961 } 962 addr.index = idx; 963 addr.log_guest_addr = vq->used_phys; 964 addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0; 965 r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 966 if (r < 0) { 967 VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed"); 968 } 969 return r; 970 } 971 972 static int vhost_dev_set_features(struct vhost_dev *dev, 973 bool enable_log) 974 { 975 uint64_t features = dev->acked_features; 976 int r; 977 if (enable_log) { 978 features |= 0x1ULL << VHOST_F_LOG_ALL; 979 } 980 if (!vhost_dev_has_iommu(dev)) { 981 features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); 982 } 983 if (dev->vhost_ops->vhost_force_iommu) { 984 if (dev->vhost_ops->vhost_force_iommu(dev) == true) { 985 features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; 986 } 987 } 988 r = dev->vhost_ops->vhost_set_features(dev, features); 989 if (r < 0) { 990 VHOST_OPS_DEBUG(r, "vhost_set_features failed"); 991 goto out; 992 } 993 if (dev->vhost_ops->vhost_set_backend_cap) { 994 r = dev->vhost_ops->vhost_set_backend_cap(dev); 995 if (r < 0) { 996 VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed"); 997 goto out; 998 } 999 } 1000 1001 out: 1002 return r; 1003 } 1004 1005 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) 1006 { 1007 int r, i, idx; 1008 hwaddr addr; 1009 1010 r = vhost_dev_set_features(dev, enable_log); 1011 if (r < 0) { 1012 goto err_features; 1013 } 1014 for (i = 0; i < dev->nvqs; ++i) { 1015 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 1016 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 1017 if (!addr) { 1018 /* 1019 * The queue might not be ready for start. If this 1020 * is the case there is no reason to continue the process. 1021 * The similar logic is used by the vhost_virtqueue_start() 1022 * routine. 1023 */ 1024 continue; 1025 } 1026 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 1027 enable_log); 1028 if (r < 0) { 1029 goto err_vq; 1030 } 1031 } 1032 1033 /* 1034 * At log start we select our vhost_device logger that will scan the 1035 * memory sections and skip for the others. This is possible because 1036 * the log is shared amongst all vhost devices for a given type of 1037 * backend. 1038 */ 1039 vhost_dev_elect_mem_logger(dev, enable_log); 1040 1041 return 0; 1042 err_vq: 1043 for (; i >= 0; --i) { 1044 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 1045 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 1046 if (!addr) { 1047 continue; 1048 } 1049 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 1050 dev->log_enabled); 1051 } 1052 vhost_dev_set_features(dev, dev->log_enabled); 1053 err_features: 1054 return r; 1055 } 1056 1057 static int vhost_migration_log(MemoryListener *listener, bool enable) 1058 { 1059 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 1060 memory_listener); 1061 int r; 1062 if (enable == dev->log_enabled) { 1063 return 0; 1064 } 1065 if (!dev->started) { 1066 dev->log_enabled = enable; 1067 return 0; 1068 } 1069 1070 r = 0; 1071 if (!enable) { 1072 r = vhost_dev_set_log(dev, false); 1073 if (r < 0) { 1074 goto check_dev_state; 1075 } 1076 vhost_log_put(dev, false); 1077 } else { 1078 vhost_dev_log_resize(dev, vhost_get_log_size(dev)); 1079 r = vhost_dev_set_log(dev, true); 1080 if (r < 0) { 1081 goto check_dev_state; 1082 } 1083 } 1084 1085 check_dev_state: 1086 dev->log_enabled = enable; 1087 /* 1088 * vhost-user-* devices could change their state during log 1089 * initialization due to disconnect. So check dev state after 1090 * vhost communication. 1091 */ 1092 if (!dev->started) { 1093 /* 1094 * Since device is in the stopped state, it is okay for 1095 * migration. Return success. 1096 */ 1097 r = 0; 1098 } 1099 if (r) { 1100 /* An error occurred. */ 1101 dev->log_enabled = false; 1102 } 1103 1104 return r; 1105 } 1106 1107 static bool vhost_log_global_start(MemoryListener *listener, Error **errp) 1108 { 1109 int r; 1110 1111 r = vhost_migration_log(listener, true); 1112 if (r < 0) { 1113 abort(); 1114 } 1115 return true; 1116 } 1117 1118 static void vhost_log_global_stop(MemoryListener *listener) 1119 { 1120 int r; 1121 1122 r = vhost_migration_log(listener, false); 1123 if (r < 0) { 1124 abort(); 1125 } 1126 } 1127 1128 static void vhost_log_start(MemoryListener *listener, 1129 MemoryRegionSection *section, 1130 int old, int new) 1131 { 1132 /* FIXME: implement */ 1133 } 1134 1135 static void vhost_log_stop(MemoryListener *listener, 1136 MemoryRegionSection *section, 1137 int old, int new) 1138 { 1139 /* FIXME: implement */ 1140 } 1141 1142 /* The vhost driver natively knows how to handle the vrings of non 1143 * cross-endian legacy devices and modern devices. Only legacy devices 1144 * exposed to a bi-endian guest may require the vhost driver to use a 1145 * specific endianness. 1146 */ 1147 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) 1148 { 1149 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1150 return false; 1151 } 1152 #if HOST_BIG_ENDIAN 1153 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; 1154 #else 1155 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; 1156 #endif 1157 } 1158 1159 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, 1160 bool is_big_endian, 1161 int vhost_vq_index) 1162 { 1163 int r; 1164 struct vhost_vring_state s = { 1165 .index = vhost_vq_index, 1166 .num = is_big_endian 1167 }; 1168 1169 r = dev->vhost_ops->vhost_set_vring_endian(dev, &s); 1170 if (r < 0) { 1171 VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed"); 1172 } 1173 return r; 1174 } 1175 1176 static int vhost_memory_region_lookup(struct vhost_dev *hdev, 1177 uint64_t gpa, uint64_t *uaddr, 1178 uint64_t *len) 1179 { 1180 int i; 1181 1182 for (i = 0; i < hdev->mem->nregions; i++) { 1183 struct vhost_memory_region *reg = hdev->mem->regions + i; 1184 1185 if (gpa >= reg->guest_phys_addr && 1186 reg->guest_phys_addr + reg->memory_size > gpa) { 1187 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr; 1188 *len = reg->guest_phys_addr + reg->memory_size - gpa; 1189 return 0; 1190 } 1191 } 1192 1193 return -EFAULT; 1194 } 1195 1196 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) 1197 { 1198 IOMMUTLBEntry iotlb; 1199 uint64_t uaddr, len; 1200 int ret = -EFAULT; 1201 1202 RCU_READ_LOCK_GUARD(); 1203 1204 trace_vhost_iotlb_miss(dev, 1); 1205 1206 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 1207 iova, write, 1208 MEMTXATTRS_UNSPECIFIED); 1209 if (iotlb.target_as != NULL) { 1210 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, 1211 &uaddr, &len); 1212 if (ret) { 1213 trace_vhost_iotlb_miss(dev, 3); 1214 error_report("Fail to lookup the translated address " 1215 "%"PRIx64, iotlb.translated_addr); 1216 goto out; 1217 } 1218 1219 len = MIN(iotlb.addr_mask + 1, len); 1220 iova = iova & ~iotlb.addr_mask; 1221 1222 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, 1223 len, iotlb.perm); 1224 if (ret) { 1225 trace_vhost_iotlb_miss(dev, 4); 1226 error_report("Fail to update device iotlb"); 1227 goto out; 1228 } 1229 } 1230 1231 trace_vhost_iotlb_miss(dev, 2); 1232 1233 out: 1234 return ret; 1235 } 1236 1237 int vhost_virtqueue_start(struct vhost_dev *dev, 1238 struct VirtIODevice *vdev, 1239 struct vhost_virtqueue *vq, 1240 unsigned idx) 1241 { 1242 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1243 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1244 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1245 hwaddr s, l, a; 1246 int r; 1247 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1248 struct vhost_vring_file file = { 1249 .index = vhost_vq_index 1250 }; 1251 struct vhost_vring_state state = { 1252 .index = vhost_vq_index 1253 }; 1254 struct VirtQueue *vvq = virtio_get_queue(vdev, idx); 1255 1256 a = virtio_queue_get_desc_addr(vdev, idx); 1257 if (a == 0) { 1258 /* Queue might not be ready for start */ 1259 return 0; 1260 } 1261 1262 vq->num = state.num = virtio_queue_get_num(vdev, idx); 1263 r = dev->vhost_ops->vhost_set_vring_num(dev, &state); 1264 if (r) { 1265 VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed"); 1266 return r; 1267 } 1268 1269 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 1270 r = dev->vhost_ops->vhost_set_vring_base(dev, &state); 1271 if (r) { 1272 VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed"); 1273 return r; 1274 } 1275 1276 if (vhost_needs_vring_endian(vdev)) { 1277 r = vhost_virtqueue_set_vring_endian_legacy(dev, 1278 virtio_is_big_endian(vdev), 1279 vhost_vq_index); 1280 if (r) { 1281 return r; 1282 } 1283 } 1284 1285 vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx); 1286 vq->desc_phys = a; 1287 vq->desc = vhost_memory_map(dev, a, &l, false); 1288 if (!vq->desc || l != s) { 1289 r = -ENOMEM; 1290 goto fail_alloc_desc; 1291 } 1292 vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx); 1293 vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx); 1294 vq->avail = vhost_memory_map(dev, a, &l, false); 1295 if (!vq->avail || l != s) { 1296 r = -ENOMEM; 1297 goto fail_alloc_avail; 1298 } 1299 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); 1300 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); 1301 vq->used = vhost_memory_map(dev, a, &l, true); 1302 if (!vq->used || l != s) { 1303 r = -ENOMEM; 1304 goto fail_alloc_used; 1305 } 1306 1307 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); 1308 if (r < 0) { 1309 goto fail_alloc; 1310 } 1311 1312 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); 1313 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); 1314 if (r) { 1315 VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed"); 1316 goto fail_kick; 1317 } 1318 1319 /* Clear and discard previous events if any. */ 1320 event_notifier_test_and_clear(&vq->masked_notifier); 1321 1322 /* Init vring in unmasked state, unless guest_notifier_mask 1323 * will do it later. 1324 */ 1325 if (!vdev->use_guest_notifier_mask) { 1326 /* TODO: check and handle errors. */ 1327 vhost_virtqueue_mask(dev, vdev, idx, false); 1328 } 1329 1330 if (k->query_guest_notifiers && 1331 k->query_guest_notifiers(qbus->parent) && 1332 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) { 1333 file.fd = -1; 1334 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1335 if (r) { 1336 goto fail_vector; 1337 } 1338 } 1339 1340 return 0; 1341 1342 fail_vector: 1343 fail_kick: 1344 fail_alloc: 1345 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1346 0, 0); 1347 fail_alloc_used: 1348 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1349 0, 0); 1350 fail_alloc_avail: 1351 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1352 0, 0); 1353 fail_alloc_desc: 1354 return r; 1355 } 1356 1357 static int do_vhost_virtqueue_stop(struct vhost_dev *dev, 1358 struct VirtIODevice *vdev, 1359 struct vhost_virtqueue *vq, 1360 unsigned idx, bool force) 1361 { 1362 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1363 struct vhost_vring_state state = { 1364 .index = vhost_vq_index, 1365 }; 1366 int r = 0; 1367 1368 if (virtio_queue_get_desc_addr(vdev, idx) == 0) { 1369 /* Don't stop the virtqueue which might have not been started */ 1370 return 0; 1371 } 1372 1373 if (!force) { 1374 r = dev->vhost_ops->vhost_get_vring_base(dev, &state); 1375 if (r < 0) { 1376 VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r); 1377 } 1378 } 1379 1380 if (r < 0 || force) { 1381 /* Connection to the backend is broken, so let's sync internal 1382 * last avail idx to the device used idx. 1383 */ 1384 virtio_queue_restore_last_avail_idx(vdev, idx); 1385 } else { 1386 virtio_queue_set_last_avail_idx(vdev, idx, state.num); 1387 } 1388 virtio_queue_invalidate_signalled_used(vdev, idx); 1389 virtio_queue_update_used_idx(vdev, idx); 1390 1391 /* In the cross-endian case, we need to reset the vring endianness to 1392 * native as legacy devices expect so by default. 1393 */ 1394 if (vhost_needs_vring_endian(vdev)) { 1395 vhost_virtqueue_set_vring_endian_legacy(dev, 1396 !virtio_is_big_endian(vdev), 1397 vhost_vq_index); 1398 } 1399 1400 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1401 1, virtio_queue_get_used_size(vdev, idx)); 1402 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1403 0, virtio_queue_get_avail_size(vdev, idx)); 1404 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1405 0, virtio_queue_get_desc_size(vdev, idx)); 1406 return r; 1407 } 1408 1409 int vhost_virtqueue_stop(struct vhost_dev *dev, 1410 struct VirtIODevice *vdev, 1411 struct vhost_virtqueue *vq, 1412 unsigned idx) 1413 { 1414 return do_vhost_virtqueue_stop(dev, vdev, vq, idx, false); 1415 } 1416 1417 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, 1418 int n, uint32_t timeout) 1419 { 1420 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1421 struct vhost_vring_state state = { 1422 .index = vhost_vq_index, 1423 .num = timeout, 1424 }; 1425 int r; 1426 1427 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) { 1428 return -EINVAL; 1429 } 1430 1431 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state); 1432 if (r) { 1433 VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed"); 1434 return r; 1435 } 1436 1437 return 0; 1438 } 1439 1440 static void vhost_virtqueue_error_notifier(EventNotifier *n) 1441 { 1442 struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue, 1443 error_notifier); 1444 struct vhost_dev *dev = vq->dev; 1445 int index = vq - dev->vqs; 1446 1447 if (event_notifier_test_and_clear(n) && dev->vdev) { 1448 VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", 1449 dev->vq_index + index); 1450 } 1451 } 1452 1453 static int vhost_virtqueue_init(struct vhost_dev *dev, 1454 struct vhost_virtqueue *vq, int n) 1455 { 1456 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1457 struct vhost_vring_file file = { 1458 .index = vhost_vq_index, 1459 }; 1460 int r = event_notifier_init(&vq->masked_notifier, 0); 1461 if (r < 0) { 1462 return r; 1463 } 1464 1465 file.fd = event_notifier_get_wfd(&vq->masked_notifier); 1466 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1467 if (r) { 1468 VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed"); 1469 goto fail_call; 1470 } 1471 1472 vq->dev = dev; 1473 1474 if (dev->vhost_ops->vhost_set_vring_err) { 1475 r = event_notifier_init(&vq->error_notifier, 0); 1476 if (r < 0) { 1477 goto fail_call; 1478 } 1479 1480 file.fd = event_notifier_get_fd(&vq->error_notifier); 1481 r = dev->vhost_ops->vhost_set_vring_err(dev, &file); 1482 if (r) { 1483 VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed"); 1484 goto fail_err; 1485 } 1486 1487 event_notifier_set_handler(&vq->error_notifier, 1488 vhost_virtqueue_error_notifier); 1489 } 1490 1491 return 0; 1492 1493 fail_err: 1494 event_notifier_cleanup(&vq->error_notifier); 1495 fail_call: 1496 event_notifier_cleanup(&vq->masked_notifier); 1497 return r; 1498 } 1499 1500 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) 1501 { 1502 event_notifier_cleanup(&vq->masked_notifier); 1503 if (vq->dev->vhost_ops->vhost_set_vring_err) { 1504 event_notifier_set_handler(&vq->error_notifier, NULL); 1505 event_notifier_cleanup(&vq->error_notifier); 1506 } 1507 } 1508 1509 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 1510 VhostBackendType backend_type, uint32_t busyloop_timeout, 1511 Error **errp) 1512 { 1513 unsigned int used, reserved, limit; 1514 uint64_t features; 1515 int i, r, n_initialized_vqs = 0; 1516 1517 hdev->vdev = NULL; 1518 hdev->migration_blocker = NULL; 1519 1520 r = vhost_set_backend_type(hdev, backend_type); 1521 assert(r >= 0); 1522 1523 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp); 1524 if (r < 0) { 1525 goto fail; 1526 } 1527 1528 r = hdev->vhost_ops->vhost_set_owner(hdev); 1529 if (r < 0) { 1530 error_setg_errno(errp, -r, "vhost_set_owner failed"); 1531 goto fail; 1532 } 1533 1534 r = hdev->vhost_ops->vhost_get_features(hdev, &features); 1535 if (r < 0) { 1536 error_setg_errno(errp, -r, "vhost_get_features failed"); 1537 goto fail; 1538 } 1539 1540 limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 1541 if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS && 1542 memory_devices_memslot_auto_decision_active()) { 1543 error_setg(errp, "some memory device (like virtio-mem)" 1544 " decided how many memory slots to use based on the overall" 1545 " number of memory slots; this vhost backend would further" 1546 " restricts the overall number of memory slots"); 1547 error_append_hint(errp, "Try plugging this vhost backend before" 1548 " plugging such memory devices.\n"); 1549 r = -EINVAL; 1550 goto fail; 1551 } 1552 1553 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) { 1554 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); 1555 if (r < 0) { 1556 error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i); 1557 goto fail; 1558 } 1559 } 1560 1561 if (busyloop_timeout) { 1562 for (i = 0; i < hdev->nvqs; ++i) { 1563 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 1564 busyloop_timeout); 1565 if (r < 0) { 1566 error_setg_errno(errp, -r, "Failed to set busyloop timeout"); 1567 goto fail_busyloop; 1568 } 1569 } 1570 } 1571 1572 hdev->features = features; 1573 1574 hdev->memory_listener = (MemoryListener) { 1575 .name = "vhost", 1576 .begin = vhost_begin, 1577 .commit = vhost_commit, 1578 .region_add = vhost_region_addnop, 1579 .region_nop = vhost_region_addnop, 1580 .log_start = vhost_log_start, 1581 .log_stop = vhost_log_stop, 1582 .log_sync = vhost_log_sync, 1583 .log_global_start = vhost_log_global_start, 1584 .log_global_stop = vhost_log_global_stop, 1585 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND 1586 }; 1587 1588 hdev->iommu_listener = (MemoryListener) { 1589 .name = "vhost-iommu", 1590 .region_add = vhost_iommu_region_add, 1591 .region_del = vhost_iommu_region_del, 1592 }; 1593 1594 if (hdev->migration_blocker == NULL) { 1595 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { 1596 error_setg(&hdev->migration_blocker, 1597 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); 1598 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { 1599 error_setg(&hdev->migration_blocker, 1600 "Migration disabled: failed to allocate shared memory"); 1601 } 1602 } 1603 1604 if (hdev->migration_blocker != NULL) { 1605 r = migrate_add_blocker_normal(&hdev->migration_blocker, errp); 1606 if (r < 0) { 1607 goto fail_busyloop; 1608 } 1609 } 1610 1611 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); 1612 hdev->n_mem_sections = 0; 1613 hdev->mem_sections = NULL; 1614 hdev->log = NULL; 1615 hdev->log_size = 0; 1616 hdev->log_enabled = false; 1617 hdev->started = false; 1618 memory_listener_register(&hdev->memory_listener, &address_space_memory); 1619 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); 1620 1621 /* 1622 * The listener we registered properly setup the number of required 1623 * memslots in vhost_commit(). 1624 */ 1625 used = hdev->mem->nregions; 1626 1627 /* 1628 * We assume that all reserved memslots actually require a real memslot 1629 * in our vhost backend. This might not be true, for example, if the 1630 * memslot would be ROM. If ever relevant, we can optimize for that -- 1631 * but we'll need additional information about the reservations. 1632 */ 1633 reserved = memory_devices_get_reserved_memslots(); 1634 if (used + reserved > limit) { 1635 error_setg(errp, "vhost backend memory slots limit (%d) is less" 1636 " than current number of used (%d) and reserved (%d)" 1637 " memory slots for memory devices.", limit, used, reserved); 1638 r = -EINVAL; 1639 goto fail_busyloop; 1640 } 1641 1642 return 0; 1643 1644 fail_busyloop: 1645 if (busyloop_timeout) { 1646 while (--i >= 0) { 1647 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0); 1648 } 1649 } 1650 fail: 1651 hdev->nvqs = n_initialized_vqs; 1652 vhost_dev_cleanup(hdev); 1653 return r; 1654 } 1655 1656 void vhost_dev_cleanup(struct vhost_dev *hdev) 1657 { 1658 int i; 1659 1660 trace_vhost_dev_cleanup(hdev); 1661 1662 for (i = 0; i < hdev->nvqs; ++i) { 1663 vhost_virtqueue_cleanup(hdev->vqs + i); 1664 } 1665 if (hdev->mem) { 1666 /* those are only safe after successful init */ 1667 memory_listener_unregister(&hdev->memory_listener); 1668 QLIST_REMOVE(hdev, entry); 1669 } 1670 migrate_del_blocker(&hdev->migration_blocker); 1671 g_free(hdev->mem); 1672 g_free(hdev->mem_sections); 1673 if (hdev->vhost_ops) { 1674 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1675 } 1676 assert(!hdev->log); 1677 1678 memset(hdev, 0, sizeof(struct vhost_dev)); 1679 } 1680 1681 void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev, 1682 VirtIODevice *vdev, 1683 unsigned int nvqs) 1684 { 1685 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1686 int i, r; 1687 1688 /* 1689 * Batch all the host notifiers in a single transaction to avoid 1690 * quadratic time complexity in address_space_update_ioeventfds(). 1691 */ 1692 memory_region_transaction_begin(); 1693 1694 for (i = 0; i < nvqs; ++i) { 1695 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1696 false); 1697 if (r < 0) { 1698 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); 1699 } 1700 assert(r >= 0); 1701 } 1702 1703 /* 1704 * The transaction expects the ioeventfds to be open when it 1705 * commits. Do it now, before the cleanup loop. 1706 */ 1707 memory_region_transaction_commit(); 1708 1709 for (i = 0; i < nvqs; ++i) { 1710 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); 1711 } 1712 virtio_device_release_ioeventfd(vdev); 1713 } 1714 1715 /* Stop processing guest IO notifications in qemu. 1716 * Start processing them in vhost in kernel. 1717 */ 1718 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1719 { 1720 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1721 int i, r; 1722 1723 /* We will pass the notifiers to the kernel, make sure that QEMU 1724 * doesn't interfere. 1725 */ 1726 r = virtio_device_grab_ioeventfd(vdev); 1727 if (r < 0) { 1728 error_report("binding does not support host notifiers"); 1729 return r; 1730 } 1731 1732 /* 1733 * Batch all the host notifiers in a single transaction to avoid 1734 * quadratic time complexity in address_space_update_ioeventfds(). 1735 */ 1736 memory_region_transaction_begin(); 1737 1738 for (i = 0; i < hdev->nvqs; ++i) { 1739 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1740 true); 1741 if (r < 0) { 1742 error_report("vhost VQ %d notifier binding failed: %d", i, -r); 1743 memory_region_transaction_commit(); 1744 vhost_dev_disable_notifiers_nvqs(hdev, vdev, i); 1745 return r; 1746 } 1747 } 1748 1749 memory_region_transaction_commit(); 1750 1751 return 0; 1752 } 1753 1754 /* Stop processing guest IO notifications in vhost. 1755 * Start processing them in qemu. 1756 * This might actually run the qemu handlers right away, 1757 * so virtio in qemu must be completely setup when this is called. 1758 */ 1759 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1760 { 1761 vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs); 1762 } 1763 1764 /* Test and clear event pending status. 1765 * Should be called after unmask to avoid losing events. 1766 */ 1767 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) 1768 { 1769 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; 1770 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); 1771 return event_notifier_test_and_clear(&vq->masked_notifier); 1772 } 1773 1774 /* Mask/unmask events from this vq. */ 1775 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, 1776 bool mask) 1777 { 1778 struct VirtQueue *vvq = virtio_get_queue(vdev, n); 1779 int r, index = n - hdev->vq_index; 1780 struct vhost_vring_file file; 1781 1782 /* should only be called after backend is connected */ 1783 assert(hdev->vhost_ops); 1784 1785 if (mask) { 1786 assert(vdev->use_guest_notifier_mask); 1787 file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier); 1788 } else { 1789 file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq)); 1790 } 1791 1792 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); 1793 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); 1794 if (r < 0) { 1795 error_report("vhost_set_vring_call failed %d", -r); 1796 } 1797 } 1798 1799 bool vhost_config_pending(struct vhost_dev *hdev) 1800 { 1801 assert(hdev->vhost_ops); 1802 if ((hdev->started == false) || 1803 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1804 return false; 1805 } 1806 1807 EventNotifier *notifier = 1808 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1809 return event_notifier_test_and_clear(notifier); 1810 } 1811 1812 void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask) 1813 { 1814 int fd; 1815 int r; 1816 EventNotifier *notifier = 1817 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1818 EventNotifier *config_notifier = &vdev->config_notifier; 1819 assert(hdev->vhost_ops); 1820 1821 if ((hdev->started == false) || 1822 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1823 return; 1824 } 1825 if (mask) { 1826 assert(vdev->use_guest_notifier_mask); 1827 fd = event_notifier_get_fd(notifier); 1828 } else { 1829 fd = event_notifier_get_fd(config_notifier); 1830 } 1831 r = hdev->vhost_ops->vhost_set_config_call(hdev, fd); 1832 if (r < 0) { 1833 error_report("vhost_set_config_call failed %d", -r); 1834 } 1835 } 1836 1837 static void vhost_stop_config_intr(struct vhost_dev *dev) 1838 { 1839 int fd = -1; 1840 assert(dev->vhost_ops); 1841 if (dev->vhost_ops->vhost_set_config_call) { 1842 dev->vhost_ops->vhost_set_config_call(dev, fd); 1843 } 1844 } 1845 1846 static void vhost_start_config_intr(struct vhost_dev *dev) 1847 { 1848 int r; 1849 1850 assert(dev->vhost_ops); 1851 int fd = event_notifier_get_fd(&dev->vdev->config_notifier); 1852 if (dev->vhost_ops->vhost_set_config_call) { 1853 r = dev->vhost_ops->vhost_set_config_call(dev, fd); 1854 if (!r) { 1855 event_notifier_set(&dev->vdev->config_notifier); 1856 } 1857 } 1858 } 1859 1860 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, 1861 uint64_t features) 1862 { 1863 const int *bit = feature_bits; 1864 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1865 uint64_t bit_mask = (1ULL << *bit); 1866 if (!(hdev->features & bit_mask)) { 1867 features &= ~bit_mask; 1868 } 1869 bit++; 1870 } 1871 return features; 1872 } 1873 1874 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, 1875 uint64_t features) 1876 { 1877 const int *bit = feature_bits; 1878 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1879 uint64_t bit_mask = (1ULL << *bit); 1880 if (features & bit_mask) { 1881 hdev->acked_features |= bit_mask; 1882 } 1883 bit++; 1884 } 1885 } 1886 1887 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, 1888 uint32_t config_len, Error **errp) 1889 { 1890 assert(hdev->vhost_ops); 1891 1892 if (hdev->vhost_ops->vhost_get_config) { 1893 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len, 1894 errp); 1895 } 1896 1897 error_setg(errp, "vhost_get_config not implemented"); 1898 return -ENOSYS; 1899 } 1900 1901 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data, 1902 uint32_t offset, uint32_t size, uint32_t flags) 1903 { 1904 assert(hdev->vhost_ops); 1905 1906 if (hdev->vhost_ops->vhost_set_config) { 1907 return hdev->vhost_ops->vhost_set_config(hdev, data, offset, 1908 size, flags); 1909 } 1910 1911 return -ENOSYS; 1912 } 1913 1914 void vhost_dev_set_config_notifier(struct vhost_dev *hdev, 1915 const VhostDevConfigOps *ops) 1916 { 1917 hdev->config_ops = ops; 1918 } 1919 1920 void vhost_dev_free_inflight(struct vhost_inflight *inflight) 1921 { 1922 if (inflight && inflight->addr) { 1923 qemu_memfd_free(inflight->addr, inflight->size, inflight->fd); 1924 inflight->addr = NULL; 1925 inflight->fd = -1; 1926 } 1927 } 1928 1929 int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev) 1930 { 1931 int r; 1932 1933 if (hdev->vhost_ops->vhost_get_inflight_fd == NULL || 1934 hdev->vhost_ops->vhost_set_inflight_fd == NULL) { 1935 return 0; 1936 } 1937 1938 hdev->vdev = vdev; 1939 1940 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1941 if (r < 0) { 1942 VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed"); 1943 return r; 1944 } 1945 1946 return 0; 1947 } 1948 1949 int vhost_dev_set_inflight(struct vhost_dev *dev, 1950 struct vhost_inflight *inflight) 1951 { 1952 int r; 1953 1954 if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) { 1955 r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight); 1956 if (r) { 1957 VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed"); 1958 return r; 1959 } 1960 } 1961 1962 return 0; 1963 } 1964 1965 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, 1966 struct vhost_inflight *inflight) 1967 { 1968 int r; 1969 1970 if (dev->vhost_ops->vhost_get_inflight_fd) { 1971 r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight); 1972 if (r) { 1973 VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed"); 1974 return r; 1975 } 1976 } 1977 1978 return 0; 1979 } 1980 1981 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) 1982 { 1983 if (!hdev->vhost_ops->vhost_set_vring_enable) { 1984 return 0; 1985 } 1986 1987 /* 1988 * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not 1989 * been negotiated, the rings start directly in the enabled state, and 1990 * .vhost_set_vring_enable callback will fail since 1991 * VHOST_USER_SET_VRING_ENABLE is not supported. 1992 */ 1993 if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER && 1994 !virtio_has_feature(hdev->backend_features, 1995 VHOST_USER_F_PROTOCOL_FEATURES)) { 1996 return 0; 1997 } 1998 1999 return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); 2000 } 2001 2002 /* 2003 * Host notifiers must be enabled at this point. 2004 * 2005 * If @vrings is true, this function will enable all vrings before starting the 2006 * device. If it is false, the vring initialization is left to be done by the 2007 * caller. 2008 */ 2009 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 2010 { 2011 int i, r; 2012 2013 /* should only be called after backend is connected */ 2014 assert(hdev->vhost_ops); 2015 2016 trace_vhost_dev_start(hdev, vdev->name, vrings); 2017 2018 vdev->vhost_started = true; 2019 hdev->started = true; 2020 hdev->vdev = vdev; 2021 2022 r = vhost_dev_set_features(hdev, hdev->log_enabled); 2023 if (r < 0) { 2024 goto fail_features; 2025 } 2026 2027 if (vhost_dev_has_iommu(hdev)) { 2028 memory_listener_register(&hdev->iommu_listener, vdev->dma_as); 2029 } 2030 2031 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); 2032 if (r < 0) { 2033 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 2034 goto fail_mem; 2035 } 2036 for (i = 0; i < hdev->nvqs; ++i) { 2037 r = vhost_virtqueue_start(hdev, 2038 vdev, 2039 hdev->vqs + i, 2040 hdev->vq_index + i); 2041 if (r < 0) { 2042 goto fail_vq; 2043 } 2044 } 2045 2046 r = event_notifier_init( 2047 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0); 2048 if (r < 0) { 2049 VHOST_OPS_DEBUG(r, "event_notifier_init failed"); 2050 goto fail_vq; 2051 } 2052 event_notifier_test_and_clear( 2053 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2054 if (!vdev->use_guest_notifier_mask) { 2055 vhost_config_mask(hdev, vdev, true); 2056 } 2057 if (hdev->log_enabled) { 2058 uint64_t log_base; 2059 2060 hdev->log_size = vhost_get_log_size(hdev); 2061 hdev->log = vhost_log_get(hdev->vhost_ops->backend_type, 2062 hdev->log_size, 2063 vhost_dev_log_is_shared(hdev)); 2064 log_base = (uintptr_t)hdev->log->log; 2065 r = hdev->vhost_ops->vhost_set_log_base(hdev, 2066 hdev->log_size ? log_base : 0, 2067 hdev->log); 2068 if (r < 0) { 2069 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 2070 goto fail_log; 2071 } 2072 vhost_dev_elect_mem_logger(hdev, true); 2073 } 2074 if (vrings) { 2075 r = vhost_dev_set_vring_enable(hdev, true); 2076 if (r) { 2077 goto fail_log; 2078 } 2079 } 2080 if (hdev->vhost_ops->vhost_dev_start) { 2081 r = hdev->vhost_ops->vhost_dev_start(hdev, true); 2082 if (r) { 2083 goto fail_start; 2084 } 2085 } 2086 if (vhost_dev_has_iommu(hdev) && 2087 hdev->vhost_ops->vhost_set_iotlb_callback) { 2088 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); 2089 2090 /* Update used ring information for IOTLB to work correctly, 2091 * vhost-kernel code requires for this.*/ 2092 for (i = 0; i < hdev->nvqs; ++i) { 2093 struct vhost_virtqueue *vq = hdev->vqs + i; 2094 r = vhost_device_iotlb_miss(hdev, vq->used_phys, true); 2095 if (r) { 2096 goto fail_iotlb; 2097 } 2098 } 2099 } 2100 vhost_start_config_intr(hdev); 2101 return 0; 2102 fail_iotlb: 2103 if (vhost_dev_has_iommu(hdev) && 2104 hdev->vhost_ops->vhost_set_iotlb_callback) { 2105 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 2106 } 2107 if (hdev->vhost_ops->vhost_dev_start) { 2108 hdev->vhost_ops->vhost_dev_start(hdev, false); 2109 } 2110 fail_start: 2111 if (vrings) { 2112 vhost_dev_set_vring_enable(hdev, false); 2113 } 2114 fail_log: 2115 vhost_log_put(hdev, false); 2116 fail_vq: 2117 while (--i >= 0) { 2118 vhost_virtqueue_stop(hdev, 2119 vdev, 2120 hdev->vqs + i, 2121 hdev->vq_index + i); 2122 } 2123 2124 fail_mem: 2125 if (vhost_dev_has_iommu(hdev)) { 2126 memory_listener_unregister(&hdev->iommu_listener); 2127 } 2128 fail_features: 2129 vdev->vhost_started = false; 2130 hdev->started = false; 2131 return r; 2132 } 2133 2134 /* Host notifiers must be enabled at this point. */ 2135 static int do_vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, 2136 bool vrings, bool force) 2137 { 2138 int i; 2139 int rc = 0; 2140 2141 /* should only be called after backend is connected */ 2142 assert(hdev->vhost_ops); 2143 event_notifier_test_and_clear( 2144 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2145 event_notifier_test_and_clear(&vdev->config_notifier); 2146 event_notifier_cleanup( 2147 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2148 2149 trace_vhost_dev_stop(hdev, vdev->name, vrings); 2150 2151 if (hdev->vhost_ops->vhost_dev_start) { 2152 hdev->vhost_ops->vhost_dev_start(hdev, false); 2153 } 2154 if (vrings) { 2155 vhost_dev_set_vring_enable(hdev, false); 2156 } 2157 for (i = 0; i < hdev->nvqs; ++i) { 2158 rc |= do_vhost_virtqueue_stop(hdev, 2159 vdev, 2160 hdev->vqs + i, 2161 hdev->vq_index + i, 2162 force); 2163 } 2164 if (hdev->vhost_ops->vhost_reset_status) { 2165 hdev->vhost_ops->vhost_reset_status(hdev); 2166 } 2167 2168 if (vhost_dev_has_iommu(hdev)) { 2169 if (hdev->vhost_ops->vhost_set_iotlb_callback) { 2170 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 2171 } 2172 memory_listener_unregister(&hdev->iommu_listener); 2173 } 2174 vhost_stop_config_intr(hdev); 2175 vhost_log_put(hdev, true); 2176 hdev->started = false; 2177 vdev->vhost_started = false; 2178 hdev->vdev = NULL; 2179 return rc; 2180 } 2181 2182 int vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 2183 { 2184 return do_vhost_dev_stop(hdev, vdev, vrings, false); 2185 } 2186 2187 int vhost_dev_force_stop(struct vhost_dev *hdev, VirtIODevice *vdev, 2188 bool vrings) 2189 { 2190 return do_vhost_dev_stop(hdev, vdev, vrings, true); 2191 } 2192 2193 int vhost_net_set_backend(struct vhost_dev *hdev, 2194 struct vhost_vring_file *file) 2195 { 2196 if (hdev->vhost_ops->vhost_net_set_backend) { 2197 return hdev->vhost_ops->vhost_net_set_backend(hdev, file); 2198 } 2199 2200 return -ENOSYS; 2201 } 2202 2203 int vhost_reset_device(struct vhost_dev *hdev) 2204 { 2205 if (hdev->vhost_ops->vhost_reset_device) { 2206 return hdev->vhost_ops->vhost_reset_device(hdev); 2207 } 2208 2209 return -ENOSYS; 2210 } 2211 2212 bool vhost_supports_device_state(struct vhost_dev *dev) 2213 { 2214 if (dev->vhost_ops->vhost_supports_device_state) { 2215 return dev->vhost_ops->vhost_supports_device_state(dev); 2216 } 2217 2218 return false; 2219 } 2220 2221 int vhost_set_device_state_fd(struct vhost_dev *dev, 2222 VhostDeviceStateDirection direction, 2223 VhostDeviceStatePhase phase, 2224 int fd, 2225 int *reply_fd, 2226 Error **errp) 2227 { 2228 if (dev->vhost_ops->vhost_set_device_state_fd) { 2229 return dev->vhost_ops->vhost_set_device_state_fd(dev, direction, phase, 2230 fd, reply_fd, errp); 2231 } 2232 2233 error_setg(errp, 2234 "vhost transport does not support migration state transfer"); 2235 return -ENOSYS; 2236 } 2237 2238 int vhost_check_device_state(struct vhost_dev *dev, Error **errp) 2239 { 2240 if (dev->vhost_ops->vhost_check_device_state) { 2241 return dev->vhost_ops->vhost_check_device_state(dev, errp); 2242 } 2243 2244 error_setg(errp, 2245 "vhost transport does not support migration state transfer"); 2246 return -ENOSYS; 2247 } 2248 2249 int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) 2250 { 2251 ERRP_GUARD(); 2252 /* Maximum chunk size in which to transfer the state */ 2253 const size_t chunk_size = 1 * 1024 * 1024; 2254 g_autofree void *transfer_buf = NULL; 2255 g_autoptr(GError) g_err = NULL; 2256 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; 2257 int ret; 2258 2259 /* [0] for reading (our end), [1] for writing (back-end's end) */ 2260 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { 2261 error_setg(errp, "Failed to set up state transfer pipe: %s", 2262 g_err->message); 2263 ret = -EINVAL; 2264 goto fail; 2265 } 2266 2267 read_fd = pipe_fds[0]; 2268 write_fd = pipe_fds[1]; 2269 2270 /* 2271 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. 2272 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for 2273 * vhost-user, so just check that it is stopped at all. 2274 */ 2275 assert(!dev->started); 2276 2277 /* Transfer ownership of write_fd to the back-end */ 2278 ret = vhost_set_device_state_fd(dev, 2279 VHOST_TRANSFER_STATE_DIRECTION_SAVE, 2280 VHOST_TRANSFER_STATE_PHASE_STOPPED, 2281 write_fd, 2282 &reply_fd, 2283 errp); 2284 if (ret < 0) { 2285 error_prepend(errp, "Failed to initiate state transfer: "); 2286 goto fail; 2287 } 2288 2289 /* If the back-end wishes to use a different pipe, switch over */ 2290 if (reply_fd >= 0) { 2291 close(read_fd); 2292 read_fd = reply_fd; 2293 } 2294 2295 transfer_buf = g_malloc(chunk_size); 2296 2297 while (true) { 2298 ssize_t read_ret; 2299 2300 read_ret = RETRY_ON_EINTR(read(read_fd, transfer_buf, chunk_size)); 2301 if (read_ret < 0) { 2302 ret = -errno; 2303 error_setg_errno(errp, -ret, "Failed to receive state"); 2304 goto fail; 2305 } 2306 2307 assert(read_ret <= chunk_size); 2308 qemu_put_be32(f, read_ret); 2309 2310 if (read_ret == 0) { 2311 /* EOF */ 2312 break; 2313 } 2314 2315 qemu_put_buffer(f, transfer_buf, read_ret); 2316 } 2317 2318 /* 2319 * Back-end will not really care, but be clean and close our end of the pipe 2320 * before inquiring the back-end about whether transfer was successful 2321 */ 2322 close(read_fd); 2323 read_fd = -1; 2324 2325 /* Also, verify that the device is still stopped */ 2326 assert(!dev->started); 2327 2328 ret = vhost_check_device_state(dev, errp); 2329 if (ret < 0) { 2330 goto fail; 2331 } 2332 2333 ret = 0; 2334 fail: 2335 if (read_fd >= 0) { 2336 close(read_fd); 2337 } 2338 2339 return ret; 2340 } 2341 2342 int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) 2343 { 2344 ERRP_GUARD(); 2345 size_t transfer_buf_size = 0; 2346 g_autofree void *transfer_buf = NULL; 2347 g_autoptr(GError) g_err = NULL; 2348 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; 2349 int ret; 2350 2351 /* [0] for reading (back-end's end), [1] for writing (our end) */ 2352 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { 2353 error_setg(errp, "Failed to set up state transfer pipe: %s", 2354 g_err->message); 2355 ret = -EINVAL; 2356 goto fail; 2357 } 2358 2359 read_fd = pipe_fds[0]; 2360 write_fd = pipe_fds[1]; 2361 2362 /* 2363 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. 2364 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for 2365 * vhost-user, so just check that it is stopped at all. 2366 */ 2367 assert(!dev->started); 2368 2369 /* Transfer ownership of read_fd to the back-end */ 2370 ret = vhost_set_device_state_fd(dev, 2371 VHOST_TRANSFER_STATE_DIRECTION_LOAD, 2372 VHOST_TRANSFER_STATE_PHASE_STOPPED, 2373 read_fd, 2374 &reply_fd, 2375 errp); 2376 if (ret < 0) { 2377 error_prepend(errp, "Failed to initiate state transfer: "); 2378 goto fail; 2379 } 2380 2381 /* If the back-end wishes to use a different pipe, switch over */ 2382 if (reply_fd >= 0) { 2383 close(write_fd); 2384 write_fd = reply_fd; 2385 } 2386 2387 while (true) { 2388 size_t this_chunk_size = qemu_get_be32(f); 2389 ssize_t write_ret; 2390 const uint8_t *transfer_pointer; 2391 2392 if (this_chunk_size == 0) { 2393 /* End of state */ 2394 break; 2395 } 2396 2397 if (transfer_buf_size < this_chunk_size) { 2398 transfer_buf = g_realloc(transfer_buf, this_chunk_size); 2399 transfer_buf_size = this_chunk_size; 2400 } 2401 2402 if (qemu_get_buffer(f, transfer_buf, this_chunk_size) < 2403 this_chunk_size) 2404 { 2405 error_setg(errp, "Failed to read state"); 2406 ret = -EINVAL; 2407 goto fail; 2408 } 2409 2410 transfer_pointer = transfer_buf; 2411 while (this_chunk_size > 0) { 2412 write_ret = RETRY_ON_EINTR( 2413 write(write_fd, transfer_pointer, this_chunk_size) 2414 ); 2415 if (write_ret < 0) { 2416 ret = -errno; 2417 error_setg_errno(errp, -ret, "Failed to send state"); 2418 goto fail; 2419 } else if (write_ret == 0) { 2420 error_setg(errp, "Failed to send state: Connection is closed"); 2421 ret = -ECONNRESET; 2422 goto fail; 2423 } 2424 2425 assert(write_ret <= this_chunk_size); 2426 this_chunk_size -= write_ret; 2427 transfer_pointer += write_ret; 2428 } 2429 } 2430 2431 /* 2432 * Close our end, thus ending transfer, before inquiring the back-end about 2433 * whether transfer was successful 2434 */ 2435 close(write_fd); 2436 write_fd = -1; 2437 2438 /* Also, verify that the device is still stopped */ 2439 assert(!dev->started); 2440 2441 ret = vhost_check_device_state(dev, errp); 2442 if (ret < 0) { 2443 goto fail; 2444 } 2445 2446 ret = 0; 2447 fail: 2448 if (write_fd >= 0) { 2449 close(write_fd); 2450 } 2451 2452 return ret; 2453 } 2454