1 /* 2 * vhost support 3 * 4 * Copyright Red Hat, Inc. 2010 5 * 6 * Authors: 7 * Michael S. Tsirkin <mst@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "hw/virtio/vhost.h" 19 #include "qemu/atomic.h" 20 #include "qemu/range.h" 21 #include "qemu/error-report.h" 22 #include "qemu/memfd.h" 23 #include "qemu/log.h" 24 #include "standard-headers/linux/vhost_types.h" 25 #include "hw/virtio/virtio-bus.h" 26 #include "hw/mem/memory-device.h" 27 #include "migration/blocker.h" 28 #include "migration/qemu-file-types.h" 29 #include "sysemu/dma.h" 30 #include "trace.h" 31 32 /* enabled until disconnected backend stabilizes */ 33 #define _VHOST_DEBUG 1 34 35 #ifdef _VHOST_DEBUG 36 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 37 do { \ 38 error_report(fmt ": %s (%d)", ## __VA_ARGS__, \ 39 strerror(-retval), -retval); \ 40 } while (0) 41 #else 42 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 43 do { } while (0) 44 #endif 45 46 static struct vhost_log *vhost_log; 47 static struct vhost_log *vhost_log_shm; 48 49 /* Memslots used by backends that support private memslots (without an fd). */ 50 static unsigned int used_memslots; 51 52 /* Memslots used by backends that only support shared memslots (with an fd). */ 53 static unsigned int used_shared_memslots; 54 55 static QLIST_HEAD(, vhost_dev) vhost_devices = 56 QLIST_HEAD_INITIALIZER(vhost_devices); 57 58 unsigned int vhost_get_max_memslots(void) 59 { 60 unsigned int max = UINT_MAX; 61 struct vhost_dev *hdev; 62 63 QLIST_FOREACH(hdev, &vhost_devices, entry) { 64 max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev)); 65 } 66 return max; 67 } 68 69 unsigned int vhost_get_free_memslots(void) 70 { 71 unsigned int free = UINT_MAX; 72 struct vhost_dev *hdev; 73 74 QLIST_FOREACH(hdev, &vhost_devices, entry) { 75 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 76 unsigned int cur_free; 77 78 if (hdev->vhost_ops->vhost_backend_no_private_memslots && 79 hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { 80 cur_free = r - used_shared_memslots; 81 } else { 82 cur_free = r - used_memslots; 83 } 84 free = MIN(free, cur_free); 85 } 86 return free; 87 } 88 89 static void vhost_dev_sync_region(struct vhost_dev *dev, 90 MemoryRegionSection *section, 91 uint64_t mfirst, uint64_t mlast, 92 uint64_t rfirst, uint64_t rlast) 93 { 94 vhost_log_chunk_t *dev_log = dev->log->log; 95 96 uint64_t start = MAX(mfirst, rfirst); 97 uint64_t end = MIN(mlast, rlast); 98 vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK; 99 vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1; 100 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK); 101 102 if (end < start) { 103 return; 104 } 105 assert(end / VHOST_LOG_CHUNK < dev->log_size); 106 assert(start / VHOST_LOG_CHUNK < dev->log_size); 107 108 for (;from < to; ++from) { 109 vhost_log_chunk_t log; 110 /* We first check with non-atomic: much cheaper, 111 * and we expect non-dirty to be the common case. */ 112 if (!*from) { 113 addr += VHOST_LOG_CHUNK; 114 continue; 115 } 116 /* Data must be read atomically. We don't really need barrier semantics 117 * but it's easier to use atomic_* than roll our own. */ 118 log = qatomic_xchg(from, 0); 119 while (log) { 120 int bit = ctzl(log); 121 hwaddr page_addr; 122 hwaddr section_offset; 123 hwaddr mr_offset; 124 page_addr = addr + bit * VHOST_LOG_PAGE; 125 section_offset = page_addr - section->offset_within_address_space; 126 mr_offset = section_offset + section->offset_within_region; 127 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); 128 log &= ~(0x1ull << bit); 129 } 130 addr += VHOST_LOG_CHUNK; 131 } 132 } 133 134 bool vhost_dev_has_iommu(struct vhost_dev *dev) 135 { 136 VirtIODevice *vdev = dev->vdev; 137 138 /* 139 * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support 140 * incremental memory mapping API via IOTLB API. For platform that 141 * does not have IOMMU, there's no need to enable this feature 142 * which may cause unnecessary IOTLB miss/update transactions. 143 */ 144 if (vdev) { 145 return virtio_bus_device_iommu_enabled(vdev) && 146 virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); 147 } else { 148 return false; 149 } 150 } 151 152 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, 153 MemoryRegionSection *section, 154 hwaddr first, 155 hwaddr last) 156 { 157 int i; 158 hwaddr start_addr; 159 hwaddr end_addr; 160 161 if (!dev->log_enabled || !dev->started) { 162 return 0; 163 } 164 start_addr = section->offset_within_address_space; 165 end_addr = range_get_last(start_addr, int128_get64(section->size)); 166 start_addr = MAX(first, start_addr); 167 end_addr = MIN(last, end_addr); 168 169 for (i = 0; i < dev->mem->nregions; ++i) { 170 struct vhost_memory_region *reg = dev->mem->regions + i; 171 vhost_dev_sync_region(dev, section, start_addr, end_addr, 172 reg->guest_phys_addr, 173 range_get_last(reg->guest_phys_addr, 174 reg->memory_size)); 175 } 176 for (i = 0; i < dev->nvqs; ++i) { 177 struct vhost_virtqueue *vq = dev->vqs + i; 178 179 if (!vq->used_phys && !vq->used_size) { 180 continue; 181 } 182 183 if (vhost_dev_has_iommu(dev)) { 184 IOMMUTLBEntry iotlb; 185 hwaddr used_phys = vq->used_phys, used_size = vq->used_size; 186 hwaddr phys, s, offset; 187 188 while (used_size) { 189 rcu_read_lock(); 190 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 191 used_phys, 192 true, 193 MEMTXATTRS_UNSPECIFIED); 194 rcu_read_unlock(); 195 196 if (!iotlb.target_as) { 197 qemu_log_mask(LOG_GUEST_ERROR, "translation " 198 "failure for used_iova %"PRIx64"\n", 199 used_phys); 200 return -EINVAL; 201 } 202 203 offset = used_phys & iotlb.addr_mask; 204 phys = iotlb.translated_addr + offset; 205 206 /* 207 * Distance from start of used ring until last byte of 208 * IOMMU page. 209 */ 210 s = iotlb.addr_mask - offset; 211 /* 212 * Size of used ring, or of the part of it until end 213 * of IOMMU page. To avoid zero result, do the adding 214 * outside of MIN(). 215 */ 216 s = MIN(s, used_size - 1) + 1; 217 218 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys, 219 range_get_last(phys, s)); 220 used_size -= s; 221 used_phys += s; 222 } 223 } else { 224 vhost_dev_sync_region(dev, section, start_addr, 225 end_addr, vq->used_phys, 226 range_get_last(vq->used_phys, vq->used_size)); 227 } 228 } 229 return 0; 230 } 231 232 static void vhost_log_sync(MemoryListener *listener, 233 MemoryRegionSection *section) 234 { 235 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 236 memory_listener); 237 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); 238 } 239 240 static void vhost_log_sync_range(struct vhost_dev *dev, 241 hwaddr first, hwaddr last) 242 { 243 int i; 244 /* FIXME: this is N^2 in number of sections */ 245 for (i = 0; i < dev->n_mem_sections; ++i) { 246 MemoryRegionSection *section = &dev->mem_sections[i]; 247 vhost_sync_dirty_bitmap(dev, section, first, last); 248 } 249 } 250 251 static uint64_t vhost_get_log_size(struct vhost_dev *dev) 252 { 253 uint64_t log_size = 0; 254 int i; 255 for (i = 0; i < dev->mem->nregions; ++i) { 256 struct vhost_memory_region *reg = dev->mem->regions + i; 257 uint64_t last = range_get_last(reg->guest_phys_addr, 258 reg->memory_size); 259 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 260 } 261 return log_size; 262 } 263 264 static int vhost_set_backend_type(struct vhost_dev *dev, 265 VhostBackendType backend_type) 266 { 267 int r = 0; 268 269 switch (backend_type) { 270 #ifdef CONFIG_VHOST_KERNEL 271 case VHOST_BACKEND_TYPE_KERNEL: 272 dev->vhost_ops = &kernel_ops; 273 break; 274 #endif 275 #ifdef CONFIG_VHOST_USER 276 case VHOST_BACKEND_TYPE_USER: 277 dev->vhost_ops = &user_ops; 278 break; 279 #endif 280 #ifdef CONFIG_VHOST_VDPA 281 case VHOST_BACKEND_TYPE_VDPA: 282 dev->vhost_ops = &vdpa_ops; 283 break; 284 #endif 285 default: 286 error_report("Unknown vhost backend type"); 287 r = -1; 288 } 289 290 return r; 291 } 292 293 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) 294 { 295 Error *err = NULL; 296 struct vhost_log *log; 297 uint64_t logsize = size * sizeof(*(log->log)); 298 int fd = -1; 299 300 log = g_new0(struct vhost_log, 1); 301 if (share) { 302 log->log = qemu_memfd_alloc("vhost-log", logsize, 303 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 304 &fd, &err); 305 if (err) { 306 error_report_err(err); 307 g_free(log); 308 return NULL; 309 } 310 memset(log->log, 0, logsize); 311 } else { 312 log->log = g_malloc0(logsize); 313 } 314 315 log->size = size; 316 log->refcnt = 1; 317 log->fd = fd; 318 319 return log; 320 } 321 322 static struct vhost_log *vhost_log_get(uint64_t size, bool share) 323 { 324 struct vhost_log *log = share ? vhost_log_shm : vhost_log; 325 326 if (!log || log->size != size) { 327 log = vhost_log_alloc(size, share); 328 if (share) { 329 vhost_log_shm = log; 330 } else { 331 vhost_log = log; 332 } 333 } else { 334 ++log->refcnt; 335 } 336 337 return log; 338 } 339 340 static void vhost_log_put(struct vhost_dev *dev, bool sync) 341 { 342 struct vhost_log *log = dev->log; 343 344 if (!log) { 345 return; 346 } 347 348 --log->refcnt; 349 if (log->refcnt == 0) { 350 /* Sync only the range covered by the old log */ 351 if (dev->log_size && sync) { 352 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); 353 } 354 355 if (vhost_log == log) { 356 g_free(log->log); 357 vhost_log = NULL; 358 } else if (vhost_log_shm == log) { 359 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), 360 log->fd); 361 vhost_log_shm = NULL; 362 } 363 364 g_free(log); 365 } 366 367 dev->log = NULL; 368 dev->log_size = 0; 369 } 370 371 static bool vhost_dev_log_is_shared(struct vhost_dev *dev) 372 { 373 return dev->vhost_ops->vhost_requires_shm_log && 374 dev->vhost_ops->vhost_requires_shm_log(dev); 375 } 376 377 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) 378 { 379 struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); 380 uint64_t log_base = (uintptr_t)log->log; 381 int r; 382 383 /* inform backend of log switching, this must be done before 384 releasing the current log, to ensure no logging is lost */ 385 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); 386 if (r < 0) { 387 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 388 } 389 390 vhost_log_put(dev, true); 391 dev->log = log; 392 dev->log_size = size; 393 } 394 395 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, 396 hwaddr *plen, bool is_write) 397 { 398 if (!vhost_dev_has_iommu(dev)) { 399 return cpu_physical_memory_map(addr, plen, is_write); 400 } else { 401 return (void *)(uintptr_t)addr; 402 } 403 } 404 405 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer, 406 hwaddr len, int is_write, 407 hwaddr access_len) 408 { 409 if (!vhost_dev_has_iommu(dev)) { 410 cpu_physical_memory_unmap(buffer, len, is_write, access_len); 411 } 412 } 413 414 static int vhost_verify_ring_part_mapping(void *ring_hva, 415 uint64_t ring_gpa, 416 uint64_t ring_size, 417 void *reg_hva, 418 uint64_t reg_gpa, 419 uint64_t reg_size) 420 { 421 uint64_t hva_ring_offset; 422 uint64_t ring_last = range_get_last(ring_gpa, ring_size); 423 uint64_t reg_last = range_get_last(reg_gpa, reg_size); 424 425 if (ring_last < reg_gpa || ring_gpa > reg_last) { 426 return 0; 427 } 428 /* check that whole ring's is mapped */ 429 if (ring_last > reg_last) { 430 return -ENOMEM; 431 } 432 /* check that ring's MemoryRegion wasn't replaced */ 433 hva_ring_offset = ring_gpa - reg_gpa; 434 if (ring_hva != reg_hva + hva_ring_offset) { 435 return -EBUSY; 436 } 437 438 return 0; 439 } 440 441 static int vhost_verify_ring_mappings(struct vhost_dev *dev, 442 void *reg_hva, 443 uint64_t reg_gpa, 444 uint64_t reg_size) 445 { 446 int i, j; 447 int r = 0; 448 const char *part_name[] = { 449 "descriptor table", 450 "available ring", 451 "used ring" 452 }; 453 454 if (vhost_dev_has_iommu(dev)) { 455 return 0; 456 } 457 458 for (i = 0; i < dev->nvqs; ++i) { 459 struct vhost_virtqueue *vq = dev->vqs + i; 460 461 if (vq->desc_phys == 0) { 462 continue; 463 } 464 465 j = 0; 466 r = vhost_verify_ring_part_mapping( 467 vq->desc, vq->desc_phys, vq->desc_size, 468 reg_hva, reg_gpa, reg_size); 469 if (r) { 470 break; 471 } 472 473 j++; 474 r = vhost_verify_ring_part_mapping( 475 vq->avail, vq->avail_phys, vq->avail_size, 476 reg_hva, reg_gpa, reg_size); 477 if (r) { 478 break; 479 } 480 481 j++; 482 r = vhost_verify_ring_part_mapping( 483 vq->used, vq->used_phys, vq->used_size, 484 reg_hva, reg_gpa, reg_size); 485 if (r) { 486 break; 487 } 488 } 489 490 if (r == -ENOMEM) { 491 error_report("Unable to map %s for ring %d", part_name[j], i); 492 } else if (r == -EBUSY) { 493 error_report("%s relocated for ring %d", part_name[j], i); 494 } 495 return r; 496 } 497 498 /* 499 * vhost_section: identify sections needed for vhost access 500 * 501 * We only care about RAM sections here (where virtqueue and guest 502 * internals accessed by virtio might live). 503 */ 504 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) 505 { 506 MemoryRegion *mr = section->mr; 507 508 if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) { 509 uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr); 510 uint8_t handled_dirty; 511 512 /* 513 * Kernel based vhost doesn't handle any block which is doing 514 * dirty-tracking other than migration for which it has 515 * specific logging support. However for TCG the kernel never 516 * gets involved anyway so we can also ignore it's 517 * self-modiying code detection flags. However a vhost-user 518 * client could still confuse a TCG guest if it re-writes 519 * executable memory that has already been translated. 520 */ 521 handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) | 522 (1 << DIRTY_MEMORY_CODE); 523 524 if (dirty_mask & ~handled_dirty) { 525 trace_vhost_reject_section(mr->name, 1); 526 return false; 527 } 528 529 /* 530 * Some backends (like vhost-user) can only handle memory regions 531 * that have an fd (can be mapped into a different process). Filter 532 * the ones without an fd out, if requested. 533 * 534 * TODO: we might have to limit to MAP_SHARED as well. 535 */ 536 if (memory_region_get_fd(section->mr) < 0 && 537 dev->vhost_ops->vhost_backend_no_private_memslots && 538 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 539 trace_vhost_reject_section(mr->name, 2); 540 return false; 541 } 542 543 trace_vhost_section(mr->name); 544 return true; 545 } else { 546 trace_vhost_reject_section(mr->name, 3); 547 return false; 548 } 549 } 550 551 static void vhost_begin(MemoryListener *listener) 552 { 553 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 554 memory_listener); 555 dev->tmp_sections = NULL; 556 dev->n_tmp_sections = 0; 557 } 558 559 static void vhost_commit(MemoryListener *listener) 560 { 561 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 562 memory_listener); 563 MemoryRegionSection *old_sections; 564 int n_old_sections; 565 uint64_t log_size; 566 size_t regions_size; 567 int r; 568 int i; 569 bool changed = false; 570 571 /* Note we can be called before the device is started, but then 572 * starting the device calls set_mem_table, so we need to have 573 * built the data structures. 574 */ 575 old_sections = dev->mem_sections; 576 n_old_sections = dev->n_mem_sections; 577 dev->mem_sections = dev->tmp_sections; 578 dev->n_mem_sections = dev->n_tmp_sections; 579 580 if (dev->n_mem_sections != n_old_sections) { 581 changed = true; 582 } else { 583 /* Same size, lets check the contents */ 584 for (i = 0; i < n_old_sections; i++) { 585 if (!MemoryRegionSection_eq(&old_sections[i], 586 &dev->mem_sections[i])) { 587 changed = true; 588 break; 589 } 590 } 591 } 592 593 trace_vhost_commit(dev->started, changed); 594 if (!changed) { 595 goto out; 596 } 597 598 /* Rebuild the regions list from the new sections list */ 599 regions_size = offsetof(struct vhost_memory, regions) + 600 dev->n_mem_sections * sizeof dev->mem->regions[0]; 601 dev->mem = g_realloc(dev->mem, regions_size); 602 dev->mem->nregions = dev->n_mem_sections; 603 604 if (dev->vhost_ops->vhost_backend_no_private_memslots && 605 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 606 used_shared_memslots = dev->mem->nregions; 607 } else { 608 used_memslots = dev->mem->nregions; 609 } 610 611 for (i = 0; i < dev->n_mem_sections; i++) { 612 struct vhost_memory_region *cur_vmr = dev->mem->regions + i; 613 struct MemoryRegionSection *mrs = dev->mem_sections + i; 614 615 cur_vmr->guest_phys_addr = mrs->offset_within_address_space; 616 cur_vmr->memory_size = int128_get64(mrs->size); 617 cur_vmr->userspace_addr = 618 (uintptr_t)memory_region_get_ram_ptr(mrs->mr) + 619 mrs->offset_within_region; 620 cur_vmr->flags_padding = 0; 621 } 622 623 if (!dev->started) { 624 goto out; 625 } 626 627 for (i = 0; i < dev->mem->nregions; i++) { 628 if (vhost_verify_ring_mappings(dev, 629 (void *)(uintptr_t)dev->mem->regions[i].userspace_addr, 630 dev->mem->regions[i].guest_phys_addr, 631 dev->mem->regions[i].memory_size)) { 632 error_report("Verify ring failure on region %d", i); 633 abort(); 634 } 635 } 636 637 if (!dev->log_enabled) { 638 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 639 if (r < 0) { 640 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 641 } 642 goto out; 643 } 644 log_size = vhost_get_log_size(dev); 645 /* We allocate an extra 4K bytes to log, 646 * to reduce the * number of reallocations. */ 647 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) 648 /* To log more, must increase log size before table update. */ 649 if (dev->log_size < log_size) { 650 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); 651 } 652 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 653 if (r < 0) { 654 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 655 } 656 /* To log less, can only decrease log size after table update. */ 657 if (dev->log_size > log_size + VHOST_LOG_BUFFER) { 658 vhost_dev_log_resize(dev, log_size); 659 } 660 661 out: 662 /* Deref the old list of sections, this must happen _after_ the 663 * vhost_set_mem_table to ensure the client isn't still using the 664 * section we're about to unref. 665 */ 666 while (n_old_sections--) { 667 memory_region_unref(old_sections[n_old_sections].mr); 668 } 669 g_free(old_sections); 670 return; 671 } 672 673 /* Adds the section data to the tmp_section structure. 674 * It relies on the listener calling us in memory address order 675 * and for each region (via the _add and _nop methods) to 676 * join neighbours. 677 */ 678 static void vhost_region_add_section(struct vhost_dev *dev, 679 MemoryRegionSection *section) 680 { 681 bool need_add = true; 682 uint64_t mrs_size = int128_get64(section->size); 683 uint64_t mrs_gpa = section->offset_within_address_space; 684 uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + 685 section->offset_within_region; 686 RAMBlock *mrs_rb = section->mr->ram_block; 687 688 trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, 689 mrs_host); 690 691 if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { 692 /* Round the section to it's page size */ 693 /* First align the start down to a page boundary */ 694 size_t mrs_page = qemu_ram_pagesize(mrs_rb); 695 uint64_t alignage = mrs_host & (mrs_page - 1); 696 if (alignage) { 697 mrs_host -= alignage; 698 mrs_size += alignage; 699 mrs_gpa -= alignage; 700 } 701 /* Now align the size up to a page boundary */ 702 alignage = mrs_size & (mrs_page - 1); 703 if (alignage) { 704 mrs_size += mrs_page - alignage; 705 } 706 trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, 707 mrs_size, mrs_host); 708 } 709 710 if (dev->n_tmp_sections && !section->unmergeable) { 711 /* Since we already have at least one section, lets see if 712 * this extends it; since we're scanning in order, we only 713 * have to look at the last one, and the FlatView that calls 714 * us shouldn't have overlaps. 715 */ 716 MemoryRegionSection *prev_sec = dev->tmp_sections + 717 (dev->n_tmp_sections - 1); 718 uint64_t prev_gpa_start = prev_sec->offset_within_address_space; 719 uint64_t prev_size = int128_get64(prev_sec->size); 720 uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); 721 uint64_t prev_host_start = 722 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + 723 prev_sec->offset_within_region; 724 uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); 725 726 if (mrs_gpa <= (prev_gpa_end + 1)) { 727 /* OK, looks like overlapping/intersecting - it's possible that 728 * the rounding to page sizes has made them overlap, but they should 729 * match up in the same RAMBlock if they do. 730 */ 731 if (mrs_gpa < prev_gpa_start) { 732 error_report("%s:Section '%s' rounded to %"PRIx64 733 " prior to previous '%s' %"PRIx64, 734 __func__, section->mr->name, mrs_gpa, 735 prev_sec->mr->name, prev_gpa_start); 736 /* A way to cleanly fail here would be better */ 737 return; 738 } 739 /* Offset from the start of the previous GPA to this GPA */ 740 size_t offset = mrs_gpa - prev_gpa_start; 741 742 if (prev_host_start + offset == mrs_host && 743 section->mr == prev_sec->mr && !prev_sec->unmergeable) { 744 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); 745 need_add = false; 746 prev_sec->offset_within_address_space = 747 MIN(prev_gpa_start, mrs_gpa); 748 prev_sec->offset_within_region = 749 MIN(prev_host_start, mrs_host) - 750 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); 751 prev_sec->size = int128_make64(max_end - MIN(prev_host_start, 752 mrs_host)); 753 trace_vhost_region_add_section_merge(section->mr->name, 754 int128_get64(prev_sec->size), 755 prev_sec->offset_within_address_space, 756 prev_sec->offset_within_region); 757 } else { 758 /* adjoining regions are fine, but overlapping ones with 759 * different blocks/offsets shouldn't happen 760 */ 761 if (mrs_gpa != prev_gpa_end + 1) { 762 error_report("%s: Overlapping but not coherent sections " 763 "at %"PRIx64, 764 __func__, mrs_gpa); 765 return; 766 } 767 } 768 } 769 } 770 771 if (need_add) { 772 ++dev->n_tmp_sections; 773 dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections, 774 dev->n_tmp_sections); 775 dev->tmp_sections[dev->n_tmp_sections - 1] = *section; 776 /* The flatview isn't stable and we don't use it, making it NULL 777 * means we can memcmp the list. 778 */ 779 dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL; 780 memory_region_ref(section->mr); 781 } 782 } 783 784 /* Used for both add and nop callbacks */ 785 static void vhost_region_addnop(MemoryListener *listener, 786 MemoryRegionSection *section) 787 { 788 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 789 memory_listener); 790 791 if (!vhost_section(dev, section)) { 792 return; 793 } 794 vhost_region_add_section(dev, section); 795 } 796 797 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 798 { 799 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n); 800 struct vhost_dev *hdev = iommu->hdev; 801 hwaddr iova = iotlb->iova + iommu->iommu_offset; 802 803 if (vhost_backend_invalidate_device_iotlb(hdev, iova, 804 iotlb->addr_mask + 1)) { 805 error_report("Fail to invalidate device iotlb"); 806 } 807 } 808 809 static void vhost_iommu_region_add(MemoryListener *listener, 810 MemoryRegionSection *section) 811 { 812 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 813 iommu_listener); 814 struct vhost_iommu *iommu; 815 Int128 end; 816 int iommu_idx; 817 IOMMUMemoryRegion *iommu_mr; 818 819 if (!memory_region_is_iommu(section->mr)) { 820 return; 821 } 822 823 iommu_mr = IOMMU_MEMORY_REGION(section->mr); 824 825 iommu = g_malloc0(sizeof(*iommu)); 826 end = int128_add(int128_make64(section->offset_within_region), 827 section->size); 828 end = int128_sub(end, int128_one()); 829 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 830 MEMTXATTRS_UNSPECIFIED); 831 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, 832 dev->vdev->device_iotlb_enabled ? 833 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : 834 IOMMU_NOTIFIER_UNMAP, 835 section->offset_within_region, 836 int128_get64(end), 837 iommu_idx); 838 iommu->mr = section->mr; 839 iommu->iommu_offset = section->offset_within_address_space - 840 section->offset_within_region; 841 iommu->hdev = dev; 842 memory_region_register_iommu_notifier(section->mr, &iommu->n, 843 &error_fatal); 844 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next); 845 /* TODO: can replay help performance here? */ 846 } 847 848 static void vhost_iommu_region_del(MemoryListener *listener, 849 MemoryRegionSection *section) 850 { 851 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 852 iommu_listener); 853 struct vhost_iommu *iommu; 854 855 if (!memory_region_is_iommu(section->mr)) { 856 return; 857 } 858 859 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 860 if (iommu->mr == section->mr && 861 iommu->n.start == section->offset_within_region) { 862 memory_region_unregister_iommu_notifier(iommu->mr, 863 &iommu->n); 864 QLIST_REMOVE(iommu, iommu_next); 865 g_free(iommu); 866 break; 867 } 868 } 869 } 870 871 void vhost_toggle_device_iotlb(VirtIODevice *vdev) 872 { 873 VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); 874 struct vhost_dev *dev; 875 struct vhost_iommu *iommu; 876 877 if (vdev->vhost_started) { 878 dev = vdc->get_vhost(vdev); 879 } else { 880 return; 881 } 882 883 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 884 memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n); 885 iommu->n.notifier_flags = vdev->device_iotlb_enabled ? 886 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP; 887 memory_region_register_iommu_notifier(iommu->mr, &iommu->n, 888 &error_fatal); 889 } 890 } 891 892 static int vhost_virtqueue_set_addr(struct vhost_dev *dev, 893 struct vhost_virtqueue *vq, 894 unsigned idx, bool enable_log) 895 { 896 struct vhost_vring_addr addr; 897 int r; 898 memset(&addr, 0, sizeof(struct vhost_vring_addr)); 899 900 if (dev->vhost_ops->vhost_vq_get_addr) { 901 r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq); 902 if (r < 0) { 903 VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed"); 904 return r; 905 } 906 } else { 907 addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc; 908 addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail; 909 addr.used_user_addr = (uint64_t)(unsigned long)vq->used; 910 } 911 addr.index = idx; 912 addr.log_guest_addr = vq->used_phys; 913 addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0; 914 r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 915 if (r < 0) { 916 VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed"); 917 } 918 return r; 919 } 920 921 static int vhost_dev_set_features(struct vhost_dev *dev, 922 bool enable_log) 923 { 924 uint64_t features = dev->acked_features; 925 int r; 926 if (enable_log) { 927 features |= 0x1ULL << VHOST_F_LOG_ALL; 928 } 929 if (!vhost_dev_has_iommu(dev)) { 930 features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); 931 } 932 if (dev->vhost_ops->vhost_force_iommu) { 933 if (dev->vhost_ops->vhost_force_iommu(dev) == true) { 934 features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; 935 } 936 } 937 r = dev->vhost_ops->vhost_set_features(dev, features); 938 if (r < 0) { 939 VHOST_OPS_DEBUG(r, "vhost_set_features failed"); 940 goto out; 941 } 942 if (dev->vhost_ops->vhost_set_backend_cap) { 943 r = dev->vhost_ops->vhost_set_backend_cap(dev); 944 if (r < 0) { 945 VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed"); 946 goto out; 947 } 948 } 949 950 out: 951 return r; 952 } 953 954 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) 955 { 956 int r, i, idx; 957 hwaddr addr; 958 959 r = vhost_dev_set_features(dev, enable_log); 960 if (r < 0) { 961 goto err_features; 962 } 963 for (i = 0; i < dev->nvqs; ++i) { 964 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 965 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 966 if (!addr) { 967 /* 968 * The queue might not be ready for start. If this 969 * is the case there is no reason to continue the process. 970 * The similar logic is used by the vhost_virtqueue_start() 971 * routine. 972 */ 973 continue; 974 } 975 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 976 enable_log); 977 if (r < 0) { 978 goto err_vq; 979 } 980 } 981 return 0; 982 err_vq: 983 for (; i >= 0; --i) { 984 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 985 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 986 if (!addr) { 987 continue; 988 } 989 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 990 dev->log_enabled); 991 } 992 vhost_dev_set_features(dev, dev->log_enabled); 993 err_features: 994 return r; 995 } 996 997 static int vhost_migration_log(MemoryListener *listener, bool enable) 998 { 999 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 1000 memory_listener); 1001 int r; 1002 if (enable == dev->log_enabled) { 1003 return 0; 1004 } 1005 if (!dev->started) { 1006 dev->log_enabled = enable; 1007 return 0; 1008 } 1009 1010 r = 0; 1011 if (!enable) { 1012 r = vhost_dev_set_log(dev, false); 1013 if (r < 0) { 1014 goto check_dev_state; 1015 } 1016 vhost_log_put(dev, false); 1017 } else { 1018 vhost_dev_log_resize(dev, vhost_get_log_size(dev)); 1019 r = vhost_dev_set_log(dev, true); 1020 if (r < 0) { 1021 goto check_dev_state; 1022 } 1023 } 1024 1025 check_dev_state: 1026 dev->log_enabled = enable; 1027 /* 1028 * vhost-user-* devices could change their state during log 1029 * initialization due to disconnect. So check dev state after 1030 * vhost communication. 1031 */ 1032 if (!dev->started) { 1033 /* 1034 * Since device is in the stopped state, it is okay for 1035 * migration. Return success. 1036 */ 1037 r = 0; 1038 } 1039 if (r) { 1040 /* An error occurred. */ 1041 dev->log_enabled = false; 1042 } 1043 1044 return r; 1045 } 1046 1047 static bool vhost_log_global_start(MemoryListener *listener, Error **errp) 1048 { 1049 int r; 1050 1051 r = vhost_migration_log(listener, true); 1052 if (r < 0) { 1053 abort(); 1054 } 1055 return true; 1056 } 1057 1058 static void vhost_log_global_stop(MemoryListener *listener) 1059 { 1060 int r; 1061 1062 r = vhost_migration_log(listener, false); 1063 if (r < 0) { 1064 abort(); 1065 } 1066 } 1067 1068 static void vhost_log_start(MemoryListener *listener, 1069 MemoryRegionSection *section, 1070 int old, int new) 1071 { 1072 /* FIXME: implement */ 1073 } 1074 1075 static void vhost_log_stop(MemoryListener *listener, 1076 MemoryRegionSection *section, 1077 int old, int new) 1078 { 1079 /* FIXME: implement */ 1080 } 1081 1082 /* The vhost driver natively knows how to handle the vrings of non 1083 * cross-endian legacy devices and modern devices. Only legacy devices 1084 * exposed to a bi-endian guest may require the vhost driver to use a 1085 * specific endianness. 1086 */ 1087 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) 1088 { 1089 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1090 return false; 1091 } 1092 #if HOST_BIG_ENDIAN 1093 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; 1094 #else 1095 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; 1096 #endif 1097 } 1098 1099 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, 1100 bool is_big_endian, 1101 int vhost_vq_index) 1102 { 1103 int r; 1104 struct vhost_vring_state s = { 1105 .index = vhost_vq_index, 1106 .num = is_big_endian 1107 }; 1108 1109 r = dev->vhost_ops->vhost_set_vring_endian(dev, &s); 1110 if (r < 0) { 1111 VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed"); 1112 } 1113 return r; 1114 } 1115 1116 static int vhost_memory_region_lookup(struct vhost_dev *hdev, 1117 uint64_t gpa, uint64_t *uaddr, 1118 uint64_t *len) 1119 { 1120 int i; 1121 1122 for (i = 0; i < hdev->mem->nregions; i++) { 1123 struct vhost_memory_region *reg = hdev->mem->regions + i; 1124 1125 if (gpa >= reg->guest_phys_addr && 1126 reg->guest_phys_addr + reg->memory_size > gpa) { 1127 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr; 1128 *len = reg->guest_phys_addr + reg->memory_size - gpa; 1129 return 0; 1130 } 1131 } 1132 1133 return -EFAULT; 1134 } 1135 1136 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) 1137 { 1138 IOMMUTLBEntry iotlb; 1139 uint64_t uaddr, len; 1140 int ret = -EFAULT; 1141 1142 RCU_READ_LOCK_GUARD(); 1143 1144 trace_vhost_iotlb_miss(dev, 1); 1145 1146 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 1147 iova, write, 1148 MEMTXATTRS_UNSPECIFIED); 1149 if (iotlb.target_as != NULL) { 1150 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, 1151 &uaddr, &len); 1152 if (ret) { 1153 trace_vhost_iotlb_miss(dev, 3); 1154 error_report("Fail to lookup the translated address " 1155 "%"PRIx64, iotlb.translated_addr); 1156 goto out; 1157 } 1158 1159 len = MIN(iotlb.addr_mask + 1, len); 1160 iova = iova & ~iotlb.addr_mask; 1161 1162 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, 1163 len, iotlb.perm); 1164 if (ret) { 1165 trace_vhost_iotlb_miss(dev, 4); 1166 error_report("Fail to update device iotlb"); 1167 goto out; 1168 } 1169 } 1170 1171 trace_vhost_iotlb_miss(dev, 2); 1172 1173 out: 1174 return ret; 1175 } 1176 1177 int vhost_virtqueue_start(struct vhost_dev *dev, 1178 struct VirtIODevice *vdev, 1179 struct vhost_virtqueue *vq, 1180 unsigned idx) 1181 { 1182 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1183 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1184 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1185 hwaddr s, l, a; 1186 int r; 1187 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1188 struct vhost_vring_file file = { 1189 .index = vhost_vq_index 1190 }; 1191 struct vhost_vring_state state = { 1192 .index = vhost_vq_index 1193 }; 1194 struct VirtQueue *vvq = virtio_get_queue(vdev, idx); 1195 1196 a = virtio_queue_get_desc_addr(vdev, idx); 1197 if (a == 0) { 1198 /* Queue might not be ready for start */ 1199 return 0; 1200 } 1201 1202 vq->num = state.num = virtio_queue_get_num(vdev, idx); 1203 r = dev->vhost_ops->vhost_set_vring_num(dev, &state); 1204 if (r) { 1205 VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed"); 1206 return r; 1207 } 1208 1209 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 1210 r = dev->vhost_ops->vhost_set_vring_base(dev, &state); 1211 if (r) { 1212 VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed"); 1213 return r; 1214 } 1215 1216 if (vhost_needs_vring_endian(vdev)) { 1217 r = vhost_virtqueue_set_vring_endian_legacy(dev, 1218 virtio_is_big_endian(vdev), 1219 vhost_vq_index); 1220 if (r) { 1221 return r; 1222 } 1223 } 1224 1225 vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx); 1226 vq->desc_phys = a; 1227 vq->desc = vhost_memory_map(dev, a, &l, false); 1228 if (!vq->desc || l != s) { 1229 r = -ENOMEM; 1230 goto fail_alloc_desc; 1231 } 1232 vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx); 1233 vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx); 1234 vq->avail = vhost_memory_map(dev, a, &l, false); 1235 if (!vq->avail || l != s) { 1236 r = -ENOMEM; 1237 goto fail_alloc_avail; 1238 } 1239 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); 1240 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); 1241 vq->used = vhost_memory_map(dev, a, &l, true); 1242 if (!vq->used || l != s) { 1243 r = -ENOMEM; 1244 goto fail_alloc_used; 1245 } 1246 1247 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); 1248 if (r < 0) { 1249 goto fail_alloc; 1250 } 1251 1252 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); 1253 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); 1254 if (r) { 1255 VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed"); 1256 goto fail_kick; 1257 } 1258 1259 /* Clear and discard previous events if any. */ 1260 event_notifier_test_and_clear(&vq->masked_notifier); 1261 1262 /* Init vring in unmasked state, unless guest_notifier_mask 1263 * will do it later. 1264 */ 1265 if (!vdev->use_guest_notifier_mask) { 1266 /* TODO: check and handle errors. */ 1267 vhost_virtqueue_mask(dev, vdev, idx, false); 1268 } 1269 1270 if (k->query_guest_notifiers && 1271 k->query_guest_notifiers(qbus->parent) && 1272 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) { 1273 file.fd = -1; 1274 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1275 if (r) { 1276 goto fail_vector; 1277 } 1278 } 1279 1280 return 0; 1281 1282 fail_vector: 1283 fail_kick: 1284 fail_alloc: 1285 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1286 0, 0); 1287 fail_alloc_used: 1288 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1289 0, 0); 1290 fail_alloc_avail: 1291 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1292 0, 0); 1293 fail_alloc_desc: 1294 return r; 1295 } 1296 1297 void vhost_virtqueue_stop(struct vhost_dev *dev, 1298 struct VirtIODevice *vdev, 1299 struct vhost_virtqueue *vq, 1300 unsigned idx) 1301 { 1302 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1303 struct vhost_vring_state state = { 1304 .index = vhost_vq_index, 1305 }; 1306 int r; 1307 1308 if (virtio_queue_get_desc_addr(vdev, idx) == 0) { 1309 /* Don't stop the virtqueue which might have not been started */ 1310 return; 1311 } 1312 1313 r = dev->vhost_ops->vhost_get_vring_base(dev, &state); 1314 if (r < 0) { 1315 VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r); 1316 /* Connection to the backend is broken, so let's sync internal 1317 * last avail idx to the device used idx. 1318 */ 1319 virtio_queue_restore_last_avail_idx(vdev, idx); 1320 } else { 1321 virtio_queue_set_last_avail_idx(vdev, idx, state.num); 1322 } 1323 virtio_queue_invalidate_signalled_used(vdev, idx); 1324 virtio_queue_update_used_idx(vdev, idx); 1325 1326 /* In the cross-endian case, we need to reset the vring endianness to 1327 * native as legacy devices expect so by default. 1328 */ 1329 if (vhost_needs_vring_endian(vdev)) { 1330 vhost_virtqueue_set_vring_endian_legacy(dev, 1331 !virtio_is_big_endian(vdev), 1332 vhost_vq_index); 1333 } 1334 1335 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1336 1, virtio_queue_get_used_size(vdev, idx)); 1337 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1338 0, virtio_queue_get_avail_size(vdev, idx)); 1339 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1340 0, virtio_queue_get_desc_size(vdev, idx)); 1341 } 1342 1343 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, 1344 int n, uint32_t timeout) 1345 { 1346 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1347 struct vhost_vring_state state = { 1348 .index = vhost_vq_index, 1349 .num = timeout, 1350 }; 1351 int r; 1352 1353 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) { 1354 return -EINVAL; 1355 } 1356 1357 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state); 1358 if (r) { 1359 VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed"); 1360 return r; 1361 } 1362 1363 return 0; 1364 } 1365 1366 static void vhost_virtqueue_error_notifier(EventNotifier *n) 1367 { 1368 struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue, 1369 error_notifier); 1370 struct vhost_dev *dev = vq->dev; 1371 int index = vq - dev->vqs; 1372 1373 if (event_notifier_test_and_clear(n) && dev->vdev) { 1374 VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", 1375 dev->vq_index + index); 1376 } 1377 } 1378 1379 static int vhost_virtqueue_init(struct vhost_dev *dev, 1380 struct vhost_virtqueue *vq, int n) 1381 { 1382 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1383 struct vhost_vring_file file = { 1384 .index = vhost_vq_index, 1385 }; 1386 int r = event_notifier_init(&vq->masked_notifier, 0); 1387 if (r < 0) { 1388 return r; 1389 } 1390 1391 file.fd = event_notifier_get_wfd(&vq->masked_notifier); 1392 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1393 if (r) { 1394 VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed"); 1395 goto fail_call; 1396 } 1397 1398 vq->dev = dev; 1399 1400 if (dev->vhost_ops->vhost_set_vring_err) { 1401 r = event_notifier_init(&vq->error_notifier, 0); 1402 if (r < 0) { 1403 goto fail_call; 1404 } 1405 1406 file.fd = event_notifier_get_fd(&vq->error_notifier); 1407 r = dev->vhost_ops->vhost_set_vring_err(dev, &file); 1408 if (r) { 1409 VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed"); 1410 goto fail_err; 1411 } 1412 1413 event_notifier_set_handler(&vq->error_notifier, 1414 vhost_virtqueue_error_notifier); 1415 } 1416 1417 return 0; 1418 1419 fail_err: 1420 event_notifier_cleanup(&vq->error_notifier); 1421 fail_call: 1422 event_notifier_cleanup(&vq->masked_notifier); 1423 return r; 1424 } 1425 1426 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) 1427 { 1428 event_notifier_cleanup(&vq->masked_notifier); 1429 if (vq->dev->vhost_ops->vhost_set_vring_err) { 1430 event_notifier_set_handler(&vq->error_notifier, NULL); 1431 event_notifier_cleanup(&vq->error_notifier); 1432 } 1433 } 1434 1435 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 1436 VhostBackendType backend_type, uint32_t busyloop_timeout, 1437 Error **errp) 1438 { 1439 unsigned int used, reserved, limit; 1440 uint64_t features; 1441 int i, r, n_initialized_vqs = 0; 1442 1443 hdev->vdev = NULL; 1444 hdev->migration_blocker = NULL; 1445 1446 r = vhost_set_backend_type(hdev, backend_type); 1447 assert(r >= 0); 1448 1449 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp); 1450 if (r < 0) { 1451 goto fail; 1452 } 1453 1454 r = hdev->vhost_ops->vhost_set_owner(hdev); 1455 if (r < 0) { 1456 error_setg_errno(errp, -r, "vhost_set_owner failed"); 1457 goto fail; 1458 } 1459 1460 r = hdev->vhost_ops->vhost_get_features(hdev, &features); 1461 if (r < 0) { 1462 error_setg_errno(errp, -r, "vhost_get_features failed"); 1463 goto fail; 1464 } 1465 1466 limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 1467 if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS && 1468 memory_devices_memslot_auto_decision_active()) { 1469 error_setg(errp, "some memory device (like virtio-mem)" 1470 " decided how many memory slots to use based on the overall" 1471 " number of memory slots; this vhost backend would further" 1472 " restricts the overall number of memory slots"); 1473 error_append_hint(errp, "Try plugging this vhost backend before" 1474 " plugging such memory devices.\n"); 1475 r = -EINVAL; 1476 goto fail; 1477 } 1478 1479 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) { 1480 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); 1481 if (r < 0) { 1482 error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i); 1483 goto fail; 1484 } 1485 } 1486 1487 if (busyloop_timeout) { 1488 for (i = 0; i < hdev->nvqs; ++i) { 1489 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 1490 busyloop_timeout); 1491 if (r < 0) { 1492 error_setg_errno(errp, -r, "Failed to set busyloop timeout"); 1493 goto fail_busyloop; 1494 } 1495 } 1496 } 1497 1498 hdev->features = features; 1499 1500 hdev->memory_listener = (MemoryListener) { 1501 .name = "vhost", 1502 .begin = vhost_begin, 1503 .commit = vhost_commit, 1504 .region_add = vhost_region_addnop, 1505 .region_nop = vhost_region_addnop, 1506 .log_start = vhost_log_start, 1507 .log_stop = vhost_log_stop, 1508 .log_sync = vhost_log_sync, 1509 .log_global_start = vhost_log_global_start, 1510 .log_global_stop = vhost_log_global_stop, 1511 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND 1512 }; 1513 1514 hdev->iommu_listener = (MemoryListener) { 1515 .name = "vhost-iommu", 1516 .region_add = vhost_iommu_region_add, 1517 .region_del = vhost_iommu_region_del, 1518 }; 1519 1520 if (hdev->migration_blocker == NULL) { 1521 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { 1522 error_setg(&hdev->migration_blocker, 1523 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); 1524 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { 1525 error_setg(&hdev->migration_blocker, 1526 "Migration disabled: failed to allocate shared memory"); 1527 } 1528 } 1529 1530 if (hdev->migration_blocker != NULL) { 1531 r = migrate_add_blocker_normal(&hdev->migration_blocker, errp); 1532 if (r < 0) { 1533 goto fail_busyloop; 1534 } 1535 } 1536 1537 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); 1538 hdev->n_mem_sections = 0; 1539 hdev->mem_sections = NULL; 1540 hdev->log = NULL; 1541 hdev->log_size = 0; 1542 hdev->log_enabled = false; 1543 hdev->started = false; 1544 memory_listener_register(&hdev->memory_listener, &address_space_memory); 1545 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); 1546 1547 /* 1548 * The listener we registered properly updated the corresponding counter. 1549 * So we can trust that these values are accurate. 1550 */ 1551 if (hdev->vhost_ops->vhost_backend_no_private_memslots && 1552 hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { 1553 used = used_shared_memslots; 1554 } else { 1555 used = used_memslots; 1556 } 1557 /* 1558 * We assume that all reserved memslots actually require a real memslot 1559 * in our vhost backend. This might not be true, for example, if the 1560 * memslot would be ROM. If ever relevant, we can optimize for that -- 1561 * but we'll need additional information about the reservations. 1562 */ 1563 reserved = memory_devices_get_reserved_memslots(); 1564 if (used + reserved > limit) { 1565 error_setg(errp, "vhost backend memory slots limit (%d) is less" 1566 " than current number of used (%d) and reserved (%d)" 1567 " memory slots for memory devices.", limit, used, reserved); 1568 r = -EINVAL; 1569 goto fail_busyloop; 1570 } 1571 1572 return 0; 1573 1574 fail_busyloop: 1575 if (busyloop_timeout) { 1576 while (--i >= 0) { 1577 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0); 1578 } 1579 } 1580 fail: 1581 hdev->nvqs = n_initialized_vqs; 1582 vhost_dev_cleanup(hdev); 1583 return r; 1584 } 1585 1586 void vhost_dev_cleanup(struct vhost_dev *hdev) 1587 { 1588 int i; 1589 1590 trace_vhost_dev_cleanup(hdev); 1591 1592 for (i = 0; i < hdev->nvqs; ++i) { 1593 vhost_virtqueue_cleanup(hdev->vqs + i); 1594 } 1595 if (hdev->mem) { 1596 /* those are only safe after successful init */ 1597 memory_listener_unregister(&hdev->memory_listener); 1598 QLIST_REMOVE(hdev, entry); 1599 } 1600 migrate_del_blocker(&hdev->migration_blocker); 1601 g_free(hdev->mem); 1602 g_free(hdev->mem_sections); 1603 if (hdev->vhost_ops) { 1604 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1605 } 1606 assert(!hdev->log); 1607 1608 memset(hdev, 0, sizeof(struct vhost_dev)); 1609 } 1610 1611 static void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev, 1612 VirtIODevice *vdev, 1613 unsigned int nvqs) 1614 { 1615 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1616 int i, r; 1617 1618 /* 1619 * Batch all the host notifiers in a single transaction to avoid 1620 * quadratic time complexity in address_space_update_ioeventfds(). 1621 */ 1622 memory_region_transaction_begin(); 1623 1624 for (i = 0; i < nvqs; ++i) { 1625 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1626 false); 1627 if (r < 0) { 1628 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); 1629 } 1630 assert(r >= 0); 1631 } 1632 1633 /* 1634 * The transaction expects the ioeventfds to be open when it 1635 * commits. Do it now, before the cleanup loop. 1636 */ 1637 memory_region_transaction_commit(); 1638 1639 for (i = 0; i < nvqs; ++i) { 1640 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); 1641 } 1642 virtio_device_release_ioeventfd(vdev); 1643 } 1644 1645 /* Stop processing guest IO notifications in qemu. 1646 * Start processing them in vhost in kernel. 1647 */ 1648 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1649 { 1650 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1651 int i, r; 1652 1653 /* We will pass the notifiers to the kernel, make sure that QEMU 1654 * doesn't interfere. 1655 */ 1656 r = virtio_device_grab_ioeventfd(vdev); 1657 if (r < 0) { 1658 error_report("binding does not support host notifiers"); 1659 return r; 1660 } 1661 1662 /* 1663 * Batch all the host notifiers in a single transaction to avoid 1664 * quadratic time complexity in address_space_update_ioeventfds(). 1665 */ 1666 memory_region_transaction_begin(); 1667 1668 for (i = 0; i < hdev->nvqs; ++i) { 1669 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1670 true); 1671 if (r < 0) { 1672 error_report("vhost VQ %d notifier binding failed: %d", i, -r); 1673 memory_region_transaction_commit(); 1674 vhost_dev_disable_notifiers_nvqs(hdev, vdev, i); 1675 return r; 1676 } 1677 } 1678 1679 memory_region_transaction_commit(); 1680 1681 return 0; 1682 } 1683 1684 /* Stop processing guest IO notifications in vhost. 1685 * Start processing them in qemu. 1686 * This might actually run the qemu handlers right away, 1687 * so virtio in qemu must be completely setup when this is called. 1688 */ 1689 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1690 { 1691 vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs); 1692 } 1693 1694 /* Test and clear event pending status. 1695 * Should be called after unmask to avoid losing events. 1696 */ 1697 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) 1698 { 1699 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; 1700 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); 1701 return event_notifier_test_and_clear(&vq->masked_notifier); 1702 } 1703 1704 /* Mask/unmask events from this vq. */ 1705 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, 1706 bool mask) 1707 { 1708 struct VirtQueue *vvq = virtio_get_queue(vdev, n); 1709 int r, index = n - hdev->vq_index; 1710 struct vhost_vring_file file; 1711 1712 /* should only be called after backend is connected */ 1713 assert(hdev->vhost_ops); 1714 1715 if (mask) { 1716 assert(vdev->use_guest_notifier_mask); 1717 file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier); 1718 } else { 1719 file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq)); 1720 } 1721 1722 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); 1723 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); 1724 if (r < 0) { 1725 error_report("vhost_set_vring_call failed %d", -r); 1726 } 1727 } 1728 1729 bool vhost_config_pending(struct vhost_dev *hdev) 1730 { 1731 assert(hdev->vhost_ops); 1732 if ((hdev->started == false) || 1733 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1734 return false; 1735 } 1736 1737 EventNotifier *notifier = 1738 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1739 return event_notifier_test_and_clear(notifier); 1740 } 1741 1742 void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask) 1743 { 1744 int fd; 1745 int r; 1746 EventNotifier *notifier = 1747 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1748 EventNotifier *config_notifier = &vdev->config_notifier; 1749 assert(hdev->vhost_ops); 1750 1751 if ((hdev->started == false) || 1752 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1753 return; 1754 } 1755 if (mask) { 1756 assert(vdev->use_guest_notifier_mask); 1757 fd = event_notifier_get_fd(notifier); 1758 } else { 1759 fd = event_notifier_get_fd(config_notifier); 1760 } 1761 r = hdev->vhost_ops->vhost_set_config_call(hdev, fd); 1762 if (r < 0) { 1763 error_report("vhost_set_config_call failed %d", -r); 1764 } 1765 } 1766 1767 static void vhost_stop_config_intr(struct vhost_dev *dev) 1768 { 1769 int fd = -1; 1770 assert(dev->vhost_ops); 1771 if (dev->vhost_ops->vhost_set_config_call) { 1772 dev->vhost_ops->vhost_set_config_call(dev, fd); 1773 } 1774 } 1775 1776 static void vhost_start_config_intr(struct vhost_dev *dev) 1777 { 1778 int r; 1779 1780 assert(dev->vhost_ops); 1781 int fd = event_notifier_get_fd(&dev->vdev->config_notifier); 1782 if (dev->vhost_ops->vhost_set_config_call) { 1783 r = dev->vhost_ops->vhost_set_config_call(dev, fd); 1784 if (!r) { 1785 event_notifier_set(&dev->vdev->config_notifier); 1786 } 1787 } 1788 } 1789 1790 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, 1791 uint64_t features) 1792 { 1793 const int *bit = feature_bits; 1794 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1795 uint64_t bit_mask = (1ULL << *bit); 1796 if (!(hdev->features & bit_mask)) { 1797 features &= ~bit_mask; 1798 } 1799 bit++; 1800 } 1801 return features; 1802 } 1803 1804 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, 1805 uint64_t features) 1806 { 1807 const int *bit = feature_bits; 1808 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1809 uint64_t bit_mask = (1ULL << *bit); 1810 if (features & bit_mask) { 1811 hdev->acked_features |= bit_mask; 1812 } 1813 bit++; 1814 } 1815 } 1816 1817 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, 1818 uint32_t config_len, Error **errp) 1819 { 1820 assert(hdev->vhost_ops); 1821 1822 if (hdev->vhost_ops->vhost_get_config) { 1823 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len, 1824 errp); 1825 } 1826 1827 error_setg(errp, "vhost_get_config not implemented"); 1828 return -ENOSYS; 1829 } 1830 1831 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data, 1832 uint32_t offset, uint32_t size, uint32_t flags) 1833 { 1834 assert(hdev->vhost_ops); 1835 1836 if (hdev->vhost_ops->vhost_set_config) { 1837 return hdev->vhost_ops->vhost_set_config(hdev, data, offset, 1838 size, flags); 1839 } 1840 1841 return -ENOSYS; 1842 } 1843 1844 void vhost_dev_set_config_notifier(struct vhost_dev *hdev, 1845 const VhostDevConfigOps *ops) 1846 { 1847 hdev->config_ops = ops; 1848 } 1849 1850 void vhost_dev_free_inflight(struct vhost_inflight *inflight) 1851 { 1852 if (inflight && inflight->addr) { 1853 qemu_memfd_free(inflight->addr, inflight->size, inflight->fd); 1854 inflight->addr = NULL; 1855 inflight->fd = -1; 1856 } 1857 } 1858 1859 static int vhost_dev_resize_inflight(struct vhost_inflight *inflight, 1860 uint64_t new_size) 1861 { 1862 Error *err = NULL; 1863 int fd = -1; 1864 void *addr = qemu_memfd_alloc("vhost-inflight", new_size, 1865 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 1866 &fd, &err); 1867 1868 if (err) { 1869 error_report_err(err); 1870 return -ENOMEM; 1871 } 1872 1873 vhost_dev_free_inflight(inflight); 1874 inflight->offset = 0; 1875 inflight->addr = addr; 1876 inflight->fd = fd; 1877 inflight->size = new_size; 1878 1879 return 0; 1880 } 1881 1882 void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f) 1883 { 1884 if (inflight->addr) { 1885 qemu_put_be64(f, inflight->size); 1886 qemu_put_be16(f, inflight->queue_size); 1887 qemu_put_buffer(f, inflight->addr, inflight->size); 1888 } else { 1889 qemu_put_be64(f, 0); 1890 } 1891 } 1892 1893 int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f) 1894 { 1895 uint64_t size; 1896 1897 size = qemu_get_be64(f); 1898 if (!size) { 1899 return 0; 1900 } 1901 1902 if (inflight->size != size) { 1903 int ret = vhost_dev_resize_inflight(inflight, size); 1904 if (ret < 0) { 1905 return ret; 1906 } 1907 } 1908 inflight->queue_size = qemu_get_be16(f); 1909 1910 qemu_get_buffer(f, inflight->addr, size); 1911 1912 return 0; 1913 } 1914 1915 int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev) 1916 { 1917 int r; 1918 1919 if (hdev->vhost_ops->vhost_get_inflight_fd == NULL || 1920 hdev->vhost_ops->vhost_set_inflight_fd == NULL) { 1921 return 0; 1922 } 1923 1924 hdev->vdev = vdev; 1925 1926 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1927 if (r < 0) { 1928 VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed"); 1929 return r; 1930 } 1931 1932 return 0; 1933 } 1934 1935 int vhost_dev_set_inflight(struct vhost_dev *dev, 1936 struct vhost_inflight *inflight) 1937 { 1938 int r; 1939 1940 if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) { 1941 r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight); 1942 if (r) { 1943 VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed"); 1944 return r; 1945 } 1946 } 1947 1948 return 0; 1949 } 1950 1951 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, 1952 struct vhost_inflight *inflight) 1953 { 1954 int r; 1955 1956 if (dev->vhost_ops->vhost_get_inflight_fd) { 1957 r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight); 1958 if (r) { 1959 VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed"); 1960 return r; 1961 } 1962 } 1963 1964 return 0; 1965 } 1966 1967 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) 1968 { 1969 if (!hdev->vhost_ops->vhost_set_vring_enable) { 1970 return 0; 1971 } 1972 1973 /* 1974 * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not 1975 * been negotiated, the rings start directly in the enabled state, and 1976 * .vhost_set_vring_enable callback will fail since 1977 * VHOST_USER_SET_VRING_ENABLE is not supported. 1978 */ 1979 if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER && 1980 !virtio_has_feature(hdev->backend_features, 1981 VHOST_USER_F_PROTOCOL_FEATURES)) { 1982 return 0; 1983 } 1984 1985 return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); 1986 } 1987 1988 /* 1989 * Host notifiers must be enabled at this point. 1990 * 1991 * If @vrings is true, this function will enable all vrings before starting the 1992 * device. If it is false, the vring initialization is left to be done by the 1993 * caller. 1994 */ 1995 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 1996 { 1997 int i, r; 1998 1999 /* should only be called after backend is connected */ 2000 assert(hdev->vhost_ops); 2001 2002 trace_vhost_dev_start(hdev, vdev->name, vrings); 2003 2004 vdev->vhost_started = true; 2005 hdev->started = true; 2006 hdev->vdev = vdev; 2007 2008 r = vhost_dev_set_features(hdev, hdev->log_enabled); 2009 if (r < 0) { 2010 goto fail_features; 2011 } 2012 2013 if (vhost_dev_has_iommu(hdev)) { 2014 memory_listener_register(&hdev->iommu_listener, vdev->dma_as); 2015 } 2016 2017 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); 2018 if (r < 0) { 2019 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 2020 goto fail_mem; 2021 } 2022 for (i = 0; i < hdev->nvqs; ++i) { 2023 r = vhost_virtqueue_start(hdev, 2024 vdev, 2025 hdev->vqs + i, 2026 hdev->vq_index + i); 2027 if (r < 0) { 2028 goto fail_vq; 2029 } 2030 } 2031 2032 r = event_notifier_init( 2033 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0); 2034 if (r < 0) { 2035 VHOST_OPS_DEBUG(r, "event_notifier_init failed"); 2036 goto fail_vq; 2037 } 2038 event_notifier_test_and_clear( 2039 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2040 if (!vdev->use_guest_notifier_mask) { 2041 vhost_config_mask(hdev, vdev, true); 2042 } 2043 if (hdev->log_enabled) { 2044 uint64_t log_base; 2045 2046 hdev->log_size = vhost_get_log_size(hdev); 2047 hdev->log = vhost_log_get(hdev->log_size, 2048 vhost_dev_log_is_shared(hdev)); 2049 log_base = (uintptr_t)hdev->log->log; 2050 r = hdev->vhost_ops->vhost_set_log_base(hdev, 2051 hdev->log_size ? log_base : 0, 2052 hdev->log); 2053 if (r < 0) { 2054 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 2055 goto fail_log; 2056 } 2057 } 2058 if (vrings) { 2059 r = vhost_dev_set_vring_enable(hdev, true); 2060 if (r) { 2061 goto fail_log; 2062 } 2063 } 2064 if (hdev->vhost_ops->vhost_dev_start) { 2065 r = hdev->vhost_ops->vhost_dev_start(hdev, true); 2066 if (r) { 2067 goto fail_start; 2068 } 2069 } 2070 if (vhost_dev_has_iommu(hdev) && 2071 hdev->vhost_ops->vhost_set_iotlb_callback) { 2072 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); 2073 2074 /* Update used ring information for IOTLB to work correctly, 2075 * vhost-kernel code requires for this.*/ 2076 for (i = 0; i < hdev->nvqs; ++i) { 2077 struct vhost_virtqueue *vq = hdev->vqs + i; 2078 vhost_device_iotlb_miss(hdev, vq->used_phys, true); 2079 } 2080 } 2081 vhost_start_config_intr(hdev); 2082 return 0; 2083 fail_start: 2084 if (vrings) { 2085 vhost_dev_set_vring_enable(hdev, false); 2086 } 2087 fail_log: 2088 vhost_log_put(hdev, false); 2089 fail_vq: 2090 while (--i >= 0) { 2091 vhost_virtqueue_stop(hdev, 2092 vdev, 2093 hdev->vqs + i, 2094 hdev->vq_index + i); 2095 } 2096 2097 fail_mem: 2098 if (vhost_dev_has_iommu(hdev)) { 2099 memory_listener_unregister(&hdev->iommu_listener); 2100 } 2101 fail_features: 2102 vdev->vhost_started = false; 2103 hdev->started = false; 2104 return r; 2105 } 2106 2107 /* Host notifiers must be enabled at this point. */ 2108 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 2109 { 2110 int i; 2111 2112 /* should only be called after backend is connected */ 2113 assert(hdev->vhost_ops); 2114 event_notifier_test_and_clear( 2115 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2116 event_notifier_test_and_clear(&vdev->config_notifier); 2117 event_notifier_cleanup( 2118 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2119 2120 trace_vhost_dev_stop(hdev, vdev->name, vrings); 2121 2122 if (hdev->vhost_ops->vhost_dev_start) { 2123 hdev->vhost_ops->vhost_dev_start(hdev, false); 2124 } 2125 if (vrings) { 2126 vhost_dev_set_vring_enable(hdev, false); 2127 } 2128 for (i = 0; i < hdev->nvqs; ++i) { 2129 vhost_virtqueue_stop(hdev, 2130 vdev, 2131 hdev->vqs + i, 2132 hdev->vq_index + i); 2133 } 2134 if (hdev->vhost_ops->vhost_reset_status) { 2135 hdev->vhost_ops->vhost_reset_status(hdev); 2136 } 2137 2138 if (vhost_dev_has_iommu(hdev)) { 2139 if (hdev->vhost_ops->vhost_set_iotlb_callback) { 2140 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 2141 } 2142 memory_listener_unregister(&hdev->iommu_listener); 2143 } 2144 vhost_stop_config_intr(hdev); 2145 vhost_log_put(hdev, true); 2146 hdev->started = false; 2147 vdev->vhost_started = false; 2148 hdev->vdev = NULL; 2149 } 2150 2151 int vhost_net_set_backend(struct vhost_dev *hdev, 2152 struct vhost_vring_file *file) 2153 { 2154 if (hdev->vhost_ops->vhost_net_set_backend) { 2155 return hdev->vhost_ops->vhost_net_set_backend(hdev, file); 2156 } 2157 2158 return -ENOSYS; 2159 } 2160 2161 int vhost_reset_device(struct vhost_dev *hdev) 2162 { 2163 if (hdev->vhost_ops->vhost_reset_device) { 2164 return hdev->vhost_ops->vhost_reset_device(hdev); 2165 } 2166 2167 return -ENOSYS; 2168 } 2169 2170 bool vhost_supports_device_state(struct vhost_dev *dev) 2171 { 2172 if (dev->vhost_ops->vhost_supports_device_state) { 2173 return dev->vhost_ops->vhost_supports_device_state(dev); 2174 } 2175 2176 return false; 2177 } 2178 2179 int vhost_set_device_state_fd(struct vhost_dev *dev, 2180 VhostDeviceStateDirection direction, 2181 VhostDeviceStatePhase phase, 2182 int fd, 2183 int *reply_fd, 2184 Error **errp) 2185 { 2186 if (dev->vhost_ops->vhost_set_device_state_fd) { 2187 return dev->vhost_ops->vhost_set_device_state_fd(dev, direction, phase, 2188 fd, reply_fd, errp); 2189 } 2190 2191 error_setg(errp, 2192 "vhost transport does not support migration state transfer"); 2193 return -ENOSYS; 2194 } 2195 2196 int vhost_check_device_state(struct vhost_dev *dev, Error **errp) 2197 { 2198 if (dev->vhost_ops->vhost_check_device_state) { 2199 return dev->vhost_ops->vhost_check_device_state(dev, errp); 2200 } 2201 2202 error_setg(errp, 2203 "vhost transport does not support migration state transfer"); 2204 return -ENOSYS; 2205 } 2206 2207 int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) 2208 { 2209 ERRP_GUARD(); 2210 /* Maximum chunk size in which to transfer the state */ 2211 const size_t chunk_size = 1 * 1024 * 1024; 2212 g_autofree void *transfer_buf = NULL; 2213 g_autoptr(GError) g_err = NULL; 2214 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; 2215 int ret; 2216 2217 /* [0] for reading (our end), [1] for writing (back-end's end) */ 2218 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { 2219 error_setg(errp, "Failed to set up state transfer pipe: %s", 2220 g_err->message); 2221 ret = -EINVAL; 2222 goto fail; 2223 } 2224 2225 read_fd = pipe_fds[0]; 2226 write_fd = pipe_fds[1]; 2227 2228 /* 2229 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. 2230 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for 2231 * vhost-user, so just check that it is stopped at all. 2232 */ 2233 assert(!dev->started); 2234 2235 /* Transfer ownership of write_fd to the back-end */ 2236 ret = vhost_set_device_state_fd(dev, 2237 VHOST_TRANSFER_STATE_DIRECTION_SAVE, 2238 VHOST_TRANSFER_STATE_PHASE_STOPPED, 2239 write_fd, 2240 &reply_fd, 2241 errp); 2242 if (ret < 0) { 2243 error_prepend(errp, "Failed to initiate state transfer: "); 2244 goto fail; 2245 } 2246 2247 /* If the back-end wishes to use a different pipe, switch over */ 2248 if (reply_fd >= 0) { 2249 close(read_fd); 2250 read_fd = reply_fd; 2251 } 2252 2253 transfer_buf = g_malloc(chunk_size); 2254 2255 while (true) { 2256 ssize_t read_ret; 2257 2258 read_ret = RETRY_ON_EINTR(read(read_fd, transfer_buf, chunk_size)); 2259 if (read_ret < 0) { 2260 ret = -errno; 2261 error_setg_errno(errp, -ret, "Failed to receive state"); 2262 goto fail; 2263 } 2264 2265 assert(read_ret <= chunk_size); 2266 qemu_put_be32(f, read_ret); 2267 2268 if (read_ret == 0) { 2269 /* EOF */ 2270 break; 2271 } 2272 2273 qemu_put_buffer(f, transfer_buf, read_ret); 2274 } 2275 2276 /* 2277 * Back-end will not really care, but be clean and close our end of the pipe 2278 * before inquiring the back-end about whether transfer was successful 2279 */ 2280 close(read_fd); 2281 read_fd = -1; 2282 2283 /* Also, verify that the device is still stopped */ 2284 assert(!dev->started); 2285 2286 ret = vhost_check_device_state(dev, errp); 2287 if (ret < 0) { 2288 goto fail; 2289 } 2290 2291 ret = 0; 2292 fail: 2293 if (read_fd >= 0) { 2294 close(read_fd); 2295 } 2296 2297 return ret; 2298 } 2299 2300 int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) 2301 { 2302 ERRP_GUARD(); 2303 size_t transfer_buf_size = 0; 2304 g_autofree void *transfer_buf = NULL; 2305 g_autoptr(GError) g_err = NULL; 2306 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; 2307 int ret; 2308 2309 /* [0] for reading (back-end's end), [1] for writing (our end) */ 2310 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { 2311 error_setg(errp, "Failed to set up state transfer pipe: %s", 2312 g_err->message); 2313 ret = -EINVAL; 2314 goto fail; 2315 } 2316 2317 read_fd = pipe_fds[0]; 2318 write_fd = pipe_fds[1]; 2319 2320 /* 2321 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. 2322 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for 2323 * vhost-user, so just check that it is stopped at all. 2324 */ 2325 assert(!dev->started); 2326 2327 /* Transfer ownership of read_fd to the back-end */ 2328 ret = vhost_set_device_state_fd(dev, 2329 VHOST_TRANSFER_STATE_DIRECTION_LOAD, 2330 VHOST_TRANSFER_STATE_PHASE_STOPPED, 2331 read_fd, 2332 &reply_fd, 2333 errp); 2334 if (ret < 0) { 2335 error_prepend(errp, "Failed to initiate state transfer: "); 2336 goto fail; 2337 } 2338 2339 /* If the back-end wishes to use a different pipe, switch over */ 2340 if (reply_fd >= 0) { 2341 close(write_fd); 2342 write_fd = reply_fd; 2343 } 2344 2345 while (true) { 2346 size_t this_chunk_size = qemu_get_be32(f); 2347 ssize_t write_ret; 2348 const uint8_t *transfer_pointer; 2349 2350 if (this_chunk_size == 0) { 2351 /* End of state */ 2352 break; 2353 } 2354 2355 if (transfer_buf_size < this_chunk_size) { 2356 transfer_buf = g_realloc(transfer_buf, this_chunk_size); 2357 transfer_buf_size = this_chunk_size; 2358 } 2359 2360 if (qemu_get_buffer(f, transfer_buf, this_chunk_size) < 2361 this_chunk_size) 2362 { 2363 error_setg(errp, "Failed to read state"); 2364 ret = -EINVAL; 2365 goto fail; 2366 } 2367 2368 transfer_pointer = transfer_buf; 2369 while (this_chunk_size > 0) { 2370 write_ret = RETRY_ON_EINTR( 2371 write(write_fd, transfer_pointer, this_chunk_size) 2372 ); 2373 if (write_ret < 0) { 2374 ret = -errno; 2375 error_setg_errno(errp, -ret, "Failed to send state"); 2376 goto fail; 2377 } else if (write_ret == 0) { 2378 error_setg(errp, "Failed to send state: Connection is closed"); 2379 ret = -ECONNRESET; 2380 goto fail; 2381 } 2382 2383 assert(write_ret <= this_chunk_size); 2384 this_chunk_size -= write_ret; 2385 transfer_pointer += write_ret; 2386 } 2387 } 2388 2389 /* 2390 * Close our end, thus ending transfer, before inquiring the back-end about 2391 * whether transfer was successful 2392 */ 2393 close(write_fd); 2394 write_fd = -1; 2395 2396 /* Also, verify that the device is still stopped */ 2397 assert(!dev->started); 2398 2399 ret = vhost_check_device_state(dev, errp); 2400 if (ret < 0) { 2401 goto fail; 2402 } 2403 2404 ret = 0; 2405 fail: 2406 if (write_fd >= 0) { 2407 close(write_fd); 2408 } 2409 2410 return ret; 2411 } 2412