1 /* 2 * vhost support 3 * 4 * Copyright Red Hat, Inc. 2010 5 * 6 * Authors: 7 * Michael S. Tsirkin <mst@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "hw/virtio/vhost.h" 19 #include "qemu/atomic.h" 20 #include "qemu/range.h" 21 #include "qemu/error-report.h" 22 #include "qemu/memfd.h" 23 #include "qemu/log.h" 24 #include "standard-headers/linux/vhost_types.h" 25 #include "hw/virtio/virtio-bus.h" 26 #include "hw/mem/memory-device.h" 27 #include "migration/blocker.h" 28 #include "migration/qemu-file-types.h" 29 #include "sysemu/dma.h" 30 #include "trace.h" 31 32 /* enabled until disconnected backend stabilizes */ 33 #define _VHOST_DEBUG 1 34 35 #ifdef _VHOST_DEBUG 36 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 37 do { \ 38 error_report(fmt ": %s (%d)", ## __VA_ARGS__, \ 39 strerror(-retval), -retval); \ 40 } while (0) 41 #else 42 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 43 do { } while (0) 44 #endif 45 46 static struct vhost_log *vhost_log; 47 static struct vhost_log *vhost_log_shm; 48 49 /* Memslots used by backends that support private memslots (without an fd). */ 50 static unsigned int used_memslots; 51 52 /* Memslots used by backends that only support shared memslots (with an fd). */ 53 static unsigned int used_shared_memslots; 54 55 static QLIST_HEAD(, vhost_dev) vhost_devices = 56 QLIST_HEAD_INITIALIZER(vhost_devices); 57 58 unsigned int vhost_get_max_memslots(void) 59 { 60 unsigned int max = UINT_MAX; 61 struct vhost_dev *hdev; 62 63 QLIST_FOREACH(hdev, &vhost_devices, entry) { 64 max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev)); 65 } 66 return max; 67 } 68 69 unsigned int vhost_get_free_memslots(void) 70 { 71 unsigned int free = UINT_MAX; 72 struct vhost_dev *hdev; 73 74 QLIST_FOREACH(hdev, &vhost_devices, entry) { 75 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 76 unsigned int cur_free; 77 78 if (hdev->vhost_ops->vhost_backend_no_private_memslots && 79 hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { 80 cur_free = r - used_shared_memslots; 81 } else { 82 cur_free = r - used_memslots; 83 } 84 free = MIN(free, cur_free); 85 } 86 return free; 87 } 88 89 static void vhost_dev_sync_region(struct vhost_dev *dev, 90 MemoryRegionSection *section, 91 uint64_t mfirst, uint64_t mlast, 92 uint64_t rfirst, uint64_t rlast) 93 { 94 vhost_log_chunk_t *dev_log = dev->log->log; 95 96 uint64_t start = MAX(mfirst, rfirst); 97 uint64_t end = MIN(mlast, rlast); 98 vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK; 99 vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1; 100 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK); 101 102 if (end < start) { 103 return; 104 } 105 assert(end / VHOST_LOG_CHUNK < dev->log_size); 106 assert(start / VHOST_LOG_CHUNK < dev->log_size); 107 108 for (;from < to; ++from) { 109 vhost_log_chunk_t log; 110 /* We first check with non-atomic: much cheaper, 111 * and we expect non-dirty to be the common case. */ 112 if (!*from) { 113 addr += VHOST_LOG_CHUNK; 114 continue; 115 } 116 /* Data must be read atomically. We don't really need barrier semantics 117 * but it's easier to use atomic_* than roll our own. */ 118 log = qatomic_xchg(from, 0); 119 while (log) { 120 int bit = ctzl(log); 121 hwaddr page_addr; 122 hwaddr section_offset; 123 hwaddr mr_offset; 124 page_addr = addr + bit * VHOST_LOG_PAGE; 125 section_offset = page_addr - section->offset_within_address_space; 126 mr_offset = section_offset + section->offset_within_region; 127 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); 128 log &= ~(0x1ull << bit); 129 } 130 addr += VHOST_LOG_CHUNK; 131 } 132 } 133 134 bool vhost_dev_has_iommu(struct vhost_dev *dev) 135 { 136 VirtIODevice *vdev = dev->vdev; 137 138 /* 139 * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support 140 * incremental memory mapping API via IOTLB API. For platform that 141 * does not have IOMMU, there's no need to enable this feature 142 * which may cause unnecessary IOTLB miss/update transactions. 143 */ 144 if (vdev) { 145 return virtio_bus_device_iommu_enabled(vdev) && 146 virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); 147 } else { 148 return false; 149 } 150 } 151 152 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, 153 MemoryRegionSection *section, 154 hwaddr first, 155 hwaddr last) 156 { 157 int i; 158 hwaddr start_addr; 159 hwaddr end_addr; 160 161 if (!dev->log_enabled || !dev->started) { 162 return 0; 163 } 164 start_addr = section->offset_within_address_space; 165 end_addr = range_get_last(start_addr, int128_get64(section->size)); 166 start_addr = MAX(first, start_addr); 167 end_addr = MIN(last, end_addr); 168 169 for (i = 0; i < dev->mem->nregions; ++i) { 170 struct vhost_memory_region *reg = dev->mem->regions + i; 171 vhost_dev_sync_region(dev, section, start_addr, end_addr, 172 reg->guest_phys_addr, 173 range_get_last(reg->guest_phys_addr, 174 reg->memory_size)); 175 } 176 for (i = 0; i < dev->nvqs; ++i) { 177 struct vhost_virtqueue *vq = dev->vqs + i; 178 179 if (!vq->used_phys && !vq->used_size) { 180 continue; 181 } 182 183 if (vhost_dev_has_iommu(dev)) { 184 IOMMUTLBEntry iotlb; 185 hwaddr used_phys = vq->used_phys, used_size = vq->used_size; 186 hwaddr phys, s, offset; 187 188 while (used_size) { 189 rcu_read_lock(); 190 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 191 used_phys, 192 true, 193 MEMTXATTRS_UNSPECIFIED); 194 rcu_read_unlock(); 195 196 if (!iotlb.target_as) { 197 qemu_log_mask(LOG_GUEST_ERROR, "translation " 198 "failure for used_iova %"PRIx64"\n", 199 used_phys); 200 return -EINVAL; 201 } 202 203 offset = used_phys & iotlb.addr_mask; 204 phys = iotlb.translated_addr + offset; 205 206 /* 207 * Distance from start of used ring until last byte of 208 * IOMMU page. 209 */ 210 s = iotlb.addr_mask - offset; 211 /* 212 * Size of used ring, or of the part of it until end 213 * of IOMMU page. To avoid zero result, do the adding 214 * outside of MIN(). 215 */ 216 s = MIN(s, used_size - 1) + 1; 217 218 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys, 219 range_get_last(phys, s)); 220 used_size -= s; 221 used_phys += s; 222 } 223 } else { 224 vhost_dev_sync_region(dev, section, start_addr, 225 end_addr, vq->used_phys, 226 range_get_last(vq->used_phys, vq->used_size)); 227 } 228 } 229 return 0; 230 } 231 232 static void vhost_log_sync(MemoryListener *listener, 233 MemoryRegionSection *section) 234 { 235 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 236 memory_listener); 237 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); 238 } 239 240 static void vhost_log_sync_range(struct vhost_dev *dev, 241 hwaddr first, hwaddr last) 242 { 243 int i; 244 /* FIXME: this is N^2 in number of sections */ 245 for (i = 0; i < dev->n_mem_sections; ++i) { 246 MemoryRegionSection *section = &dev->mem_sections[i]; 247 vhost_sync_dirty_bitmap(dev, section, first, last); 248 } 249 } 250 251 static uint64_t vhost_get_log_size(struct vhost_dev *dev) 252 { 253 uint64_t log_size = 0; 254 int i; 255 for (i = 0; i < dev->mem->nregions; ++i) { 256 struct vhost_memory_region *reg = dev->mem->regions + i; 257 uint64_t last = range_get_last(reg->guest_phys_addr, 258 reg->memory_size); 259 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 260 } 261 return log_size; 262 } 263 264 static int vhost_set_backend_type(struct vhost_dev *dev, 265 VhostBackendType backend_type) 266 { 267 int r = 0; 268 269 switch (backend_type) { 270 #ifdef CONFIG_VHOST_KERNEL 271 case VHOST_BACKEND_TYPE_KERNEL: 272 dev->vhost_ops = &kernel_ops; 273 break; 274 #endif 275 #ifdef CONFIG_VHOST_USER 276 case VHOST_BACKEND_TYPE_USER: 277 dev->vhost_ops = &user_ops; 278 break; 279 #endif 280 #ifdef CONFIG_VHOST_VDPA 281 case VHOST_BACKEND_TYPE_VDPA: 282 dev->vhost_ops = &vdpa_ops; 283 break; 284 #endif 285 default: 286 error_report("Unknown vhost backend type"); 287 r = -1; 288 } 289 290 return r; 291 } 292 293 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) 294 { 295 Error *err = NULL; 296 struct vhost_log *log; 297 uint64_t logsize = size * sizeof(*(log->log)); 298 int fd = -1; 299 300 log = g_new0(struct vhost_log, 1); 301 if (share) { 302 log->log = qemu_memfd_alloc("vhost-log", logsize, 303 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 304 &fd, &err); 305 if (err) { 306 error_report_err(err); 307 g_free(log); 308 return NULL; 309 } 310 memset(log->log, 0, logsize); 311 } else { 312 log->log = g_malloc0(logsize); 313 } 314 315 log->size = size; 316 log->refcnt = 1; 317 log->fd = fd; 318 319 return log; 320 } 321 322 static struct vhost_log *vhost_log_get(uint64_t size, bool share) 323 { 324 struct vhost_log *log = share ? vhost_log_shm : vhost_log; 325 326 if (!log || log->size != size) { 327 log = vhost_log_alloc(size, share); 328 if (share) { 329 vhost_log_shm = log; 330 } else { 331 vhost_log = log; 332 } 333 } else { 334 ++log->refcnt; 335 } 336 337 return log; 338 } 339 340 static void vhost_log_put(struct vhost_dev *dev, bool sync) 341 { 342 struct vhost_log *log = dev->log; 343 344 if (!log) { 345 return; 346 } 347 348 --log->refcnt; 349 if (log->refcnt == 0) { 350 /* Sync only the range covered by the old log */ 351 if (dev->log_size && sync) { 352 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); 353 } 354 355 if (vhost_log == log) { 356 g_free(log->log); 357 vhost_log = NULL; 358 } else if (vhost_log_shm == log) { 359 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), 360 log->fd); 361 vhost_log_shm = NULL; 362 } 363 364 g_free(log); 365 } 366 367 dev->log = NULL; 368 dev->log_size = 0; 369 } 370 371 static bool vhost_dev_log_is_shared(struct vhost_dev *dev) 372 { 373 return dev->vhost_ops->vhost_requires_shm_log && 374 dev->vhost_ops->vhost_requires_shm_log(dev); 375 } 376 377 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) 378 { 379 struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); 380 uint64_t log_base = (uintptr_t)log->log; 381 int r; 382 383 /* inform backend of log switching, this must be done before 384 releasing the current log, to ensure no logging is lost */ 385 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); 386 if (r < 0) { 387 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 388 } 389 390 vhost_log_put(dev, true); 391 dev->log = log; 392 dev->log_size = size; 393 } 394 395 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, 396 hwaddr *plen, bool is_write) 397 { 398 if (!vhost_dev_has_iommu(dev)) { 399 return cpu_physical_memory_map(addr, plen, is_write); 400 } else { 401 return (void *)(uintptr_t)addr; 402 } 403 } 404 405 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer, 406 hwaddr len, int is_write, 407 hwaddr access_len) 408 { 409 if (!vhost_dev_has_iommu(dev)) { 410 cpu_physical_memory_unmap(buffer, len, is_write, access_len); 411 } 412 } 413 414 static int vhost_verify_ring_part_mapping(void *ring_hva, 415 uint64_t ring_gpa, 416 uint64_t ring_size, 417 void *reg_hva, 418 uint64_t reg_gpa, 419 uint64_t reg_size) 420 { 421 uint64_t hva_ring_offset; 422 uint64_t ring_last = range_get_last(ring_gpa, ring_size); 423 uint64_t reg_last = range_get_last(reg_gpa, reg_size); 424 425 if (ring_last < reg_gpa || ring_gpa > reg_last) { 426 return 0; 427 } 428 /* check that whole ring's is mapped */ 429 if (ring_last > reg_last) { 430 return -ENOMEM; 431 } 432 /* check that ring's MemoryRegion wasn't replaced */ 433 hva_ring_offset = ring_gpa - reg_gpa; 434 if (ring_hva != reg_hva + hva_ring_offset) { 435 return -EBUSY; 436 } 437 438 return 0; 439 } 440 441 static int vhost_verify_ring_mappings(struct vhost_dev *dev, 442 void *reg_hva, 443 uint64_t reg_gpa, 444 uint64_t reg_size) 445 { 446 int i, j; 447 int r = 0; 448 const char *part_name[] = { 449 "descriptor table", 450 "available ring", 451 "used ring" 452 }; 453 454 if (vhost_dev_has_iommu(dev)) { 455 return 0; 456 } 457 458 for (i = 0; i < dev->nvqs; ++i) { 459 struct vhost_virtqueue *vq = dev->vqs + i; 460 461 if (vq->desc_phys == 0) { 462 continue; 463 } 464 465 j = 0; 466 r = vhost_verify_ring_part_mapping( 467 vq->desc, vq->desc_phys, vq->desc_size, 468 reg_hva, reg_gpa, reg_size); 469 if (r) { 470 break; 471 } 472 473 j++; 474 r = vhost_verify_ring_part_mapping( 475 vq->avail, vq->avail_phys, vq->avail_size, 476 reg_hva, reg_gpa, reg_size); 477 if (r) { 478 break; 479 } 480 481 j++; 482 r = vhost_verify_ring_part_mapping( 483 vq->used, vq->used_phys, vq->used_size, 484 reg_hva, reg_gpa, reg_size); 485 if (r) { 486 break; 487 } 488 } 489 490 if (r == -ENOMEM) { 491 error_report("Unable to map %s for ring %d", part_name[j], i); 492 } else if (r == -EBUSY) { 493 error_report("%s relocated for ring %d", part_name[j], i); 494 } 495 return r; 496 } 497 498 /* 499 * vhost_section: identify sections needed for vhost access 500 * 501 * We only care about RAM sections here (where virtqueue and guest 502 * internals accessed by virtio might live). 503 */ 504 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) 505 { 506 MemoryRegion *mr = section->mr; 507 508 if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) { 509 uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr); 510 uint8_t handled_dirty; 511 512 /* 513 * Kernel based vhost doesn't handle any block which is doing 514 * dirty-tracking other than migration for which it has 515 * specific logging support. However for TCG the kernel never 516 * gets involved anyway so we can also ignore it's 517 * self-modiying code detection flags. However a vhost-user 518 * client could still confuse a TCG guest if it re-writes 519 * executable memory that has already been translated. 520 */ 521 handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) | 522 (1 << DIRTY_MEMORY_CODE); 523 524 if (dirty_mask & ~handled_dirty) { 525 trace_vhost_reject_section(mr->name, 1); 526 return false; 527 } 528 529 /* 530 * Some backends (like vhost-user) can only handle memory regions 531 * that have an fd (can be mapped into a different process). Filter 532 * the ones without an fd out, if requested. 533 * 534 * TODO: we might have to limit to MAP_SHARED as well. 535 */ 536 if (memory_region_get_fd(section->mr) < 0 && 537 dev->vhost_ops->vhost_backend_no_private_memslots && 538 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 539 trace_vhost_reject_section(mr->name, 2); 540 return false; 541 } 542 543 trace_vhost_section(mr->name); 544 return true; 545 } else { 546 trace_vhost_reject_section(mr->name, 3); 547 return false; 548 } 549 } 550 551 static void vhost_begin(MemoryListener *listener) 552 { 553 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 554 memory_listener); 555 dev->tmp_sections = NULL; 556 dev->n_tmp_sections = 0; 557 } 558 559 static void vhost_commit(MemoryListener *listener) 560 { 561 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 562 memory_listener); 563 MemoryRegionSection *old_sections; 564 int n_old_sections; 565 uint64_t log_size; 566 size_t regions_size; 567 int r; 568 int i; 569 bool changed = false; 570 571 /* Note we can be called before the device is started, but then 572 * starting the device calls set_mem_table, so we need to have 573 * built the data structures. 574 */ 575 old_sections = dev->mem_sections; 576 n_old_sections = dev->n_mem_sections; 577 dev->mem_sections = dev->tmp_sections; 578 dev->n_mem_sections = dev->n_tmp_sections; 579 580 if (dev->n_mem_sections != n_old_sections) { 581 changed = true; 582 } else { 583 /* Same size, lets check the contents */ 584 for (i = 0; i < n_old_sections; i++) { 585 if (!MemoryRegionSection_eq(&old_sections[i], 586 &dev->mem_sections[i])) { 587 changed = true; 588 break; 589 } 590 } 591 } 592 593 trace_vhost_commit(dev->started, changed); 594 if (!changed) { 595 goto out; 596 } 597 598 /* Rebuild the regions list from the new sections list */ 599 regions_size = offsetof(struct vhost_memory, regions) + 600 dev->n_mem_sections * sizeof dev->mem->regions[0]; 601 dev->mem = g_realloc(dev->mem, regions_size); 602 dev->mem->nregions = dev->n_mem_sections; 603 604 if (dev->vhost_ops->vhost_backend_no_private_memslots && 605 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 606 used_shared_memslots = dev->mem->nregions; 607 } else { 608 used_memslots = dev->mem->nregions; 609 } 610 611 for (i = 0; i < dev->n_mem_sections; i++) { 612 struct vhost_memory_region *cur_vmr = dev->mem->regions + i; 613 struct MemoryRegionSection *mrs = dev->mem_sections + i; 614 615 cur_vmr->guest_phys_addr = mrs->offset_within_address_space; 616 cur_vmr->memory_size = int128_get64(mrs->size); 617 cur_vmr->userspace_addr = 618 (uintptr_t)memory_region_get_ram_ptr(mrs->mr) + 619 mrs->offset_within_region; 620 cur_vmr->flags_padding = 0; 621 } 622 623 if (!dev->started) { 624 goto out; 625 } 626 627 for (i = 0; i < dev->mem->nregions; i++) { 628 if (vhost_verify_ring_mappings(dev, 629 (void *)(uintptr_t)dev->mem->regions[i].userspace_addr, 630 dev->mem->regions[i].guest_phys_addr, 631 dev->mem->regions[i].memory_size)) { 632 error_report("Verify ring failure on region %d", i); 633 abort(); 634 } 635 } 636 637 if (!dev->log_enabled) { 638 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 639 if (r < 0) { 640 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 641 } 642 goto out; 643 } 644 log_size = vhost_get_log_size(dev); 645 /* We allocate an extra 4K bytes to log, 646 * to reduce the * number of reallocations. */ 647 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) 648 /* To log more, must increase log size before table update. */ 649 if (dev->log_size < log_size) { 650 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); 651 } 652 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 653 if (r < 0) { 654 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 655 } 656 /* To log less, can only decrease log size after table update. */ 657 if (dev->log_size > log_size + VHOST_LOG_BUFFER) { 658 vhost_dev_log_resize(dev, log_size); 659 } 660 661 out: 662 /* Deref the old list of sections, this must happen _after_ the 663 * vhost_set_mem_table to ensure the client isn't still using the 664 * section we're about to unref. 665 */ 666 while (n_old_sections--) { 667 memory_region_unref(old_sections[n_old_sections].mr); 668 } 669 g_free(old_sections); 670 return; 671 } 672 673 /* Adds the section data to the tmp_section structure. 674 * It relies on the listener calling us in memory address order 675 * and for each region (via the _add and _nop methods) to 676 * join neighbours. 677 */ 678 static void vhost_region_add_section(struct vhost_dev *dev, 679 MemoryRegionSection *section) 680 { 681 bool need_add = true; 682 uint64_t mrs_size = int128_get64(section->size); 683 uint64_t mrs_gpa = section->offset_within_address_space; 684 uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + 685 section->offset_within_region; 686 RAMBlock *mrs_rb = section->mr->ram_block; 687 688 trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, 689 mrs_host); 690 691 if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { 692 /* Round the section to it's page size */ 693 /* First align the start down to a page boundary */ 694 size_t mrs_page = qemu_ram_pagesize(mrs_rb); 695 uint64_t alignage = mrs_host & (mrs_page - 1); 696 if (alignage) { 697 mrs_host -= alignage; 698 mrs_size += alignage; 699 mrs_gpa -= alignage; 700 } 701 /* Now align the size up to a page boundary */ 702 alignage = mrs_size & (mrs_page - 1); 703 if (alignage) { 704 mrs_size += mrs_page - alignage; 705 } 706 trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, 707 mrs_size, mrs_host); 708 } 709 710 if (dev->n_tmp_sections && !section->unmergeable) { 711 /* Since we already have at least one section, lets see if 712 * this extends it; since we're scanning in order, we only 713 * have to look at the last one, and the FlatView that calls 714 * us shouldn't have overlaps. 715 */ 716 MemoryRegionSection *prev_sec = dev->tmp_sections + 717 (dev->n_tmp_sections - 1); 718 uint64_t prev_gpa_start = prev_sec->offset_within_address_space; 719 uint64_t prev_size = int128_get64(prev_sec->size); 720 uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); 721 uint64_t prev_host_start = 722 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + 723 prev_sec->offset_within_region; 724 uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); 725 726 if (mrs_gpa <= (prev_gpa_end + 1)) { 727 /* OK, looks like overlapping/intersecting - it's possible that 728 * the rounding to page sizes has made them overlap, but they should 729 * match up in the same RAMBlock if they do. 730 */ 731 if (mrs_gpa < prev_gpa_start) { 732 error_report("%s:Section '%s' rounded to %"PRIx64 733 " prior to previous '%s' %"PRIx64, 734 __func__, section->mr->name, mrs_gpa, 735 prev_sec->mr->name, prev_gpa_start); 736 /* A way to cleanly fail here would be better */ 737 return; 738 } 739 /* Offset from the start of the previous GPA to this GPA */ 740 size_t offset = mrs_gpa - prev_gpa_start; 741 742 if (prev_host_start + offset == mrs_host && 743 section->mr == prev_sec->mr && !prev_sec->unmergeable) { 744 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); 745 need_add = false; 746 prev_sec->offset_within_address_space = 747 MIN(prev_gpa_start, mrs_gpa); 748 prev_sec->offset_within_region = 749 MIN(prev_host_start, mrs_host) - 750 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); 751 prev_sec->size = int128_make64(max_end - MIN(prev_host_start, 752 mrs_host)); 753 trace_vhost_region_add_section_merge(section->mr->name, 754 int128_get64(prev_sec->size), 755 prev_sec->offset_within_address_space, 756 prev_sec->offset_within_region); 757 } else { 758 /* adjoining regions are fine, but overlapping ones with 759 * different blocks/offsets shouldn't happen 760 */ 761 if (mrs_gpa != prev_gpa_end + 1) { 762 error_report("%s: Overlapping but not coherent sections " 763 "at %"PRIx64, 764 __func__, mrs_gpa); 765 return; 766 } 767 } 768 } 769 } 770 771 if (need_add) { 772 ++dev->n_tmp_sections; 773 dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections, 774 dev->n_tmp_sections); 775 dev->tmp_sections[dev->n_tmp_sections - 1] = *section; 776 /* The flatview isn't stable and we don't use it, making it NULL 777 * means we can memcmp the list. 778 */ 779 dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL; 780 memory_region_ref(section->mr); 781 } 782 } 783 784 /* Used for both add and nop callbacks */ 785 static void vhost_region_addnop(MemoryListener *listener, 786 MemoryRegionSection *section) 787 { 788 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 789 memory_listener); 790 791 if (!vhost_section(dev, section)) { 792 return; 793 } 794 vhost_region_add_section(dev, section); 795 } 796 797 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 798 { 799 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n); 800 struct vhost_dev *hdev = iommu->hdev; 801 hwaddr iova = iotlb->iova + iommu->iommu_offset; 802 803 if (vhost_backend_invalidate_device_iotlb(hdev, iova, 804 iotlb->addr_mask + 1)) { 805 error_report("Fail to invalidate device iotlb"); 806 } 807 } 808 809 static void vhost_iommu_region_add(MemoryListener *listener, 810 MemoryRegionSection *section) 811 { 812 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 813 iommu_listener); 814 struct vhost_iommu *iommu; 815 Int128 end; 816 int iommu_idx; 817 IOMMUMemoryRegion *iommu_mr; 818 819 if (!memory_region_is_iommu(section->mr)) { 820 return; 821 } 822 823 iommu_mr = IOMMU_MEMORY_REGION(section->mr); 824 825 iommu = g_malloc0(sizeof(*iommu)); 826 end = int128_add(int128_make64(section->offset_within_region), 827 section->size); 828 end = int128_sub(end, int128_one()); 829 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 830 MEMTXATTRS_UNSPECIFIED); 831 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, 832 dev->vdev->device_iotlb_enabled ? 833 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : 834 IOMMU_NOTIFIER_UNMAP, 835 section->offset_within_region, 836 int128_get64(end), 837 iommu_idx); 838 iommu->mr = section->mr; 839 iommu->iommu_offset = section->offset_within_address_space - 840 section->offset_within_region; 841 iommu->hdev = dev; 842 memory_region_register_iommu_notifier(section->mr, &iommu->n, 843 &error_fatal); 844 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next); 845 /* TODO: can replay help performance here? */ 846 } 847 848 static void vhost_iommu_region_del(MemoryListener *listener, 849 MemoryRegionSection *section) 850 { 851 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 852 iommu_listener); 853 struct vhost_iommu *iommu; 854 855 if (!memory_region_is_iommu(section->mr)) { 856 return; 857 } 858 859 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 860 if (iommu->mr == section->mr && 861 iommu->n.start == section->offset_within_region) { 862 memory_region_unregister_iommu_notifier(iommu->mr, 863 &iommu->n); 864 QLIST_REMOVE(iommu, iommu_next); 865 g_free(iommu); 866 break; 867 } 868 } 869 } 870 871 void vhost_toggle_device_iotlb(VirtIODevice *vdev) 872 { 873 VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); 874 struct vhost_dev *dev; 875 struct vhost_iommu *iommu; 876 877 if (vdev->vhost_started) { 878 dev = vdc->get_vhost(vdev); 879 } else { 880 return; 881 } 882 883 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 884 memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n); 885 iommu->n.notifier_flags = vdev->device_iotlb_enabled ? 886 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP; 887 memory_region_register_iommu_notifier(iommu->mr, &iommu->n, 888 &error_fatal); 889 } 890 } 891 892 static int vhost_virtqueue_set_addr(struct vhost_dev *dev, 893 struct vhost_virtqueue *vq, 894 unsigned idx, bool enable_log) 895 { 896 struct vhost_vring_addr addr; 897 int r; 898 memset(&addr, 0, sizeof(struct vhost_vring_addr)); 899 900 if (dev->vhost_ops->vhost_vq_get_addr) { 901 r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq); 902 if (r < 0) { 903 VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed"); 904 return r; 905 } 906 } else { 907 addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc; 908 addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail; 909 addr.used_user_addr = (uint64_t)(unsigned long)vq->used; 910 } 911 addr.index = idx; 912 addr.log_guest_addr = vq->used_phys; 913 addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0; 914 r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 915 if (r < 0) { 916 VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed"); 917 } 918 return r; 919 } 920 921 static int vhost_dev_set_features(struct vhost_dev *dev, 922 bool enable_log) 923 { 924 uint64_t features = dev->acked_features; 925 int r; 926 if (enable_log) { 927 features |= 0x1ULL << VHOST_F_LOG_ALL; 928 } 929 if (!vhost_dev_has_iommu(dev)) { 930 features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); 931 } 932 if (dev->vhost_ops->vhost_force_iommu) { 933 if (dev->vhost_ops->vhost_force_iommu(dev) == true) { 934 features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; 935 } 936 } 937 r = dev->vhost_ops->vhost_set_features(dev, features); 938 if (r < 0) { 939 VHOST_OPS_DEBUG(r, "vhost_set_features failed"); 940 goto out; 941 } 942 if (dev->vhost_ops->vhost_set_backend_cap) { 943 r = dev->vhost_ops->vhost_set_backend_cap(dev); 944 if (r < 0) { 945 VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed"); 946 goto out; 947 } 948 } 949 950 out: 951 return r; 952 } 953 954 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) 955 { 956 int r, i, idx; 957 hwaddr addr; 958 959 r = vhost_dev_set_features(dev, enable_log); 960 if (r < 0) { 961 goto err_features; 962 } 963 for (i = 0; i < dev->nvqs; ++i) { 964 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 965 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 966 if (!addr) { 967 /* 968 * The queue might not be ready for start. If this 969 * is the case there is no reason to continue the process. 970 * The similar logic is used by the vhost_virtqueue_start() 971 * routine. 972 */ 973 continue; 974 } 975 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 976 enable_log); 977 if (r < 0) { 978 goto err_vq; 979 } 980 } 981 return 0; 982 err_vq: 983 for (; i >= 0; --i) { 984 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 985 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 986 if (!addr) { 987 continue; 988 } 989 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 990 dev->log_enabled); 991 } 992 vhost_dev_set_features(dev, dev->log_enabled); 993 err_features: 994 return r; 995 } 996 997 static int vhost_migration_log(MemoryListener *listener, bool enable) 998 { 999 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 1000 memory_listener); 1001 int r; 1002 if (enable == dev->log_enabled) { 1003 return 0; 1004 } 1005 if (!dev->started) { 1006 dev->log_enabled = enable; 1007 return 0; 1008 } 1009 1010 r = 0; 1011 if (!enable) { 1012 r = vhost_dev_set_log(dev, false); 1013 if (r < 0) { 1014 goto check_dev_state; 1015 } 1016 vhost_log_put(dev, false); 1017 } else { 1018 vhost_dev_log_resize(dev, vhost_get_log_size(dev)); 1019 r = vhost_dev_set_log(dev, true); 1020 if (r < 0) { 1021 goto check_dev_state; 1022 } 1023 } 1024 1025 check_dev_state: 1026 dev->log_enabled = enable; 1027 /* 1028 * vhost-user-* devices could change their state during log 1029 * initialization due to disconnect. So check dev state after 1030 * vhost communication. 1031 */ 1032 if (!dev->started) { 1033 /* 1034 * Since device is in the stopped state, it is okay for 1035 * migration. Return success. 1036 */ 1037 r = 0; 1038 } 1039 if (r) { 1040 /* An error occurred. */ 1041 dev->log_enabled = false; 1042 } 1043 1044 return r; 1045 } 1046 1047 static void vhost_log_global_start(MemoryListener *listener) 1048 { 1049 int r; 1050 1051 r = vhost_migration_log(listener, true); 1052 if (r < 0) { 1053 abort(); 1054 } 1055 } 1056 1057 static void vhost_log_global_stop(MemoryListener *listener) 1058 { 1059 int r; 1060 1061 r = vhost_migration_log(listener, false); 1062 if (r < 0) { 1063 abort(); 1064 } 1065 } 1066 1067 static void vhost_log_start(MemoryListener *listener, 1068 MemoryRegionSection *section, 1069 int old, int new) 1070 { 1071 /* FIXME: implement */ 1072 } 1073 1074 static void vhost_log_stop(MemoryListener *listener, 1075 MemoryRegionSection *section, 1076 int old, int new) 1077 { 1078 /* FIXME: implement */ 1079 } 1080 1081 /* The vhost driver natively knows how to handle the vrings of non 1082 * cross-endian legacy devices and modern devices. Only legacy devices 1083 * exposed to a bi-endian guest may require the vhost driver to use a 1084 * specific endianness. 1085 */ 1086 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) 1087 { 1088 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1089 return false; 1090 } 1091 #if HOST_BIG_ENDIAN 1092 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; 1093 #else 1094 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; 1095 #endif 1096 } 1097 1098 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, 1099 bool is_big_endian, 1100 int vhost_vq_index) 1101 { 1102 int r; 1103 struct vhost_vring_state s = { 1104 .index = vhost_vq_index, 1105 .num = is_big_endian 1106 }; 1107 1108 r = dev->vhost_ops->vhost_set_vring_endian(dev, &s); 1109 if (r < 0) { 1110 VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed"); 1111 } 1112 return r; 1113 } 1114 1115 static int vhost_memory_region_lookup(struct vhost_dev *hdev, 1116 uint64_t gpa, uint64_t *uaddr, 1117 uint64_t *len) 1118 { 1119 int i; 1120 1121 for (i = 0; i < hdev->mem->nregions; i++) { 1122 struct vhost_memory_region *reg = hdev->mem->regions + i; 1123 1124 if (gpa >= reg->guest_phys_addr && 1125 reg->guest_phys_addr + reg->memory_size > gpa) { 1126 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr; 1127 *len = reg->guest_phys_addr + reg->memory_size - gpa; 1128 return 0; 1129 } 1130 } 1131 1132 return -EFAULT; 1133 } 1134 1135 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) 1136 { 1137 IOMMUTLBEntry iotlb; 1138 uint64_t uaddr, len; 1139 int ret = -EFAULT; 1140 1141 RCU_READ_LOCK_GUARD(); 1142 1143 trace_vhost_iotlb_miss(dev, 1); 1144 1145 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 1146 iova, write, 1147 MEMTXATTRS_UNSPECIFIED); 1148 if (iotlb.target_as != NULL) { 1149 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, 1150 &uaddr, &len); 1151 if (ret) { 1152 trace_vhost_iotlb_miss(dev, 3); 1153 error_report("Fail to lookup the translated address " 1154 "%"PRIx64, iotlb.translated_addr); 1155 goto out; 1156 } 1157 1158 len = MIN(iotlb.addr_mask + 1, len); 1159 iova = iova & ~iotlb.addr_mask; 1160 1161 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, 1162 len, iotlb.perm); 1163 if (ret) { 1164 trace_vhost_iotlb_miss(dev, 4); 1165 error_report("Fail to update device iotlb"); 1166 goto out; 1167 } 1168 } 1169 1170 trace_vhost_iotlb_miss(dev, 2); 1171 1172 out: 1173 return ret; 1174 } 1175 1176 int vhost_virtqueue_start(struct vhost_dev *dev, 1177 struct VirtIODevice *vdev, 1178 struct vhost_virtqueue *vq, 1179 unsigned idx) 1180 { 1181 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1182 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1183 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1184 hwaddr s, l, a; 1185 int r; 1186 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1187 struct vhost_vring_file file = { 1188 .index = vhost_vq_index 1189 }; 1190 struct vhost_vring_state state = { 1191 .index = vhost_vq_index 1192 }; 1193 struct VirtQueue *vvq = virtio_get_queue(vdev, idx); 1194 1195 a = virtio_queue_get_desc_addr(vdev, idx); 1196 if (a == 0) { 1197 /* Queue might not be ready for start */ 1198 return 0; 1199 } 1200 1201 vq->num = state.num = virtio_queue_get_num(vdev, idx); 1202 r = dev->vhost_ops->vhost_set_vring_num(dev, &state); 1203 if (r) { 1204 VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed"); 1205 return r; 1206 } 1207 1208 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 1209 r = dev->vhost_ops->vhost_set_vring_base(dev, &state); 1210 if (r) { 1211 VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed"); 1212 return r; 1213 } 1214 1215 if (vhost_needs_vring_endian(vdev)) { 1216 r = vhost_virtqueue_set_vring_endian_legacy(dev, 1217 virtio_is_big_endian(vdev), 1218 vhost_vq_index); 1219 if (r) { 1220 return r; 1221 } 1222 } 1223 1224 vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx); 1225 vq->desc_phys = a; 1226 vq->desc = vhost_memory_map(dev, a, &l, false); 1227 if (!vq->desc || l != s) { 1228 r = -ENOMEM; 1229 goto fail_alloc_desc; 1230 } 1231 vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx); 1232 vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx); 1233 vq->avail = vhost_memory_map(dev, a, &l, false); 1234 if (!vq->avail || l != s) { 1235 r = -ENOMEM; 1236 goto fail_alloc_avail; 1237 } 1238 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); 1239 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); 1240 vq->used = vhost_memory_map(dev, a, &l, true); 1241 if (!vq->used || l != s) { 1242 r = -ENOMEM; 1243 goto fail_alloc_used; 1244 } 1245 1246 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); 1247 if (r < 0) { 1248 goto fail_alloc; 1249 } 1250 1251 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); 1252 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); 1253 if (r) { 1254 VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed"); 1255 goto fail_kick; 1256 } 1257 1258 /* Clear and discard previous events if any. */ 1259 event_notifier_test_and_clear(&vq->masked_notifier); 1260 1261 /* Init vring in unmasked state, unless guest_notifier_mask 1262 * will do it later. 1263 */ 1264 if (!vdev->use_guest_notifier_mask) { 1265 /* TODO: check and handle errors. */ 1266 vhost_virtqueue_mask(dev, vdev, idx, false); 1267 } 1268 1269 if (k->query_guest_notifiers && 1270 k->query_guest_notifiers(qbus->parent) && 1271 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) { 1272 file.fd = -1; 1273 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1274 if (r) { 1275 goto fail_vector; 1276 } 1277 } 1278 1279 return 0; 1280 1281 fail_vector: 1282 fail_kick: 1283 fail_alloc: 1284 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1285 0, 0); 1286 fail_alloc_used: 1287 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1288 0, 0); 1289 fail_alloc_avail: 1290 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1291 0, 0); 1292 fail_alloc_desc: 1293 return r; 1294 } 1295 1296 void vhost_virtqueue_stop(struct vhost_dev *dev, 1297 struct VirtIODevice *vdev, 1298 struct vhost_virtqueue *vq, 1299 unsigned idx) 1300 { 1301 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1302 struct vhost_vring_state state = { 1303 .index = vhost_vq_index, 1304 }; 1305 int r; 1306 1307 if (virtio_queue_get_desc_addr(vdev, idx) == 0) { 1308 /* Don't stop the virtqueue which might have not been started */ 1309 return; 1310 } 1311 1312 r = dev->vhost_ops->vhost_get_vring_base(dev, &state); 1313 if (r < 0) { 1314 VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r); 1315 /* Connection to the backend is broken, so let's sync internal 1316 * last avail idx to the device used idx. 1317 */ 1318 virtio_queue_restore_last_avail_idx(vdev, idx); 1319 } else { 1320 virtio_queue_set_last_avail_idx(vdev, idx, state.num); 1321 } 1322 virtio_queue_invalidate_signalled_used(vdev, idx); 1323 virtio_queue_update_used_idx(vdev, idx); 1324 1325 /* In the cross-endian case, we need to reset the vring endianness to 1326 * native as legacy devices expect so by default. 1327 */ 1328 if (vhost_needs_vring_endian(vdev)) { 1329 vhost_virtqueue_set_vring_endian_legacy(dev, 1330 !virtio_is_big_endian(vdev), 1331 vhost_vq_index); 1332 } 1333 1334 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1335 1, virtio_queue_get_used_size(vdev, idx)); 1336 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1337 0, virtio_queue_get_avail_size(vdev, idx)); 1338 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1339 0, virtio_queue_get_desc_size(vdev, idx)); 1340 } 1341 1342 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, 1343 int n, uint32_t timeout) 1344 { 1345 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1346 struct vhost_vring_state state = { 1347 .index = vhost_vq_index, 1348 .num = timeout, 1349 }; 1350 int r; 1351 1352 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) { 1353 return -EINVAL; 1354 } 1355 1356 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state); 1357 if (r) { 1358 VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed"); 1359 return r; 1360 } 1361 1362 return 0; 1363 } 1364 1365 static void vhost_virtqueue_error_notifier(EventNotifier *n) 1366 { 1367 struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue, 1368 error_notifier); 1369 struct vhost_dev *dev = vq->dev; 1370 int index = vq - dev->vqs; 1371 1372 if (event_notifier_test_and_clear(n) && dev->vdev) { 1373 VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", 1374 dev->vq_index + index); 1375 } 1376 } 1377 1378 static int vhost_virtqueue_init(struct vhost_dev *dev, 1379 struct vhost_virtqueue *vq, int n) 1380 { 1381 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1382 struct vhost_vring_file file = { 1383 .index = vhost_vq_index, 1384 }; 1385 int r = event_notifier_init(&vq->masked_notifier, 0); 1386 if (r < 0) { 1387 return r; 1388 } 1389 1390 file.fd = event_notifier_get_wfd(&vq->masked_notifier); 1391 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1392 if (r) { 1393 VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed"); 1394 goto fail_call; 1395 } 1396 1397 vq->dev = dev; 1398 1399 if (dev->vhost_ops->vhost_set_vring_err) { 1400 r = event_notifier_init(&vq->error_notifier, 0); 1401 if (r < 0) { 1402 goto fail_call; 1403 } 1404 1405 file.fd = event_notifier_get_fd(&vq->error_notifier); 1406 r = dev->vhost_ops->vhost_set_vring_err(dev, &file); 1407 if (r) { 1408 VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed"); 1409 goto fail_err; 1410 } 1411 1412 event_notifier_set_handler(&vq->error_notifier, 1413 vhost_virtqueue_error_notifier); 1414 } 1415 1416 return 0; 1417 1418 fail_err: 1419 event_notifier_cleanup(&vq->error_notifier); 1420 fail_call: 1421 event_notifier_cleanup(&vq->masked_notifier); 1422 return r; 1423 } 1424 1425 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) 1426 { 1427 event_notifier_cleanup(&vq->masked_notifier); 1428 if (vq->dev->vhost_ops->vhost_set_vring_err) { 1429 event_notifier_set_handler(&vq->error_notifier, NULL); 1430 event_notifier_cleanup(&vq->error_notifier); 1431 } 1432 } 1433 1434 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 1435 VhostBackendType backend_type, uint32_t busyloop_timeout, 1436 Error **errp) 1437 { 1438 unsigned int used, reserved, limit; 1439 uint64_t features; 1440 int i, r, n_initialized_vqs = 0; 1441 1442 hdev->vdev = NULL; 1443 hdev->migration_blocker = NULL; 1444 1445 r = vhost_set_backend_type(hdev, backend_type); 1446 assert(r >= 0); 1447 1448 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp); 1449 if (r < 0) { 1450 goto fail; 1451 } 1452 1453 r = hdev->vhost_ops->vhost_set_owner(hdev); 1454 if (r < 0) { 1455 error_setg_errno(errp, -r, "vhost_set_owner failed"); 1456 goto fail; 1457 } 1458 1459 r = hdev->vhost_ops->vhost_get_features(hdev, &features); 1460 if (r < 0) { 1461 error_setg_errno(errp, -r, "vhost_get_features failed"); 1462 goto fail; 1463 } 1464 1465 limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 1466 if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS && 1467 memory_devices_memslot_auto_decision_active()) { 1468 error_setg(errp, "some memory device (like virtio-mem)" 1469 " decided how many memory slots to use based on the overall" 1470 " number of memory slots; this vhost backend would further" 1471 " restricts the overall number of memory slots"); 1472 error_append_hint(errp, "Try plugging this vhost backend before" 1473 " plugging such memory devices.\n"); 1474 r = -EINVAL; 1475 goto fail; 1476 } 1477 1478 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) { 1479 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); 1480 if (r < 0) { 1481 error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i); 1482 goto fail; 1483 } 1484 } 1485 1486 if (busyloop_timeout) { 1487 for (i = 0; i < hdev->nvqs; ++i) { 1488 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 1489 busyloop_timeout); 1490 if (r < 0) { 1491 error_setg_errno(errp, -r, "Failed to set busyloop timeout"); 1492 goto fail_busyloop; 1493 } 1494 } 1495 } 1496 1497 hdev->features = features; 1498 1499 hdev->memory_listener = (MemoryListener) { 1500 .name = "vhost", 1501 .begin = vhost_begin, 1502 .commit = vhost_commit, 1503 .region_add = vhost_region_addnop, 1504 .region_nop = vhost_region_addnop, 1505 .log_start = vhost_log_start, 1506 .log_stop = vhost_log_stop, 1507 .log_sync = vhost_log_sync, 1508 .log_global_start = vhost_log_global_start, 1509 .log_global_stop = vhost_log_global_stop, 1510 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND 1511 }; 1512 1513 hdev->iommu_listener = (MemoryListener) { 1514 .name = "vhost-iommu", 1515 .region_add = vhost_iommu_region_add, 1516 .region_del = vhost_iommu_region_del, 1517 }; 1518 1519 if (hdev->migration_blocker == NULL) { 1520 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { 1521 error_setg(&hdev->migration_blocker, 1522 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); 1523 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { 1524 error_setg(&hdev->migration_blocker, 1525 "Migration disabled: failed to allocate shared memory"); 1526 } 1527 } 1528 1529 if (hdev->migration_blocker != NULL) { 1530 r = migrate_add_blocker_normal(&hdev->migration_blocker, errp); 1531 if (r < 0) { 1532 goto fail_busyloop; 1533 } 1534 } 1535 1536 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); 1537 hdev->n_mem_sections = 0; 1538 hdev->mem_sections = NULL; 1539 hdev->log = NULL; 1540 hdev->log_size = 0; 1541 hdev->log_enabled = false; 1542 hdev->started = false; 1543 memory_listener_register(&hdev->memory_listener, &address_space_memory); 1544 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); 1545 1546 /* 1547 * The listener we registered properly updated the corresponding counter. 1548 * So we can trust that these values are accurate. 1549 */ 1550 if (hdev->vhost_ops->vhost_backend_no_private_memslots && 1551 hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { 1552 used = used_shared_memslots; 1553 } else { 1554 used = used_memslots; 1555 } 1556 /* 1557 * We assume that all reserved memslots actually require a real memslot 1558 * in our vhost backend. This might not be true, for example, if the 1559 * memslot would be ROM. If ever relevant, we can optimize for that -- 1560 * but we'll need additional information about the reservations. 1561 */ 1562 reserved = memory_devices_get_reserved_memslots(); 1563 if (used + reserved > limit) { 1564 error_setg(errp, "vhost backend memory slots limit (%d) is less" 1565 " than current number of used (%d) and reserved (%d)" 1566 " memory slots for memory devices.", limit, used, reserved); 1567 r = -EINVAL; 1568 goto fail_busyloop; 1569 } 1570 1571 return 0; 1572 1573 fail_busyloop: 1574 if (busyloop_timeout) { 1575 while (--i >= 0) { 1576 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0); 1577 } 1578 } 1579 fail: 1580 hdev->nvqs = n_initialized_vqs; 1581 vhost_dev_cleanup(hdev); 1582 return r; 1583 } 1584 1585 void vhost_dev_cleanup(struct vhost_dev *hdev) 1586 { 1587 int i; 1588 1589 trace_vhost_dev_cleanup(hdev); 1590 1591 for (i = 0; i < hdev->nvqs; ++i) { 1592 vhost_virtqueue_cleanup(hdev->vqs + i); 1593 } 1594 if (hdev->mem) { 1595 /* those are only safe after successful init */ 1596 memory_listener_unregister(&hdev->memory_listener); 1597 QLIST_REMOVE(hdev, entry); 1598 } 1599 migrate_del_blocker(&hdev->migration_blocker); 1600 g_free(hdev->mem); 1601 g_free(hdev->mem_sections); 1602 if (hdev->vhost_ops) { 1603 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1604 } 1605 assert(!hdev->log); 1606 1607 memset(hdev, 0, sizeof(struct vhost_dev)); 1608 } 1609 1610 static void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev, 1611 VirtIODevice *vdev, 1612 unsigned int nvqs) 1613 { 1614 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1615 int i, r; 1616 1617 /* 1618 * Batch all the host notifiers in a single transaction to avoid 1619 * quadratic time complexity in address_space_update_ioeventfds(). 1620 */ 1621 memory_region_transaction_begin(); 1622 1623 for (i = 0; i < nvqs; ++i) { 1624 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1625 false); 1626 if (r < 0) { 1627 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); 1628 } 1629 assert(r >= 0); 1630 } 1631 1632 /* 1633 * The transaction expects the ioeventfds to be open when it 1634 * commits. Do it now, before the cleanup loop. 1635 */ 1636 memory_region_transaction_commit(); 1637 1638 for (i = 0; i < nvqs; ++i) { 1639 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); 1640 } 1641 virtio_device_release_ioeventfd(vdev); 1642 } 1643 1644 /* Stop processing guest IO notifications in qemu. 1645 * Start processing them in vhost in kernel. 1646 */ 1647 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1648 { 1649 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1650 int i, r; 1651 1652 /* We will pass the notifiers to the kernel, make sure that QEMU 1653 * doesn't interfere. 1654 */ 1655 r = virtio_device_grab_ioeventfd(vdev); 1656 if (r < 0) { 1657 error_report("binding does not support host notifiers"); 1658 return r; 1659 } 1660 1661 /* 1662 * Batch all the host notifiers in a single transaction to avoid 1663 * quadratic time complexity in address_space_update_ioeventfds(). 1664 */ 1665 memory_region_transaction_begin(); 1666 1667 for (i = 0; i < hdev->nvqs; ++i) { 1668 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1669 true); 1670 if (r < 0) { 1671 error_report("vhost VQ %d notifier binding failed: %d", i, -r); 1672 memory_region_transaction_commit(); 1673 vhost_dev_disable_notifiers_nvqs(hdev, vdev, i); 1674 return r; 1675 } 1676 } 1677 1678 memory_region_transaction_commit(); 1679 1680 return 0; 1681 } 1682 1683 /* Stop processing guest IO notifications in vhost. 1684 * Start processing them in qemu. 1685 * This might actually run the qemu handlers right away, 1686 * so virtio in qemu must be completely setup when this is called. 1687 */ 1688 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1689 { 1690 vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs); 1691 } 1692 1693 /* Test and clear event pending status. 1694 * Should be called after unmask to avoid losing events. 1695 */ 1696 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) 1697 { 1698 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; 1699 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); 1700 return event_notifier_test_and_clear(&vq->masked_notifier); 1701 } 1702 1703 /* Mask/unmask events from this vq. */ 1704 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, 1705 bool mask) 1706 { 1707 struct VirtQueue *vvq = virtio_get_queue(vdev, n); 1708 int r, index = n - hdev->vq_index; 1709 struct vhost_vring_file file; 1710 1711 /* should only be called after backend is connected */ 1712 assert(hdev->vhost_ops); 1713 1714 if (mask) { 1715 assert(vdev->use_guest_notifier_mask); 1716 file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier); 1717 } else { 1718 file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq)); 1719 } 1720 1721 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); 1722 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); 1723 if (r < 0) { 1724 error_report("vhost_set_vring_call failed %d", -r); 1725 } 1726 } 1727 1728 bool vhost_config_pending(struct vhost_dev *hdev) 1729 { 1730 assert(hdev->vhost_ops); 1731 if ((hdev->started == false) || 1732 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1733 return false; 1734 } 1735 1736 EventNotifier *notifier = 1737 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1738 return event_notifier_test_and_clear(notifier); 1739 } 1740 1741 void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask) 1742 { 1743 int fd; 1744 int r; 1745 EventNotifier *notifier = 1746 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1747 EventNotifier *config_notifier = &vdev->config_notifier; 1748 assert(hdev->vhost_ops); 1749 1750 if ((hdev->started == false) || 1751 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1752 return; 1753 } 1754 if (mask) { 1755 assert(vdev->use_guest_notifier_mask); 1756 fd = event_notifier_get_fd(notifier); 1757 } else { 1758 fd = event_notifier_get_fd(config_notifier); 1759 } 1760 r = hdev->vhost_ops->vhost_set_config_call(hdev, fd); 1761 if (r < 0) { 1762 error_report("vhost_set_config_call failed %d", -r); 1763 } 1764 } 1765 1766 static void vhost_stop_config_intr(struct vhost_dev *dev) 1767 { 1768 int fd = -1; 1769 assert(dev->vhost_ops); 1770 if (dev->vhost_ops->vhost_set_config_call) { 1771 dev->vhost_ops->vhost_set_config_call(dev, fd); 1772 } 1773 } 1774 1775 static void vhost_start_config_intr(struct vhost_dev *dev) 1776 { 1777 int r; 1778 1779 assert(dev->vhost_ops); 1780 int fd = event_notifier_get_fd(&dev->vdev->config_notifier); 1781 if (dev->vhost_ops->vhost_set_config_call) { 1782 r = dev->vhost_ops->vhost_set_config_call(dev, fd); 1783 if (!r) { 1784 event_notifier_set(&dev->vdev->config_notifier); 1785 } 1786 } 1787 } 1788 1789 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, 1790 uint64_t features) 1791 { 1792 const int *bit = feature_bits; 1793 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1794 uint64_t bit_mask = (1ULL << *bit); 1795 if (!(hdev->features & bit_mask)) { 1796 features &= ~bit_mask; 1797 } 1798 bit++; 1799 } 1800 return features; 1801 } 1802 1803 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, 1804 uint64_t features) 1805 { 1806 const int *bit = feature_bits; 1807 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1808 uint64_t bit_mask = (1ULL << *bit); 1809 if (features & bit_mask) { 1810 hdev->acked_features |= bit_mask; 1811 } 1812 bit++; 1813 } 1814 } 1815 1816 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, 1817 uint32_t config_len, Error **errp) 1818 { 1819 assert(hdev->vhost_ops); 1820 1821 if (hdev->vhost_ops->vhost_get_config) { 1822 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len, 1823 errp); 1824 } 1825 1826 error_setg(errp, "vhost_get_config not implemented"); 1827 return -ENOSYS; 1828 } 1829 1830 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data, 1831 uint32_t offset, uint32_t size, uint32_t flags) 1832 { 1833 assert(hdev->vhost_ops); 1834 1835 if (hdev->vhost_ops->vhost_set_config) { 1836 return hdev->vhost_ops->vhost_set_config(hdev, data, offset, 1837 size, flags); 1838 } 1839 1840 return -ENOSYS; 1841 } 1842 1843 void vhost_dev_set_config_notifier(struct vhost_dev *hdev, 1844 const VhostDevConfigOps *ops) 1845 { 1846 hdev->config_ops = ops; 1847 } 1848 1849 void vhost_dev_free_inflight(struct vhost_inflight *inflight) 1850 { 1851 if (inflight && inflight->addr) { 1852 qemu_memfd_free(inflight->addr, inflight->size, inflight->fd); 1853 inflight->addr = NULL; 1854 inflight->fd = -1; 1855 } 1856 } 1857 1858 static int vhost_dev_resize_inflight(struct vhost_inflight *inflight, 1859 uint64_t new_size) 1860 { 1861 Error *err = NULL; 1862 int fd = -1; 1863 void *addr = qemu_memfd_alloc("vhost-inflight", new_size, 1864 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 1865 &fd, &err); 1866 1867 if (err) { 1868 error_report_err(err); 1869 return -ENOMEM; 1870 } 1871 1872 vhost_dev_free_inflight(inflight); 1873 inflight->offset = 0; 1874 inflight->addr = addr; 1875 inflight->fd = fd; 1876 inflight->size = new_size; 1877 1878 return 0; 1879 } 1880 1881 void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f) 1882 { 1883 if (inflight->addr) { 1884 qemu_put_be64(f, inflight->size); 1885 qemu_put_be16(f, inflight->queue_size); 1886 qemu_put_buffer(f, inflight->addr, inflight->size); 1887 } else { 1888 qemu_put_be64(f, 0); 1889 } 1890 } 1891 1892 int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f) 1893 { 1894 uint64_t size; 1895 1896 size = qemu_get_be64(f); 1897 if (!size) { 1898 return 0; 1899 } 1900 1901 if (inflight->size != size) { 1902 int ret = vhost_dev_resize_inflight(inflight, size); 1903 if (ret < 0) { 1904 return ret; 1905 } 1906 } 1907 inflight->queue_size = qemu_get_be16(f); 1908 1909 qemu_get_buffer(f, inflight->addr, size); 1910 1911 return 0; 1912 } 1913 1914 int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev) 1915 { 1916 int r; 1917 1918 if (hdev->vhost_ops->vhost_get_inflight_fd == NULL || 1919 hdev->vhost_ops->vhost_set_inflight_fd == NULL) { 1920 return 0; 1921 } 1922 1923 hdev->vdev = vdev; 1924 1925 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1926 if (r < 0) { 1927 VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed"); 1928 return r; 1929 } 1930 1931 return 0; 1932 } 1933 1934 int vhost_dev_set_inflight(struct vhost_dev *dev, 1935 struct vhost_inflight *inflight) 1936 { 1937 int r; 1938 1939 if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) { 1940 r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight); 1941 if (r) { 1942 VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed"); 1943 return r; 1944 } 1945 } 1946 1947 return 0; 1948 } 1949 1950 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, 1951 struct vhost_inflight *inflight) 1952 { 1953 int r; 1954 1955 if (dev->vhost_ops->vhost_get_inflight_fd) { 1956 r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight); 1957 if (r) { 1958 VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed"); 1959 return r; 1960 } 1961 } 1962 1963 return 0; 1964 } 1965 1966 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) 1967 { 1968 if (!hdev->vhost_ops->vhost_set_vring_enable) { 1969 return 0; 1970 } 1971 1972 /* 1973 * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not 1974 * been negotiated, the rings start directly in the enabled state, and 1975 * .vhost_set_vring_enable callback will fail since 1976 * VHOST_USER_SET_VRING_ENABLE is not supported. 1977 */ 1978 if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER && 1979 !virtio_has_feature(hdev->backend_features, 1980 VHOST_USER_F_PROTOCOL_FEATURES)) { 1981 return 0; 1982 } 1983 1984 return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); 1985 } 1986 1987 /* Host notifiers must be enabled at this point. */ 1988 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 1989 { 1990 int i, r; 1991 1992 /* should only be called after backend is connected */ 1993 assert(hdev->vhost_ops); 1994 1995 trace_vhost_dev_start(hdev, vdev->name, vrings); 1996 1997 vdev->vhost_started = true; 1998 hdev->started = true; 1999 hdev->vdev = vdev; 2000 2001 r = vhost_dev_set_features(hdev, hdev->log_enabled); 2002 if (r < 0) { 2003 goto fail_features; 2004 } 2005 2006 if (vhost_dev_has_iommu(hdev)) { 2007 memory_listener_register(&hdev->iommu_listener, vdev->dma_as); 2008 } 2009 2010 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); 2011 if (r < 0) { 2012 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 2013 goto fail_mem; 2014 } 2015 for (i = 0; i < hdev->nvqs; ++i) { 2016 r = vhost_virtqueue_start(hdev, 2017 vdev, 2018 hdev->vqs + i, 2019 hdev->vq_index + i); 2020 if (r < 0) { 2021 goto fail_vq; 2022 } 2023 } 2024 2025 r = event_notifier_init( 2026 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0); 2027 if (r < 0) { 2028 VHOST_OPS_DEBUG(r, "event_notifier_init failed"); 2029 goto fail_vq; 2030 } 2031 event_notifier_test_and_clear( 2032 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2033 if (!vdev->use_guest_notifier_mask) { 2034 vhost_config_mask(hdev, vdev, true); 2035 } 2036 if (hdev->log_enabled) { 2037 uint64_t log_base; 2038 2039 hdev->log_size = vhost_get_log_size(hdev); 2040 hdev->log = vhost_log_get(hdev->log_size, 2041 vhost_dev_log_is_shared(hdev)); 2042 log_base = (uintptr_t)hdev->log->log; 2043 r = hdev->vhost_ops->vhost_set_log_base(hdev, 2044 hdev->log_size ? log_base : 0, 2045 hdev->log); 2046 if (r < 0) { 2047 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 2048 goto fail_log; 2049 } 2050 } 2051 if (vrings) { 2052 r = vhost_dev_set_vring_enable(hdev, true); 2053 if (r) { 2054 goto fail_log; 2055 } 2056 } 2057 if (hdev->vhost_ops->vhost_dev_start) { 2058 r = hdev->vhost_ops->vhost_dev_start(hdev, true); 2059 if (r) { 2060 goto fail_start; 2061 } 2062 } 2063 if (vhost_dev_has_iommu(hdev) && 2064 hdev->vhost_ops->vhost_set_iotlb_callback) { 2065 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); 2066 2067 /* Update used ring information for IOTLB to work correctly, 2068 * vhost-kernel code requires for this.*/ 2069 for (i = 0; i < hdev->nvqs; ++i) { 2070 struct vhost_virtqueue *vq = hdev->vqs + i; 2071 vhost_device_iotlb_miss(hdev, vq->used_phys, true); 2072 } 2073 } 2074 vhost_start_config_intr(hdev); 2075 return 0; 2076 fail_start: 2077 if (vrings) { 2078 vhost_dev_set_vring_enable(hdev, false); 2079 } 2080 fail_log: 2081 vhost_log_put(hdev, false); 2082 fail_vq: 2083 while (--i >= 0) { 2084 vhost_virtqueue_stop(hdev, 2085 vdev, 2086 hdev->vqs + i, 2087 hdev->vq_index + i); 2088 } 2089 2090 fail_mem: 2091 if (vhost_dev_has_iommu(hdev)) { 2092 memory_listener_unregister(&hdev->iommu_listener); 2093 } 2094 fail_features: 2095 vdev->vhost_started = false; 2096 hdev->started = false; 2097 return r; 2098 } 2099 2100 /* Host notifiers must be enabled at this point. */ 2101 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 2102 { 2103 int i; 2104 2105 /* should only be called after backend is connected */ 2106 assert(hdev->vhost_ops); 2107 event_notifier_test_and_clear( 2108 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2109 event_notifier_test_and_clear(&vdev->config_notifier); 2110 event_notifier_cleanup( 2111 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2112 2113 trace_vhost_dev_stop(hdev, vdev->name, vrings); 2114 2115 if (hdev->vhost_ops->vhost_dev_start) { 2116 hdev->vhost_ops->vhost_dev_start(hdev, false); 2117 } 2118 if (vrings) { 2119 vhost_dev_set_vring_enable(hdev, false); 2120 } 2121 for (i = 0; i < hdev->nvqs; ++i) { 2122 vhost_virtqueue_stop(hdev, 2123 vdev, 2124 hdev->vqs + i, 2125 hdev->vq_index + i); 2126 } 2127 if (hdev->vhost_ops->vhost_reset_status) { 2128 hdev->vhost_ops->vhost_reset_status(hdev); 2129 } 2130 2131 if (vhost_dev_has_iommu(hdev)) { 2132 if (hdev->vhost_ops->vhost_set_iotlb_callback) { 2133 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 2134 } 2135 memory_listener_unregister(&hdev->iommu_listener); 2136 } 2137 vhost_stop_config_intr(hdev); 2138 vhost_log_put(hdev, true); 2139 hdev->started = false; 2140 vdev->vhost_started = false; 2141 hdev->vdev = NULL; 2142 } 2143 2144 int vhost_net_set_backend(struct vhost_dev *hdev, 2145 struct vhost_vring_file *file) 2146 { 2147 if (hdev->vhost_ops->vhost_net_set_backend) { 2148 return hdev->vhost_ops->vhost_net_set_backend(hdev, file); 2149 } 2150 2151 return -ENOSYS; 2152 } 2153 2154 int vhost_reset_device(struct vhost_dev *hdev) 2155 { 2156 if (hdev->vhost_ops->vhost_reset_device) { 2157 return hdev->vhost_ops->vhost_reset_device(hdev); 2158 } 2159 2160 return -ENOSYS; 2161 } 2162 2163 bool vhost_supports_device_state(struct vhost_dev *dev) 2164 { 2165 if (dev->vhost_ops->vhost_supports_device_state) { 2166 return dev->vhost_ops->vhost_supports_device_state(dev); 2167 } 2168 2169 return false; 2170 } 2171 2172 int vhost_set_device_state_fd(struct vhost_dev *dev, 2173 VhostDeviceStateDirection direction, 2174 VhostDeviceStatePhase phase, 2175 int fd, 2176 int *reply_fd, 2177 Error **errp) 2178 { 2179 if (dev->vhost_ops->vhost_set_device_state_fd) { 2180 return dev->vhost_ops->vhost_set_device_state_fd(dev, direction, phase, 2181 fd, reply_fd, errp); 2182 } 2183 2184 error_setg(errp, 2185 "vhost transport does not support migration state transfer"); 2186 return -ENOSYS; 2187 } 2188 2189 int vhost_check_device_state(struct vhost_dev *dev, Error **errp) 2190 { 2191 if (dev->vhost_ops->vhost_check_device_state) { 2192 return dev->vhost_ops->vhost_check_device_state(dev, errp); 2193 } 2194 2195 error_setg(errp, 2196 "vhost transport does not support migration state transfer"); 2197 return -ENOSYS; 2198 } 2199 2200 int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) 2201 { 2202 ERRP_GUARD(); 2203 /* Maximum chunk size in which to transfer the state */ 2204 const size_t chunk_size = 1 * 1024 * 1024; 2205 g_autofree void *transfer_buf = NULL; 2206 g_autoptr(GError) g_err = NULL; 2207 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; 2208 int ret; 2209 2210 /* [0] for reading (our end), [1] for writing (back-end's end) */ 2211 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { 2212 error_setg(errp, "Failed to set up state transfer pipe: %s", 2213 g_err->message); 2214 ret = -EINVAL; 2215 goto fail; 2216 } 2217 2218 read_fd = pipe_fds[0]; 2219 write_fd = pipe_fds[1]; 2220 2221 /* 2222 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. 2223 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for 2224 * vhost-user, so just check that it is stopped at all. 2225 */ 2226 assert(!dev->started); 2227 2228 /* Transfer ownership of write_fd to the back-end */ 2229 ret = vhost_set_device_state_fd(dev, 2230 VHOST_TRANSFER_STATE_DIRECTION_SAVE, 2231 VHOST_TRANSFER_STATE_PHASE_STOPPED, 2232 write_fd, 2233 &reply_fd, 2234 errp); 2235 if (ret < 0) { 2236 error_prepend(errp, "Failed to initiate state transfer: "); 2237 goto fail; 2238 } 2239 2240 /* If the back-end wishes to use a different pipe, switch over */ 2241 if (reply_fd >= 0) { 2242 close(read_fd); 2243 read_fd = reply_fd; 2244 } 2245 2246 transfer_buf = g_malloc(chunk_size); 2247 2248 while (true) { 2249 ssize_t read_ret; 2250 2251 read_ret = RETRY_ON_EINTR(read(read_fd, transfer_buf, chunk_size)); 2252 if (read_ret < 0) { 2253 ret = -errno; 2254 error_setg_errno(errp, -ret, "Failed to receive state"); 2255 goto fail; 2256 } 2257 2258 assert(read_ret <= chunk_size); 2259 qemu_put_be32(f, read_ret); 2260 2261 if (read_ret == 0) { 2262 /* EOF */ 2263 break; 2264 } 2265 2266 qemu_put_buffer(f, transfer_buf, read_ret); 2267 } 2268 2269 /* 2270 * Back-end will not really care, but be clean and close our end of the pipe 2271 * before inquiring the back-end about whether transfer was successful 2272 */ 2273 close(read_fd); 2274 read_fd = -1; 2275 2276 /* Also, verify that the device is still stopped */ 2277 assert(!dev->started); 2278 2279 ret = vhost_check_device_state(dev, errp); 2280 if (ret < 0) { 2281 goto fail; 2282 } 2283 2284 ret = 0; 2285 fail: 2286 if (read_fd >= 0) { 2287 close(read_fd); 2288 } 2289 2290 return ret; 2291 } 2292 2293 int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) 2294 { 2295 ERRP_GUARD(); 2296 size_t transfer_buf_size = 0; 2297 g_autofree void *transfer_buf = NULL; 2298 g_autoptr(GError) g_err = NULL; 2299 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; 2300 int ret; 2301 2302 /* [0] for reading (back-end's end), [1] for writing (our end) */ 2303 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { 2304 error_setg(errp, "Failed to set up state transfer pipe: %s", 2305 g_err->message); 2306 ret = -EINVAL; 2307 goto fail; 2308 } 2309 2310 read_fd = pipe_fds[0]; 2311 write_fd = pipe_fds[1]; 2312 2313 /* 2314 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. 2315 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for 2316 * vhost-user, so just check that it is stopped at all. 2317 */ 2318 assert(!dev->started); 2319 2320 /* Transfer ownership of read_fd to the back-end */ 2321 ret = vhost_set_device_state_fd(dev, 2322 VHOST_TRANSFER_STATE_DIRECTION_LOAD, 2323 VHOST_TRANSFER_STATE_PHASE_STOPPED, 2324 read_fd, 2325 &reply_fd, 2326 errp); 2327 if (ret < 0) { 2328 error_prepend(errp, "Failed to initiate state transfer: "); 2329 goto fail; 2330 } 2331 2332 /* If the back-end wishes to use a different pipe, switch over */ 2333 if (reply_fd >= 0) { 2334 close(write_fd); 2335 write_fd = reply_fd; 2336 } 2337 2338 while (true) { 2339 size_t this_chunk_size = qemu_get_be32(f); 2340 ssize_t write_ret; 2341 const uint8_t *transfer_pointer; 2342 2343 if (this_chunk_size == 0) { 2344 /* End of state */ 2345 break; 2346 } 2347 2348 if (transfer_buf_size < this_chunk_size) { 2349 transfer_buf = g_realloc(transfer_buf, this_chunk_size); 2350 transfer_buf_size = this_chunk_size; 2351 } 2352 2353 if (qemu_get_buffer(f, transfer_buf, this_chunk_size) < 2354 this_chunk_size) 2355 { 2356 error_setg(errp, "Failed to read state"); 2357 ret = -EINVAL; 2358 goto fail; 2359 } 2360 2361 transfer_pointer = transfer_buf; 2362 while (this_chunk_size > 0) { 2363 write_ret = RETRY_ON_EINTR( 2364 write(write_fd, transfer_pointer, this_chunk_size) 2365 ); 2366 if (write_ret < 0) { 2367 ret = -errno; 2368 error_setg_errno(errp, -ret, "Failed to send state"); 2369 goto fail; 2370 } else if (write_ret == 0) { 2371 error_setg(errp, "Failed to send state: Connection is closed"); 2372 ret = -ECONNRESET; 2373 goto fail; 2374 } 2375 2376 assert(write_ret <= this_chunk_size); 2377 this_chunk_size -= write_ret; 2378 transfer_pointer += write_ret; 2379 } 2380 } 2381 2382 /* 2383 * Close our end, thus ending transfer, before inquiring the back-end about 2384 * whether transfer was successful 2385 */ 2386 close(write_fd); 2387 write_fd = -1; 2388 2389 /* Also, verify that the device is still stopped */ 2390 assert(!dev->started); 2391 2392 ret = vhost_check_device_state(dev, errp); 2393 if (ret < 0) { 2394 goto fail; 2395 } 2396 2397 ret = 0; 2398 fail: 2399 if (write_fd >= 0) { 2400 close(write_fd); 2401 } 2402 2403 return ret; 2404 } 2405