1 /* 2 * vhost support 3 * 4 * Copyright Red Hat, Inc. 2010 5 * 6 * Authors: 7 * Michael S. Tsirkin <mst@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "hw/virtio/vhost.h" 19 #include "qemu/atomic.h" 20 #include "qemu/range.h" 21 #include "qemu/error-report.h" 22 #include "qemu/memfd.h" 23 #include "qemu/log.h" 24 #include "standard-headers/linux/vhost_types.h" 25 #include "hw/virtio/virtio-bus.h" 26 #include "migration/blocker.h" 27 #include "migration/qemu-file-types.h" 28 #include "sysemu/dma.h" 29 #include "trace.h" 30 31 /* enabled until disconnected backend stabilizes */ 32 #define _VHOST_DEBUG 1 33 34 #ifdef _VHOST_DEBUG 35 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 36 do { \ 37 error_report(fmt ": %s (%d)", ## __VA_ARGS__, \ 38 strerror(-retval), -retval); \ 39 } while (0) 40 #else 41 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 42 do { } while (0) 43 #endif 44 45 static struct vhost_log *vhost_log; 46 static struct vhost_log *vhost_log_shm; 47 48 /* Memslots used by backends that support private memslots (without an fd). */ 49 static unsigned int used_memslots; 50 51 /* Memslots used by backends that only support shared memslots (with an fd). */ 52 static unsigned int used_shared_memslots; 53 54 static QLIST_HEAD(, vhost_dev) vhost_devices = 55 QLIST_HEAD_INITIALIZER(vhost_devices); 56 57 unsigned int vhost_get_free_memslots(void) 58 { 59 unsigned int free = UINT_MAX; 60 struct vhost_dev *hdev; 61 62 QLIST_FOREACH(hdev, &vhost_devices, entry) { 63 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 64 unsigned int cur_free; 65 66 if (hdev->vhost_ops->vhost_backend_no_private_memslots && 67 hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { 68 cur_free = r - used_shared_memslots; 69 } else { 70 cur_free = r - used_memslots; 71 } 72 free = MIN(free, cur_free); 73 } 74 return free; 75 } 76 77 static void vhost_dev_sync_region(struct vhost_dev *dev, 78 MemoryRegionSection *section, 79 uint64_t mfirst, uint64_t mlast, 80 uint64_t rfirst, uint64_t rlast) 81 { 82 vhost_log_chunk_t *dev_log = dev->log->log; 83 84 uint64_t start = MAX(mfirst, rfirst); 85 uint64_t end = MIN(mlast, rlast); 86 vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK; 87 vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1; 88 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK); 89 90 if (end < start) { 91 return; 92 } 93 assert(end / VHOST_LOG_CHUNK < dev->log_size); 94 assert(start / VHOST_LOG_CHUNK < dev->log_size); 95 96 for (;from < to; ++from) { 97 vhost_log_chunk_t log; 98 /* We first check with non-atomic: much cheaper, 99 * and we expect non-dirty to be the common case. */ 100 if (!*from) { 101 addr += VHOST_LOG_CHUNK; 102 continue; 103 } 104 /* Data must be read atomically. We don't really need barrier semantics 105 * but it's easier to use atomic_* than roll our own. */ 106 log = qatomic_xchg(from, 0); 107 while (log) { 108 int bit = ctzl(log); 109 hwaddr page_addr; 110 hwaddr section_offset; 111 hwaddr mr_offset; 112 page_addr = addr + bit * VHOST_LOG_PAGE; 113 section_offset = page_addr - section->offset_within_address_space; 114 mr_offset = section_offset + section->offset_within_region; 115 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); 116 log &= ~(0x1ull << bit); 117 } 118 addr += VHOST_LOG_CHUNK; 119 } 120 } 121 122 bool vhost_dev_has_iommu(struct vhost_dev *dev) 123 { 124 VirtIODevice *vdev = dev->vdev; 125 126 /* 127 * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support 128 * incremental memory mapping API via IOTLB API. For platform that 129 * does not have IOMMU, there's no need to enable this feature 130 * which may cause unnecessary IOTLB miss/update transactions. 131 */ 132 if (vdev) { 133 return virtio_bus_device_iommu_enabled(vdev) && 134 virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); 135 } else { 136 return false; 137 } 138 } 139 140 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, 141 MemoryRegionSection *section, 142 hwaddr first, 143 hwaddr last) 144 { 145 int i; 146 hwaddr start_addr; 147 hwaddr end_addr; 148 149 if (!dev->log_enabled || !dev->started) { 150 return 0; 151 } 152 start_addr = section->offset_within_address_space; 153 end_addr = range_get_last(start_addr, int128_get64(section->size)); 154 start_addr = MAX(first, start_addr); 155 end_addr = MIN(last, end_addr); 156 157 for (i = 0; i < dev->mem->nregions; ++i) { 158 struct vhost_memory_region *reg = dev->mem->regions + i; 159 vhost_dev_sync_region(dev, section, start_addr, end_addr, 160 reg->guest_phys_addr, 161 range_get_last(reg->guest_phys_addr, 162 reg->memory_size)); 163 } 164 for (i = 0; i < dev->nvqs; ++i) { 165 struct vhost_virtqueue *vq = dev->vqs + i; 166 167 if (!vq->used_phys && !vq->used_size) { 168 continue; 169 } 170 171 if (vhost_dev_has_iommu(dev)) { 172 IOMMUTLBEntry iotlb; 173 hwaddr used_phys = vq->used_phys, used_size = vq->used_size; 174 hwaddr phys, s, offset; 175 176 while (used_size) { 177 rcu_read_lock(); 178 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 179 used_phys, 180 true, 181 MEMTXATTRS_UNSPECIFIED); 182 rcu_read_unlock(); 183 184 if (!iotlb.target_as) { 185 qemu_log_mask(LOG_GUEST_ERROR, "translation " 186 "failure for used_iova %"PRIx64"\n", 187 used_phys); 188 return -EINVAL; 189 } 190 191 offset = used_phys & iotlb.addr_mask; 192 phys = iotlb.translated_addr + offset; 193 194 /* 195 * Distance from start of used ring until last byte of 196 * IOMMU page. 197 */ 198 s = iotlb.addr_mask - offset; 199 /* 200 * Size of used ring, or of the part of it until end 201 * of IOMMU page. To avoid zero result, do the adding 202 * outside of MIN(). 203 */ 204 s = MIN(s, used_size - 1) + 1; 205 206 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys, 207 range_get_last(phys, s)); 208 used_size -= s; 209 used_phys += s; 210 } 211 } else { 212 vhost_dev_sync_region(dev, section, start_addr, 213 end_addr, vq->used_phys, 214 range_get_last(vq->used_phys, vq->used_size)); 215 } 216 } 217 return 0; 218 } 219 220 static void vhost_log_sync(MemoryListener *listener, 221 MemoryRegionSection *section) 222 { 223 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 224 memory_listener); 225 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); 226 } 227 228 static void vhost_log_sync_range(struct vhost_dev *dev, 229 hwaddr first, hwaddr last) 230 { 231 int i; 232 /* FIXME: this is N^2 in number of sections */ 233 for (i = 0; i < dev->n_mem_sections; ++i) { 234 MemoryRegionSection *section = &dev->mem_sections[i]; 235 vhost_sync_dirty_bitmap(dev, section, first, last); 236 } 237 } 238 239 static uint64_t vhost_get_log_size(struct vhost_dev *dev) 240 { 241 uint64_t log_size = 0; 242 int i; 243 for (i = 0; i < dev->mem->nregions; ++i) { 244 struct vhost_memory_region *reg = dev->mem->regions + i; 245 uint64_t last = range_get_last(reg->guest_phys_addr, 246 reg->memory_size); 247 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 248 } 249 return log_size; 250 } 251 252 static int vhost_set_backend_type(struct vhost_dev *dev, 253 VhostBackendType backend_type) 254 { 255 int r = 0; 256 257 switch (backend_type) { 258 #ifdef CONFIG_VHOST_KERNEL 259 case VHOST_BACKEND_TYPE_KERNEL: 260 dev->vhost_ops = &kernel_ops; 261 break; 262 #endif 263 #ifdef CONFIG_VHOST_USER 264 case VHOST_BACKEND_TYPE_USER: 265 dev->vhost_ops = &user_ops; 266 break; 267 #endif 268 #ifdef CONFIG_VHOST_VDPA 269 case VHOST_BACKEND_TYPE_VDPA: 270 dev->vhost_ops = &vdpa_ops; 271 break; 272 #endif 273 default: 274 error_report("Unknown vhost backend type"); 275 r = -1; 276 } 277 278 return r; 279 } 280 281 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) 282 { 283 Error *err = NULL; 284 struct vhost_log *log; 285 uint64_t logsize = size * sizeof(*(log->log)); 286 int fd = -1; 287 288 log = g_new0(struct vhost_log, 1); 289 if (share) { 290 log->log = qemu_memfd_alloc("vhost-log", logsize, 291 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 292 &fd, &err); 293 if (err) { 294 error_report_err(err); 295 g_free(log); 296 return NULL; 297 } 298 memset(log->log, 0, logsize); 299 } else { 300 log->log = g_malloc0(logsize); 301 } 302 303 log->size = size; 304 log->refcnt = 1; 305 log->fd = fd; 306 307 return log; 308 } 309 310 static struct vhost_log *vhost_log_get(uint64_t size, bool share) 311 { 312 struct vhost_log *log = share ? vhost_log_shm : vhost_log; 313 314 if (!log || log->size != size) { 315 log = vhost_log_alloc(size, share); 316 if (share) { 317 vhost_log_shm = log; 318 } else { 319 vhost_log = log; 320 } 321 } else { 322 ++log->refcnt; 323 } 324 325 return log; 326 } 327 328 static void vhost_log_put(struct vhost_dev *dev, bool sync) 329 { 330 struct vhost_log *log = dev->log; 331 332 if (!log) { 333 return; 334 } 335 336 --log->refcnt; 337 if (log->refcnt == 0) { 338 /* Sync only the range covered by the old log */ 339 if (dev->log_size && sync) { 340 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); 341 } 342 343 if (vhost_log == log) { 344 g_free(log->log); 345 vhost_log = NULL; 346 } else if (vhost_log_shm == log) { 347 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), 348 log->fd); 349 vhost_log_shm = NULL; 350 } 351 352 g_free(log); 353 } 354 355 dev->log = NULL; 356 dev->log_size = 0; 357 } 358 359 static bool vhost_dev_log_is_shared(struct vhost_dev *dev) 360 { 361 return dev->vhost_ops->vhost_requires_shm_log && 362 dev->vhost_ops->vhost_requires_shm_log(dev); 363 } 364 365 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) 366 { 367 struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); 368 uint64_t log_base = (uintptr_t)log->log; 369 int r; 370 371 /* inform backend of log switching, this must be done before 372 releasing the current log, to ensure no logging is lost */ 373 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); 374 if (r < 0) { 375 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 376 } 377 378 vhost_log_put(dev, true); 379 dev->log = log; 380 dev->log_size = size; 381 } 382 383 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, 384 hwaddr *plen, bool is_write) 385 { 386 if (!vhost_dev_has_iommu(dev)) { 387 return cpu_physical_memory_map(addr, plen, is_write); 388 } else { 389 return (void *)(uintptr_t)addr; 390 } 391 } 392 393 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer, 394 hwaddr len, int is_write, 395 hwaddr access_len) 396 { 397 if (!vhost_dev_has_iommu(dev)) { 398 cpu_physical_memory_unmap(buffer, len, is_write, access_len); 399 } 400 } 401 402 static int vhost_verify_ring_part_mapping(void *ring_hva, 403 uint64_t ring_gpa, 404 uint64_t ring_size, 405 void *reg_hva, 406 uint64_t reg_gpa, 407 uint64_t reg_size) 408 { 409 uint64_t hva_ring_offset; 410 uint64_t ring_last = range_get_last(ring_gpa, ring_size); 411 uint64_t reg_last = range_get_last(reg_gpa, reg_size); 412 413 if (ring_last < reg_gpa || ring_gpa > reg_last) { 414 return 0; 415 } 416 /* check that whole ring's is mapped */ 417 if (ring_last > reg_last) { 418 return -ENOMEM; 419 } 420 /* check that ring's MemoryRegion wasn't replaced */ 421 hva_ring_offset = ring_gpa - reg_gpa; 422 if (ring_hva != reg_hva + hva_ring_offset) { 423 return -EBUSY; 424 } 425 426 return 0; 427 } 428 429 static int vhost_verify_ring_mappings(struct vhost_dev *dev, 430 void *reg_hva, 431 uint64_t reg_gpa, 432 uint64_t reg_size) 433 { 434 int i, j; 435 int r = 0; 436 const char *part_name[] = { 437 "descriptor table", 438 "available ring", 439 "used ring" 440 }; 441 442 if (vhost_dev_has_iommu(dev)) { 443 return 0; 444 } 445 446 for (i = 0; i < dev->nvqs; ++i) { 447 struct vhost_virtqueue *vq = dev->vqs + i; 448 449 if (vq->desc_phys == 0) { 450 continue; 451 } 452 453 j = 0; 454 r = vhost_verify_ring_part_mapping( 455 vq->desc, vq->desc_phys, vq->desc_size, 456 reg_hva, reg_gpa, reg_size); 457 if (r) { 458 break; 459 } 460 461 j++; 462 r = vhost_verify_ring_part_mapping( 463 vq->avail, vq->avail_phys, vq->avail_size, 464 reg_hva, reg_gpa, reg_size); 465 if (r) { 466 break; 467 } 468 469 j++; 470 r = vhost_verify_ring_part_mapping( 471 vq->used, vq->used_phys, vq->used_size, 472 reg_hva, reg_gpa, reg_size); 473 if (r) { 474 break; 475 } 476 } 477 478 if (r == -ENOMEM) { 479 error_report("Unable to map %s for ring %d", part_name[j], i); 480 } else if (r == -EBUSY) { 481 error_report("%s relocated for ring %d", part_name[j], i); 482 } 483 return r; 484 } 485 486 /* 487 * vhost_section: identify sections needed for vhost access 488 * 489 * We only care about RAM sections here (where virtqueue and guest 490 * internals accessed by virtio might live). 491 */ 492 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) 493 { 494 MemoryRegion *mr = section->mr; 495 496 if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) { 497 uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr); 498 uint8_t handled_dirty; 499 500 /* 501 * Kernel based vhost doesn't handle any block which is doing 502 * dirty-tracking other than migration for which it has 503 * specific logging support. However for TCG the kernel never 504 * gets involved anyway so we can also ignore it's 505 * self-modiying code detection flags. However a vhost-user 506 * client could still confuse a TCG guest if it re-writes 507 * executable memory that has already been translated. 508 */ 509 handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) | 510 (1 << DIRTY_MEMORY_CODE); 511 512 if (dirty_mask & ~handled_dirty) { 513 trace_vhost_reject_section(mr->name, 1); 514 return false; 515 } 516 517 /* 518 * Some backends (like vhost-user) can only handle memory regions 519 * that have an fd (can be mapped into a different process). Filter 520 * the ones without an fd out, if requested. 521 * 522 * TODO: we might have to limit to MAP_SHARED as well. 523 */ 524 if (memory_region_get_fd(section->mr) < 0 && 525 dev->vhost_ops->vhost_backend_no_private_memslots && 526 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 527 trace_vhost_reject_section(mr->name, 2); 528 return false; 529 } 530 531 trace_vhost_section(mr->name); 532 return true; 533 } else { 534 trace_vhost_reject_section(mr->name, 3); 535 return false; 536 } 537 } 538 539 static void vhost_begin(MemoryListener *listener) 540 { 541 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 542 memory_listener); 543 dev->tmp_sections = NULL; 544 dev->n_tmp_sections = 0; 545 } 546 547 static void vhost_commit(MemoryListener *listener) 548 { 549 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 550 memory_listener); 551 MemoryRegionSection *old_sections; 552 int n_old_sections; 553 uint64_t log_size; 554 size_t regions_size; 555 int r; 556 int i; 557 bool changed = false; 558 559 /* Note we can be called before the device is started, but then 560 * starting the device calls set_mem_table, so we need to have 561 * built the data structures. 562 */ 563 old_sections = dev->mem_sections; 564 n_old_sections = dev->n_mem_sections; 565 dev->mem_sections = dev->tmp_sections; 566 dev->n_mem_sections = dev->n_tmp_sections; 567 568 if (dev->n_mem_sections != n_old_sections) { 569 changed = true; 570 } else { 571 /* Same size, lets check the contents */ 572 for (i = 0; i < n_old_sections; i++) { 573 if (!MemoryRegionSection_eq(&old_sections[i], 574 &dev->mem_sections[i])) { 575 changed = true; 576 break; 577 } 578 } 579 } 580 581 trace_vhost_commit(dev->started, changed); 582 if (!changed) { 583 goto out; 584 } 585 586 /* Rebuild the regions list from the new sections list */ 587 regions_size = offsetof(struct vhost_memory, regions) + 588 dev->n_mem_sections * sizeof dev->mem->regions[0]; 589 dev->mem = g_realloc(dev->mem, regions_size); 590 dev->mem->nregions = dev->n_mem_sections; 591 592 if (dev->vhost_ops->vhost_backend_no_private_memslots && 593 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 594 used_shared_memslots = dev->mem->nregions; 595 } else { 596 used_memslots = dev->mem->nregions; 597 } 598 599 for (i = 0; i < dev->n_mem_sections; i++) { 600 struct vhost_memory_region *cur_vmr = dev->mem->regions + i; 601 struct MemoryRegionSection *mrs = dev->mem_sections + i; 602 603 cur_vmr->guest_phys_addr = mrs->offset_within_address_space; 604 cur_vmr->memory_size = int128_get64(mrs->size); 605 cur_vmr->userspace_addr = 606 (uintptr_t)memory_region_get_ram_ptr(mrs->mr) + 607 mrs->offset_within_region; 608 cur_vmr->flags_padding = 0; 609 } 610 611 if (!dev->started) { 612 goto out; 613 } 614 615 for (i = 0; i < dev->mem->nregions; i++) { 616 if (vhost_verify_ring_mappings(dev, 617 (void *)(uintptr_t)dev->mem->regions[i].userspace_addr, 618 dev->mem->regions[i].guest_phys_addr, 619 dev->mem->regions[i].memory_size)) { 620 error_report("Verify ring failure on region %d", i); 621 abort(); 622 } 623 } 624 625 if (!dev->log_enabled) { 626 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 627 if (r < 0) { 628 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 629 } 630 goto out; 631 } 632 log_size = vhost_get_log_size(dev); 633 /* We allocate an extra 4K bytes to log, 634 * to reduce the * number of reallocations. */ 635 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) 636 /* To log more, must increase log size before table update. */ 637 if (dev->log_size < log_size) { 638 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); 639 } 640 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 641 if (r < 0) { 642 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 643 } 644 /* To log less, can only decrease log size after table update. */ 645 if (dev->log_size > log_size + VHOST_LOG_BUFFER) { 646 vhost_dev_log_resize(dev, log_size); 647 } 648 649 out: 650 /* Deref the old list of sections, this must happen _after_ the 651 * vhost_set_mem_table to ensure the client isn't still using the 652 * section we're about to unref. 653 */ 654 while (n_old_sections--) { 655 memory_region_unref(old_sections[n_old_sections].mr); 656 } 657 g_free(old_sections); 658 return; 659 } 660 661 /* Adds the section data to the tmp_section structure. 662 * It relies on the listener calling us in memory address order 663 * and for each region (via the _add and _nop methods) to 664 * join neighbours. 665 */ 666 static void vhost_region_add_section(struct vhost_dev *dev, 667 MemoryRegionSection *section) 668 { 669 bool need_add = true; 670 uint64_t mrs_size = int128_get64(section->size); 671 uint64_t mrs_gpa = section->offset_within_address_space; 672 uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + 673 section->offset_within_region; 674 RAMBlock *mrs_rb = section->mr->ram_block; 675 676 trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, 677 mrs_host); 678 679 if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { 680 /* Round the section to it's page size */ 681 /* First align the start down to a page boundary */ 682 size_t mrs_page = qemu_ram_pagesize(mrs_rb); 683 uint64_t alignage = mrs_host & (mrs_page - 1); 684 if (alignage) { 685 mrs_host -= alignage; 686 mrs_size += alignage; 687 mrs_gpa -= alignage; 688 } 689 /* Now align the size up to a page boundary */ 690 alignage = mrs_size & (mrs_page - 1); 691 if (alignage) { 692 mrs_size += mrs_page - alignage; 693 } 694 trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, 695 mrs_size, mrs_host); 696 } 697 698 if (dev->n_tmp_sections) { 699 /* Since we already have at least one section, lets see if 700 * this extends it; since we're scanning in order, we only 701 * have to look at the last one, and the FlatView that calls 702 * us shouldn't have overlaps. 703 */ 704 MemoryRegionSection *prev_sec = dev->tmp_sections + 705 (dev->n_tmp_sections - 1); 706 uint64_t prev_gpa_start = prev_sec->offset_within_address_space; 707 uint64_t prev_size = int128_get64(prev_sec->size); 708 uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); 709 uint64_t prev_host_start = 710 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + 711 prev_sec->offset_within_region; 712 uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); 713 714 if (mrs_gpa <= (prev_gpa_end + 1)) { 715 /* OK, looks like overlapping/intersecting - it's possible that 716 * the rounding to page sizes has made them overlap, but they should 717 * match up in the same RAMBlock if they do. 718 */ 719 if (mrs_gpa < prev_gpa_start) { 720 error_report("%s:Section '%s' rounded to %"PRIx64 721 " prior to previous '%s' %"PRIx64, 722 __func__, section->mr->name, mrs_gpa, 723 prev_sec->mr->name, prev_gpa_start); 724 /* A way to cleanly fail here would be better */ 725 return; 726 } 727 /* Offset from the start of the previous GPA to this GPA */ 728 size_t offset = mrs_gpa - prev_gpa_start; 729 730 if (prev_host_start + offset == mrs_host && 731 section->mr == prev_sec->mr) { 732 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); 733 need_add = false; 734 prev_sec->offset_within_address_space = 735 MIN(prev_gpa_start, mrs_gpa); 736 prev_sec->offset_within_region = 737 MIN(prev_host_start, mrs_host) - 738 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); 739 prev_sec->size = int128_make64(max_end - MIN(prev_host_start, 740 mrs_host)); 741 trace_vhost_region_add_section_merge(section->mr->name, 742 int128_get64(prev_sec->size), 743 prev_sec->offset_within_address_space, 744 prev_sec->offset_within_region); 745 } else { 746 /* adjoining regions are fine, but overlapping ones with 747 * different blocks/offsets shouldn't happen 748 */ 749 if (mrs_gpa != prev_gpa_end + 1) { 750 error_report("%s: Overlapping but not coherent sections " 751 "at %"PRIx64, 752 __func__, mrs_gpa); 753 return; 754 } 755 } 756 } 757 } 758 759 if (need_add) { 760 ++dev->n_tmp_sections; 761 dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections, 762 dev->n_tmp_sections); 763 dev->tmp_sections[dev->n_tmp_sections - 1] = *section; 764 /* The flatview isn't stable and we don't use it, making it NULL 765 * means we can memcmp the list. 766 */ 767 dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL; 768 memory_region_ref(section->mr); 769 } 770 } 771 772 /* Used for both add and nop callbacks */ 773 static void vhost_region_addnop(MemoryListener *listener, 774 MemoryRegionSection *section) 775 { 776 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 777 memory_listener); 778 779 if (!vhost_section(dev, section)) { 780 return; 781 } 782 vhost_region_add_section(dev, section); 783 } 784 785 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 786 { 787 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n); 788 struct vhost_dev *hdev = iommu->hdev; 789 hwaddr iova = iotlb->iova + iommu->iommu_offset; 790 791 if (vhost_backend_invalidate_device_iotlb(hdev, iova, 792 iotlb->addr_mask + 1)) { 793 error_report("Fail to invalidate device iotlb"); 794 } 795 } 796 797 static void vhost_iommu_region_add(MemoryListener *listener, 798 MemoryRegionSection *section) 799 { 800 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 801 iommu_listener); 802 struct vhost_iommu *iommu; 803 Int128 end; 804 int iommu_idx; 805 IOMMUMemoryRegion *iommu_mr; 806 807 if (!memory_region_is_iommu(section->mr)) { 808 return; 809 } 810 811 iommu_mr = IOMMU_MEMORY_REGION(section->mr); 812 813 iommu = g_malloc0(sizeof(*iommu)); 814 end = int128_add(int128_make64(section->offset_within_region), 815 section->size); 816 end = int128_sub(end, int128_one()); 817 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 818 MEMTXATTRS_UNSPECIFIED); 819 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, 820 dev->vdev->device_iotlb_enabled ? 821 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : 822 IOMMU_NOTIFIER_UNMAP, 823 section->offset_within_region, 824 int128_get64(end), 825 iommu_idx); 826 iommu->mr = section->mr; 827 iommu->iommu_offset = section->offset_within_address_space - 828 section->offset_within_region; 829 iommu->hdev = dev; 830 memory_region_register_iommu_notifier(section->mr, &iommu->n, 831 &error_fatal); 832 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next); 833 /* TODO: can replay help performance here? */ 834 } 835 836 static void vhost_iommu_region_del(MemoryListener *listener, 837 MemoryRegionSection *section) 838 { 839 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 840 iommu_listener); 841 struct vhost_iommu *iommu; 842 843 if (!memory_region_is_iommu(section->mr)) { 844 return; 845 } 846 847 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 848 if (iommu->mr == section->mr && 849 iommu->n.start == section->offset_within_region) { 850 memory_region_unregister_iommu_notifier(iommu->mr, 851 &iommu->n); 852 QLIST_REMOVE(iommu, iommu_next); 853 g_free(iommu); 854 break; 855 } 856 } 857 } 858 859 void vhost_toggle_device_iotlb(VirtIODevice *vdev) 860 { 861 VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); 862 struct vhost_dev *dev; 863 struct vhost_iommu *iommu; 864 865 if (vdev->vhost_started) { 866 dev = vdc->get_vhost(vdev); 867 } else { 868 return; 869 } 870 871 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 872 memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n); 873 iommu->n.notifier_flags = vdev->device_iotlb_enabled ? 874 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP; 875 memory_region_register_iommu_notifier(iommu->mr, &iommu->n, 876 &error_fatal); 877 } 878 } 879 880 static int vhost_virtqueue_set_addr(struct vhost_dev *dev, 881 struct vhost_virtqueue *vq, 882 unsigned idx, bool enable_log) 883 { 884 struct vhost_vring_addr addr; 885 int r; 886 memset(&addr, 0, sizeof(struct vhost_vring_addr)); 887 888 if (dev->vhost_ops->vhost_vq_get_addr) { 889 r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq); 890 if (r < 0) { 891 VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed"); 892 return r; 893 } 894 } else { 895 addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc; 896 addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail; 897 addr.used_user_addr = (uint64_t)(unsigned long)vq->used; 898 } 899 addr.index = idx; 900 addr.log_guest_addr = vq->used_phys; 901 addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0; 902 r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 903 if (r < 0) { 904 VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed"); 905 } 906 return r; 907 } 908 909 static int vhost_dev_set_features(struct vhost_dev *dev, 910 bool enable_log) 911 { 912 uint64_t features = dev->acked_features; 913 int r; 914 if (enable_log) { 915 features |= 0x1ULL << VHOST_F_LOG_ALL; 916 } 917 if (!vhost_dev_has_iommu(dev)) { 918 features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); 919 } 920 if (dev->vhost_ops->vhost_force_iommu) { 921 if (dev->vhost_ops->vhost_force_iommu(dev) == true) { 922 features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; 923 } 924 } 925 r = dev->vhost_ops->vhost_set_features(dev, features); 926 if (r < 0) { 927 VHOST_OPS_DEBUG(r, "vhost_set_features failed"); 928 goto out; 929 } 930 if (dev->vhost_ops->vhost_set_backend_cap) { 931 r = dev->vhost_ops->vhost_set_backend_cap(dev); 932 if (r < 0) { 933 VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed"); 934 goto out; 935 } 936 } 937 938 out: 939 return r; 940 } 941 942 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) 943 { 944 int r, i, idx; 945 hwaddr addr; 946 947 r = vhost_dev_set_features(dev, enable_log); 948 if (r < 0) { 949 goto err_features; 950 } 951 for (i = 0; i < dev->nvqs; ++i) { 952 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 953 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 954 if (!addr) { 955 /* 956 * The queue might not be ready for start. If this 957 * is the case there is no reason to continue the process. 958 * The similar logic is used by the vhost_virtqueue_start() 959 * routine. 960 */ 961 continue; 962 } 963 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 964 enable_log); 965 if (r < 0) { 966 goto err_vq; 967 } 968 } 969 return 0; 970 err_vq: 971 for (; i >= 0; --i) { 972 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 973 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 974 if (!addr) { 975 continue; 976 } 977 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 978 dev->log_enabled); 979 } 980 vhost_dev_set_features(dev, dev->log_enabled); 981 err_features: 982 return r; 983 } 984 985 static int vhost_migration_log(MemoryListener *listener, bool enable) 986 { 987 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 988 memory_listener); 989 int r; 990 if (enable == dev->log_enabled) { 991 return 0; 992 } 993 if (!dev->started) { 994 dev->log_enabled = enable; 995 return 0; 996 } 997 998 r = 0; 999 if (!enable) { 1000 r = vhost_dev_set_log(dev, false); 1001 if (r < 0) { 1002 goto check_dev_state; 1003 } 1004 vhost_log_put(dev, false); 1005 } else { 1006 vhost_dev_log_resize(dev, vhost_get_log_size(dev)); 1007 r = vhost_dev_set_log(dev, true); 1008 if (r < 0) { 1009 goto check_dev_state; 1010 } 1011 } 1012 1013 check_dev_state: 1014 dev->log_enabled = enable; 1015 /* 1016 * vhost-user-* devices could change their state during log 1017 * initialization due to disconnect. So check dev state after 1018 * vhost communication. 1019 */ 1020 if (!dev->started) { 1021 /* 1022 * Since device is in the stopped state, it is okay for 1023 * migration. Return success. 1024 */ 1025 r = 0; 1026 } 1027 if (r) { 1028 /* An error occurred. */ 1029 dev->log_enabled = false; 1030 } 1031 1032 return r; 1033 } 1034 1035 static void vhost_log_global_start(MemoryListener *listener) 1036 { 1037 int r; 1038 1039 r = vhost_migration_log(listener, true); 1040 if (r < 0) { 1041 abort(); 1042 } 1043 } 1044 1045 static void vhost_log_global_stop(MemoryListener *listener) 1046 { 1047 int r; 1048 1049 r = vhost_migration_log(listener, false); 1050 if (r < 0) { 1051 abort(); 1052 } 1053 } 1054 1055 static void vhost_log_start(MemoryListener *listener, 1056 MemoryRegionSection *section, 1057 int old, int new) 1058 { 1059 /* FIXME: implement */ 1060 } 1061 1062 static void vhost_log_stop(MemoryListener *listener, 1063 MemoryRegionSection *section, 1064 int old, int new) 1065 { 1066 /* FIXME: implement */ 1067 } 1068 1069 /* The vhost driver natively knows how to handle the vrings of non 1070 * cross-endian legacy devices and modern devices. Only legacy devices 1071 * exposed to a bi-endian guest may require the vhost driver to use a 1072 * specific endianness. 1073 */ 1074 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) 1075 { 1076 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1077 return false; 1078 } 1079 #if HOST_BIG_ENDIAN 1080 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; 1081 #else 1082 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; 1083 #endif 1084 } 1085 1086 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, 1087 bool is_big_endian, 1088 int vhost_vq_index) 1089 { 1090 int r; 1091 struct vhost_vring_state s = { 1092 .index = vhost_vq_index, 1093 .num = is_big_endian 1094 }; 1095 1096 r = dev->vhost_ops->vhost_set_vring_endian(dev, &s); 1097 if (r < 0) { 1098 VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed"); 1099 } 1100 return r; 1101 } 1102 1103 static int vhost_memory_region_lookup(struct vhost_dev *hdev, 1104 uint64_t gpa, uint64_t *uaddr, 1105 uint64_t *len) 1106 { 1107 int i; 1108 1109 for (i = 0; i < hdev->mem->nregions; i++) { 1110 struct vhost_memory_region *reg = hdev->mem->regions + i; 1111 1112 if (gpa >= reg->guest_phys_addr && 1113 reg->guest_phys_addr + reg->memory_size > gpa) { 1114 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr; 1115 *len = reg->guest_phys_addr + reg->memory_size - gpa; 1116 return 0; 1117 } 1118 } 1119 1120 return -EFAULT; 1121 } 1122 1123 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) 1124 { 1125 IOMMUTLBEntry iotlb; 1126 uint64_t uaddr, len; 1127 int ret = -EFAULT; 1128 1129 RCU_READ_LOCK_GUARD(); 1130 1131 trace_vhost_iotlb_miss(dev, 1); 1132 1133 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 1134 iova, write, 1135 MEMTXATTRS_UNSPECIFIED); 1136 if (iotlb.target_as != NULL) { 1137 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, 1138 &uaddr, &len); 1139 if (ret) { 1140 trace_vhost_iotlb_miss(dev, 3); 1141 error_report("Fail to lookup the translated address " 1142 "%"PRIx64, iotlb.translated_addr); 1143 goto out; 1144 } 1145 1146 len = MIN(iotlb.addr_mask + 1, len); 1147 iova = iova & ~iotlb.addr_mask; 1148 1149 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, 1150 len, iotlb.perm); 1151 if (ret) { 1152 trace_vhost_iotlb_miss(dev, 4); 1153 error_report("Fail to update device iotlb"); 1154 goto out; 1155 } 1156 } 1157 1158 trace_vhost_iotlb_miss(dev, 2); 1159 1160 out: 1161 return ret; 1162 } 1163 1164 int vhost_virtqueue_start(struct vhost_dev *dev, 1165 struct VirtIODevice *vdev, 1166 struct vhost_virtqueue *vq, 1167 unsigned idx) 1168 { 1169 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1170 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1171 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1172 hwaddr s, l, a; 1173 int r; 1174 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1175 struct vhost_vring_file file = { 1176 .index = vhost_vq_index 1177 }; 1178 struct vhost_vring_state state = { 1179 .index = vhost_vq_index 1180 }; 1181 struct VirtQueue *vvq = virtio_get_queue(vdev, idx); 1182 1183 a = virtio_queue_get_desc_addr(vdev, idx); 1184 if (a == 0) { 1185 /* Queue might not be ready for start */ 1186 return 0; 1187 } 1188 1189 vq->num = state.num = virtio_queue_get_num(vdev, idx); 1190 r = dev->vhost_ops->vhost_set_vring_num(dev, &state); 1191 if (r) { 1192 VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed"); 1193 return r; 1194 } 1195 1196 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 1197 r = dev->vhost_ops->vhost_set_vring_base(dev, &state); 1198 if (r) { 1199 VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed"); 1200 return r; 1201 } 1202 1203 if (vhost_needs_vring_endian(vdev)) { 1204 r = vhost_virtqueue_set_vring_endian_legacy(dev, 1205 virtio_is_big_endian(vdev), 1206 vhost_vq_index); 1207 if (r) { 1208 return r; 1209 } 1210 } 1211 1212 vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx); 1213 vq->desc_phys = a; 1214 vq->desc = vhost_memory_map(dev, a, &l, false); 1215 if (!vq->desc || l != s) { 1216 r = -ENOMEM; 1217 goto fail_alloc_desc; 1218 } 1219 vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx); 1220 vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx); 1221 vq->avail = vhost_memory_map(dev, a, &l, false); 1222 if (!vq->avail || l != s) { 1223 r = -ENOMEM; 1224 goto fail_alloc_avail; 1225 } 1226 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); 1227 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); 1228 vq->used = vhost_memory_map(dev, a, &l, true); 1229 if (!vq->used || l != s) { 1230 r = -ENOMEM; 1231 goto fail_alloc_used; 1232 } 1233 1234 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); 1235 if (r < 0) { 1236 goto fail_alloc; 1237 } 1238 1239 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); 1240 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); 1241 if (r) { 1242 VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed"); 1243 goto fail_kick; 1244 } 1245 1246 /* Clear and discard previous events if any. */ 1247 event_notifier_test_and_clear(&vq->masked_notifier); 1248 1249 /* Init vring in unmasked state, unless guest_notifier_mask 1250 * will do it later. 1251 */ 1252 if (!vdev->use_guest_notifier_mask) { 1253 /* TODO: check and handle errors. */ 1254 vhost_virtqueue_mask(dev, vdev, idx, false); 1255 } 1256 1257 if (k->query_guest_notifiers && 1258 k->query_guest_notifiers(qbus->parent) && 1259 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) { 1260 file.fd = -1; 1261 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1262 if (r) { 1263 goto fail_vector; 1264 } 1265 } 1266 1267 return 0; 1268 1269 fail_vector: 1270 fail_kick: 1271 fail_alloc: 1272 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1273 0, 0); 1274 fail_alloc_used: 1275 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1276 0, 0); 1277 fail_alloc_avail: 1278 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1279 0, 0); 1280 fail_alloc_desc: 1281 return r; 1282 } 1283 1284 void vhost_virtqueue_stop(struct vhost_dev *dev, 1285 struct VirtIODevice *vdev, 1286 struct vhost_virtqueue *vq, 1287 unsigned idx) 1288 { 1289 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1290 struct vhost_vring_state state = { 1291 .index = vhost_vq_index, 1292 }; 1293 int r; 1294 1295 if (virtio_queue_get_desc_addr(vdev, idx) == 0) { 1296 /* Don't stop the virtqueue which might have not been started */ 1297 return; 1298 } 1299 1300 r = dev->vhost_ops->vhost_get_vring_base(dev, &state); 1301 if (r < 0) { 1302 VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r); 1303 /* Connection to the backend is broken, so let's sync internal 1304 * last avail idx to the device used idx. 1305 */ 1306 virtio_queue_restore_last_avail_idx(vdev, idx); 1307 } else { 1308 virtio_queue_set_last_avail_idx(vdev, idx, state.num); 1309 } 1310 virtio_queue_invalidate_signalled_used(vdev, idx); 1311 virtio_queue_update_used_idx(vdev, idx); 1312 1313 /* In the cross-endian case, we need to reset the vring endianness to 1314 * native as legacy devices expect so by default. 1315 */ 1316 if (vhost_needs_vring_endian(vdev)) { 1317 vhost_virtqueue_set_vring_endian_legacy(dev, 1318 !virtio_is_big_endian(vdev), 1319 vhost_vq_index); 1320 } 1321 1322 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1323 1, virtio_queue_get_used_size(vdev, idx)); 1324 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1325 0, virtio_queue_get_avail_size(vdev, idx)); 1326 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1327 0, virtio_queue_get_desc_size(vdev, idx)); 1328 } 1329 1330 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, 1331 int n, uint32_t timeout) 1332 { 1333 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1334 struct vhost_vring_state state = { 1335 .index = vhost_vq_index, 1336 .num = timeout, 1337 }; 1338 int r; 1339 1340 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) { 1341 return -EINVAL; 1342 } 1343 1344 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state); 1345 if (r) { 1346 VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed"); 1347 return r; 1348 } 1349 1350 return 0; 1351 } 1352 1353 static void vhost_virtqueue_error_notifier(EventNotifier *n) 1354 { 1355 struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue, 1356 error_notifier); 1357 struct vhost_dev *dev = vq->dev; 1358 int index = vq - dev->vqs; 1359 1360 if (event_notifier_test_and_clear(n) && dev->vdev) { 1361 VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", 1362 dev->vq_index + index); 1363 } 1364 } 1365 1366 static int vhost_virtqueue_init(struct vhost_dev *dev, 1367 struct vhost_virtqueue *vq, int n) 1368 { 1369 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1370 struct vhost_vring_file file = { 1371 .index = vhost_vq_index, 1372 }; 1373 int r = event_notifier_init(&vq->masked_notifier, 0); 1374 if (r < 0) { 1375 return r; 1376 } 1377 1378 file.fd = event_notifier_get_wfd(&vq->masked_notifier); 1379 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1380 if (r) { 1381 VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed"); 1382 goto fail_call; 1383 } 1384 1385 vq->dev = dev; 1386 1387 if (dev->vhost_ops->vhost_set_vring_err) { 1388 r = event_notifier_init(&vq->error_notifier, 0); 1389 if (r < 0) { 1390 goto fail_call; 1391 } 1392 1393 file.fd = event_notifier_get_fd(&vq->error_notifier); 1394 r = dev->vhost_ops->vhost_set_vring_err(dev, &file); 1395 if (r) { 1396 VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed"); 1397 goto fail_err; 1398 } 1399 1400 event_notifier_set_handler(&vq->error_notifier, 1401 vhost_virtqueue_error_notifier); 1402 } 1403 1404 return 0; 1405 1406 fail_err: 1407 event_notifier_cleanup(&vq->error_notifier); 1408 fail_call: 1409 event_notifier_cleanup(&vq->masked_notifier); 1410 return r; 1411 } 1412 1413 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) 1414 { 1415 event_notifier_cleanup(&vq->masked_notifier); 1416 if (vq->dev->vhost_ops->vhost_set_vring_err) { 1417 event_notifier_set_handler(&vq->error_notifier, NULL); 1418 event_notifier_cleanup(&vq->error_notifier); 1419 } 1420 } 1421 1422 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 1423 VhostBackendType backend_type, uint32_t busyloop_timeout, 1424 Error **errp) 1425 { 1426 unsigned int used; 1427 uint64_t features; 1428 int i, r, n_initialized_vqs = 0; 1429 1430 hdev->vdev = NULL; 1431 hdev->migration_blocker = NULL; 1432 1433 r = vhost_set_backend_type(hdev, backend_type); 1434 assert(r >= 0); 1435 1436 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp); 1437 if (r < 0) { 1438 goto fail; 1439 } 1440 1441 r = hdev->vhost_ops->vhost_set_owner(hdev); 1442 if (r < 0) { 1443 error_setg_errno(errp, -r, "vhost_set_owner failed"); 1444 goto fail; 1445 } 1446 1447 r = hdev->vhost_ops->vhost_get_features(hdev, &features); 1448 if (r < 0) { 1449 error_setg_errno(errp, -r, "vhost_get_features failed"); 1450 goto fail; 1451 } 1452 1453 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) { 1454 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); 1455 if (r < 0) { 1456 error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i); 1457 goto fail; 1458 } 1459 } 1460 1461 if (busyloop_timeout) { 1462 for (i = 0; i < hdev->nvqs; ++i) { 1463 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 1464 busyloop_timeout); 1465 if (r < 0) { 1466 error_setg_errno(errp, -r, "Failed to set busyloop timeout"); 1467 goto fail_busyloop; 1468 } 1469 } 1470 } 1471 1472 hdev->features = features; 1473 1474 hdev->memory_listener = (MemoryListener) { 1475 .name = "vhost", 1476 .begin = vhost_begin, 1477 .commit = vhost_commit, 1478 .region_add = vhost_region_addnop, 1479 .region_nop = vhost_region_addnop, 1480 .log_start = vhost_log_start, 1481 .log_stop = vhost_log_stop, 1482 .log_sync = vhost_log_sync, 1483 .log_global_start = vhost_log_global_start, 1484 .log_global_stop = vhost_log_global_stop, 1485 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND 1486 }; 1487 1488 hdev->iommu_listener = (MemoryListener) { 1489 .name = "vhost-iommu", 1490 .region_add = vhost_iommu_region_add, 1491 .region_del = vhost_iommu_region_del, 1492 }; 1493 1494 if (hdev->migration_blocker == NULL) { 1495 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { 1496 error_setg(&hdev->migration_blocker, 1497 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); 1498 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { 1499 error_setg(&hdev->migration_blocker, 1500 "Migration disabled: failed to allocate shared memory"); 1501 } 1502 } 1503 1504 if (hdev->migration_blocker != NULL) { 1505 r = migrate_add_blocker(hdev->migration_blocker, errp); 1506 if (r < 0) { 1507 error_free(hdev->migration_blocker); 1508 goto fail_busyloop; 1509 } 1510 } 1511 1512 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); 1513 hdev->n_mem_sections = 0; 1514 hdev->mem_sections = NULL; 1515 hdev->log = NULL; 1516 hdev->log_size = 0; 1517 hdev->log_enabled = false; 1518 hdev->started = false; 1519 memory_listener_register(&hdev->memory_listener, &address_space_memory); 1520 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); 1521 1522 /* 1523 * The listener we registered properly updated the corresponding counter. 1524 * So we can trust that these values are accurate. 1525 */ 1526 if (hdev->vhost_ops->vhost_backend_no_private_memslots && 1527 hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { 1528 used = used_shared_memslots; 1529 } else { 1530 used = used_memslots; 1531 } 1532 if (used > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) { 1533 error_setg(errp, "vhost backend memory slots limit is less" 1534 " than current number of present memory slots"); 1535 r = -EINVAL; 1536 goto fail_busyloop; 1537 } 1538 1539 return 0; 1540 1541 fail_busyloop: 1542 if (busyloop_timeout) { 1543 while (--i >= 0) { 1544 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0); 1545 } 1546 } 1547 fail: 1548 hdev->nvqs = n_initialized_vqs; 1549 vhost_dev_cleanup(hdev); 1550 return r; 1551 } 1552 1553 void vhost_dev_cleanup(struct vhost_dev *hdev) 1554 { 1555 int i; 1556 1557 trace_vhost_dev_cleanup(hdev); 1558 1559 for (i = 0; i < hdev->nvqs; ++i) { 1560 vhost_virtqueue_cleanup(hdev->vqs + i); 1561 } 1562 if (hdev->mem) { 1563 /* those are only safe after successful init */ 1564 memory_listener_unregister(&hdev->memory_listener); 1565 QLIST_REMOVE(hdev, entry); 1566 } 1567 if (hdev->migration_blocker) { 1568 migrate_del_blocker(hdev->migration_blocker); 1569 error_free(hdev->migration_blocker); 1570 } 1571 g_free(hdev->mem); 1572 g_free(hdev->mem_sections); 1573 if (hdev->vhost_ops) { 1574 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1575 } 1576 assert(!hdev->log); 1577 1578 memset(hdev, 0, sizeof(struct vhost_dev)); 1579 } 1580 1581 static void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev, 1582 VirtIODevice *vdev, 1583 unsigned int nvqs) 1584 { 1585 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1586 int i, r; 1587 1588 /* 1589 * Batch all the host notifiers in a single transaction to avoid 1590 * quadratic time complexity in address_space_update_ioeventfds(). 1591 */ 1592 memory_region_transaction_begin(); 1593 1594 for (i = 0; i < nvqs; ++i) { 1595 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1596 false); 1597 if (r < 0) { 1598 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); 1599 } 1600 assert(r >= 0); 1601 } 1602 1603 /* 1604 * The transaction expects the ioeventfds to be open when it 1605 * commits. Do it now, before the cleanup loop. 1606 */ 1607 memory_region_transaction_commit(); 1608 1609 for (i = 0; i < nvqs; ++i) { 1610 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); 1611 } 1612 virtio_device_release_ioeventfd(vdev); 1613 } 1614 1615 /* Stop processing guest IO notifications in qemu. 1616 * Start processing them in vhost in kernel. 1617 */ 1618 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1619 { 1620 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1621 int i, r; 1622 1623 /* We will pass the notifiers to the kernel, make sure that QEMU 1624 * doesn't interfere. 1625 */ 1626 r = virtio_device_grab_ioeventfd(vdev); 1627 if (r < 0) { 1628 error_report("binding does not support host notifiers"); 1629 return r; 1630 } 1631 1632 /* 1633 * Batch all the host notifiers in a single transaction to avoid 1634 * quadratic time complexity in address_space_update_ioeventfds(). 1635 */ 1636 memory_region_transaction_begin(); 1637 1638 for (i = 0; i < hdev->nvqs; ++i) { 1639 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1640 true); 1641 if (r < 0) { 1642 error_report("vhost VQ %d notifier binding failed: %d", i, -r); 1643 memory_region_transaction_commit(); 1644 vhost_dev_disable_notifiers_nvqs(hdev, vdev, i); 1645 return r; 1646 } 1647 } 1648 1649 memory_region_transaction_commit(); 1650 1651 return 0; 1652 } 1653 1654 /* Stop processing guest IO notifications in vhost. 1655 * Start processing them in qemu. 1656 * This might actually run the qemu handlers right away, 1657 * so virtio in qemu must be completely setup when this is called. 1658 */ 1659 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1660 { 1661 vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs); 1662 } 1663 1664 /* Test and clear event pending status. 1665 * Should be called after unmask to avoid losing events. 1666 */ 1667 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) 1668 { 1669 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; 1670 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); 1671 return event_notifier_test_and_clear(&vq->masked_notifier); 1672 } 1673 1674 /* Mask/unmask events from this vq. */ 1675 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, 1676 bool mask) 1677 { 1678 struct VirtQueue *vvq = virtio_get_queue(vdev, n); 1679 int r, index = n - hdev->vq_index; 1680 struct vhost_vring_file file; 1681 1682 /* should only be called after backend is connected */ 1683 assert(hdev->vhost_ops); 1684 1685 if (mask) { 1686 assert(vdev->use_guest_notifier_mask); 1687 file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier); 1688 } else { 1689 file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq)); 1690 } 1691 1692 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); 1693 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); 1694 if (r < 0) { 1695 error_report("vhost_set_vring_call failed %d", -r); 1696 } 1697 } 1698 1699 bool vhost_config_pending(struct vhost_dev *hdev) 1700 { 1701 assert(hdev->vhost_ops); 1702 if ((hdev->started == false) || 1703 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1704 return false; 1705 } 1706 1707 EventNotifier *notifier = 1708 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1709 return event_notifier_test_and_clear(notifier); 1710 } 1711 1712 void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask) 1713 { 1714 int fd; 1715 int r; 1716 EventNotifier *notifier = 1717 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1718 EventNotifier *config_notifier = &vdev->config_notifier; 1719 assert(hdev->vhost_ops); 1720 1721 if ((hdev->started == false) || 1722 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1723 return; 1724 } 1725 if (mask) { 1726 assert(vdev->use_guest_notifier_mask); 1727 fd = event_notifier_get_fd(notifier); 1728 } else { 1729 fd = event_notifier_get_fd(config_notifier); 1730 } 1731 r = hdev->vhost_ops->vhost_set_config_call(hdev, fd); 1732 if (r < 0) { 1733 error_report("vhost_set_config_call failed %d", -r); 1734 } 1735 } 1736 1737 static void vhost_stop_config_intr(struct vhost_dev *dev) 1738 { 1739 int fd = -1; 1740 assert(dev->vhost_ops); 1741 if (dev->vhost_ops->vhost_set_config_call) { 1742 dev->vhost_ops->vhost_set_config_call(dev, fd); 1743 } 1744 } 1745 1746 static void vhost_start_config_intr(struct vhost_dev *dev) 1747 { 1748 int r; 1749 1750 assert(dev->vhost_ops); 1751 int fd = event_notifier_get_fd(&dev->vdev->config_notifier); 1752 if (dev->vhost_ops->vhost_set_config_call) { 1753 r = dev->vhost_ops->vhost_set_config_call(dev, fd); 1754 if (!r) { 1755 event_notifier_set(&dev->vdev->config_notifier); 1756 } 1757 } 1758 } 1759 1760 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, 1761 uint64_t features) 1762 { 1763 const int *bit = feature_bits; 1764 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1765 uint64_t bit_mask = (1ULL << *bit); 1766 if (!(hdev->features & bit_mask)) { 1767 features &= ~bit_mask; 1768 } 1769 bit++; 1770 } 1771 return features; 1772 } 1773 1774 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, 1775 uint64_t features) 1776 { 1777 const int *bit = feature_bits; 1778 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1779 uint64_t bit_mask = (1ULL << *bit); 1780 if (features & bit_mask) { 1781 hdev->acked_features |= bit_mask; 1782 } 1783 bit++; 1784 } 1785 } 1786 1787 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, 1788 uint32_t config_len, Error **errp) 1789 { 1790 assert(hdev->vhost_ops); 1791 1792 if (hdev->vhost_ops->vhost_get_config) { 1793 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len, 1794 errp); 1795 } 1796 1797 error_setg(errp, "vhost_get_config not implemented"); 1798 return -ENOSYS; 1799 } 1800 1801 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data, 1802 uint32_t offset, uint32_t size, uint32_t flags) 1803 { 1804 assert(hdev->vhost_ops); 1805 1806 if (hdev->vhost_ops->vhost_set_config) { 1807 return hdev->vhost_ops->vhost_set_config(hdev, data, offset, 1808 size, flags); 1809 } 1810 1811 return -ENOSYS; 1812 } 1813 1814 void vhost_dev_set_config_notifier(struct vhost_dev *hdev, 1815 const VhostDevConfigOps *ops) 1816 { 1817 hdev->config_ops = ops; 1818 } 1819 1820 void vhost_dev_free_inflight(struct vhost_inflight *inflight) 1821 { 1822 if (inflight && inflight->addr) { 1823 qemu_memfd_free(inflight->addr, inflight->size, inflight->fd); 1824 inflight->addr = NULL; 1825 inflight->fd = -1; 1826 } 1827 } 1828 1829 static int vhost_dev_resize_inflight(struct vhost_inflight *inflight, 1830 uint64_t new_size) 1831 { 1832 Error *err = NULL; 1833 int fd = -1; 1834 void *addr = qemu_memfd_alloc("vhost-inflight", new_size, 1835 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 1836 &fd, &err); 1837 1838 if (err) { 1839 error_report_err(err); 1840 return -ENOMEM; 1841 } 1842 1843 vhost_dev_free_inflight(inflight); 1844 inflight->offset = 0; 1845 inflight->addr = addr; 1846 inflight->fd = fd; 1847 inflight->size = new_size; 1848 1849 return 0; 1850 } 1851 1852 void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f) 1853 { 1854 if (inflight->addr) { 1855 qemu_put_be64(f, inflight->size); 1856 qemu_put_be16(f, inflight->queue_size); 1857 qemu_put_buffer(f, inflight->addr, inflight->size); 1858 } else { 1859 qemu_put_be64(f, 0); 1860 } 1861 } 1862 1863 int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f) 1864 { 1865 uint64_t size; 1866 1867 size = qemu_get_be64(f); 1868 if (!size) { 1869 return 0; 1870 } 1871 1872 if (inflight->size != size) { 1873 int ret = vhost_dev_resize_inflight(inflight, size); 1874 if (ret < 0) { 1875 return ret; 1876 } 1877 } 1878 inflight->queue_size = qemu_get_be16(f); 1879 1880 qemu_get_buffer(f, inflight->addr, size); 1881 1882 return 0; 1883 } 1884 1885 int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev) 1886 { 1887 int r; 1888 1889 if (hdev->vhost_ops->vhost_get_inflight_fd == NULL || 1890 hdev->vhost_ops->vhost_set_inflight_fd == NULL) { 1891 return 0; 1892 } 1893 1894 hdev->vdev = vdev; 1895 1896 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1897 if (r < 0) { 1898 VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed"); 1899 return r; 1900 } 1901 1902 return 0; 1903 } 1904 1905 int vhost_dev_set_inflight(struct vhost_dev *dev, 1906 struct vhost_inflight *inflight) 1907 { 1908 int r; 1909 1910 if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) { 1911 r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight); 1912 if (r) { 1913 VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed"); 1914 return r; 1915 } 1916 } 1917 1918 return 0; 1919 } 1920 1921 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, 1922 struct vhost_inflight *inflight) 1923 { 1924 int r; 1925 1926 if (dev->vhost_ops->vhost_get_inflight_fd) { 1927 r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight); 1928 if (r) { 1929 VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed"); 1930 return r; 1931 } 1932 } 1933 1934 return 0; 1935 } 1936 1937 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) 1938 { 1939 if (!hdev->vhost_ops->vhost_set_vring_enable) { 1940 return 0; 1941 } 1942 1943 /* 1944 * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not 1945 * been negotiated, the rings start directly in the enabled state, and 1946 * .vhost_set_vring_enable callback will fail since 1947 * VHOST_USER_SET_VRING_ENABLE is not supported. 1948 */ 1949 if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER && 1950 !virtio_has_feature(hdev->backend_features, 1951 VHOST_USER_F_PROTOCOL_FEATURES)) { 1952 return 0; 1953 } 1954 1955 return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); 1956 } 1957 1958 /* Host notifiers must be enabled at this point. */ 1959 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 1960 { 1961 int i, r; 1962 1963 /* should only be called after backend is connected */ 1964 assert(hdev->vhost_ops); 1965 1966 trace_vhost_dev_start(hdev, vdev->name, vrings); 1967 1968 vdev->vhost_started = true; 1969 hdev->started = true; 1970 hdev->vdev = vdev; 1971 1972 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1973 if (r < 0) { 1974 goto fail_features; 1975 } 1976 1977 if (vhost_dev_has_iommu(hdev)) { 1978 memory_listener_register(&hdev->iommu_listener, vdev->dma_as); 1979 } 1980 1981 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); 1982 if (r < 0) { 1983 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 1984 goto fail_mem; 1985 } 1986 for (i = 0; i < hdev->nvqs; ++i) { 1987 r = vhost_virtqueue_start(hdev, 1988 vdev, 1989 hdev->vqs + i, 1990 hdev->vq_index + i); 1991 if (r < 0) { 1992 goto fail_vq; 1993 } 1994 } 1995 1996 r = event_notifier_init( 1997 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0); 1998 if (r < 0) { 1999 VHOST_OPS_DEBUG(r, "event_notifier_init failed"); 2000 goto fail_vq; 2001 } 2002 event_notifier_test_and_clear( 2003 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2004 if (!vdev->use_guest_notifier_mask) { 2005 vhost_config_mask(hdev, vdev, true); 2006 } 2007 if (hdev->log_enabled) { 2008 uint64_t log_base; 2009 2010 hdev->log_size = vhost_get_log_size(hdev); 2011 hdev->log = vhost_log_get(hdev->log_size, 2012 vhost_dev_log_is_shared(hdev)); 2013 log_base = (uintptr_t)hdev->log->log; 2014 r = hdev->vhost_ops->vhost_set_log_base(hdev, 2015 hdev->log_size ? log_base : 0, 2016 hdev->log); 2017 if (r < 0) { 2018 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 2019 goto fail_log; 2020 } 2021 } 2022 if (vrings) { 2023 r = vhost_dev_set_vring_enable(hdev, true); 2024 if (r) { 2025 goto fail_log; 2026 } 2027 } 2028 if (hdev->vhost_ops->vhost_dev_start) { 2029 r = hdev->vhost_ops->vhost_dev_start(hdev, true); 2030 if (r) { 2031 goto fail_start; 2032 } 2033 } 2034 if (vhost_dev_has_iommu(hdev) && 2035 hdev->vhost_ops->vhost_set_iotlb_callback) { 2036 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); 2037 2038 /* Update used ring information for IOTLB to work correctly, 2039 * vhost-kernel code requires for this.*/ 2040 for (i = 0; i < hdev->nvqs; ++i) { 2041 struct vhost_virtqueue *vq = hdev->vqs + i; 2042 vhost_device_iotlb_miss(hdev, vq->used_phys, true); 2043 } 2044 } 2045 vhost_start_config_intr(hdev); 2046 return 0; 2047 fail_start: 2048 if (vrings) { 2049 vhost_dev_set_vring_enable(hdev, false); 2050 } 2051 fail_log: 2052 vhost_log_put(hdev, false); 2053 fail_vq: 2054 while (--i >= 0) { 2055 vhost_virtqueue_stop(hdev, 2056 vdev, 2057 hdev->vqs + i, 2058 hdev->vq_index + i); 2059 } 2060 2061 fail_mem: 2062 if (vhost_dev_has_iommu(hdev)) { 2063 memory_listener_unregister(&hdev->iommu_listener); 2064 } 2065 fail_features: 2066 vdev->vhost_started = false; 2067 hdev->started = false; 2068 return r; 2069 } 2070 2071 /* Host notifiers must be enabled at this point. */ 2072 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 2073 { 2074 int i; 2075 2076 /* should only be called after backend is connected */ 2077 assert(hdev->vhost_ops); 2078 event_notifier_test_and_clear( 2079 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2080 event_notifier_test_and_clear(&vdev->config_notifier); 2081 event_notifier_cleanup( 2082 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2083 2084 trace_vhost_dev_stop(hdev, vdev->name, vrings); 2085 2086 if (hdev->vhost_ops->vhost_dev_start) { 2087 hdev->vhost_ops->vhost_dev_start(hdev, false); 2088 } 2089 if (vrings) { 2090 vhost_dev_set_vring_enable(hdev, false); 2091 } 2092 for (i = 0; i < hdev->nvqs; ++i) { 2093 vhost_virtqueue_stop(hdev, 2094 vdev, 2095 hdev->vqs + i, 2096 hdev->vq_index + i); 2097 } 2098 if (hdev->vhost_ops->vhost_reset_status) { 2099 hdev->vhost_ops->vhost_reset_status(hdev); 2100 } 2101 2102 if (vhost_dev_has_iommu(hdev)) { 2103 if (hdev->vhost_ops->vhost_set_iotlb_callback) { 2104 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 2105 } 2106 memory_listener_unregister(&hdev->iommu_listener); 2107 } 2108 vhost_stop_config_intr(hdev); 2109 vhost_log_put(hdev, true); 2110 hdev->started = false; 2111 vdev->vhost_started = false; 2112 hdev->vdev = NULL; 2113 } 2114 2115 int vhost_net_set_backend(struct vhost_dev *hdev, 2116 struct vhost_vring_file *file) 2117 { 2118 if (hdev->vhost_ops->vhost_net_set_backend) { 2119 return hdev->vhost_ops->vhost_net_set_backend(hdev, file); 2120 } 2121 2122 return -ENOSYS; 2123 } 2124