1 /* 2 * vhost support 3 * 4 * Copyright Red Hat, Inc. 2010 5 * 6 * Authors: 7 * Michael S. Tsirkin <mst@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "hw/virtio/vhost.h" 19 #include "qemu/atomic.h" 20 #include "qemu/range.h" 21 #include "qemu/error-report.h" 22 #include "qemu/memfd.h" 23 #include "qemu/log.h" 24 #include "standard-headers/linux/vhost_types.h" 25 #include "hw/virtio/virtio-bus.h" 26 #include "hw/mem/memory-device.h" 27 #include "migration/blocker.h" 28 #include "migration/qemu-file-types.h" 29 #include "sysemu/dma.h" 30 #include "trace.h" 31 32 /* enabled until disconnected backend stabilizes */ 33 #define _VHOST_DEBUG 1 34 35 #ifdef _VHOST_DEBUG 36 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 37 do { \ 38 error_report(fmt ": %s (%d)", ## __VA_ARGS__, \ 39 strerror(-retval), -retval); \ 40 } while (0) 41 #else 42 #define VHOST_OPS_DEBUG(retval, fmt, ...) \ 43 do { } while (0) 44 #endif 45 46 static struct vhost_log *vhost_log; 47 static struct vhost_log *vhost_log_shm; 48 49 /* Memslots used by backends that support private memslots (without an fd). */ 50 static unsigned int used_memslots; 51 52 /* Memslots used by backends that only support shared memslots (with an fd). */ 53 static unsigned int used_shared_memslots; 54 55 static QLIST_HEAD(, vhost_dev) vhost_devices = 56 QLIST_HEAD_INITIALIZER(vhost_devices); 57 58 unsigned int vhost_get_max_memslots(void) 59 { 60 unsigned int max = UINT_MAX; 61 struct vhost_dev *hdev; 62 63 QLIST_FOREACH(hdev, &vhost_devices, entry) { 64 max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev)); 65 } 66 return max; 67 } 68 69 unsigned int vhost_get_free_memslots(void) 70 { 71 unsigned int free = UINT_MAX; 72 struct vhost_dev *hdev; 73 74 QLIST_FOREACH(hdev, &vhost_devices, entry) { 75 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 76 unsigned int cur_free; 77 78 if (hdev->vhost_ops->vhost_backend_no_private_memslots && 79 hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { 80 cur_free = r - used_shared_memslots; 81 } else { 82 cur_free = r - used_memslots; 83 } 84 free = MIN(free, cur_free); 85 } 86 return free; 87 } 88 89 static void vhost_dev_sync_region(struct vhost_dev *dev, 90 MemoryRegionSection *section, 91 uint64_t mfirst, uint64_t mlast, 92 uint64_t rfirst, uint64_t rlast) 93 { 94 vhost_log_chunk_t *dev_log = dev->log->log; 95 96 uint64_t start = MAX(mfirst, rfirst); 97 uint64_t end = MIN(mlast, rlast); 98 vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK; 99 vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1; 100 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK); 101 102 if (end < start) { 103 return; 104 } 105 assert(end / VHOST_LOG_CHUNK < dev->log_size); 106 assert(start / VHOST_LOG_CHUNK < dev->log_size); 107 108 for (;from < to; ++from) { 109 vhost_log_chunk_t log; 110 /* We first check with non-atomic: much cheaper, 111 * and we expect non-dirty to be the common case. */ 112 if (!*from) { 113 addr += VHOST_LOG_CHUNK; 114 continue; 115 } 116 /* Data must be read atomically. We don't really need barrier semantics 117 * but it's easier to use atomic_* than roll our own. */ 118 log = qatomic_xchg(from, 0); 119 while (log) { 120 int bit = ctzl(log); 121 hwaddr page_addr; 122 hwaddr section_offset; 123 hwaddr mr_offset; 124 page_addr = addr + bit * VHOST_LOG_PAGE; 125 section_offset = page_addr - section->offset_within_address_space; 126 mr_offset = section_offset + section->offset_within_region; 127 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); 128 log &= ~(0x1ull << bit); 129 } 130 addr += VHOST_LOG_CHUNK; 131 } 132 } 133 134 bool vhost_dev_has_iommu(struct vhost_dev *dev) 135 { 136 VirtIODevice *vdev = dev->vdev; 137 138 /* 139 * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support 140 * incremental memory mapping API via IOTLB API. For platform that 141 * does not have IOMMU, there's no need to enable this feature 142 * which may cause unnecessary IOTLB miss/update transactions. 143 */ 144 if (vdev) { 145 return virtio_bus_device_iommu_enabled(vdev) && 146 virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); 147 } else { 148 return false; 149 } 150 } 151 152 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, 153 MemoryRegionSection *section, 154 hwaddr first, 155 hwaddr last) 156 { 157 int i; 158 hwaddr start_addr; 159 hwaddr end_addr; 160 161 if (!dev->log_enabled || !dev->started) { 162 return 0; 163 } 164 start_addr = section->offset_within_address_space; 165 end_addr = range_get_last(start_addr, int128_get64(section->size)); 166 start_addr = MAX(first, start_addr); 167 end_addr = MIN(last, end_addr); 168 169 for (i = 0; i < dev->mem->nregions; ++i) { 170 struct vhost_memory_region *reg = dev->mem->regions + i; 171 vhost_dev_sync_region(dev, section, start_addr, end_addr, 172 reg->guest_phys_addr, 173 range_get_last(reg->guest_phys_addr, 174 reg->memory_size)); 175 } 176 for (i = 0; i < dev->nvqs; ++i) { 177 struct vhost_virtqueue *vq = dev->vqs + i; 178 179 if (!vq->used_phys && !vq->used_size) { 180 continue; 181 } 182 183 if (vhost_dev_has_iommu(dev)) { 184 IOMMUTLBEntry iotlb; 185 hwaddr used_phys = vq->used_phys, used_size = vq->used_size; 186 hwaddr phys, s, offset; 187 188 while (used_size) { 189 rcu_read_lock(); 190 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 191 used_phys, 192 true, 193 MEMTXATTRS_UNSPECIFIED); 194 rcu_read_unlock(); 195 196 if (!iotlb.target_as) { 197 qemu_log_mask(LOG_GUEST_ERROR, "translation " 198 "failure for used_iova %"PRIx64"\n", 199 used_phys); 200 return -EINVAL; 201 } 202 203 offset = used_phys & iotlb.addr_mask; 204 phys = iotlb.translated_addr + offset; 205 206 /* 207 * Distance from start of used ring until last byte of 208 * IOMMU page. 209 */ 210 s = iotlb.addr_mask - offset; 211 /* 212 * Size of used ring, or of the part of it until end 213 * of IOMMU page. To avoid zero result, do the adding 214 * outside of MIN(). 215 */ 216 s = MIN(s, used_size - 1) + 1; 217 218 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys, 219 range_get_last(phys, s)); 220 used_size -= s; 221 used_phys += s; 222 } 223 } else { 224 vhost_dev_sync_region(dev, section, start_addr, 225 end_addr, vq->used_phys, 226 range_get_last(vq->used_phys, vq->used_size)); 227 } 228 } 229 return 0; 230 } 231 232 static void vhost_log_sync(MemoryListener *listener, 233 MemoryRegionSection *section) 234 { 235 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 236 memory_listener); 237 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); 238 } 239 240 static void vhost_log_sync_range(struct vhost_dev *dev, 241 hwaddr first, hwaddr last) 242 { 243 int i; 244 /* FIXME: this is N^2 in number of sections */ 245 for (i = 0; i < dev->n_mem_sections; ++i) { 246 MemoryRegionSection *section = &dev->mem_sections[i]; 247 vhost_sync_dirty_bitmap(dev, section, first, last); 248 } 249 } 250 251 static uint64_t vhost_get_log_size(struct vhost_dev *dev) 252 { 253 uint64_t log_size = 0; 254 int i; 255 for (i = 0; i < dev->mem->nregions; ++i) { 256 struct vhost_memory_region *reg = dev->mem->regions + i; 257 uint64_t last = range_get_last(reg->guest_phys_addr, 258 reg->memory_size); 259 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 260 } 261 return log_size; 262 } 263 264 static int vhost_set_backend_type(struct vhost_dev *dev, 265 VhostBackendType backend_type) 266 { 267 int r = 0; 268 269 switch (backend_type) { 270 #ifdef CONFIG_VHOST_KERNEL 271 case VHOST_BACKEND_TYPE_KERNEL: 272 dev->vhost_ops = &kernel_ops; 273 break; 274 #endif 275 #ifdef CONFIG_VHOST_USER 276 case VHOST_BACKEND_TYPE_USER: 277 dev->vhost_ops = &user_ops; 278 break; 279 #endif 280 #ifdef CONFIG_VHOST_VDPA 281 case VHOST_BACKEND_TYPE_VDPA: 282 dev->vhost_ops = &vdpa_ops; 283 break; 284 #endif 285 default: 286 error_report("Unknown vhost backend type"); 287 r = -1; 288 } 289 290 return r; 291 } 292 293 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) 294 { 295 Error *err = NULL; 296 struct vhost_log *log; 297 uint64_t logsize = size * sizeof(*(log->log)); 298 int fd = -1; 299 300 log = g_new0(struct vhost_log, 1); 301 if (share) { 302 log->log = qemu_memfd_alloc("vhost-log", logsize, 303 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 304 &fd, &err); 305 if (err) { 306 error_report_err(err); 307 g_free(log); 308 return NULL; 309 } 310 memset(log->log, 0, logsize); 311 } else { 312 log->log = g_malloc0(logsize); 313 } 314 315 log->size = size; 316 log->refcnt = 1; 317 log->fd = fd; 318 319 return log; 320 } 321 322 static struct vhost_log *vhost_log_get(uint64_t size, bool share) 323 { 324 struct vhost_log *log = share ? vhost_log_shm : vhost_log; 325 326 if (!log || log->size != size) { 327 log = vhost_log_alloc(size, share); 328 if (share) { 329 vhost_log_shm = log; 330 } else { 331 vhost_log = log; 332 } 333 } else { 334 ++log->refcnt; 335 } 336 337 return log; 338 } 339 340 static void vhost_log_put(struct vhost_dev *dev, bool sync) 341 { 342 struct vhost_log *log = dev->log; 343 344 if (!log) { 345 return; 346 } 347 348 --log->refcnt; 349 if (log->refcnt == 0) { 350 /* Sync only the range covered by the old log */ 351 if (dev->log_size && sync) { 352 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); 353 } 354 355 if (vhost_log == log) { 356 g_free(log->log); 357 vhost_log = NULL; 358 } else if (vhost_log_shm == log) { 359 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), 360 log->fd); 361 vhost_log_shm = NULL; 362 } 363 364 g_free(log); 365 } 366 367 dev->log = NULL; 368 dev->log_size = 0; 369 } 370 371 static bool vhost_dev_log_is_shared(struct vhost_dev *dev) 372 { 373 return dev->vhost_ops->vhost_requires_shm_log && 374 dev->vhost_ops->vhost_requires_shm_log(dev); 375 } 376 377 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) 378 { 379 struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); 380 uint64_t log_base = (uintptr_t)log->log; 381 int r; 382 383 /* inform backend of log switching, this must be done before 384 releasing the current log, to ensure no logging is lost */ 385 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); 386 if (r < 0) { 387 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 388 } 389 390 vhost_log_put(dev, true); 391 dev->log = log; 392 dev->log_size = size; 393 } 394 395 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, 396 hwaddr *plen, bool is_write) 397 { 398 if (!vhost_dev_has_iommu(dev)) { 399 return cpu_physical_memory_map(addr, plen, is_write); 400 } else { 401 return (void *)(uintptr_t)addr; 402 } 403 } 404 405 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer, 406 hwaddr len, int is_write, 407 hwaddr access_len) 408 { 409 if (!vhost_dev_has_iommu(dev)) { 410 cpu_physical_memory_unmap(buffer, len, is_write, access_len); 411 } 412 } 413 414 static int vhost_verify_ring_part_mapping(void *ring_hva, 415 uint64_t ring_gpa, 416 uint64_t ring_size, 417 void *reg_hva, 418 uint64_t reg_gpa, 419 uint64_t reg_size) 420 { 421 uint64_t hva_ring_offset; 422 uint64_t ring_last = range_get_last(ring_gpa, ring_size); 423 uint64_t reg_last = range_get_last(reg_gpa, reg_size); 424 425 if (ring_last < reg_gpa || ring_gpa > reg_last) { 426 return 0; 427 } 428 /* check that whole ring's is mapped */ 429 if (ring_last > reg_last) { 430 return -ENOMEM; 431 } 432 /* check that ring's MemoryRegion wasn't replaced */ 433 hva_ring_offset = ring_gpa - reg_gpa; 434 if (ring_hva != reg_hva + hva_ring_offset) { 435 return -EBUSY; 436 } 437 438 return 0; 439 } 440 441 static int vhost_verify_ring_mappings(struct vhost_dev *dev, 442 void *reg_hva, 443 uint64_t reg_gpa, 444 uint64_t reg_size) 445 { 446 int i, j; 447 int r = 0; 448 const char *part_name[] = { 449 "descriptor table", 450 "available ring", 451 "used ring" 452 }; 453 454 if (vhost_dev_has_iommu(dev)) { 455 return 0; 456 } 457 458 for (i = 0; i < dev->nvqs; ++i) { 459 struct vhost_virtqueue *vq = dev->vqs + i; 460 461 if (vq->desc_phys == 0) { 462 continue; 463 } 464 465 j = 0; 466 r = vhost_verify_ring_part_mapping( 467 vq->desc, vq->desc_phys, vq->desc_size, 468 reg_hva, reg_gpa, reg_size); 469 if (r) { 470 break; 471 } 472 473 j++; 474 r = vhost_verify_ring_part_mapping( 475 vq->avail, vq->avail_phys, vq->avail_size, 476 reg_hva, reg_gpa, reg_size); 477 if (r) { 478 break; 479 } 480 481 j++; 482 r = vhost_verify_ring_part_mapping( 483 vq->used, vq->used_phys, vq->used_size, 484 reg_hva, reg_gpa, reg_size); 485 if (r) { 486 break; 487 } 488 } 489 490 if (r == -ENOMEM) { 491 error_report("Unable to map %s for ring %d", part_name[j], i); 492 } else if (r == -EBUSY) { 493 error_report("%s relocated for ring %d", part_name[j], i); 494 } 495 return r; 496 } 497 498 /* 499 * vhost_section: identify sections needed for vhost access 500 * 501 * We only care about RAM sections here (where virtqueue and guest 502 * internals accessed by virtio might live). 503 */ 504 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) 505 { 506 MemoryRegion *mr = section->mr; 507 508 if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) { 509 uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr); 510 uint8_t handled_dirty; 511 512 /* 513 * Kernel based vhost doesn't handle any block which is doing 514 * dirty-tracking other than migration for which it has 515 * specific logging support. However for TCG the kernel never 516 * gets involved anyway so we can also ignore it's 517 * self-modiying code detection flags. However a vhost-user 518 * client could still confuse a TCG guest if it re-writes 519 * executable memory that has already been translated. 520 */ 521 handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) | 522 (1 << DIRTY_MEMORY_CODE); 523 524 if (dirty_mask & ~handled_dirty) { 525 trace_vhost_reject_section(mr->name, 1); 526 return false; 527 } 528 529 /* 530 * Some backends (like vhost-user) can only handle memory regions 531 * that have an fd (can be mapped into a different process). Filter 532 * the ones without an fd out, if requested. 533 * 534 * TODO: we might have to limit to MAP_SHARED as well. 535 */ 536 if (memory_region_get_fd(section->mr) < 0 && 537 dev->vhost_ops->vhost_backend_no_private_memslots && 538 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 539 trace_vhost_reject_section(mr->name, 2); 540 return false; 541 } 542 543 trace_vhost_section(mr->name); 544 return true; 545 } else { 546 trace_vhost_reject_section(mr->name, 3); 547 return false; 548 } 549 } 550 551 static void vhost_begin(MemoryListener *listener) 552 { 553 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 554 memory_listener); 555 dev->tmp_sections = NULL; 556 dev->n_tmp_sections = 0; 557 } 558 559 static void vhost_commit(MemoryListener *listener) 560 { 561 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 562 memory_listener); 563 MemoryRegionSection *old_sections; 564 int n_old_sections; 565 uint64_t log_size; 566 size_t regions_size; 567 int r; 568 int i; 569 bool changed = false; 570 571 /* Note we can be called before the device is started, but then 572 * starting the device calls set_mem_table, so we need to have 573 * built the data structures. 574 */ 575 old_sections = dev->mem_sections; 576 n_old_sections = dev->n_mem_sections; 577 dev->mem_sections = dev->tmp_sections; 578 dev->n_mem_sections = dev->n_tmp_sections; 579 580 if (dev->n_mem_sections != n_old_sections) { 581 changed = true; 582 } else { 583 /* Same size, lets check the contents */ 584 for (i = 0; i < n_old_sections; i++) { 585 if (!MemoryRegionSection_eq(&old_sections[i], 586 &dev->mem_sections[i])) { 587 changed = true; 588 break; 589 } 590 } 591 } 592 593 trace_vhost_commit(dev->started, changed); 594 if (!changed) { 595 goto out; 596 } 597 598 /* Rebuild the regions list from the new sections list */ 599 regions_size = offsetof(struct vhost_memory, regions) + 600 dev->n_mem_sections * sizeof dev->mem->regions[0]; 601 dev->mem = g_realloc(dev->mem, regions_size); 602 dev->mem->nregions = dev->n_mem_sections; 603 604 if (dev->vhost_ops->vhost_backend_no_private_memslots && 605 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { 606 used_shared_memslots = dev->mem->nregions; 607 } else { 608 used_memslots = dev->mem->nregions; 609 } 610 611 for (i = 0; i < dev->n_mem_sections; i++) { 612 struct vhost_memory_region *cur_vmr = dev->mem->regions + i; 613 struct MemoryRegionSection *mrs = dev->mem_sections + i; 614 615 cur_vmr->guest_phys_addr = mrs->offset_within_address_space; 616 cur_vmr->memory_size = int128_get64(mrs->size); 617 cur_vmr->userspace_addr = 618 (uintptr_t)memory_region_get_ram_ptr(mrs->mr) + 619 mrs->offset_within_region; 620 cur_vmr->flags_padding = 0; 621 } 622 623 if (!dev->started) { 624 goto out; 625 } 626 627 for (i = 0; i < dev->mem->nregions; i++) { 628 if (vhost_verify_ring_mappings(dev, 629 (void *)(uintptr_t)dev->mem->regions[i].userspace_addr, 630 dev->mem->regions[i].guest_phys_addr, 631 dev->mem->regions[i].memory_size)) { 632 error_report("Verify ring failure on region %d", i); 633 abort(); 634 } 635 } 636 637 if (!dev->log_enabled) { 638 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 639 if (r < 0) { 640 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 641 } 642 goto out; 643 } 644 log_size = vhost_get_log_size(dev); 645 /* We allocate an extra 4K bytes to log, 646 * to reduce the * number of reallocations. */ 647 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) 648 /* To log more, must increase log size before table update. */ 649 if (dev->log_size < log_size) { 650 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); 651 } 652 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 653 if (r < 0) { 654 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 655 } 656 /* To log less, can only decrease log size after table update. */ 657 if (dev->log_size > log_size + VHOST_LOG_BUFFER) { 658 vhost_dev_log_resize(dev, log_size); 659 } 660 661 out: 662 /* Deref the old list of sections, this must happen _after_ the 663 * vhost_set_mem_table to ensure the client isn't still using the 664 * section we're about to unref. 665 */ 666 while (n_old_sections--) { 667 memory_region_unref(old_sections[n_old_sections].mr); 668 } 669 g_free(old_sections); 670 return; 671 } 672 673 /* Adds the section data to the tmp_section structure. 674 * It relies on the listener calling us in memory address order 675 * and for each region (via the _add and _nop methods) to 676 * join neighbours. 677 */ 678 static void vhost_region_add_section(struct vhost_dev *dev, 679 MemoryRegionSection *section) 680 { 681 bool need_add = true; 682 uint64_t mrs_size = int128_get64(section->size); 683 uint64_t mrs_gpa = section->offset_within_address_space; 684 uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + 685 section->offset_within_region; 686 RAMBlock *mrs_rb = section->mr->ram_block; 687 688 trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, 689 mrs_host); 690 691 if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { 692 /* Round the section to it's page size */ 693 /* First align the start down to a page boundary */ 694 size_t mrs_page = qemu_ram_pagesize(mrs_rb); 695 uint64_t alignage = mrs_host & (mrs_page - 1); 696 if (alignage) { 697 mrs_host -= alignage; 698 mrs_size += alignage; 699 mrs_gpa -= alignage; 700 } 701 /* Now align the size up to a page boundary */ 702 alignage = mrs_size & (mrs_page - 1); 703 if (alignage) { 704 mrs_size += mrs_page - alignage; 705 } 706 trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, 707 mrs_size, mrs_host); 708 } 709 710 if (dev->n_tmp_sections && !section->unmergeable) { 711 /* Since we already have at least one section, lets see if 712 * this extends it; since we're scanning in order, we only 713 * have to look at the last one, and the FlatView that calls 714 * us shouldn't have overlaps. 715 */ 716 MemoryRegionSection *prev_sec = dev->tmp_sections + 717 (dev->n_tmp_sections - 1); 718 uint64_t prev_gpa_start = prev_sec->offset_within_address_space; 719 uint64_t prev_size = int128_get64(prev_sec->size); 720 uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); 721 uint64_t prev_host_start = 722 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + 723 prev_sec->offset_within_region; 724 uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); 725 726 if (mrs_gpa <= (prev_gpa_end + 1)) { 727 /* OK, looks like overlapping/intersecting - it's possible that 728 * the rounding to page sizes has made them overlap, but they should 729 * match up in the same RAMBlock if they do. 730 */ 731 if (mrs_gpa < prev_gpa_start) { 732 error_report("%s:Section '%s' rounded to %"PRIx64 733 " prior to previous '%s' %"PRIx64, 734 __func__, section->mr->name, mrs_gpa, 735 prev_sec->mr->name, prev_gpa_start); 736 /* A way to cleanly fail here would be better */ 737 return; 738 } 739 /* Offset from the start of the previous GPA to this GPA */ 740 size_t offset = mrs_gpa - prev_gpa_start; 741 742 if (prev_host_start + offset == mrs_host && 743 section->mr == prev_sec->mr && !prev_sec->unmergeable) { 744 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); 745 need_add = false; 746 prev_sec->offset_within_address_space = 747 MIN(prev_gpa_start, mrs_gpa); 748 prev_sec->offset_within_region = 749 MIN(prev_host_start, mrs_host) - 750 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); 751 prev_sec->size = int128_make64(max_end - MIN(prev_host_start, 752 mrs_host)); 753 trace_vhost_region_add_section_merge(section->mr->name, 754 int128_get64(prev_sec->size), 755 prev_sec->offset_within_address_space, 756 prev_sec->offset_within_region); 757 } else { 758 /* adjoining regions are fine, but overlapping ones with 759 * different blocks/offsets shouldn't happen 760 */ 761 if (mrs_gpa != prev_gpa_end + 1) { 762 error_report("%s: Overlapping but not coherent sections " 763 "at %"PRIx64, 764 __func__, mrs_gpa); 765 return; 766 } 767 } 768 } 769 } 770 771 if (need_add) { 772 ++dev->n_tmp_sections; 773 dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections, 774 dev->n_tmp_sections); 775 dev->tmp_sections[dev->n_tmp_sections - 1] = *section; 776 /* The flatview isn't stable and we don't use it, making it NULL 777 * means we can memcmp the list. 778 */ 779 dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL; 780 memory_region_ref(section->mr); 781 } 782 } 783 784 /* Used for both add and nop callbacks */ 785 static void vhost_region_addnop(MemoryListener *listener, 786 MemoryRegionSection *section) 787 { 788 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 789 memory_listener); 790 791 if (!vhost_section(dev, section)) { 792 return; 793 } 794 vhost_region_add_section(dev, section); 795 } 796 797 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 798 { 799 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n); 800 struct vhost_dev *hdev = iommu->hdev; 801 hwaddr iova = iotlb->iova + iommu->iommu_offset; 802 803 if (vhost_backend_invalidate_device_iotlb(hdev, iova, 804 iotlb->addr_mask + 1)) { 805 error_report("Fail to invalidate device iotlb"); 806 } 807 } 808 809 static void vhost_iommu_region_add(MemoryListener *listener, 810 MemoryRegionSection *section) 811 { 812 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 813 iommu_listener); 814 struct vhost_iommu *iommu; 815 Int128 end; 816 int iommu_idx; 817 IOMMUMemoryRegion *iommu_mr; 818 819 if (!memory_region_is_iommu(section->mr)) { 820 return; 821 } 822 823 iommu_mr = IOMMU_MEMORY_REGION(section->mr); 824 825 iommu = g_malloc0(sizeof(*iommu)); 826 end = int128_add(int128_make64(section->offset_within_region), 827 section->size); 828 end = int128_sub(end, int128_one()); 829 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, 830 MEMTXATTRS_UNSPECIFIED); 831 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, 832 dev->vdev->device_iotlb_enabled ? 833 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : 834 IOMMU_NOTIFIER_UNMAP, 835 section->offset_within_region, 836 int128_get64(end), 837 iommu_idx); 838 iommu->mr = section->mr; 839 iommu->iommu_offset = section->offset_within_address_space - 840 section->offset_within_region; 841 iommu->hdev = dev; 842 memory_region_register_iommu_notifier(section->mr, &iommu->n, 843 &error_fatal); 844 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next); 845 /* TODO: can replay help performance here? */ 846 } 847 848 static void vhost_iommu_region_del(MemoryListener *listener, 849 MemoryRegionSection *section) 850 { 851 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 852 iommu_listener); 853 struct vhost_iommu *iommu; 854 855 if (!memory_region_is_iommu(section->mr)) { 856 return; 857 } 858 859 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 860 if (iommu->mr == section->mr && 861 iommu->n.start == section->offset_within_region) { 862 memory_region_unregister_iommu_notifier(iommu->mr, 863 &iommu->n); 864 QLIST_REMOVE(iommu, iommu_next); 865 g_free(iommu); 866 break; 867 } 868 } 869 } 870 871 void vhost_toggle_device_iotlb(VirtIODevice *vdev) 872 { 873 VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); 874 struct vhost_dev *dev; 875 struct vhost_iommu *iommu; 876 877 if (vdev->vhost_started) { 878 dev = vdc->get_vhost(vdev); 879 } else { 880 return; 881 } 882 883 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 884 memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n); 885 iommu->n.notifier_flags = vdev->device_iotlb_enabled ? 886 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP; 887 memory_region_register_iommu_notifier(iommu->mr, &iommu->n, 888 &error_fatal); 889 } 890 } 891 892 static int vhost_virtqueue_set_addr(struct vhost_dev *dev, 893 struct vhost_virtqueue *vq, 894 unsigned idx, bool enable_log) 895 { 896 struct vhost_vring_addr addr; 897 int r; 898 memset(&addr, 0, sizeof(struct vhost_vring_addr)); 899 900 if (dev->vhost_ops->vhost_vq_get_addr) { 901 r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq); 902 if (r < 0) { 903 VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed"); 904 return r; 905 } 906 } else { 907 addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc; 908 addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail; 909 addr.used_user_addr = (uint64_t)(unsigned long)vq->used; 910 } 911 addr.index = idx; 912 addr.log_guest_addr = vq->used_phys; 913 addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0; 914 r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 915 if (r < 0) { 916 VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed"); 917 } 918 return r; 919 } 920 921 static int vhost_dev_set_features(struct vhost_dev *dev, 922 bool enable_log) 923 { 924 uint64_t features = dev->acked_features; 925 int r; 926 if (enable_log) { 927 features |= 0x1ULL << VHOST_F_LOG_ALL; 928 } 929 if (!vhost_dev_has_iommu(dev)) { 930 features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); 931 } 932 if (dev->vhost_ops->vhost_force_iommu) { 933 if (dev->vhost_ops->vhost_force_iommu(dev) == true) { 934 features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; 935 } 936 } 937 r = dev->vhost_ops->vhost_set_features(dev, features); 938 if (r < 0) { 939 VHOST_OPS_DEBUG(r, "vhost_set_features failed"); 940 goto out; 941 } 942 if (dev->vhost_ops->vhost_set_backend_cap) { 943 r = dev->vhost_ops->vhost_set_backend_cap(dev); 944 if (r < 0) { 945 VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed"); 946 goto out; 947 } 948 } 949 950 out: 951 return r; 952 } 953 954 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) 955 { 956 int r, i, idx; 957 hwaddr addr; 958 959 r = vhost_dev_set_features(dev, enable_log); 960 if (r < 0) { 961 goto err_features; 962 } 963 for (i = 0; i < dev->nvqs; ++i) { 964 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 965 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 966 if (!addr) { 967 /* 968 * The queue might not be ready for start. If this 969 * is the case there is no reason to continue the process. 970 * The similar logic is used by the vhost_virtqueue_start() 971 * routine. 972 */ 973 continue; 974 } 975 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 976 enable_log); 977 if (r < 0) { 978 goto err_vq; 979 } 980 } 981 return 0; 982 err_vq: 983 for (; i >= 0; --i) { 984 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 985 addr = virtio_queue_get_desc_addr(dev->vdev, idx); 986 if (!addr) { 987 continue; 988 } 989 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 990 dev->log_enabled); 991 } 992 vhost_dev_set_features(dev, dev->log_enabled); 993 err_features: 994 return r; 995 } 996 997 static int vhost_migration_log(MemoryListener *listener, bool enable) 998 { 999 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 1000 memory_listener); 1001 int r; 1002 if (enable == dev->log_enabled) { 1003 return 0; 1004 } 1005 if (!dev->started) { 1006 dev->log_enabled = enable; 1007 return 0; 1008 } 1009 1010 r = 0; 1011 if (!enable) { 1012 r = vhost_dev_set_log(dev, false); 1013 if (r < 0) { 1014 goto check_dev_state; 1015 } 1016 vhost_log_put(dev, false); 1017 } else { 1018 vhost_dev_log_resize(dev, vhost_get_log_size(dev)); 1019 r = vhost_dev_set_log(dev, true); 1020 if (r < 0) { 1021 goto check_dev_state; 1022 } 1023 } 1024 1025 check_dev_state: 1026 dev->log_enabled = enable; 1027 /* 1028 * vhost-user-* devices could change their state during log 1029 * initialization due to disconnect. So check dev state after 1030 * vhost communication. 1031 */ 1032 if (!dev->started) { 1033 /* 1034 * Since device is in the stopped state, it is okay for 1035 * migration. Return success. 1036 */ 1037 r = 0; 1038 } 1039 if (r) { 1040 /* An error occurred. */ 1041 dev->log_enabled = false; 1042 } 1043 1044 return r; 1045 } 1046 1047 static void vhost_log_global_start(MemoryListener *listener) 1048 { 1049 int r; 1050 1051 r = vhost_migration_log(listener, true); 1052 if (r < 0) { 1053 abort(); 1054 } 1055 } 1056 1057 static void vhost_log_global_stop(MemoryListener *listener) 1058 { 1059 int r; 1060 1061 r = vhost_migration_log(listener, false); 1062 if (r < 0) { 1063 abort(); 1064 } 1065 } 1066 1067 static void vhost_log_start(MemoryListener *listener, 1068 MemoryRegionSection *section, 1069 int old, int new) 1070 { 1071 /* FIXME: implement */ 1072 } 1073 1074 static void vhost_log_stop(MemoryListener *listener, 1075 MemoryRegionSection *section, 1076 int old, int new) 1077 { 1078 /* FIXME: implement */ 1079 } 1080 1081 /* The vhost driver natively knows how to handle the vrings of non 1082 * cross-endian legacy devices and modern devices. Only legacy devices 1083 * exposed to a bi-endian guest may require the vhost driver to use a 1084 * specific endianness. 1085 */ 1086 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) 1087 { 1088 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1089 return false; 1090 } 1091 #if HOST_BIG_ENDIAN 1092 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; 1093 #else 1094 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; 1095 #endif 1096 } 1097 1098 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, 1099 bool is_big_endian, 1100 int vhost_vq_index) 1101 { 1102 int r; 1103 struct vhost_vring_state s = { 1104 .index = vhost_vq_index, 1105 .num = is_big_endian 1106 }; 1107 1108 r = dev->vhost_ops->vhost_set_vring_endian(dev, &s); 1109 if (r < 0) { 1110 VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed"); 1111 } 1112 return r; 1113 } 1114 1115 static int vhost_memory_region_lookup(struct vhost_dev *hdev, 1116 uint64_t gpa, uint64_t *uaddr, 1117 uint64_t *len) 1118 { 1119 int i; 1120 1121 for (i = 0; i < hdev->mem->nregions; i++) { 1122 struct vhost_memory_region *reg = hdev->mem->regions + i; 1123 1124 if (gpa >= reg->guest_phys_addr && 1125 reg->guest_phys_addr + reg->memory_size > gpa) { 1126 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr; 1127 *len = reg->guest_phys_addr + reg->memory_size - gpa; 1128 return 0; 1129 } 1130 } 1131 1132 return -EFAULT; 1133 } 1134 1135 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) 1136 { 1137 IOMMUTLBEntry iotlb; 1138 uint64_t uaddr, len; 1139 int ret = -EFAULT; 1140 1141 RCU_READ_LOCK_GUARD(); 1142 1143 trace_vhost_iotlb_miss(dev, 1); 1144 1145 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 1146 iova, write, 1147 MEMTXATTRS_UNSPECIFIED); 1148 if (iotlb.target_as != NULL) { 1149 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, 1150 &uaddr, &len); 1151 if (ret) { 1152 trace_vhost_iotlb_miss(dev, 3); 1153 error_report("Fail to lookup the translated address " 1154 "%"PRIx64, iotlb.translated_addr); 1155 goto out; 1156 } 1157 1158 len = MIN(iotlb.addr_mask + 1, len); 1159 iova = iova & ~iotlb.addr_mask; 1160 1161 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, 1162 len, iotlb.perm); 1163 if (ret) { 1164 trace_vhost_iotlb_miss(dev, 4); 1165 error_report("Fail to update device iotlb"); 1166 goto out; 1167 } 1168 } 1169 1170 trace_vhost_iotlb_miss(dev, 2); 1171 1172 out: 1173 return ret; 1174 } 1175 1176 int vhost_virtqueue_start(struct vhost_dev *dev, 1177 struct VirtIODevice *vdev, 1178 struct vhost_virtqueue *vq, 1179 unsigned idx) 1180 { 1181 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1182 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1183 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1184 hwaddr s, l, a; 1185 int r; 1186 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1187 struct vhost_vring_file file = { 1188 .index = vhost_vq_index 1189 }; 1190 struct vhost_vring_state state = { 1191 .index = vhost_vq_index 1192 }; 1193 struct VirtQueue *vvq = virtio_get_queue(vdev, idx); 1194 1195 a = virtio_queue_get_desc_addr(vdev, idx); 1196 if (a == 0) { 1197 /* Queue might not be ready for start */ 1198 return 0; 1199 } 1200 1201 vq->num = state.num = virtio_queue_get_num(vdev, idx); 1202 r = dev->vhost_ops->vhost_set_vring_num(dev, &state); 1203 if (r) { 1204 VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed"); 1205 return r; 1206 } 1207 1208 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 1209 r = dev->vhost_ops->vhost_set_vring_base(dev, &state); 1210 if (r) { 1211 VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed"); 1212 return r; 1213 } 1214 1215 if (vhost_needs_vring_endian(vdev)) { 1216 r = vhost_virtqueue_set_vring_endian_legacy(dev, 1217 virtio_is_big_endian(vdev), 1218 vhost_vq_index); 1219 if (r) { 1220 return r; 1221 } 1222 } 1223 1224 vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx); 1225 vq->desc_phys = a; 1226 vq->desc = vhost_memory_map(dev, a, &l, false); 1227 if (!vq->desc || l != s) { 1228 r = -ENOMEM; 1229 goto fail_alloc_desc; 1230 } 1231 vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx); 1232 vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx); 1233 vq->avail = vhost_memory_map(dev, a, &l, false); 1234 if (!vq->avail || l != s) { 1235 r = -ENOMEM; 1236 goto fail_alloc_avail; 1237 } 1238 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); 1239 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); 1240 vq->used = vhost_memory_map(dev, a, &l, true); 1241 if (!vq->used || l != s) { 1242 r = -ENOMEM; 1243 goto fail_alloc_used; 1244 } 1245 1246 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); 1247 if (r < 0) { 1248 goto fail_alloc; 1249 } 1250 1251 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); 1252 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); 1253 if (r) { 1254 VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed"); 1255 goto fail_kick; 1256 } 1257 1258 /* Clear and discard previous events if any. */ 1259 event_notifier_test_and_clear(&vq->masked_notifier); 1260 1261 /* Init vring in unmasked state, unless guest_notifier_mask 1262 * will do it later. 1263 */ 1264 if (!vdev->use_guest_notifier_mask) { 1265 /* TODO: check and handle errors. */ 1266 vhost_virtqueue_mask(dev, vdev, idx, false); 1267 } 1268 1269 if (k->query_guest_notifiers && 1270 k->query_guest_notifiers(qbus->parent) && 1271 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) { 1272 file.fd = -1; 1273 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1274 if (r) { 1275 goto fail_vector; 1276 } 1277 } 1278 1279 return 0; 1280 1281 fail_vector: 1282 fail_kick: 1283 fail_alloc: 1284 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1285 0, 0); 1286 fail_alloc_used: 1287 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1288 0, 0); 1289 fail_alloc_avail: 1290 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1291 0, 0); 1292 fail_alloc_desc: 1293 return r; 1294 } 1295 1296 void vhost_virtqueue_stop(struct vhost_dev *dev, 1297 struct VirtIODevice *vdev, 1298 struct vhost_virtqueue *vq, 1299 unsigned idx) 1300 { 1301 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1302 struct vhost_vring_state state = { 1303 .index = vhost_vq_index, 1304 }; 1305 int r; 1306 1307 if (virtio_queue_get_desc_addr(vdev, idx) == 0) { 1308 /* Don't stop the virtqueue which might have not been started */ 1309 return; 1310 } 1311 1312 r = dev->vhost_ops->vhost_get_vring_base(dev, &state); 1313 if (r < 0) { 1314 VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r); 1315 /* Connection to the backend is broken, so let's sync internal 1316 * last avail idx to the device used idx. 1317 */ 1318 virtio_queue_restore_last_avail_idx(vdev, idx); 1319 } else { 1320 virtio_queue_set_last_avail_idx(vdev, idx, state.num); 1321 } 1322 virtio_queue_invalidate_signalled_used(vdev, idx); 1323 virtio_queue_update_used_idx(vdev, idx); 1324 1325 /* In the cross-endian case, we need to reset the vring endianness to 1326 * native as legacy devices expect so by default. 1327 */ 1328 if (vhost_needs_vring_endian(vdev)) { 1329 vhost_virtqueue_set_vring_endian_legacy(dev, 1330 !virtio_is_big_endian(vdev), 1331 vhost_vq_index); 1332 } 1333 1334 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1335 1, virtio_queue_get_used_size(vdev, idx)); 1336 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1337 0, virtio_queue_get_avail_size(vdev, idx)); 1338 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1339 0, virtio_queue_get_desc_size(vdev, idx)); 1340 } 1341 1342 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, 1343 int n, uint32_t timeout) 1344 { 1345 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1346 struct vhost_vring_state state = { 1347 .index = vhost_vq_index, 1348 .num = timeout, 1349 }; 1350 int r; 1351 1352 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) { 1353 return -EINVAL; 1354 } 1355 1356 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state); 1357 if (r) { 1358 VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed"); 1359 return r; 1360 } 1361 1362 return 0; 1363 } 1364 1365 static void vhost_virtqueue_error_notifier(EventNotifier *n) 1366 { 1367 struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue, 1368 error_notifier); 1369 struct vhost_dev *dev = vq->dev; 1370 int index = vq - dev->vqs; 1371 1372 if (event_notifier_test_and_clear(n) && dev->vdev) { 1373 VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", 1374 dev->vq_index + index); 1375 } 1376 } 1377 1378 static int vhost_virtqueue_init(struct vhost_dev *dev, 1379 struct vhost_virtqueue *vq, int n) 1380 { 1381 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1382 struct vhost_vring_file file = { 1383 .index = vhost_vq_index, 1384 }; 1385 int r = event_notifier_init(&vq->masked_notifier, 0); 1386 if (r < 0) { 1387 return r; 1388 } 1389 1390 file.fd = event_notifier_get_wfd(&vq->masked_notifier); 1391 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1392 if (r) { 1393 VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed"); 1394 goto fail_call; 1395 } 1396 1397 vq->dev = dev; 1398 1399 if (dev->vhost_ops->vhost_set_vring_err) { 1400 r = event_notifier_init(&vq->error_notifier, 0); 1401 if (r < 0) { 1402 goto fail_call; 1403 } 1404 1405 file.fd = event_notifier_get_fd(&vq->error_notifier); 1406 r = dev->vhost_ops->vhost_set_vring_err(dev, &file); 1407 if (r) { 1408 VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed"); 1409 goto fail_err; 1410 } 1411 1412 event_notifier_set_handler(&vq->error_notifier, 1413 vhost_virtqueue_error_notifier); 1414 } 1415 1416 return 0; 1417 1418 fail_err: 1419 event_notifier_cleanup(&vq->error_notifier); 1420 fail_call: 1421 event_notifier_cleanup(&vq->masked_notifier); 1422 return r; 1423 } 1424 1425 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) 1426 { 1427 event_notifier_cleanup(&vq->masked_notifier); 1428 if (vq->dev->vhost_ops->vhost_set_vring_err) { 1429 event_notifier_set_handler(&vq->error_notifier, NULL); 1430 event_notifier_cleanup(&vq->error_notifier); 1431 } 1432 } 1433 1434 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 1435 VhostBackendType backend_type, uint32_t busyloop_timeout, 1436 Error **errp) 1437 { 1438 unsigned int used, reserved, limit; 1439 uint64_t features; 1440 int i, r, n_initialized_vqs = 0; 1441 1442 hdev->vdev = NULL; 1443 hdev->migration_blocker = NULL; 1444 1445 r = vhost_set_backend_type(hdev, backend_type); 1446 assert(r >= 0); 1447 1448 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp); 1449 if (r < 0) { 1450 goto fail; 1451 } 1452 1453 r = hdev->vhost_ops->vhost_set_owner(hdev); 1454 if (r < 0) { 1455 error_setg_errno(errp, -r, "vhost_set_owner failed"); 1456 goto fail; 1457 } 1458 1459 r = hdev->vhost_ops->vhost_get_features(hdev, &features); 1460 if (r < 0) { 1461 error_setg_errno(errp, -r, "vhost_get_features failed"); 1462 goto fail; 1463 } 1464 1465 limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 1466 if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS && 1467 memory_devices_memslot_auto_decision_active()) { 1468 error_setg(errp, "some memory device (like virtio-mem)" 1469 " decided how many memory slots to use based on the overall" 1470 " number of memory slots; this vhost backend would further" 1471 " restricts the overall number of memory slots"); 1472 error_append_hint(errp, "Try plugging this vhost backend before" 1473 " plugging such memory devices.\n"); 1474 r = -EINVAL; 1475 goto fail; 1476 } 1477 1478 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) { 1479 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); 1480 if (r < 0) { 1481 error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i); 1482 goto fail; 1483 } 1484 } 1485 1486 if (busyloop_timeout) { 1487 for (i = 0; i < hdev->nvqs; ++i) { 1488 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 1489 busyloop_timeout); 1490 if (r < 0) { 1491 error_setg_errno(errp, -r, "Failed to set busyloop timeout"); 1492 goto fail_busyloop; 1493 } 1494 } 1495 } 1496 1497 hdev->features = features; 1498 1499 hdev->memory_listener = (MemoryListener) { 1500 .name = "vhost", 1501 .begin = vhost_begin, 1502 .commit = vhost_commit, 1503 .region_add = vhost_region_addnop, 1504 .region_nop = vhost_region_addnop, 1505 .log_start = vhost_log_start, 1506 .log_stop = vhost_log_stop, 1507 .log_sync = vhost_log_sync, 1508 .log_global_start = vhost_log_global_start, 1509 .log_global_stop = vhost_log_global_stop, 1510 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND 1511 }; 1512 1513 hdev->iommu_listener = (MemoryListener) { 1514 .name = "vhost-iommu", 1515 .region_add = vhost_iommu_region_add, 1516 .region_del = vhost_iommu_region_del, 1517 }; 1518 1519 if (hdev->migration_blocker == NULL) { 1520 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { 1521 error_setg(&hdev->migration_blocker, 1522 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); 1523 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { 1524 error_setg(&hdev->migration_blocker, 1525 "Migration disabled: failed to allocate shared memory"); 1526 } 1527 } 1528 1529 if (hdev->migration_blocker != NULL) { 1530 r = migrate_add_blocker(hdev->migration_blocker, errp); 1531 if (r < 0) { 1532 error_free(hdev->migration_blocker); 1533 goto fail_busyloop; 1534 } 1535 } 1536 1537 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); 1538 hdev->n_mem_sections = 0; 1539 hdev->mem_sections = NULL; 1540 hdev->log = NULL; 1541 hdev->log_size = 0; 1542 hdev->log_enabled = false; 1543 hdev->started = false; 1544 memory_listener_register(&hdev->memory_listener, &address_space_memory); 1545 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); 1546 1547 /* 1548 * The listener we registered properly updated the corresponding counter. 1549 * So we can trust that these values are accurate. 1550 */ 1551 if (hdev->vhost_ops->vhost_backend_no_private_memslots && 1552 hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { 1553 used = used_shared_memslots; 1554 } else { 1555 used = used_memslots; 1556 } 1557 /* 1558 * We assume that all reserved memslots actually require a real memslot 1559 * in our vhost backend. This might not be true, for example, if the 1560 * memslot would be ROM. If ever relevant, we can optimize for that -- 1561 * but we'll need additional information about the reservations. 1562 */ 1563 reserved = memory_devices_get_reserved_memslots(); 1564 if (used + reserved > limit) { 1565 error_setg(errp, "vhost backend memory slots limit (%d) is less" 1566 " than current number of used (%d) and reserved (%d)" 1567 " memory slots for memory devices.", limit, used, reserved); 1568 r = -EINVAL; 1569 goto fail_busyloop; 1570 } 1571 1572 return 0; 1573 1574 fail_busyloop: 1575 if (busyloop_timeout) { 1576 while (--i >= 0) { 1577 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0); 1578 } 1579 } 1580 fail: 1581 hdev->nvqs = n_initialized_vqs; 1582 vhost_dev_cleanup(hdev); 1583 return r; 1584 } 1585 1586 void vhost_dev_cleanup(struct vhost_dev *hdev) 1587 { 1588 int i; 1589 1590 trace_vhost_dev_cleanup(hdev); 1591 1592 for (i = 0; i < hdev->nvqs; ++i) { 1593 vhost_virtqueue_cleanup(hdev->vqs + i); 1594 } 1595 if (hdev->mem) { 1596 /* those are only safe after successful init */ 1597 memory_listener_unregister(&hdev->memory_listener); 1598 QLIST_REMOVE(hdev, entry); 1599 } 1600 if (hdev->migration_blocker) { 1601 migrate_del_blocker(hdev->migration_blocker); 1602 error_free(hdev->migration_blocker); 1603 } 1604 g_free(hdev->mem); 1605 g_free(hdev->mem_sections); 1606 if (hdev->vhost_ops) { 1607 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1608 } 1609 assert(!hdev->log); 1610 1611 memset(hdev, 0, sizeof(struct vhost_dev)); 1612 } 1613 1614 static void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev, 1615 VirtIODevice *vdev, 1616 unsigned int nvqs) 1617 { 1618 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1619 int i, r; 1620 1621 /* 1622 * Batch all the host notifiers in a single transaction to avoid 1623 * quadratic time complexity in address_space_update_ioeventfds(). 1624 */ 1625 memory_region_transaction_begin(); 1626 1627 for (i = 0; i < nvqs; ++i) { 1628 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1629 false); 1630 if (r < 0) { 1631 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); 1632 } 1633 assert(r >= 0); 1634 } 1635 1636 /* 1637 * The transaction expects the ioeventfds to be open when it 1638 * commits. Do it now, before the cleanup loop. 1639 */ 1640 memory_region_transaction_commit(); 1641 1642 for (i = 0; i < nvqs; ++i) { 1643 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); 1644 } 1645 virtio_device_release_ioeventfd(vdev); 1646 } 1647 1648 /* Stop processing guest IO notifications in qemu. 1649 * Start processing them in vhost in kernel. 1650 */ 1651 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1652 { 1653 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1654 int i, r; 1655 1656 /* We will pass the notifiers to the kernel, make sure that QEMU 1657 * doesn't interfere. 1658 */ 1659 r = virtio_device_grab_ioeventfd(vdev); 1660 if (r < 0) { 1661 error_report("binding does not support host notifiers"); 1662 return r; 1663 } 1664 1665 /* 1666 * Batch all the host notifiers in a single transaction to avoid 1667 * quadratic time complexity in address_space_update_ioeventfds(). 1668 */ 1669 memory_region_transaction_begin(); 1670 1671 for (i = 0; i < hdev->nvqs; ++i) { 1672 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1673 true); 1674 if (r < 0) { 1675 error_report("vhost VQ %d notifier binding failed: %d", i, -r); 1676 memory_region_transaction_commit(); 1677 vhost_dev_disable_notifiers_nvqs(hdev, vdev, i); 1678 return r; 1679 } 1680 } 1681 1682 memory_region_transaction_commit(); 1683 1684 return 0; 1685 } 1686 1687 /* Stop processing guest IO notifications in vhost. 1688 * Start processing them in qemu. 1689 * This might actually run the qemu handlers right away, 1690 * so virtio in qemu must be completely setup when this is called. 1691 */ 1692 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1693 { 1694 vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs); 1695 } 1696 1697 /* Test and clear event pending status. 1698 * Should be called after unmask to avoid losing events. 1699 */ 1700 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) 1701 { 1702 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; 1703 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); 1704 return event_notifier_test_and_clear(&vq->masked_notifier); 1705 } 1706 1707 /* Mask/unmask events from this vq. */ 1708 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, 1709 bool mask) 1710 { 1711 struct VirtQueue *vvq = virtio_get_queue(vdev, n); 1712 int r, index = n - hdev->vq_index; 1713 struct vhost_vring_file file; 1714 1715 /* should only be called after backend is connected */ 1716 assert(hdev->vhost_ops); 1717 1718 if (mask) { 1719 assert(vdev->use_guest_notifier_mask); 1720 file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier); 1721 } else { 1722 file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq)); 1723 } 1724 1725 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); 1726 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); 1727 if (r < 0) { 1728 error_report("vhost_set_vring_call failed %d", -r); 1729 } 1730 } 1731 1732 bool vhost_config_pending(struct vhost_dev *hdev) 1733 { 1734 assert(hdev->vhost_ops); 1735 if ((hdev->started == false) || 1736 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1737 return false; 1738 } 1739 1740 EventNotifier *notifier = 1741 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1742 return event_notifier_test_and_clear(notifier); 1743 } 1744 1745 void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask) 1746 { 1747 int fd; 1748 int r; 1749 EventNotifier *notifier = 1750 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; 1751 EventNotifier *config_notifier = &vdev->config_notifier; 1752 assert(hdev->vhost_ops); 1753 1754 if ((hdev->started == false) || 1755 (hdev->vhost_ops->vhost_set_config_call == NULL)) { 1756 return; 1757 } 1758 if (mask) { 1759 assert(vdev->use_guest_notifier_mask); 1760 fd = event_notifier_get_fd(notifier); 1761 } else { 1762 fd = event_notifier_get_fd(config_notifier); 1763 } 1764 r = hdev->vhost_ops->vhost_set_config_call(hdev, fd); 1765 if (r < 0) { 1766 error_report("vhost_set_config_call failed %d", -r); 1767 } 1768 } 1769 1770 static void vhost_stop_config_intr(struct vhost_dev *dev) 1771 { 1772 int fd = -1; 1773 assert(dev->vhost_ops); 1774 if (dev->vhost_ops->vhost_set_config_call) { 1775 dev->vhost_ops->vhost_set_config_call(dev, fd); 1776 } 1777 } 1778 1779 static void vhost_start_config_intr(struct vhost_dev *dev) 1780 { 1781 int r; 1782 1783 assert(dev->vhost_ops); 1784 int fd = event_notifier_get_fd(&dev->vdev->config_notifier); 1785 if (dev->vhost_ops->vhost_set_config_call) { 1786 r = dev->vhost_ops->vhost_set_config_call(dev, fd); 1787 if (!r) { 1788 event_notifier_set(&dev->vdev->config_notifier); 1789 } 1790 } 1791 } 1792 1793 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, 1794 uint64_t features) 1795 { 1796 const int *bit = feature_bits; 1797 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1798 uint64_t bit_mask = (1ULL << *bit); 1799 if (!(hdev->features & bit_mask)) { 1800 features &= ~bit_mask; 1801 } 1802 bit++; 1803 } 1804 return features; 1805 } 1806 1807 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, 1808 uint64_t features) 1809 { 1810 const int *bit = feature_bits; 1811 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1812 uint64_t bit_mask = (1ULL << *bit); 1813 if (features & bit_mask) { 1814 hdev->acked_features |= bit_mask; 1815 } 1816 bit++; 1817 } 1818 } 1819 1820 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, 1821 uint32_t config_len, Error **errp) 1822 { 1823 assert(hdev->vhost_ops); 1824 1825 if (hdev->vhost_ops->vhost_get_config) { 1826 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len, 1827 errp); 1828 } 1829 1830 error_setg(errp, "vhost_get_config not implemented"); 1831 return -ENOSYS; 1832 } 1833 1834 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data, 1835 uint32_t offset, uint32_t size, uint32_t flags) 1836 { 1837 assert(hdev->vhost_ops); 1838 1839 if (hdev->vhost_ops->vhost_set_config) { 1840 return hdev->vhost_ops->vhost_set_config(hdev, data, offset, 1841 size, flags); 1842 } 1843 1844 return -ENOSYS; 1845 } 1846 1847 void vhost_dev_set_config_notifier(struct vhost_dev *hdev, 1848 const VhostDevConfigOps *ops) 1849 { 1850 hdev->config_ops = ops; 1851 } 1852 1853 void vhost_dev_free_inflight(struct vhost_inflight *inflight) 1854 { 1855 if (inflight && inflight->addr) { 1856 qemu_memfd_free(inflight->addr, inflight->size, inflight->fd); 1857 inflight->addr = NULL; 1858 inflight->fd = -1; 1859 } 1860 } 1861 1862 static int vhost_dev_resize_inflight(struct vhost_inflight *inflight, 1863 uint64_t new_size) 1864 { 1865 Error *err = NULL; 1866 int fd = -1; 1867 void *addr = qemu_memfd_alloc("vhost-inflight", new_size, 1868 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 1869 &fd, &err); 1870 1871 if (err) { 1872 error_report_err(err); 1873 return -ENOMEM; 1874 } 1875 1876 vhost_dev_free_inflight(inflight); 1877 inflight->offset = 0; 1878 inflight->addr = addr; 1879 inflight->fd = fd; 1880 inflight->size = new_size; 1881 1882 return 0; 1883 } 1884 1885 void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f) 1886 { 1887 if (inflight->addr) { 1888 qemu_put_be64(f, inflight->size); 1889 qemu_put_be16(f, inflight->queue_size); 1890 qemu_put_buffer(f, inflight->addr, inflight->size); 1891 } else { 1892 qemu_put_be64(f, 0); 1893 } 1894 } 1895 1896 int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f) 1897 { 1898 uint64_t size; 1899 1900 size = qemu_get_be64(f); 1901 if (!size) { 1902 return 0; 1903 } 1904 1905 if (inflight->size != size) { 1906 int ret = vhost_dev_resize_inflight(inflight, size); 1907 if (ret < 0) { 1908 return ret; 1909 } 1910 } 1911 inflight->queue_size = qemu_get_be16(f); 1912 1913 qemu_get_buffer(f, inflight->addr, size); 1914 1915 return 0; 1916 } 1917 1918 int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev) 1919 { 1920 int r; 1921 1922 if (hdev->vhost_ops->vhost_get_inflight_fd == NULL || 1923 hdev->vhost_ops->vhost_set_inflight_fd == NULL) { 1924 return 0; 1925 } 1926 1927 hdev->vdev = vdev; 1928 1929 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1930 if (r < 0) { 1931 VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed"); 1932 return r; 1933 } 1934 1935 return 0; 1936 } 1937 1938 int vhost_dev_set_inflight(struct vhost_dev *dev, 1939 struct vhost_inflight *inflight) 1940 { 1941 int r; 1942 1943 if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) { 1944 r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight); 1945 if (r) { 1946 VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed"); 1947 return r; 1948 } 1949 } 1950 1951 return 0; 1952 } 1953 1954 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, 1955 struct vhost_inflight *inflight) 1956 { 1957 int r; 1958 1959 if (dev->vhost_ops->vhost_get_inflight_fd) { 1960 r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight); 1961 if (r) { 1962 VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed"); 1963 return r; 1964 } 1965 } 1966 1967 return 0; 1968 } 1969 1970 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) 1971 { 1972 if (!hdev->vhost_ops->vhost_set_vring_enable) { 1973 return 0; 1974 } 1975 1976 /* 1977 * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not 1978 * been negotiated, the rings start directly in the enabled state, and 1979 * .vhost_set_vring_enable callback will fail since 1980 * VHOST_USER_SET_VRING_ENABLE is not supported. 1981 */ 1982 if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER && 1983 !virtio_has_feature(hdev->backend_features, 1984 VHOST_USER_F_PROTOCOL_FEATURES)) { 1985 return 0; 1986 } 1987 1988 return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); 1989 } 1990 1991 /* Host notifiers must be enabled at this point. */ 1992 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 1993 { 1994 int i, r; 1995 1996 /* should only be called after backend is connected */ 1997 assert(hdev->vhost_ops); 1998 1999 trace_vhost_dev_start(hdev, vdev->name, vrings); 2000 2001 vdev->vhost_started = true; 2002 hdev->started = true; 2003 hdev->vdev = vdev; 2004 2005 r = vhost_dev_set_features(hdev, hdev->log_enabled); 2006 if (r < 0) { 2007 goto fail_features; 2008 } 2009 2010 if (vhost_dev_has_iommu(hdev)) { 2011 memory_listener_register(&hdev->iommu_listener, vdev->dma_as); 2012 } 2013 2014 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); 2015 if (r < 0) { 2016 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); 2017 goto fail_mem; 2018 } 2019 for (i = 0; i < hdev->nvqs; ++i) { 2020 r = vhost_virtqueue_start(hdev, 2021 vdev, 2022 hdev->vqs + i, 2023 hdev->vq_index + i); 2024 if (r < 0) { 2025 goto fail_vq; 2026 } 2027 } 2028 2029 r = event_notifier_init( 2030 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0); 2031 if (r < 0) { 2032 VHOST_OPS_DEBUG(r, "event_notifier_init failed"); 2033 goto fail_vq; 2034 } 2035 event_notifier_test_and_clear( 2036 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2037 if (!vdev->use_guest_notifier_mask) { 2038 vhost_config_mask(hdev, vdev, true); 2039 } 2040 if (hdev->log_enabled) { 2041 uint64_t log_base; 2042 2043 hdev->log_size = vhost_get_log_size(hdev); 2044 hdev->log = vhost_log_get(hdev->log_size, 2045 vhost_dev_log_is_shared(hdev)); 2046 log_base = (uintptr_t)hdev->log->log; 2047 r = hdev->vhost_ops->vhost_set_log_base(hdev, 2048 hdev->log_size ? log_base : 0, 2049 hdev->log); 2050 if (r < 0) { 2051 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); 2052 goto fail_log; 2053 } 2054 } 2055 if (vrings) { 2056 r = vhost_dev_set_vring_enable(hdev, true); 2057 if (r) { 2058 goto fail_log; 2059 } 2060 } 2061 if (hdev->vhost_ops->vhost_dev_start) { 2062 r = hdev->vhost_ops->vhost_dev_start(hdev, true); 2063 if (r) { 2064 goto fail_start; 2065 } 2066 } 2067 if (vhost_dev_has_iommu(hdev) && 2068 hdev->vhost_ops->vhost_set_iotlb_callback) { 2069 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); 2070 2071 /* Update used ring information for IOTLB to work correctly, 2072 * vhost-kernel code requires for this.*/ 2073 for (i = 0; i < hdev->nvqs; ++i) { 2074 struct vhost_virtqueue *vq = hdev->vqs + i; 2075 vhost_device_iotlb_miss(hdev, vq->used_phys, true); 2076 } 2077 } 2078 vhost_start_config_intr(hdev); 2079 return 0; 2080 fail_start: 2081 if (vrings) { 2082 vhost_dev_set_vring_enable(hdev, false); 2083 } 2084 fail_log: 2085 vhost_log_put(hdev, false); 2086 fail_vq: 2087 while (--i >= 0) { 2088 vhost_virtqueue_stop(hdev, 2089 vdev, 2090 hdev->vqs + i, 2091 hdev->vq_index + i); 2092 } 2093 2094 fail_mem: 2095 if (vhost_dev_has_iommu(hdev)) { 2096 memory_listener_unregister(&hdev->iommu_listener); 2097 } 2098 fail_features: 2099 vdev->vhost_started = false; 2100 hdev->started = false; 2101 return r; 2102 } 2103 2104 /* Host notifiers must be enabled at this point. */ 2105 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) 2106 { 2107 int i; 2108 2109 /* should only be called after backend is connected */ 2110 assert(hdev->vhost_ops); 2111 event_notifier_test_and_clear( 2112 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2113 event_notifier_test_and_clear(&vdev->config_notifier); 2114 event_notifier_cleanup( 2115 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier); 2116 2117 trace_vhost_dev_stop(hdev, vdev->name, vrings); 2118 2119 if (hdev->vhost_ops->vhost_dev_start) { 2120 hdev->vhost_ops->vhost_dev_start(hdev, false); 2121 } 2122 if (vrings) { 2123 vhost_dev_set_vring_enable(hdev, false); 2124 } 2125 for (i = 0; i < hdev->nvqs; ++i) { 2126 vhost_virtqueue_stop(hdev, 2127 vdev, 2128 hdev->vqs + i, 2129 hdev->vq_index + i); 2130 } 2131 if (hdev->vhost_ops->vhost_reset_status) { 2132 hdev->vhost_ops->vhost_reset_status(hdev); 2133 } 2134 2135 if (vhost_dev_has_iommu(hdev)) { 2136 if (hdev->vhost_ops->vhost_set_iotlb_callback) { 2137 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 2138 } 2139 memory_listener_unregister(&hdev->iommu_listener); 2140 } 2141 vhost_stop_config_intr(hdev); 2142 vhost_log_put(hdev, true); 2143 hdev->started = false; 2144 vdev->vhost_started = false; 2145 hdev->vdev = NULL; 2146 } 2147 2148 int vhost_net_set_backend(struct vhost_dev *hdev, 2149 struct vhost_vring_file *file) 2150 { 2151 if (hdev->vhost_ops->vhost_net_set_backend) { 2152 return hdev->vhost_ops->vhost_net_set_backend(hdev, file); 2153 } 2154 2155 return -ENOSYS; 2156 } 2157