1 /* 2 * vhost support 3 * 4 * Copyright Red Hat, Inc. 2010 5 * 6 * Authors: 7 * Michael S. Tsirkin <mst@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "hw/virtio/vhost.h" 18 #include "hw/hw.h" 19 #include "qemu/atomic.h" 20 #include "qemu/range.h" 21 #include "qemu/error-report.h" 22 #include "qemu/memfd.h" 23 #include <linux/vhost.h> 24 #include "exec/address-spaces.h" 25 #include "hw/virtio/virtio-bus.h" 26 #include "hw/virtio/virtio-access.h" 27 #include "migration/migration.h" 28 29 static struct vhost_log *vhost_log; 30 static struct vhost_log *vhost_log_shm; 31 32 static unsigned int used_memslots; 33 static QLIST_HEAD(, vhost_dev) vhost_devices = 34 QLIST_HEAD_INITIALIZER(vhost_devices); 35 36 bool vhost_has_free_slot(void) 37 { 38 unsigned int slots_limit = ~0U; 39 struct vhost_dev *hdev; 40 41 QLIST_FOREACH(hdev, &vhost_devices, entry) { 42 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 43 slots_limit = MIN(slots_limit, r); 44 } 45 return slots_limit > used_memslots; 46 } 47 48 static void vhost_dev_sync_region(struct vhost_dev *dev, 49 MemoryRegionSection *section, 50 uint64_t mfirst, uint64_t mlast, 51 uint64_t rfirst, uint64_t rlast) 52 { 53 vhost_log_chunk_t *log = dev->log->log; 54 55 uint64_t start = MAX(mfirst, rfirst); 56 uint64_t end = MIN(mlast, rlast); 57 vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK; 58 vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1; 59 uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK; 60 61 if (end < start) { 62 return; 63 } 64 assert(end / VHOST_LOG_CHUNK < dev->log_size); 65 assert(start / VHOST_LOG_CHUNK < dev->log_size); 66 67 for (;from < to; ++from) { 68 vhost_log_chunk_t log; 69 /* We first check with non-atomic: much cheaper, 70 * and we expect non-dirty to be the common case. */ 71 if (!*from) { 72 addr += VHOST_LOG_CHUNK; 73 continue; 74 } 75 /* Data must be read atomically. We don't really need barrier semantics 76 * but it's easier to use atomic_* than roll our own. */ 77 log = atomic_xchg(from, 0); 78 while (log) { 79 int bit = ctzl(log); 80 hwaddr page_addr; 81 hwaddr section_offset; 82 hwaddr mr_offset; 83 page_addr = addr + bit * VHOST_LOG_PAGE; 84 section_offset = page_addr - section->offset_within_address_space; 85 mr_offset = section_offset + section->offset_within_region; 86 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); 87 log &= ~(0x1ull << bit); 88 } 89 addr += VHOST_LOG_CHUNK; 90 } 91 } 92 93 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, 94 MemoryRegionSection *section, 95 hwaddr first, 96 hwaddr last) 97 { 98 int i; 99 hwaddr start_addr; 100 hwaddr end_addr; 101 102 if (!dev->log_enabled || !dev->started) { 103 return 0; 104 } 105 start_addr = section->offset_within_address_space; 106 end_addr = range_get_last(start_addr, int128_get64(section->size)); 107 start_addr = MAX(first, start_addr); 108 end_addr = MIN(last, end_addr); 109 110 for (i = 0; i < dev->mem->nregions; ++i) { 111 struct vhost_memory_region *reg = dev->mem->regions + i; 112 vhost_dev_sync_region(dev, section, start_addr, end_addr, 113 reg->guest_phys_addr, 114 range_get_last(reg->guest_phys_addr, 115 reg->memory_size)); 116 } 117 for (i = 0; i < dev->nvqs; ++i) { 118 struct vhost_virtqueue *vq = dev->vqs + i; 119 vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys, 120 range_get_last(vq->used_phys, vq->used_size)); 121 } 122 return 0; 123 } 124 125 static void vhost_log_sync(MemoryListener *listener, 126 MemoryRegionSection *section) 127 { 128 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 129 memory_listener); 130 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); 131 } 132 133 static void vhost_log_sync_range(struct vhost_dev *dev, 134 hwaddr first, hwaddr last) 135 { 136 int i; 137 /* FIXME: this is N^2 in number of sections */ 138 for (i = 0; i < dev->n_mem_sections; ++i) { 139 MemoryRegionSection *section = &dev->mem_sections[i]; 140 vhost_sync_dirty_bitmap(dev, section, first, last); 141 } 142 } 143 144 /* Assign/unassign. Keep an unsorted array of non-overlapping 145 * memory regions in dev->mem. */ 146 static void vhost_dev_unassign_memory(struct vhost_dev *dev, 147 uint64_t start_addr, 148 uint64_t size) 149 { 150 int from, to, n = dev->mem->nregions; 151 /* Track overlapping/split regions for sanity checking. */ 152 int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0; 153 154 for (from = 0, to = 0; from < n; ++from, ++to) { 155 struct vhost_memory_region *reg = dev->mem->regions + to; 156 uint64_t reglast; 157 uint64_t memlast; 158 uint64_t change; 159 160 /* clone old region */ 161 if (to != from) { 162 memcpy(reg, dev->mem->regions + from, sizeof *reg); 163 } 164 165 /* No overlap is simple */ 166 if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size, 167 start_addr, size)) { 168 continue; 169 } 170 171 /* Split only happens if supplied region 172 * is in the middle of an existing one. Thus it can not 173 * overlap with any other existing region. */ 174 assert(!split); 175 176 reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); 177 memlast = range_get_last(start_addr, size); 178 179 /* Remove whole region */ 180 if (start_addr <= reg->guest_phys_addr && memlast >= reglast) { 181 --dev->mem->nregions; 182 --to; 183 ++overlap_middle; 184 continue; 185 } 186 187 /* Shrink region */ 188 if (memlast >= reglast) { 189 reg->memory_size = start_addr - reg->guest_phys_addr; 190 assert(reg->memory_size); 191 assert(!overlap_end); 192 ++overlap_end; 193 continue; 194 } 195 196 /* Shift region */ 197 if (start_addr <= reg->guest_phys_addr) { 198 change = memlast + 1 - reg->guest_phys_addr; 199 reg->memory_size -= change; 200 reg->guest_phys_addr += change; 201 reg->userspace_addr += change; 202 assert(reg->memory_size); 203 assert(!overlap_start); 204 ++overlap_start; 205 continue; 206 } 207 208 /* This only happens if supplied region 209 * is in the middle of an existing one. Thus it can not 210 * overlap with any other existing region. */ 211 assert(!overlap_start); 212 assert(!overlap_end); 213 assert(!overlap_middle); 214 /* Split region: shrink first part, shift second part. */ 215 memcpy(dev->mem->regions + n, reg, sizeof *reg); 216 reg->memory_size = start_addr - reg->guest_phys_addr; 217 assert(reg->memory_size); 218 change = memlast + 1 - reg->guest_phys_addr; 219 reg = dev->mem->regions + n; 220 reg->memory_size -= change; 221 assert(reg->memory_size); 222 reg->guest_phys_addr += change; 223 reg->userspace_addr += change; 224 /* Never add more than 1 region */ 225 assert(dev->mem->nregions == n); 226 ++dev->mem->nregions; 227 ++split; 228 } 229 } 230 231 /* Called after unassign, so no regions overlap the given range. */ 232 static void vhost_dev_assign_memory(struct vhost_dev *dev, 233 uint64_t start_addr, 234 uint64_t size, 235 uint64_t uaddr) 236 { 237 int from, to; 238 struct vhost_memory_region *merged = NULL; 239 for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) { 240 struct vhost_memory_region *reg = dev->mem->regions + to; 241 uint64_t prlast, urlast; 242 uint64_t pmlast, umlast; 243 uint64_t s, e, u; 244 245 /* clone old region */ 246 if (to != from) { 247 memcpy(reg, dev->mem->regions + from, sizeof *reg); 248 } 249 prlast = range_get_last(reg->guest_phys_addr, reg->memory_size); 250 pmlast = range_get_last(start_addr, size); 251 urlast = range_get_last(reg->userspace_addr, reg->memory_size); 252 umlast = range_get_last(uaddr, size); 253 254 /* check for overlapping regions: should never happen. */ 255 assert(prlast < start_addr || pmlast < reg->guest_phys_addr); 256 /* Not an adjacent or overlapping region - do not merge. */ 257 if ((prlast + 1 != start_addr || urlast + 1 != uaddr) && 258 (pmlast + 1 != reg->guest_phys_addr || 259 umlast + 1 != reg->userspace_addr)) { 260 continue; 261 } 262 263 if (dev->vhost_ops->vhost_backend_can_merge && 264 !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size, 265 reg->userspace_addr, 266 reg->memory_size)) { 267 continue; 268 } 269 270 if (merged) { 271 --to; 272 assert(to >= 0); 273 } else { 274 merged = reg; 275 } 276 u = MIN(uaddr, reg->userspace_addr); 277 s = MIN(start_addr, reg->guest_phys_addr); 278 e = MAX(pmlast, prlast); 279 uaddr = merged->userspace_addr = u; 280 start_addr = merged->guest_phys_addr = s; 281 size = merged->memory_size = e - s + 1; 282 assert(merged->memory_size); 283 } 284 285 if (!merged) { 286 struct vhost_memory_region *reg = dev->mem->regions + to; 287 memset(reg, 0, sizeof *reg); 288 reg->memory_size = size; 289 assert(reg->memory_size); 290 reg->guest_phys_addr = start_addr; 291 reg->userspace_addr = uaddr; 292 ++to; 293 } 294 assert(to <= dev->mem->nregions + 1); 295 dev->mem->nregions = to; 296 } 297 298 static uint64_t vhost_get_log_size(struct vhost_dev *dev) 299 { 300 uint64_t log_size = 0; 301 int i; 302 for (i = 0; i < dev->mem->nregions; ++i) { 303 struct vhost_memory_region *reg = dev->mem->regions + i; 304 uint64_t last = range_get_last(reg->guest_phys_addr, 305 reg->memory_size); 306 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 307 } 308 for (i = 0; i < dev->nvqs; ++i) { 309 struct vhost_virtqueue *vq = dev->vqs + i; 310 uint64_t last = vq->used_phys + vq->used_size - 1; 311 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 312 } 313 return log_size; 314 } 315 316 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) 317 { 318 struct vhost_log *log; 319 uint64_t logsize = size * sizeof(*(log->log)); 320 int fd = -1; 321 322 log = g_new0(struct vhost_log, 1); 323 if (share) { 324 log->log = qemu_memfd_alloc("vhost-log", logsize, 325 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 326 &fd); 327 memset(log->log, 0, logsize); 328 } else { 329 log->log = g_malloc0(logsize); 330 } 331 332 log->size = size; 333 log->refcnt = 1; 334 log->fd = fd; 335 336 return log; 337 } 338 339 static struct vhost_log *vhost_log_get(uint64_t size, bool share) 340 { 341 struct vhost_log *log = share ? vhost_log_shm : vhost_log; 342 343 if (!log || log->size != size) { 344 log = vhost_log_alloc(size, share); 345 if (share) { 346 vhost_log_shm = log; 347 } else { 348 vhost_log = log; 349 } 350 } else { 351 ++log->refcnt; 352 } 353 354 return log; 355 } 356 357 static void vhost_log_put(struct vhost_dev *dev, bool sync) 358 { 359 struct vhost_log *log = dev->log; 360 361 if (!log) { 362 return; 363 } 364 365 --log->refcnt; 366 if (log->refcnt == 0) { 367 /* Sync only the range covered by the old log */ 368 if (dev->log_size && sync) { 369 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); 370 } 371 372 if (vhost_log == log) { 373 g_free(log->log); 374 vhost_log = NULL; 375 } else if (vhost_log_shm == log) { 376 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), 377 log->fd); 378 vhost_log_shm = NULL; 379 } 380 381 g_free(log); 382 } 383 } 384 385 static bool vhost_dev_log_is_shared(struct vhost_dev *dev) 386 { 387 return dev->vhost_ops->vhost_requires_shm_log && 388 dev->vhost_ops->vhost_requires_shm_log(dev); 389 } 390 391 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) 392 { 393 struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); 394 uint64_t log_base = (uintptr_t)log->log; 395 int r; 396 397 /* inform backend of log switching, this must be done before 398 releasing the current log, to ensure no logging is lost */ 399 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); 400 assert(r >= 0); 401 vhost_log_put(dev, true); 402 dev->log = log; 403 dev->log_size = size; 404 } 405 406 static int vhost_verify_ring_mappings(struct vhost_dev *dev, 407 uint64_t start_addr, 408 uint64_t size) 409 { 410 int i; 411 int r = 0; 412 413 for (i = 0; !r && i < dev->nvqs; ++i) { 414 struct vhost_virtqueue *vq = dev->vqs + i; 415 hwaddr l; 416 void *p; 417 418 if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) { 419 continue; 420 } 421 l = vq->ring_size; 422 p = cpu_physical_memory_map(vq->ring_phys, &l, 1); 423 if (!p || l != vq->ring_size) { 424 fprintf(stderr, "Unable to map ring buffer for ring %d\n", i); 425 r = -ENOMEM; 426 } 427 if (p != vq->ring) { 428 fprintf(stderr, "Ring buffer relocated for ring %d\n", i); 429 r = -EBUSY; 430 } 431 cpu_physical_memory_unmap(p, l, 0, 0); 432 } 433 return r; 434 } 435 436 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev, 437 uint64_t start_addr, 438 uint64_t size) 439 { 440 int i, n = dev->mem->nregions; 441 for (i = 0; i < n; ++i) { 442 struct vhost_memory_region *reg = dev->mem->regions + i; 443 if (ranges_overlap(reg->guest_phys_addr, reg->memory_size, 444 start_addr, size)) { 445 return reg; 446 } 447 } 448 return NULL; 449 } 450 451 static bool vhost_dev_cmp_memory(struct vhost_dev *dev, 452 uint64_t start_addr, 453 uint64_t size, 454 uint64_t uaddr) 455 { 456 struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size); 457 uint64_t reglast; 458 uint64_t memlast; 459 460 if (!reg) { 461 return true; 462 } 463 464 reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); 465 memlast = range_get_last(start_addr, size); 466 467 /* Need to extend region? */ 468 if (start_addr < reg->guest_phys_addr || memlast > reglast) { 469 return true; 470 } 471 /* userspace_addr changed? */ 472 return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr; 473 } 474 475 static void vhost_set_memory(MemoryListener *listener, 476 MemoryRegionSection *section, 477 bool add) 478 { 479 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 480 memory_listener); 481 hwaddr start_addr = section->offset_within_address_space; 482 ram_addr_t size = int128_get64(section->size); 483 bool log_dirty = 484 memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION); 485 int s = offsetof(struct vhost_memory, regions) + 486 (dev->mem->nregions + 1) * sizeof dev->mem->regions[0]; 487 void *ram; 488 489 dev->mem = g_realloc(dev->mem, s); 490 491 if (log_dirty) { 492 add = false; 493 } 494 495 assert(size); 496 497 /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */ 498 ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region; 499 if (add) { 500 if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) { 501 /* Region exists with same address. Nothing to do. */ 502 return; 503 } 504 } else { 505 if (!vhost_dev_find_reg(dev, start_addr, size)) { 506 /* Removing region that we don't access. Nothing to do. */ 507 return; 508 } 509 } 510 511 vhost_dev_unassign_memory(dev, start_addr, size); 512 if (add) { 513 /* Add given mapping, merging adjacent regions if any */ 514 vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram); 515 } else { 516 /* Remove old mapping for this memory, if any. */ 517 vhost_dev_unassign_memory(dev, start_addr, size); 518 } 519 dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr); 520 dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1); 521 dev->memory_changed = true; 522 used_memslots = dev->mem->nregions; 523 } 524 525 static bool vhost_section(MemoryRegionSection *section) 526 { 527 return memory_region_is_ram(section->mr); 528 } 529 530 static void vhost_begin(MemoryListener *listener) 531 { 532 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 533 memory_listener); 534 dev->mem_changed_end_addr = 0; 535 dev->mem_changed_start_addr = -1; 536 } 537 538 static void vhost_commit(MemoryListener *listener) 539 { 540 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 541 memory_listener); 542 hwaddr start_addr = 0; 543 ram_addr_t size = 0; 544 uint64_t log_size; 545 int r; 546 547 if (!dev->memory_changed) { 548 return; 549 } 550 if (!dev->started) { 551 return; 552 } 553 if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) { 554 return; 555 } 556 557 if (dev->started) { 558 start_addr = dev->mem_changed_start_addr; 559 size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1; 560 561 r = vhost_verify_ring_mappings(dev, start_addr, size); 562 assert(r >= 0); 563 } 564 565 if (!dev->log_enabled) { 566 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 567 assert(r >= 0); 568 dev->memory_changed = false; 569 return; 570 } 571 log_size = vhost_get_log_size(dev); 572 /* We allocate an extra 4K bytes to log, 573 * to reduce the * number of reallocations. */ 574 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) 575 /* To log more, must increase log size before table update. */ 576 if (dev->log_size < log_size) { 577 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); 578 } 579 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 580 assert(r >= 0); 581 /* To log less, can only decrease log size after table update. */ 582 if (dev->log_size > log_size + VHOST_LOG_BUFFER) { 583 vhost_dev_log_resize(dev, log_size); 584 } 585 dev->memory_changed = false; 586 } 587 588 static void vhost_region_add(MemoryListener *listener, 589 MemoryRegionSection *section) 590 { 591 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 592 memory_listener); 593 594 if (!vhost_section(section)) { 595 return; 596 } 597 598 ++dev->n_mem_sections; 599 dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections, 600 dev->n_mem_sections); 601 dev->mem_sections[dev->n_mem_sections - 1] = *section; 602 memory_region_ref(section->mr); 603 vhost_set_memory(listener, section, true); 604 } 605 606 static void vhost_region_del(MemoryListener *listener, 607 MemoryRegionSection *section) 608 { 609 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 610 memory_listener); 611 int i; 612 613 if (!vhost_section(section)) { 614 return; 615 } 616 617 vhost_set_memory(listener, section, false); 618 memory_region_unref(section->mr); 619 for (i = 0; i < dev->n_mem_sections; ++i) { 620 if (dev->mem_sections[i].offset_within_address_space 621 == section->offset_within_address_space) { 622 --dev->n_mem_sections; 623 memmove(&dev->mem_sections[i], &dev->mem_sections[i+1], 624 (dev->n_mem_sections - i) * sizeof(*dev->mem_sections)); 625 break; 626 } 627 } 628 } 629 630 static void vhost_region_nop(MemoryListener *listener, 631 MemoryRegionSection *section) 632 { 633 } 634 635 static int vhost_virtqueue_set_addr(struct vhost_dev *dev, 636 struct vhost_virtqueue *vq, 637 unsigned idx, bool enable_log) 638 { 639 struct vhost_vring_addr addr = { 640 .index = idx, 641 .desc_user_addr = (uint64_t)(unsigned long)vq->desc, 642 .avail_user_addr = (uint64_t)(unsigned long)vq->avail, 643 .used_user_addr = (uint64_t)(unsigned long)vq->used, 644 .log_guest_addr = vq->used_phys, 645 .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0, 646 }; 647 int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 648 if (r < 0) { 649 return -errno; 650 } 651 return 0; 652 } 653 654 static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log) 655 { 656 uint64_t features = dev->acked_features; 657 int r; 658 if (enable_log) { 659 features |= 0x1ULL << VHOST_F_LOG_ALL; 660 } 661 r = dev->vhost_ops->vhost_set_features(dev, features); 662 return r < 0 ? -errno : 0; 663 } 664 665 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) 666 { 667 int r, t, i, idx; 668 r = vhost_dev_set_features(dev, enable_log); 669 if (r < 0) { 670 goto err_features; 671 } 672 for (i = 0; i < dev->nvqs; ++i) { 673 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 674 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 675 enable_log); 676 if (r < 0) { 677 goto err_vq; 678 } 679 } 680 return 0; 681 err_vq: 682 for (; i >= 0; --i) { 683 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 684 t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 685 dev->log_enabled); 686 assert(t >= 0); 687 } 688 t = vhost_dev_set_features(dev, dev->log_enabled); 689 assert(t >= 0); 690 err_features: 691 return r; 692 } 693 694 static int vhost_migration_log(MemoryListener *listener, int enable) 695 { 696 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 697 memory_listener); 698 int r; 699 if (!!enable == dev->log_enabled) { 700 return 0; 701 } 702 if (!dev->started) { 703 dev->log_enabled = enable; 704 return 0; 705 } 706 if (!enable) { 707 r = vhost_dev_set_log(dev, false); 708 if (r < 0) { 709 return r; 710 } 711 vhost_log_put(dev, false); 712 dev->log = NULL; 713 dev->log_size = 0; 714 } else { 715 vhost_dev_log_resize(dev, vhost_get_log_size(dev)); 716 r = vhost_dev_set_log(dev, true); 717 if (r < 0) { 718 return r; 719 } 720 } 721 dev->log_enabled = enable; 722 return 0; 723 } 724 725 static void vhost_log_global_start(MemoryListener *listener) 726 { 727 int r; 728 729 r = vhost_migration_log(listener, true); 730 if (r < 0) { 731 abort(); 732 } 733 } 734 735 static void vhost_log_global_stop(MemoryListener *listener) 736 { 737 int r; 738 739 r = vhost_migration_log(listener, false); 740 if (r < 0) { 741 abort(); 742 } 743 } 744 745 static void vhost_log_start(MemoryListener *listener, 746 MemoryRegionSection *section, 747 int old, int new) 748 { 749 /* FIXME: implement */ 750 } 751 752 static void vhost_log_stop(MemoryListener *listener, 753 MemoryRegionSection *section, 754 int old, int new) 755 { 756 /* FIXME: implement */ 757 } 758 759 /* The vhost driver natively knows how to handle the vrings of non 760 * cross-endian legacy devices and modern devices. Only legacy devices 761 * exposed to a bi-endian guest may require the vhost driver to use a 762 * specific endianness. 763 */ 764 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) 765 { 766 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { 767 return false; 768 } 769 #ifdef TARGET_IS_BIENDIAN 770 #ifdef HOST_WORDS_BIGENDIAN 771 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; 772 #else 773 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; 774 #endif 775 #else 776 return false; 777 #endif 778 } 779 780 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, 781 bool is_big_endian, 782 int vhost_vq_index) 783 { 784 struct vhost_vring_state s = { 785 .index = vhost_vq_index, 786 .num = is_big_endian 787 }; 788 789 if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) { 790 return 0; 791 } 792 793 if (errno == ENOTTY) { 794 error_report("vhost does not support cross-endian"); 795 return -ENOSYS; 796 } 797 798 return -errno; 799 } 800 801 static int vhost_virtqueue_start(struct vhost_dev *dev, 802 struct VirtIODevice *vdev, 803 struct vhost_virtqueue *vq, 804 unsigned idx) 805 { 806 hwaddr s, l, a; 807 int r; 808 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 809 struct vhost_vring_file file = { 810 .index = vhost_vq_index 811 }; 812 struct vhost_vring_state state = { 813 .index = vhost_vq_index 814 }; 815 struct VirtQueue *vvq = virtio_get_queue(vdev, idx); 816 817 818 vq->num = state.num = virtio_queue_get_num(vdev, idx); 819 r = dev->vhost_ops->vhost_set_vring_num(dev, &state); 820 if (r) { 821 return -errno; 822 } 823 824 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 825 r = dev->vhost_ops->vhost_set_vring_base(dev, &state); 826 if (r) { 827 return -errno; 828 } 829 830 if (vhost_needs_vring_endian(vdev)) { 831 r = vhost_virtqueue_set_vring_endian_legacy(dev, 832 virtio_is_big_endian(vdev), 833 vhost_vq_index); 834 if (r) { 835 return -errno; 836 } 837 } 838 839 s = l = virtio_queue_get_desc_size(vdev, idx); 840 a = virtio_queue_get_desc_addr(vdev, idx); 841 vq->desc = cpu_physical_memory_map(a, &l, 0); 842 if (!vq->desc || l != s) { 843 r = -ENOMEM; 844 goto fail_alloc_desc; 845 } 846 s = l = virtio_queue_get_avail_size(vdev, idx); 847 a = virtio_queue_get_avail_addr(vdev, idx); 848 vq->avail = cpu_physical_memory_map(a, &l, 0); 849 if (!vq->avail || l != s) { 850 r = -ENOMEM; 851 goto fail_alloc_avail; 852 } 853 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); 854 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); 855 vq->used = cpu_physical_memory_map(a, &l, 1); 856 if (!vq->used || l != s) { 857 r = -ENOMEM; 858 goto fail_alloc_used; 859 } 860 861 vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx); 862 vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx); 863 vq->ring = cpu_physical_memory_map(a, &l, 1); 864 if (!vq->ring || l != s) { 865 r = -ENOMEM; 866 goto fail_alloc_ring; 867 } 868 869 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); 870 if (r < 0) { 871 r = -errno; 872 goto fail_alloc; 873 } 874 875 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); 876 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); 877 if (r) { 878 r = -errno; 879 goto fail_kick; 880 } 881 882 /* Clear and discard previous events if any. */ 883 event_notifier_test_and_clear(&vq->masked_notifier); 884 885 /* Init vring in unmasked state, unless guest_notifier_mask 886 * will do it later. 887 */ 888 if (!vdev->use_guest_notifier_mask) { 889 /* TODO: check and handle errors. */ 890 vhost_virtqueue_mask(dev, vdev, idx, false); 891 } 892 893 return 0; 894 895 fail_kick: 896 fail_alloc: 897 cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), 898 0, 0); 899 fail_alloc_ring: 900 cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx), 901 0, 0); 902 fail_alloc_used: 903 cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx), 904 0, 0); 905 fail_alloc_avail: 906 cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx), 907 0, 0); 908 fail_alloc_desc: 909 return r; 910 } 911 912 static void vhost_virtqueue_stop(struct vhost_dev *dev, 913 struct VirtIODevice *vdev, 914 struct vhost_virtqueue *vq, 915 unsigned idx) 916 { 917 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 918 struct vhost_vring_state state = { 919 .index = vhost_vq_index, 920 }; 921 int r; 922 923 r = dev->vhost_ops->vhost_get_vring_base(dev, &state); 924 if (r < 0) { 925 fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r); 926 fflush(stderr); 927 } 928 virtio_queue_set_last_avail_idx(vdev, idx, state.num); 929 virtio_queue_invalidate_signalled_used(vdev, idx); 930 931 /* In the cross-endian case, we need to reset the vring endianness to 932 * native as legacy devices expect so by default. 933 */ 934 if (vhost_needs_vring_endian(vdev)) { 935 r = vhost_virtqueue_set_vring_endian_legacy(dev, 936 !virtio_is_big_endian(vdev), 937 vhost_vq_index); 938 if (r < 0) { 939 error_report("failed to reset vring endianness"); 940 } 941 } 942 943 assert (r >= 0); 944 cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), 945 0, virtio_queue_get_ring_size(vdev, idx)); 946 cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx), 947 1, virtio_queue_get_used_size(vdev, idx)); 948 cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx), 949 0, virtio_queue_get_avail_size(vdev, idx)); 950 cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx), 951 0, virtio_queue_get_desc_size(vdev, idx)); 952 } 953 954 static void vhost_eventfd_add(MemoryListener *listener, 955 MemoryRegionSection *section, 956 bool match_data, uint64_t data, EventNotifier *e) 957 { 958 } 959 960 static void vhost_eventfd_del(MemoryListener *listener, 961 MemoryRegionSection *section, 962 bool match_data, uint64_t data, EventNotifier *e) 963 { 964 } 965 966 static int vhost_virtqueue_init(struct vhost_dev *dev, 967 struct vhost_virtqueue *vq, int n) 968 { 969 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 970 struct vhost_vring_file file = { 971 .index = vhost_vq_index, 972 }; 973 int r = event_notifier_init(&vq->masked_notifier, 0); 974 if (r < 0) { 975 return r; 976 } 977 978 file.fd = event_notifier_get_fd(&vq->masked_notifier); 979 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 980 if (r) { 981 r = -errno; 982 goto fail_call; 983 } 984 return 0; 985 fail_call: 986 event_notifier_cleanup(&vq->masked_notifier); 987 return r; 988 } 989 990 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) 991 { 992 event_notifier_cleanup(&vq->masked_notifier); 993 } 994 995 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 996 VhostBackendType backend_type) 997 { 998 uint64_t features; 999 int i, r; 1000 1001 hdev->migration_blocker = NULL; 1002 1003 if (vhost_set_backend_type(hdev, backend_type) < 0) { 1004 close((uintptr_t)opaque); 1005 return -1; 1006 } 1007 1008 if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) { 1009 close((uintptr_t)opaque); 1010 return -errno; 1011 } 1012 1013 if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) { 1014 fprintf(stderr, "vhost backend memory slots limit is less" 1015 " than current number of present memory slots\n"); 1016 close((uintptr_t)opaque); 1017 return -1; 1018 } 1019 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); 1020 1021 r = hdev->vhost_ops->vhost_set_owner(hdev); 1022 if (r < 0) { 1023 goto fail; 1024 } 1025 1026 r = hdev->vhost_ops->vhost_get_features(hdev, &features); 1027 if (r < 0) { 1028 goto fail; 1029 } 1030 1031 for (i = 0; i < hdev->nvqs; ++i) { 1032 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); 1033 if (r < 0) { 1034 goto fail_vq; 1035 } 1036 } 1037 hdev->features = features; 1038 1039 hdev->memory_listener = (MemoryListener) { 1040 .begin = vhost_begin, 1041 .commit = vhost_commit, 1042 .region_add = vhost_region_add, 1043 .region_del = vhost_region_del, 1044 .region_nop = vhost_region_nop, 1045 .log_start = vhost_log_start, 1046 .log_stop = vhost_log_stop, 1047 .log_sync = vhost_log_sync, 1048 .log_global_start = vhost_log_global_start, 1049 .log_global_stop = vhost_log_global_stop, 1050 .eventfd_add = vhost_eventfd_add, 1051 .eventfd_del = vhost_eventfd_del, 1052 .priority = 10 1053 }; 1054 1055 if (hdev->migration_blocker == NULL) { 1056 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { 1057 error_setg(&hdev->migration_blocker, 1058 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); 1059 } else if (!qemu_memfd_check()) { 1060 error_setg(&hdev->migration_blocker, 1061 "Migration disabled: failed to allocate shared memory"); 1062 } 1063 } 1064 1065 if (hdev->migration_blocker != NULL) { 1066 migrate_add_blocker(hdev->migration_blocker); 1067 } 1068 1069 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); 1070 hdev->n_mem_sections = 0; 1071 hdev->mem_sections = NULL; 1072 hdev->log = NULL; 1073 hdev->log_size = 0; 1074 hdev->log_enabled = false; 1075 hdev->started = false; 1076 hdev->memory_changed = false; 1077 memory_listener_register(&hdev->memory_listener, &address_space_memory); 1078 return 0; 1079 fail_vq: 1080 while (--i >= 0) { 1081 vhost_virtqueue_cleanup(hdev->vqs + i); 1082 } 1083 fail: 1084 r = -errno; 1085 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1086 QLIST_REMOVE(hdev, entry); 1087 return r; 1088 } 1089 1090 void vhost_dev_cleanup(struct vhost_dev *hdev) 1091 { 1092 int i; 1093 for (i = 0; i < hdev->nvqs; ++i) { 1094 vhost_virtqueue_cleanup(hdev->vqs + i); 1095 } 1096 memory_listener_unregister(&hdev->memory_listener); 1097 if (hdev->migration_blocker) { 1098 migrate_del_blocker(hdev->migration_blocker); 1099 error_free(hdev->migration_blocker); 1100 } 1101 g_free(hdev->mem); 1102 g_free(hdev->mem_sections); 1103 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1104 QLIST_REMOVE(hdev, entry); 1105 } 1106 1107 /* Stop processing guest IO notifications in qemu. 1108 * Start processing them in vhost in kernel. 1109 */ 1110 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1111 { 1112 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1113 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1114 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1115 int i, r, e; 1116 if (!k->set_host_notifier) { 1117 fprintf(stderr, "binding does not support host notifiers\n"); 1118 r = -ENOSYS; 1119 goto fail; 1120 } 1121 1122 for (i = 0; i < hdev->nvqs; ++i) { 1123 r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, true); 1124 if (r < 0) { 1125 fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r); 1126 goto fail_vq; 1127 } 1128 } 1129 1130 return 0; 1131 fail_vq: 1132 while (--i >= 0) { 1133 e = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false); 1134 if (e < 0) { 1135 fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r); 1136 fflush(stderr); 1137 } 1138 assert (e >= 0); 1139 } 1140 fail: 1141 return r; 1142 } 1143 1144 /* Stop processing guest IO notifications in vhost. 1145 * Start processing them in qemu. 1146 * This might actually run the qemu handlers right away, 1147 * so virtio in qemu must be completely setup when this is called. 1148 */ 1149 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1150 { 1151 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1152 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1153 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1154 int i, r; 1155 1156 for (i = 0; i < hdev->nvqs; ++i) { 1157 r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false); 1158 if (r < 0) { 1159 fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r); 1160 fflush(stderr); 1161 } 1162 assert (r >= 0); 1163 } 1164 } 1165 1166 /* Test and clear event pending status. 1167 * Should be called after unmask to avoid losing events. 1168 */ 1169 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) 1170 { 1171 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; 1172 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); 1173 return event_notifier_test_and_clear(&vq->masked_notifier); 1174 } 1175 1176 /* Mask/unmask events from this vq. */ 1177 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, 1178 bool mask) 1179 { 1180 struct VirtQueue *vvq = virtio_get_queue(vdev, n); 1181 int r, index = n - hdev->vq_index; 1182 struct vhost_vring_file file; 1183 1184 if (mask) { 1185 assert(vdev->use_guest_notifier_mask); 1186 file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier); 1187 } else { 1188 file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq)); 1189 } 1190 1191 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); 1192 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); 1193 assert(r >= 0); 1194 } 1195 1196 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, 1197 uint64_t features) 1198 { 1199 const int *bit = feature_bits; 1200 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1201 uint64_t bit_mask = (1ULL << *bit); 1202 if (!(hdev->features & bit_mask)) { 1203 features &= ~bit_mask; 1204 } 1205 bit++; 1206 } 1207 return features; 1208 } 1209 1210 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, 1211 uint64_t features) 1212 { 1213 const int *bit = feature_bits; 1214 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1215 uint64_t bit_mask = (1ULL << *bit); 1216 if (features & bit_mask) { 1217 hdev->acked_features |= bit_mask; 1218 } 1219 bit++; 1220 } 1221 } 1222 1223 /* Host notifiers must be enabled at this point. */ 1224 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) 1225 { 1226 int i, r; 1227 1228 hdev->started = true; 1229 1230 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1231 if (r < 0) { 1232 goto fail_features; 1233 } 1234 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); 1235 if (r < 0) { 1236 r = -errno; 1237 goto fail_mem; 1238 } 1239 for (i = 0; i < hdev->nvqs; ++i) { 1240 r = vhost_virtqueue_start(hdev, 1241 vdev, 1242 hdev->vqs + i, 1243 hdev->vq_index + i); 1244 if (r < 0) { 1245 goto fail_vq; 1246 } 1247 } 1248 1249 if (hdev->log_enabled) { 1250 uint64_t log_base; 1251 1252 hdev->log_size = vhost_get_log_size(hdev); 1253 hdev->log = vhost_log_get(hdev->log_size, 1254 vhost_dev_log_is_shared(hdev)); 1255 log_base = (uintptr_t)hdev->log->log; 1256 r = hdev->vhost_ops->vhost_set_log_base(hdev, 1257 hdev->log_size ? log_base : 0, 1258 hdev->log); 1259 if (r < 0) { 1260 r = -errno; 1261 goto fail_log; 1262 } 1263 } 1264 1265 return 0; 1266 fail_log: 1267 vhost_log_put(hdev, false); 1268 fail_vq: 1269 while (--i >= 0) { 1270 vhost_virtqueue_stop(hdev, 1271 vdev, 1272 hdev->vqs + i, 1273 hdev->vq_index + i); 1274 } 1275 i = hdev->nvqs; 1276 fail_mem: 1277 fail_features: 1278 1279 hdev->started = false; 1280 return r; 1281 } 1282 1283 /* Host notifiers must be enabled at this point. */ 1284 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev) 1285 { 1286 int i; 1287 1288 for (i = 0; i < hdev->nvqs; ++i) { 1289 vhost_virtqueue_stop(hdev, 1290 vdev, 1291 hdev->vqs + i, 1292 hdev->vq_index + i); 1293 } 1294 1295 vhost_log_put(hdev, true); 1296 hdev->started = false; 1297 hdev->log = NULL; 1298 hdev->log_size = 0; 1299 } 1300 1301