1 /* 2 * Vhost User library 3 * 4 * Copyright IBM, Corp. 2007 5 * Copyright (c) 2016 Red Hat, Inc. 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Marc-André Lureau <mlureau@redhat.com> 10 * Victor Kaplansky <victork@redhat.com> 11 * 12 * This work is licensed under the terms of the GNU GPL, version 2 or 13 * later. See the COPYING file in the top-level directory. 14 */ 15 16 /* this code avoids GLib dependency */ 17 #include <stdlib.h> 18 #include <stdio.h> 19 #include <unistd.h> 20 #include <stdarg.h> 21 #include <errno.h> 22 #include <string.h> 23 #include <assert.h> 24 #include <inttypes.h> 25 #include <sys/types.h> 26 #include <sys/socket.h> 27 #include <sys/eventfd.h> 28 #include <sys/mman.h> 29 #include <endian.h> 30 31 #if defined(__linux__) 32 #include <sys/syscall.h> 33 #include <fcntl.h> 34 #include <sys/ioctl.h> 35 #include <linux/vhost.h> 36 37 #ifdef __NR_userfaultfd 38 #include <linux/userfaultfd.h> 39 #endif 40 41 #endif 42 43 #include "include/atomic.h" 44 45 #include "libvhost-user.h" 46 47 /* usually provided by GLib */ 48 #ifndef MIN 49 #define MIN(x, y) ({ \ 50 typeof(x) _min1 = (x); \ 51 typeof(y) _min2 = (y); \ 52 (void) (&_min1 == &_min2); \ 53 _min1 < _min2 ? _min1 : _min2; }) 54 #endif 55 56 /* Round number down to multiple */ 57 #define ALIGN_DOWN(n, m) ((n) / (m) * (m)) 58 59 /* Round number up to multiple */ 60 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) 61 62 #ifndef unlikely 63 #define unlikely(x) __builtin_expect(!!(x), 0) 64 #endif 65 66 /* Align each region to cache line size in inflight buffer */ 67 #define INFLIGHT_ALIGNMENT 64 68 69 /* The version of inflight buffer */ 70 #define INFLIGHT_VERSION 1 71 72 /* The version of the protocol we support */ 73 #define VHOST_USER_VERSION 1 74 #define LIBVHOST_USER_DEBUG 0 75 76 #define DPRINT(...) \ 77 do { \ 78 if (LIBVHOST_USER_DEBUG) { \ 79 fprintf(stderr, __VA_ARGS__); \ 80 } \ 81 } while (0) 82 83 static inline 84 bool has_feature(uint64_t features, unsigned int fbit) 85 { 86 assert(fbit < 64); 87 return !!(features & (1ULL << fbit)); 88 } 89 90 static inline 91 bool vu_has_feature(VuDev *dev, 92 unsigned int fbit) 93 { 94 return has_feature(dev->features, fbit); 95 } 96 97 static inline bool vu_has_protocol_feature(VuDev *dev, unsigned int fbit) 98 { 99 return has_feature(dev->protocol_features, fbit); 100 } 101 102 const char * 103 vu_request_to_string(unsigned int req) 104 { 105 #define REQ(req) [req] = #req 106 static const char *vu_request_str[] = { 107 REQ(VHOST_USER_NONE), 108 REQ(VHOST_USER_GET_FEATURES), 109 REQ(VHOST_USER_SET_FEATURES), 110 REQ(VHOST_USER_SET_OWNER), 111 REQ(VHOST_USER_RESET_OWNER), 112 REQ(VHOST_USER_SET_MEM_TABLE), 113 REQ(VHOST_USER_SET_LOG_BASE), 114 REQ(VHOST_USER_SET_LOG_FD), 115 REQ(VHOST_USER_SET_VRING_NUM), 116 REQ(VHOST_USER_SET_VRING_ADDR), 117 REQ(VHOST_USER_SET_VRING_BASE), 118 REQ(VHOST_USER_GET_VRING_BASE), 119 REQ(VHOST_USER_SET_VRING_KICK), 120 REQ(VHOST_USER_SET_VRING_CALL), 121 REQ(VHOST_USER_SET_VRING_ERR), 122 REQ(VHOST_USER_GET_PROTOCOL_FEATURES), 123 REQ(VHOST_USER_SET_PROTOCOL_FEATURES), 124 REQ(VHOST_USER_GET_QUEUE_NUM), 125 REQ(VHOST_USER_SET_VRING_ENABLE), 126 REQ(VHOST_USER_SEND_RARP), 127 REQ(VHOST_USER_NET_SET_MTU), 128 REQ(VHOST_USER_SET_SLAVE_REQ_FD), 129 REQ(VHOST_USER_IOTLB_MSG), 130 REQ(VHOST_USER_SET_VRING_ENDIAN), 131 REQ(VHOST_USER_GET_CONFIG), 132 REQ(VHOST_USER_SET_CONFIG), 133 REQ(VHOST_USER_POSTCOPY_ADVISE), 134 REQ(VHOST_USER_POSTCOPY_LISTEN), 135 REQ(VHOST_USER_POSTCOPY_END), 136 REQ(VHOST_USER_GET_INFLIGHT_FD), 137 REQ(VHOST_USER_SET_INFLIGHT_FD), 138 REQ(VHOST_USER_GPU_SET_SOCKET), 139 REQ(VHOST_USER_VRING_KICK), 140 REQ(VHOST_USER_GET_MAX_MEM_SLOTS), 141 REQ(VHOST_USER_ADD_MEM_REG), 142 REQ(VHOST_USER_REM_MEM_REG), 143 REQ(VHOST_USER_MAX), 144 }; 145 #undef REQ 146 147 if (req < VHOST_USER_MAX) { 148 return vu_request_str[req]; 149 } else { 150 return "unknown"; 151 } 152 } 153 154 static void 155 vu_panic(VuDev *dev, const char *msg, ...) 156 { 157 char *buf = NULL; 158 va_list ap; 159 160 va_start(ap, msg); 161 if (vasprintf(&buf, msg, ap) < 0) { 162 buf = NULL; 163 } 164 va_end(ap); 165 166 dev->broken = true; 167 dev->panic(dev, buf); 168 free(buf); 169 170 /* 171 * FIXME: 172 * find a way to call virtio_error, or perhaps close the connection? 173 */ 174 } 175 176 /* Translate guest physical address to our virtual address. */ 177 void * 178 vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) 179 { 180 int i; 181 182 if (*plen == 0) { 183 return NULL; 184 } 185 186 /* Find matching memory region. */ 187 for (i = 0; i < dev->nregions; i++) { 188 VuDevRegion *r = &dev->regions[i]; 189 190 if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { 191 if ((guest_addr + *plen) > (r->gpa + r->size)) { 192 *plen = r->gpa + r->size - guest_addr; 193 } 194 return (void *)(uintptr_t) 195 guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; 196 } 197 } 198 199 return NULL; 200 } 201 202 /* Translate qemu virtual address to our virtual address. */ 203 static void * 204 qva_to_va(VuDev *dev, uint64_t qemu_addr) 205 { 206 int i; 207 208 /* Find matching memory region. */ 209 for (i = 0; i < dev->nregions; i++) { 210 VuDevRegion *r = &dev->regions[i]; 211 212 if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { 213 return (void *)(uintptr_t) 214 qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; 215 } 216 } 217 218 return NULL; 219 } 220 221 static void 222 vmsg_close_fds(VhostUserMsg *vmsg) 223 { 224 int i; 225 226 for (i = 0; i < vmsg->fd_num; i++) { 227 close(vmsg->fds[i]); 228 } 229 } 230 231 /* Set reply payload.u64 and clear request flags and fd_num */ 232 static void vmsg_set_reply_u64(VhostUserMsg *vmsg, uint64_t val) 233 { 234 vmsg->flags = 0; /* defaults will be set by vu_send_reply() */ 235 vmsg->size = sizeof(vmsg->payload.u64); 236 vmsg->payload.u64 = val; 237 vmsg->fd_num = 0; 238 } 239 240 /* A test to see if we have userfault available */ 241 static bool 242 have_userfault(void) 243 { 244 #if defined(__linux__) && defined(__NR_userfaultfd) &&\ 245 defined(UFFD_FEATURE_MISSING_SHMEM) &&\ 246 defined(UFFD_FEATURE_MISSING_HUGETLBFS) 247 /* Now test the kernel we're running on really has the features */ 248 int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 249 struct uffdio_api api_struct; 250 if (ufd < 0) { 251 return false; 252 } 253 254 api_struct.api = UFFD_API; 255 api_struct.features = UFFD_FEATURE_MISSING_SHMEM | 256 UFFD_FEATURE_MISSING_HUGETLBFS; 257 if (ioctl(ufd, UFFDIO_API, &api_struct)) { 258 close(ufd); 259 return false; 260 } 261 close(ufd); 262 return true; 263 264 #else 265 return false; 266 #endif 267 } 268 269 static bool 270 vu_message_read_default(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 271 { 272 char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; 273 struct iovec iov = { 274 .iov_base = (char *)vmsg, 275 .iov_len = VHOST_USER_HDR_SIZE, 276 }; 277 struct msghdr msg = { 278 .msg_iov = &iov, 279 .msg_iovlen = 1, 280 .msg_control = control, 281 .msg_controllen = sizeof(control), 282 }; 283 size_t fd_size; 284 struct cmsghdr *cmsg; 285 int rc; 286 287 do { 288 rc = recvmsg(conn_fd, &msg, 0); 289 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 290 291 if (rc < 0) { 292 vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); 293 return false; 294 } 295 296 vmsg->fd_num = 0; 297 for (cmsg = CMSG_FIRSTHDR(&msg); 298 cmsg != NULL; 299 cmsg = CMSG_NXTHDR(&msg, cmsg)) 300 { 301 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { 302 fd_size = cmsg->cmsg_len - CMSG_LEN(0); 303 vmsg->fd_num = fd_size / sizeof(int); 304 memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); 305 break; 306 } 307 } 308 309 if (vmsg->size > sizeof(vmsg->payload)) { 310 vu_panic(dev, 311 "Error: too big message request: %d, size: vmsg->size: %u, " 312 "while sizeof(vmsg->payload) = %zu\n", 313 vmsg->request, vmsg->size, sizeof(vmsg->payload)); 314 goto fail; 315 } 316 317 if (vmsg->size) { 318 do { 319 rc = read(conn_fd, &vmsg->payload, vmsg->size); 320 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 321 322 if (rc <= 0) { 323 vu_panic(dev, "Error while reading: %s", strerror(errno)); 324 goto fail; 325 } 326 327 assert(rc == vmsg->size); 328 } 329 330 return true; 331 332 fail: 333 vmsg_close_fds(vmsg); 334 335 return false; 336 } 337 338 static bool 339 vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 340 { 341 int rc; 342 uint8_t *p = (uint8_t *)vmsg; 343 char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; 344 struct iovec iov = { 345 .iov_base = (char *)vmsg, 346 .iov_len = VHOST_USER_HDR_SIZE, 347 }; 348 struct msghdr msg = { 349 .msg_iov = &iov, 350 .msg_iovlen = 1, 351 .msg_control = control, 352 }; 353 struct cmsghdr *cmsg; 354 355 memset(control, 0, sizeof(control)); 356 assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); 357 if (vmsg->fd_num > 0) { 358 size_t fdsize = vmsg->fd_num * sizeof(int); 359 msg.msg_controllen = CMSG_SPACE(fdsize); 360 cmsg = CMSG_FIRSTHDR(&msg); 361 cmsg->cmsg_len = CMSG_LEN(fdsize); 362 cmsg->cmsg_level = SOL_SOCKET; 363 cmsg->cmsg_type = SCM_RIGHTS; 364 memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); 365 } else { 366 msg.msg_controllen = 0; 367 } 368 369 do { 370 rc = sendmsg(conn_fd, &msg, 0); 371 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 372 373 if (vmsg->size) { 374 do { 375 if (vmsg->data) { 376 rc = write(conn_fd, vmsg->data, vmsg->size); 377 } else { 378 rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); 379 } 380 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 381 } 382 383 if (rc <= 0) { 384 vu_panic(dev, "Error while writing: %s", strerror(errno)); 385 return false; 386 } 387 388 return true; 389 } 390 391 static bool 392 vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 393 { 394 /* Set the version in the flags when sending the reply */ 395 vmsg->flags &= ~VHOST_USER_VERSION_MASK; 396 vmsg->flags |= VHOST_USER_VERSION; 397 vmsg->flags |= VHOST_USER_REPLY_MASK; 398 399 return vu_message_write(dev, conn_fd, vmsg); 400 } 401 402 /* 403 * Processes a reply on the slave channel. 404 * Entered with slave_mutex held and releases it before exit. 405 * Returns true on success. 406 */ 407 static bool 408 vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg) 409 { 410 VhostUserMsg msg_reply; 411 bool result = false; 412 413 if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { 414 result = true; 415 goto out; 416 } 417 418 if (!vu_message_read_default(dev, dev->slave_fd, &msg_reply)) { 419 goto out; 420 } 421 422 if (msg_reply.request != vmsg->request) { 423 DPRINT("Received unexpected msg type. Expected %d received %d", 424 vmsg->request, msg_reply.request); 425 goto out; 426 } 427 428 result = msg_reply.payload.u64 == 0; 429 430 out: 431 pthread_mutex_unlock(&dev->slave_mutex); 432 return result; 433 } 434 435 /* Kick the log_call_fd if required. */ 436 static void 437 vu_log_kick(VuDev *dev) 438 { 439 if (dev->log_call_fd != -1) { 440 DPRINT("Kicking the QEMU's log...\n"); 441 if (eventfd_write(dev->log_call_fd, 1) < 0) { 442 vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); 443 } 444 } 445 } 446 447 static void 448 vu_log_page(uint8_t *log_table, uint64_t page) 449 { 450 DPRINT("Logged dirty guest page: %"PRId64"\n", page); 451 qatomic_or(&log_table[page / 8], 1 << (page % 8)); 452 } 453 454 static void 455 vu_log_write(VuDev *dev, uint64_t address, uint64_t length) 456 { 457 uint64_t page; 458 459 if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || 460 !dev->log_table || !length) { 461 return; 462 } 463 464 assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); 465 466 page = address / VHOST_LOG_PAGE; 467 while (page * VHOST_LOG_PAGE < address + length) { 468 vu_log_page(dev->log_table, page); 469 page += 1; 470 } 471 472 vu_log_kick(dev); 473 } 474 475 static void 476 vu_kick_cb(VuDev *dev, int condition, void *data) 477 { 478 int index = (intptr_t)data; 479 VuVirtq *vq = &dev->vq[index]; 480 int sock = vq->kick_fd; 481 eventfd_t kick_data; 482 ssize_t rc; 483 484 rc = eventfd_read(sock, &kick_data); 485 if (rc == -1) { 486 vu_panic(dev, "kick eventfd_read(): %s", strerror(errno)); 487 dev->remove_watch(dev, dev->vq[index].kick_fd); 488 } else { 489 DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n", 490 kick_data, vq->handler, index); 491 if (vq->handler) { 492 vq->handler(dev, index); 493 } 494 } 495 } 496 497 static bool 498 vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) 499 { 500 vmsg->payload.u64 = 501 /* 502 * The following VIRTIO feature bits are supported by our virtqueue 503 * implementation: 504 */ 505 1ULL << VIRTIO_F_NOTIFY_ON_EMPTY | 506 1ULL << VIRTIO_RING_F_INDIRECT_DESC | 507 1ULL << VIRTIO_RING_F_EVENT_IDX | 508 1ULL << VIRTIO_F_VERSION_1 | 509 510 /* vhost-user feature bits */ 511 1ULL << VHOST_F_LOG_ALL | 512 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; 513 514 if (dev->iface->get_features) { 515 vmsg->payload.u64 |= dev->iface->get_features(dev); 516 } 517 518 vmsg->size = sizeof(vmsg->payload.u64); 519 vmsg->fd_num = 0; 520 521 DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 522 523 return true; 524 } 525 526 static void 527 vu_set_enable_all_rings(VuDev *dev, bool enabled) 528 { 529 uint16_t i; 530 531 for (i = 0; i < dev->max_queues; i++) { 532 dev->vq[i].enable = enabled; 533 } 534 } 535 536 static bool 537 vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg) 538 { 539 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 540 541 dev->features = vmsg->payload.u64; 542 if (!vu_has_feature(dev, VIRTIO_F_VERSION_1)) { 543 /* 544 * We only support devices conforming to VIRTIO 1.0 or 545 * later 546 */ 547 vu_panic(dev, "virtio legacy devices aren't supported by libvhost-user"); 548 return false; 549 } 550 551 if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) { 552 vu_set_enable_all_rings(dev, true); 553 } 554 555 if (dev->iface->set_features) { 556 dev->iface->set_features(dev, dev->features); 557 } 558 559 return false; 560 } 561 562 static bool 563 vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg) 564 { 565 return false; 566 } 567 568 static void 569 vu_close_log(VuDev *dev) 570 { 571 if (dev->log_table) { 572 if (munmap(dev->log_table, dev->log_size) != 0) { 573 perror("close log munmap() error"); 574 } 575 576 dev->log_table = NULL; 577 } 578 if (dev->log_call_fd != -1) { 579 close(dev->log_call_fd); 580 dev->log_call_fd = -1; 581 } 582 } 583 584 static bool 585 vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) 586 { 587 vu_set_enable_all_rings(dev, false); 588 589 return false; 590 } 591 592 static bool 593 map_ring(VuDev *dev, VuVirtq *vq) 594 { 595 vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); 596 vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); 597 vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); 598 599 DPRINT("Setting virtq addresses:\n"); 600 DPRINT(" vring_desc at %p\n", vq->vring.desc); 601 DPRINT(" vring_used at %p\n", vq->vring.used); 602 DPRINT(" vring_avail at %p\n", vq->vring.avail); 603 604 return !(vq->vring.desc && vq->vring.used && vq->vring.avail); 605 } 606 607 static bool 608 generate_faults(VuDev *dev) { 609 int i; 610 for (i = 0; i < dev->nregions; i++) { 611 VuDevRegion *dev_region = &dev->regions[i]; 612 int ret; 613 #ifdef UFFDIO_REGISTER 614 /* 615 * We should already have an open ufd. Mark each memory 616 * range as ufd. 617 * Discard any mapping we have here; note I can't use MADV_REMOVE 618 * or fallocate to make the hole since I don't want to lose 619 * data that's already arrived in the shared process. 620 * TODO: How to do hugepage 621 */ 622 ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, 623 dev_region->size + dev_region->mmap_offset, 624 MADV_DONTNEED); 625 if (ret) { 626 fprintf(stderr, 627 "%s: Failed to madvise(DONTNEED) region %d: %s\n", 628 __func__, i, strerror(errno)); 629 } 630 /* 631 * Turn off transparent hugepages so we dont get lose wakeups 632 * in neighbouring pages. 633 * TODO: Turn this backon later. 634 */ 635 ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, 636 dev_region->size + dev_region->mmap_offset, 637 MADV_NOHUGEPAGE); 638 if (ret) { 639 /* 640 * Note: This can happen legally on kernels that are configured 641 * without madvise'able hugepages 642 */ 643 fprintf(stderr, 644 "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n", 645 __func__, i, strerror(errno)); 646 } 647 struct uffdio_register reg_struct; 648 reg_struct.range.start = (uintptr_t)dev_region->mmap_addr; 649 reg_struct.range.len = dev_region->size + dev_region->mmap_offset; 650 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 651 652 if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) { 653 vu_panic(dev, "%s: Failed to userfault region %d " 654 "@%" PRIx64 " + size:%" PRIx64 " offset: %" PRIx64 655 ": (ufd=%d)%s\n", 656 __func__, i, 657 dev_region->mmap_addr, 658 dev_region->size, dev_region->mmap_offset, 659 dev->postcopy_ufd, strerror(errno)); 660 return false; 661 } 662 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) { 663 vu_panic(dev, "%s Region (%d) doesn't support COPY", 664 __func__, i); 665 return false; 666 } 667 DPRINT("%s: region %d: Registered userfault for %" 668 PRIx64 " + %" PRIx64 "\n", __func__, i, 669 (uint64_t)reg_struct.range.start, 670 (uint64_t)reg_struct.range.len); 671 /* Now it's registered we can let the client at it */ 672 if (mprotect((void *)(uintptr_t)dev_region->mmap_addr, 673 dev_region->size + dev_region->mmap_offset, 674 PROT_READ | PROT_WRITE)) { 675 vu_panic(dev, "failed to mprotect region %d for postcopy (%s)", 676 i, strerror(errno)); 677 return false; 678 } 679 /* TODO: Stash 'zero' support flags somewhere */ 680 #endif 681 } 682 683 return true; 684 } 685 686 static bool 687 vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { 688 int i; 689 bool track_ramblocks = dev->postcopy_listening; 690 VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; 691 VuDevRegion *dev_region = &dev->regions[dev->nregions]; 692 void *mmap_addr; 693 694 if (vmsg->fd_num != 1) { 695 vmsg_close_fds(vmsg); 696 vu_panic(dev, "VHOST_USER_ADD_MEM_REG received %d fds - only 1 fd " 697 "should be sent for this message type", vmsg->fd_num); 698 return false; 699 } 700 701 if (vmsg->size < VHOST_USER_MEM_REG_SIZE) { 702 close(vmsg->fds[0]); 703 vu_panic(dev, "VHOST_USER_ADD_MEM_REG requires a message size of at " 704 "least %zu bytes and only %d bytes were received", 705 VHOST_USER_MEM_REG_SIZE, vmsg->size); 706 return false; 707 } 708 709 if (dev->nregions == VHOST_USER_MAX_RAM_SLOTS) { 710 close(vmsg->fds[0]); 711 vu_panic(dev, "failing attempt to hot add memory via " 712 "VHOST_USER_ADD_MEM_REG message because the backend has " 713 "no free ram slots available"); 714 return false; 715 } 716 717 /* 718 * If we are in postcopy mode and we receive a u64 payload with a 0 value 719 * we know all the postcopy client bases have been received, and we 720 * should start generating faults. 721 */ 722 if (track_ramblocks && 723 vmsg->size == sizeof(vmsg->payload.u64) && 724 vmsg->payload.u64 == 0) { 725 (void)generate_faults(dev); 726 return false; 727 } 728 729 DPRINT("Adding region: %u\n", dev->nregions); 730 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 731 msg_region->guest_phys_addr); 732 DPRINT(" memory_size: 0x%016"PRIx64"\n", 733 msg_region->memory_size); 734 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 735 msg_region->userspace_addr); 736 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 737 msg_region->mmap_offset); 738 739 dev_region->gpa = msg_region->guest_phys_addr; 740 dev_region->size = msg_region->memory_size; 741 dev_region->qva = msg_region->userspace_addr; 742 dev_region->mmap_offset = msg_region->mmap_offset; 743 744 /* 745 * We don't use offset argument of mmap() since the 746 * mapped address has to be page aligned, and we use huge 747 * pages. 748 */ 749 if (track_ramblocks) { 750 /* 751 * In postcopy we're using PROT_NONE here to catch anyone 752 * accessing it before we userfault. 753 */ 754 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 755 PROT_NONE, MAP_SHARED | MAP_NORESERVE, 756 vmsg->fds[0], 0); 757 } else { 758 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 759 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, 760 vmsg->fds[0], 0); 761 } 762 763 if (mmap_addr == MAP_FAILED) { 764 vu_panic(dev, "region mmap error: %s", strerror(errno)); 765 } else { 766 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 767 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 768 dev_region->mmap_addr); 769 } 770 771 close(vmsg->fds[0]); 772 773 if (track_ramblocks) { 774 /* 775 * Return the address to QEMU so that it can translate the ufd 776 * fault addresses back. 777 */ 778 msg_region->userspace_addr = (uintptr_t)(mmap_addr + 779 dev_region->mmap_offset); 780 781 /* Send the message back to qemu with the addresses filled in. */ 782 vmsg->fd_num = 0; 783 DPRINT("Successfully added new region in postcopy\n"); 784 dev->nregions++; 785 return true; 786 } else { 787 for (i = 0; i < dev->max_queues; i++) { 788 if (dev->vq[i].vring.desc) { 789 if (map_ring(dev, &dev->vq[i])) { 790 vu_panic(dev, "remapping queue %d for new memory region", 791 i); 792 } 793 } 794 } 795 796 DPRINT("Successfully added new region\n"); 797 dev->nregions++; 798 return false; 799 } 800 } 801 802 static inline bool reg_equal(VuDevRegion *vudev_reg, 803 VhostUserMemoryRegion *msg_reg) 804 { 805 if (vudev_reg->gpa == msg_reg->guest_phys_addr && 806 vudev_reg->qva == msg_reg->userspace_addr && 807 vudev_reg->size == msg_reg->memory_size) { 808 return true; 809 } 810 811 return false; 812 } 813 814 static bool 815 vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { 816 VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; 817 int i; 818 bool found = false; 819 820 if (vmsg->fd_num > 1) { 821 vmsg_close_fds(vmsg); 822 vu_panic(dev, "VHOST_USER_REM_MEM_REG received %d fds - at most 1 fd " 823 "should be sent for this message type", vmsg->fd_num); 824 return false; 825 } 826 827 if (vmsg->size < VHOST_USER_MEM_REG_SIZE) { 828 vmsg_close_fds(vmsg); 829 vu_panic(dev, "VHOST_USER_REM_MEM_REG requires a message size of at " 830 "least %zu bytes and only %d bytes were received", 831 VHOST_USER_MEM_REG_SIZE, vmsg->size); 832 return false; 833 } 834 835 DPRINT("Removing region:\n"); 836 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 837 msg_region->guest_phys_addr); 838 DPRINT(" memory_size: 0x%016"PRIx64"\n", 839 msg_region->memory_size); 840 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 841 msg_region->userspace_addr); 842 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 843 msg_region->mmap_offset); 844 845 for (i = 0; i < dev->nregions; i++) { 846 if (reg_equal(&dev->regions[i], msg_region)) { 847 VuDevRegion *r = &dev->regions[i]; 848 void *m = (void *) (uintptr_t) r->mmap_addr; 849 850 if (m) { 851 munmap(m, r->size + r->mmap_offset); 852 } 853 854 /* 855 * Shift all affected entries by 1 to close the hole at index i and 856 * zero out the last entry. 857 */ 858 memmove(dev->regions + i, dev->regions + i + 1, 859 sizeof(VuDevRegion) * (dev->nregions - i - 1)); 860 memset(dev->regions + dev->nregions - 1, 0, sizeof(VuDevRegion)); 861 DPRINT("Successfully removed a region\n"); 862 dev->nregions--; 863 i--; 864 865 found = true; 866 867 /* Continue the search for eventual duplicates. */ 868 } 869 } 870 871 if (!found) { 872 vu_panic(dev, "Specified region not found\n"); 873 } 874 875 vmsg_close_fds(vmsg); 876 877 return false; 878 } 879 880 static bool 881 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) 882 { 883 int i; 884 VhostUserMemory m = vmsg->payload.memory, *memory = &m; 885 dev->nregions = memory->nregions; 886 887 DPRINT("Nregions: %u\n", memory->nregions); 888 for (i = 0; i < dev->nregions; i++) { 889 void *mmap_addr; 890 VhostUserMemoryRegion *msg_region = &memory->regions[i]; 891 VuDevRegion *dev_region = &dev->regions[i]; 892 893 DPRINT("Region %d\n", i); 894 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 895 msg_region->guest_phys_addr); 896 DPRINT(" memory_size: 0x%016"PRIx64"\n", 897 msg_region->memory_size); 898 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 899 msg_region->userspace_addr); 900 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 901 msg_region->mmap_offset); 902 903 dev_region->gpa = msg_region->guest_phys_addr; 904 dev_region->size = msg_region->memory_size; 905 dev_region->qva = msg_region->userspace_addr; 906 dev_region->mmap_offset = msg_region->mmap_offset; 907 908 /* We don't use offset argument of mmap() since the 909 * mapped address has to be page aligned, and we use huge 910 * pages. 911 * In postcopy we're using PROT_NONE here to catch anyone 912 * accessing it before we userfault 913 */ 914 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 915 PROT_NONE, MAP_SHARED | MAP_NORESERVE, 916 vmsg->fds[i], 0); 917 918 if (mmap_addr == MAP_FAILED) { 919 vu_panic(dev, "region mmap error: %s", strerror(errno)); 920 } else { 921 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 922 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 923 dev_region->mmap_addr); 924 } 925 926 /* Return the address to QEMU so that it can translate the ufd 927 * fault addresses back. 928 */ 929 msg_region->userspace_addr = (uintptr_t)(mmap_addr + 930 dev_region->mmap_offset); 931 close(vmsg->fds[i]); 932 } 933 934 /* Send the message back to qemu with the addresses filled in */ 935 vmsg->fd_num = 0; 936 if (!vu_send_reply(dev, dev->sock, vmsg)) { 937 vu_panic(dev, "failed to respond to set-mem-table for postcopy"); 938 return false; 939 } 940 941 /* Wait for QEMU to confirm that it's registered the handler for the 942 * faults. 943 */ 944 if (!dev->read_msg(dev, dev->sock, vmsg) || 945 vmsg->size != sizeof(vmsg->payload.u64) || 946 vmsg->payload.u64 != 0) { 947 vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); 948 return false; 949 } 950 951 /* OK, now we can go and register the memory and generate faults */ 952 (void)generate_faults(dev); 953 954 return false; 955 } 956 957 static bool 958 vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) 959 { 960 int i; 961 VhostUserMemory m = vmsg->payload.memory, *memory = &m; 962 963 for (i = 0; i < dev->nregions; i++) { 964 VuDevRegion *r = &dev->regions[i]; 965 void *m = (void *) (uintptr_t) r->mmap_addr; 966 967 if (m) { 968 munmap(m, r->size + r->mmap_offset); 969 } 970 } 971 dev->nregions = memory->nregions; 972 973 if (dev->postcopy_listening) { 974 return vu_set_mem_table_exec_postcopy(dev, vmsg); 975 } 976 977 DPRINT("Nregions: %u\n", memory->nregions); 978 for (i = 0; i < dev->nregions; i++) { 979 void *mmap_addr; 980 VhostUserMemoryRegion *msg_region = &memory->regions[i]; 981 VuDevRegion *dev_region = &dev->regions[i]; 982 983 DPRINT("Region %d\n", i); 984 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 985 msg_region->guest_phys_addr); 986 DPRINT(" memory_size: 0x%016"PRIx64"\n", 987 msg_region->memory_size); 988 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 989 msg_region->userspace_addr); 990 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 991 msg_region->mmap_offset); 992 993 dev_region->gpa = msg_region->guest_phys_addr; 994 dev_region->size = msg_region->memory_size; 995 dev_region->qva = msg_region->userspace_addr; 996 dev_region->mmap_offset = msg_region->mmap_offset; 997 998 /* We don't use offset argument of mmap() since the 999 * mapped address has to be page aligned, and we use huge 1000 * pages. */ 1001 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 1002 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, 1003 vmsg->fds[i], 0); 1004 1005 if (mmap_addr == MAP_FAILED) { 1006 vu_panic(dev, "region mmap error: %s", strerror(errno)); 1007 } else { 1008 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 1009 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 1010 dev_region->mmap_addr); 1011 } 1012 1013 close(vmsg->fds[i]); 1014 } 1015 1016 for (i = 0; i < dev->max_queues; i++) { 1017 if (dev->vq[i].vring.desc) { 1018 if (map_ring(dev, &dev->vq[i])) { 1019 vu_panic(dev, "remapping queue %d during setmemtable", i); 1020 } 1021 } 1022 } 1023 1024 return false; 1025 } 1026 1027 static bool 1028 vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1029 { 1030 int fd; 1031 uint64_t log_mmap_size, log_mmap_offset; 1032 void *rc; 1033 1034 if (vmsg->fd_num != 1 || 1035 vmsg->size != sizeof(vmsg->payload.log)) { 1036 vu_panic(dev, "Invalid log_base message"); 1037 return true; 1038 } 1039 1040 fd = vmsg->fds[0]; 1041 log_mmap_offset = vmsg->payload.log.mmap_offset; 1042 log_mmap_size = vmsg->payload.log.mmap_size; 1043 DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); 1044 DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); 1045 1046 rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 1047 log_mmap_offset); 1048 close(fd); 1049 if (rc == MAP_FAILED) { 1050 perror("log mmap error"); 1051 } 1052 1053 if (dev->log_table) { 1054 munmap(dev->log_table, dev->log_size); 1055 } 1056 dev->log_table = rc; 1057 dev->log_size = log_mmap_size; 1058 1059 vmsg->size = sizeof(vmsg->payload.u64); 1060 vmsg->fd_num = 0; 1061 1062 return true; 1063 } 1064 1065 static bool 1066 vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg) 1067 { 1068 if (vmsg->fd_num != 1) { 1069 vu_panic(dev, "Invalid log_fd message"); 1070 return false; 1071 } 1072 1073 if (dev->log_call_fd != -1) { 1074 close(dev->log_call_fd); 1075 } 1076 dev->log_call_fd = vmsg->fds[0]; 1077 DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); 1078 1079 return false; 1080 } 1081 1082 static bool 1083 vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg) 1084 { 1085 unsigned int index = vmsg->payload.state.index; 1086 unsigned int num = vmsg->payload.state.num; 1087 1088 DPRINT("State.index: %u\n", index); 1089 DPRINT("State.num: %u\n", num); 1090 dev->vq[index].vring.num = num; 1091 1092 return false; 1093 } 1094 1095 static bool 1096 vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) 1097 { 1098 struct vhost_vring_addr addr = vmsg->payload.addr, *vra = &addr; 1099 unsigned int index = vra->index; 1100 VuVirtq *vq = &dev->vq[index]; 1101 1102 DPRINT("vhost_vring_addr:\n"); 1103 DPRINT(" index: %d\n", vra->index); 1104 DPRINT(" flags: %d\n", vra->flags); 1105 DPRINT(" desc_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->desc_user_addr); 1106 DPRINT(" used_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->used_user_addr); 1107 DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->avail_user_addr); 1108 DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->log_guest_addr); 1109 1110 vq->vra = *vra; 1111 vq->vring.flags = vra->flags; 1112 vq->vring.log_guest_addr = vra->log_guest_addr; 1113 1114 1115 if (map_ring(dev, vq)) { 1116 vu_panic(dev, "Invalid vring_addr message"); 1117 return false; 1118 } 1119 1120 vq->used_idx = le16toh(vq->vring.used->idx); 1121 1122 if (vq->last_avail_idx != vq->used_idx) { 1123 bool resume = dev->iface->queue_is_processed_in_order && 1124 dev->iface->queue_is_processed_in_order(dev, index); 1125 1126 DPRINT("Last avail index != used index: %u != %u%s\n", 1127 vq->last_avail_idx, vq->used_idx, 1128 resume ? ", resuming" : ""); 1129 1130 if (resume) { 1131 vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx; 1132 } 1133 } 1134 1135 return false; 1136 } 1137 1138 static bool 1139 vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1140 { 1141 unsigned int index = vmsg->payload.state.index; 1142 unsigned int num = vmsg->payload.state.num; 1143 1144 DPRINT("State.index: %u\n", index); 1145 DPRINT("State.num: %u\n", num); 1146 dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num; 1147 1148 return false; 1149 } 1150 1151 static bool 1152 vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1153 { 1154 unsigned int index = vmsg->payload.state.index; 1155 1156 DPRINT("State.index: %u\n", index); 1157 vmsg->payload.state.num = dev->vq[index].last_avail_idx; 1158 vmsg->size = sizeof(vmsg->payload.state); 1159 1160 dev->vq[index].started = false; 1161 if (dev->iface->queue_set_started) { 1162 dev->iface->queue_set_started(dev, index, false); 1163 } 1164 1165 if (dev->vq[index].call_fd != -1) { 1166 close(dev->vq[index].call_fd); 1167 dev->vq[index].call_fd = -1; 1168 } 1169 if (dev->vq[index].kick_fd != -1) { 1170 dev->remove_watch(dev, dev->vq[index].kick_fd); 1171 close(dev->vq[index].kick_fd); 1172 dev->vq[index].kick_fd = -1; 1173 } 1174 1175 return true; 1176 } 1177 1178 static bool 1179 vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) 1180 { 1181 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1182 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1183 1184 if (index >= dev->max_queues) { 1185 vmsg_close_fds(vmsg); 1186 vu_panic(dev, "Invalid queue index: %u", index); 1187 return false; 1188 } 1189 1190 if (nofd) { 1191 vmsg_close_fds(vmsg); 1192 return true; 1193 } 1194 1195 if (vmsg->fd_num != 1) { 1196 vmsg_close_fds(vmsg); 1197 vu_panic(dev, "Invalid fds in request: %d", vmsg->request); 1198 return false; 1199 } 1200 1201 return true; 1202 } 1203 1204 static int 1205 inflight_desc_compare(const void *a, const void *b) 1206 { 1207 VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a, 1208 *desc1 = (VuVirtqInflightDesc *)b; 1209 1210 if (desc1->counter > desc0->counter && 1211 (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) { 1212 return 1; 1213 } 1214 1215 return -1; 1216 } 1217 1218 static int 1219 vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) 1220 { 1221 int i = 0; 1222 1223 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 1224 return 0; 1225 } 1226 1227 if (unlikely(!vq->inflight)) { 1228 return -1; 1229 } 1230 1231 if (unlikely(!vq->inflight->version)) { 1232 /* initialize the buffer */ 1233 vq->inflight->version = INFLIGHT_VERSION; 1234 return 0; 1235 } 1236 1237 vq->used_idx = le16toh(vq->vring.used->idx); 1238 vq->resubmit_num = 0; 1239 vq->resubmit_list = NULL; 1240 vq->counter = 0; 1241 1242 if (unlikely(vq->inflight->used_idx != vq->used_idx)) { 1243 vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0; 1244 1245 barrier(); 1246 1247 vq->inflight->used_idx = vq->used_idx; 1248 } 1249 1250 for (i = 0; i < vq->inflight->desc_num; i++) { 1251 if (vq->inflight->desc[i].inflight == 1) { 1252 vq->inuse++; 1253 } 1254 } 1255 1256 vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; 1257 1258 if (vq->inuse) { 1259 vq->resubmit_list = calloc(vq->inuse, sizeof(VuVirtqInflightDesc)); 1260 if (!vq->resubmit_list) { 1261 return -1; 1262 } 1263 1264 for (i = 0; i < vq->inflight->desc_num; i++) { 1265 if (vq->inflight->desc[i].inflight) { 1266 vq->resubmit_list[vq->resubmit_num].index = i; 1267 vq->resubmit_list[vq->resubmit_num].counter = 1268 vq->inflight->desc[i].counter; 1269 vq->resubmit_num++; 1270 } 1271 } 1272 1273 if (vq->resubmit_num > 1) { 1274 qsort(vq->resubmit_list, vq->resubmit_num, 1275 sizeof(VuVirtqInflightDesc), inflight_desc_compare); 1276 } 1277 vq->counter = vq->resubmit_list[0].counter + 1; 1278 } 1279 1280 /* in case of I/O hang after reconnecting */ 1281 if (eventfd_write(vq->kick_fd, 1)) { 1282 return -1; 1283 } 1284 1285 return 0; 1286 } 1287 1288 static bool 1289 vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) 1290 { 1291 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1292 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1293 1294 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1295 1296 if (!vu_check_queue_msg_file(dev, vmsg)) { 1297 return false; 1298 } 1299 1300 if (dev->vq[index].kick_fd != -1) { 1301 dev->remove_watch(dev, dev->vq[index].kick_fd); 1302 close(dev->vq[index].kick_fd); 1303 dev->vq[index].kick_fd = -1; 1304 } 1305 1306 dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0]; 1307 DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index); 1308 1309 dev->vq[index].started = true; 1310 if (dev->iface->queue_set_started) { 1311 dev->iface->queue_set_started(dev, index, true); 1312 } 1313 1314 if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) { 1315 dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN, 1316 vu_kick_cb, (void *)(long)index); 1317 1318 DPRINT("Waiting for kicks on fd: %d for vq: %d\n", 1319 dev->vq[index].kick_fd, index); 1320 } 1321 1322 if (vu_check_queue_inflights(dev, &dev->vq[index])) { 1323 vu_panic(dev, "Failed to check inflights for vq: %d\n", index); 1324 } 1325 1326 return false; 1327 } 1328 1329 void vu_set_queue_handler(VuDev *dev, VuVirtq *vq, 1330 vu_queue_handler_cb handler) 1331 { 1332 int qidx = vq - dev->vq; 1333 1334 vq->handler = handler; 1335 if (vq->kick_fd >= 0) { 1336 if (handler) { 1337 dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN, 1338 vu_kick_cb, (void *)(long)qidx); 1339 } else { 1340 dev->remove_watch(dev, vq->kick_fd); 1341 } 1342 } 1343 } 1344 1345 bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd, 1346 int size, int offset) 1347 { 1348 int qidx = vq - dev->vq; 1349 int fd_num = 0; 1350 VhostUserMsg vmsg = { 1351 .request = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, 1352 .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, 1353 .size = sizeof(vmsg.payload.area), 1354 .payload.area = { 1355 .u64 = qidx & VHOST_USER_VRING_IDX_MASK, 1356 .size = size, 1357 .offset = offset, 1358 }, 1359 }; 1360 1361 if (fd == -1) { 1362 vmsg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; 1363 } else { 1364 vmsg.fds[fd_num++] = fd; 1365 } 1366 1367 vmsg.fd_num = fd_num; 1368 1369 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) { 1370 return false; 1371 } 1372 1373 pthread_mutex_lock(&dev->slave_mutex); 1374 if (!vu_message_write(dev, dev->slave_fd, &vmsg)) { 1375 pthread_mutex_unlock(&dev->slave_mutex); 1376 return false; 1377 } 1378 1379 /* Also unlocks the slave_mutex */ 1380 return vu_process_message_reply(dev, &vmsg); 1381 } 1382 1383 static bool 1384 vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) 1385 { 1386 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1387 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1388 1389 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1390 1391 if (!vu_check_queue_msg_file(dev, vmsg)) { 1392 return false; 1393 } 1394 1395 if (dev->vq[index].call_fd != -1) { 1396 close(dev->vq[index].call_fd); 1397 dev->vq[index].call_fd = -1; 1398 } 1399 1400 dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0]; 1401 1402 /* in case of I/O hang after reconnecting */ 1403 if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) { 1404 return -1; 1405 } 1406 1407 DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index); 1408 1409 return false; 1410 } 1411 1412 static bool 1413 vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) 1414 { 1415 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1416 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1417 1418 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1419 1420 if (!vu_check_queue_msg_file(dev, vmsg)) { 1421 return false; 1422 } 1423 1424 if (dev->vq[index].err_fd != -1) { 1425 close(dev->vq[index].err_fd); 1426 dev->vq[index].err_fd = -1; 1427 } 1428 1429 dev->vq[index].err_fd = nofd ? -1 : vmsg->fds[0]; 1430 1431 return false; 1432 } 1433 1434 static bool 1435 vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1436 { 1437 /* 1438 * Note that we support, but intentionally do not set, 1439 * VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. This means that 1440 * a device implementation can return it in its callback 1441 * (get_protocol_features) if it wants to use this for 1442 * simulation, but it is otherwise not desirable (if even 1443 * implemented by the master.) 1444 */ 1445 uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_MQ | 1446 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | 1447 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | 1448 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | 1449 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | 1450 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | 1451 1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS; 1452 1453 if (have_userfault()) { 1454 features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; 1455 } 1456 1457 if (dev->iface->get_config && dev->iface->set_config) { 1458 features |= 1ULL << VHOST_USER_PROTOCOL_F_CONFIG; 1459 } 1460 1461 if (dev->iface->get_protocol_features) { 1462 features |= dev->iface->get_protocol_features(dev); 1463 } 1464 1465 vmsg_set_reply_u64(vmsg, features); 1466 return true; 1467 } 1468 1469 static bool 1470 vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1471 { 1472 uint64_t features = vmsg->payload.u64; 1473 1474 DPRINT("u64: 0x%016"PRIx64"\n", features); 1475 1476 dev->protocol_features = vmsg->payload.u64; 1477 1478 if (vu_has_protocol_feature(dev, 1479 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && 1480 (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ) || 1481 !vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { 1482 /* 1483 * The use case for using messages for kick/call is simulation, to make 1484 * the kick and call synchronous. To actually get that behaviour, both 1485 * of the other features are required. 1486 * Theoretically, one could use only kick messages, or do them without 1487 * having F_REPLY_ACK, but too many (possibly pending) messages on the 1488 * socket will eventually cause the master to hang, to avoid this in 1489 * scenarios where not desired enforce that the settings are in a way 1490 * that actually enables the simulation case. 1491 */ 1492 vu_panic(dev, 1493 "F_IN_BAND_NOTIFICATIONS requires F_SLAVE_REQ && F_REPLY_ACK"); 1494 return false; 1495 } 1496 1497 if (dev->iface->set_protocol_features) { 1498 dev->iface->set_protocol_features(dev, features); 1499 } 1500 1501 return false; 1502 } 1503 1504 static bool 1505 vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg) 1506 { 1507 vmsg_set_reply_u64(vmsg, dev->max_queues); 1508 return true; 1509 } 1510 1511 static bool 1512 vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg) 1513 { 1514 unsigned int index = vmsg->payload.state.index; 1515 unsigned int enable = vmsg->payload.state.num; 1516 1517 DPRINT("State.index: %u\n", index); 1518 DPRINT("State.enable: %u\n", enable); 1519 1520 if (index >= dev->max_queues) { 1521 vu_panic(dev, "Invalid vring_enable index: %u", index); 1522 return false; 1523 } 1524 1525 dev->vq[index].enable = enable; 1526 return false; 1527 } 1528 1529 static bool 1530 vu_set_slave_req_fd(VuDev *dev, VhostUserMsg *vmsg) 1531 { 1532 if (vmsg->fd_num != 1) { 1533 vu_panic(dev, "Invalid slave_req_fd message (%d fd's)", vmsg->fd_num); 1534 return false; 1535 } 1536 1537 if (dev->slave_fd != -1) { 1538 close(dev->slave_fd); 1539 } 1540 dev->slave_fd = vmsg->fds[0]; 1541 DPRINT("Got slave_fd: %d\n", vmsg->fds[0]); 1542 1543 return false; 1544 } 1545 1546 static bool 1547 vu_get_config(VuDev *dev, VhostUserMsg *vmsg) 1548 { 1549 int ret = -1; 1550 1551 if (dev->iface->get_config) { 1552 ret = dev->iface->get_config(dev, vmsg->payload.config.region, 1553 vmsg->payload.config.size); 1554 } 1555 1556 if (ret) { 1557 /* resize to zero to indicate an error to master */ 1558 vmsg->size = 0; 1559 } 1560 1561 return true; 1562 } 1563 1564 static bool 1565 vu_set_config(VuDev *dev, VhostUserMsg *vmsg) 1566 { 1567 int ret = -1; 1568 1569 if (dev->iface->set_config) { 1570 ret = dev->iface->set_config(dev, vmsg->payload.config.region, 1571 vmsg->payload.config.offset, 1572 vmsg->payload.config.size, 1573 vmsg->payload.config.flags); 1574 if (ret) { 1575 vu_panic(dev, "Set virtio configuration space failed"); 1576 } 1577 } 1578 1579 return false; 1580 } 1581 1582 static bool 1583 vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg) 1584 { 1585 dev->postcopy_ufd = -1; 1586 #ifdef UFFDIO_API 1587 struct uffdio_api api_struct; 1588 1589 dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 1590 vmsg->size = 0; 1591 #endif 1592 1593 if (dev->postcopy_ufd == -1) { 1594 vu_panic(dev, "Userfaultfd not available: %s", strerror(errno)); 1595 goto out; 1596 } 1597 1598 #ifdef UFFDIO_API 1599 api_struct.api = UFFD_API; 1600 api_struct.features = 0; 1601 if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { 1602 vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno)); 1603 close(dev->postcopy_ufd); 1604 dev->postcopy_ufd = -1; 1605 goto out; 1606 } 1607 /* TODO: Stash feature flags somewhere */ 1608 #endif 1609 1610 out: 1611 /* Return a ufd to the QEMU */ 1612 vmsg->fd_num = 1; 1613 vmsg->fds[0] = dev->postcopy_ufd; 1614 return true; /* = send a reply */ 1615 } 1616 1617 static bool 1618 vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg) 1619 { 1620 if (dev->nregions) { 1621 vu_panic(dev, "Regions already registered at postcopy-listen"); 1622 vmsg_set_reply_u64(vmsg, -1); 1623 return true; 1624 } 1625 dev->postcopy_listening = true; 1626 1627 vmsg_set_reply_u64(vmsg, 0); 1628 return true; 1629 } 1630 1631 static bool 1632 vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) 1633 { 1634 DPRINT("%s: Entry\n", __func__); 1635 dev->postcopy_listening = false; 1636 if (dev->postcopy_ufd > 0) { 1637 close(dev->postcopy_ufd); 1638 dev->postcopy_ufd = -1; 1639 DPRINT("%s: Done close\n", __func__); 1640 } 1641 1642 vmsg_set_reply_u64(vmsg, 0); 1643 DPRINT("%s: exit\n", __func__); 1644 return true; 1645 } 1646 1647 static inline uint64_t 1648 vu_inflight_queue_size(uint16_t queue_size) 1649 { 1650 return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size + 1651 sizeof(uint16_t), INFLIGHT_ALIGNMENT); 1652 } 1653 1654 #ifdef MFD_ALLOW_SEALING 1655 static void * 1656 memfd_alloc(const char *name, size_t size, unsigned int flags, int *fd) 1657 { 1658 void *ptr; 1659 int ret; 1660 1661 *fd = memfd_create(name, MFD_ALLOW_SEALING); 1662 if (*fd < 0) { 1663 return NULL; 1664 } 1665 1666 ret = ftruncate(*fd, size); 1667 if (ret < 0) { 1668 close(*fd); 1669 return NULL; 1670 } 1671 1672 ret = fcntl(*fd, F_ADD_SEALS, flags); 1673 if (ret < 0) { 1674 close(*fd); 1675 return NULL; 1676 } 1677 1678 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0); 1679 if (ptr == MAP_FAILED) { 1680 close(*fd); 1681 return NULL; 1682 } 1683 1684 return ptr; 1685 } 1686 #endif 1687 1688 static bool 1689 vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) 1690 { 1691 int fd = -1; 1692 void *addr = NULL; 1693 uint64_t mmap_size; 1694 uint16_t num_queues, queue_size; 1695 1696 if (vmsg->size != sizeof(vmsg->payload.inflight)) { 1697 vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size); 1698 vmsg->payload.inflight.mmap_size = 0; 1699 return true; 1700 } 1701 1702 num_queues = vmsg->payload.inflight.num_queues; 1703 queue_size = vmsg->payload.inflight.queue_size; 1704 1705 DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); 1706 DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size); 1707 1708 mmap_size = vu_inflight_queue_size(queue_size) * num_queues; 1709 1710 #ifdef MFD_ALLOW_SEALING 1711 addr = memfd_alloc("vhost-inflight", mmap_size, 1712 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 1713 &fd); 1714 #else 1715 vu_panic(dev, "Not implemented: memfd support is missing"); 1716 #endif 1717 1718 if (!addr) { 1719 vu_panic(dev, "Failed to alloc vhost inflight area"); 1720 vmsg->payload.inflight.mmap_size = 0; 1721 return true; 1722 } 1723 1724 memset(addr, 0, mmap_size); 1725 1726 dev->inflight_info.addr = addr; 1727 dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size; 1728 dev->inflight_info.fd = vmsg->fds[0] = fd; 1729 vmsg->fd_num = 1; 1730 vmsg->payload.inflight.mmap_offset = 0; 1731 1732 DPRINT("send inflight mmap_size: %"PRId64"\n", 1733 vmsg->payload.inflight.mmap_size); 1734 DPRINT("send inflight mmap offset: %"PRId64"\n", 1735 vmsg->payload.inflight.mmap_offset); 1736 1737 return true; 1738 } 1739 1740 static bool 1741 vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) 1742 { 1743 int fd, i; 1744 uint64_t mmap_size, mmap_offset; 1745 uint16_t num_queues, queue_size; 1746 void *rc; 1747 1748 if (vmsg->fd_num != 1 || 1749 vmsg->size != sizeof(vmsg->payload.inflight)) { 1750 vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d", 1751 vmsg->size, vmsg->fd_num); 1752 return false; 1753 } 1754 1755 fd = vmsg->fds[0]; 1756 mmap_size = vmsg->payload.inflight.mmap_size; 1757 mmap_offset = vmsg->payload.inflight.mmap_offset; 1758 num_queues = vmsg->payload.inflight.num_queues; 1759 queue_size = vmsg->payload.inflight.queue_size; 1760 1761 DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size); 1762 DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset); 1763 DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); 1764 DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size); 1765 1766 rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 1767 fd, mmap_offset); 1768 1769 if (rc == MAP_FAILED) { 1770 vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno)); 1771 return false; 1772 } 1773 1774 if (dev->inflight_info.fd) { 1775 close(dev->inflight_info.fd); 1776 } 1777 1778 if (dev->inflight_info.addr) { 1779 munmap(dev->inflight_info.addr, dev->inflight_info.size); 1780 } 1781 1782 dev->inflight_info.fd = fd; 1783 dev->inflight_info.addr = rc; 1784 dev->inflight_info.size = mmap_size; 1785 1786 for (i = 0; i < num_queues; i++) { 1787 dev->vq[i].inflight = (VuVirtqInflight *)rc; 1788 dev->vq[i].inflight->desc_num = queue_size; 1789 rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size)); 1790 } 1791 1792 return false; 1793 } 1794 1795 static bool 1796 vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg) 1797 { 1798 unsigned int index = vmsg->payload.state.index; 1799 1800 if (index >= dev->max_queues) { 1801 vu_panic(dev, "Invalid queue index: %u", index); 1802 return false; 1803 } 1804 1805 DPRINT("Got kick message: handler:%p idx:%u\n", 1806 dev->vq[index].handler, index); 1807 1808 if (!dev->vq[index].started) { 1809 dev->vq[index].started = true; 1810 1811 if (dev->iface->queue_set_started) { 1812 dev->iface->queue_set_started(dev, index, true); 1813 } 1814 } 1815 1816 if (dev->vq[index].handler) { 1817 dev->vq[index].handler(dev, index); 1818 } 1819 1820 return false; 1821 } 1822 1823 static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg) 1824 { 1825 vmsg_set_reply_u64(vmsg, VHOST_USER_MAX_RAM_SLOTS); 1826 1827 DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS); 1828 1829 return true; 1830 } 1831 1832 static bool 1833 vu_process_message(VuDev *dev, VhostUserMsg *vmsg) 1834 { 1835 int do_reply = 0; 1836 1837 /* Print out generic part of the request. */ 1838 DPRINT("================ Vhost user message ================\n"); 1839 DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request), 1840 vmsg->request); 1841 DPRINT("Flags: 0x%x\n", vmsg->flags); 1842 DPRINT("Size: %u\n", vmsg->size); 1843 1844 if (vmsg->fd_num) { 1845 int i; 1846 DPRINT("Fds:"); 1847 for (i = 0; i < vmsg->fd_num; i++) { 1848 DPRINT(" %d", vmsg->fds[i]); 1849 } 1850 DPRINT("\n"); 1851 } 1852 1853 if (dev->iface->process_msg && 1854 dev->iface->process_msg(dev, vmsg, &do_reply)) { 1855 return do_reply; 1856 } 1857 1858 switch (vmsg->request) { 1859 case VHOST_USER_GET_FEATURES: 1860 return vu_get_features_exec(dev, vmsg); 1861 case VHOST_USER_SET_FEATURES: 1862 return vu_set_features_exec(dev, vmsg); 1863 case VHOST_USER_GET_PROTOCOL_FEATURES: 1864 return vu_get_protocol_features_exec(dev, vmsg); 1865 case VHOST_USER_SET_PROTOCOL_FEATURES: 1866 return vu_set_protocol_features_exec(dev, vmsg); 1867 case VHOST_USER_SET_OWNER: 1868 return vu_set_owner_exec(dev, vmsg); 1869 case VHOST_USER_RESET_OWNER: 1870 return vu_reset_device_exec(dev, vmsg); 1871 case VHOST_USER_SET_MEM_TABLE: 1872 return vu_set_mem_table_exec(dev, vmsg); 1873 case VHOST_USER_SET_LOG_BASE: 1874 return vu_set_log_base_exec(dev, vmsg); 1875 case VHOST_USER_SET_LOG_FD: 1876 return vu_set_log_fd_exec(dev, vmsg); 1877 case VHOST_USER_SET_VRING_NUM: 1878 return vu_set_vring_num_exec(dev, vmsg); 1879 case VHOST_USER_SET_VRING_ADDR: 1880 return vu_set_vring_addr_exec(dev, vmsg); 1881 case VHOST_USER_SET_VRING_BASE: 1882 return vu_set_vring_base_exec(dev, vmsg); 1883 case VHOST_USER_GET_VRING_BASE: 1884 return vu_get_vring_base_exec(dev, vmsg); 1885 case VHOST_USER_SET_VRING_KICK: 1886 return vu_set_vring_kick_exec(dev, vmsg); 1887 case VHOST_USER_SET_VRING_CALL: 1888 return vu_set_vring_call_exec(dev, vmsg); 1889 case VHOST_USER_SET_VRING_ERR: 1890 return vu_set_vring_err_exec(dev, vmsg); 1891 case VHOST_USER_GET_QUEUE_NUM: 1892 return vu_get_queue_num_exec(dev, vmsg); 1893 case VHOST_USER_SET_VRING_ENABLE: 1894 return vu_set_vring_enable_exec(dev, vmsg); 1895 case VHOST_USER_SET_SLAVE_REQ_FD: 1896 return vu_set_slave_req_fd(dev, vmsg); 1897 case VHOST_USER_GET_CONFIG: 1898 return vu_get_config(dev, vmsg); 1899 case VHOST_USER_SET_CONFIG: 1900 return vu_set_config(dev, vmsg); 1901 case VHOST_USER_NONE: 1902 /* if you need processing before exit, override iface->process_msg */ 1903 exit(0); 1904 case VHOST_USER_POSTCOPY_ADVISE: 1905 return vu_set_postcopy_advise(dev, vmsg); 1906 case VHOST_USER_POSTCOPY_LISTEN: 1907 return vu_set_postcopy_listen(dev, vmsg); 1908 case VHOST_USER_POSTCOPY_END: 1909 return vu_set_postcopy_end(dev, vmsg); 1910 case VHOST_USER_GET_INFLIGHT_FD: 1911 return vu_get_inflight_fd(dev, vmsg); 1912 case VHOST_USER_SET_INFLIGHT_FD: 1913 return vu_set_inflight_fd(dev, vmsg); 1914 case VHOST_USER_VRING_KICK: 1915 return vu_handle_vring_kick(dev, vmsg); 1916 case VHOST_USER_GET_MAX_MEM_SLOTS: 1917 return vu_handle_get_max_memslots(dev, vmsg); 1918 case VHOST_USER_ADD_MEM_REG: 1919 return vu_add_mem_reg(dev, vmsg); 1920 case VHOST_USER_REM_MEM_REG: 1921 return vu_rem_mem_reg(dev, vmsg); 1922 default: 1923 vmsg_close_fds(vmsg); 1924 vu_panic(dev, "Unhandled request: %d", vmsg->request); 1925 } 1926 1927 return false; 1928 } 1929 1930 bool 1931 vu_dispatch(VuDev *dev) 1932 { 1933 VhostUserMsg vmsg = { 0, }; 1934 int reply_requested; 1935 bool need_reply, success = false; 1936 1937 if (!dev->read_msg(dev, dev->sock, &vmsg)) { 1938 goto end; 1939 } 1940 1941 need_reply = vmsg.flags & VHOST_USER_NEED_REPLY_MASK; 1942 1943 reply_requested = vu_process_message(dev, &vmsg); 1944 if (!reply_requested && need_reply) { 1945 vmsg_set_reply_u64(&vmsg, 0); 1946 reply_requested = 1; 1947 } 1948 1949 if (!reply_requested) { 1950 success = true; 1951 goto end; 1952 } 1953 1954 if (!vu_send_reply(dev, dev->sock, &vmsg)) { 1955 goto end; 1956 } 1957 1958 success = true; 1959 1960 end: 1961 free(vmsg.data); 1962 return success; 1963 } 1964 1965 void 1966 vu_deinit(VuDev *dev) 1967 { 1968 int i; 1969 1970 for (i = 0; i < dev->nregions; i++) { 1971 VuDevRegion *r = &dev->regions[i]; 1972 void *m = (void *) (uintptr_t) r->mmap_addr; 1973 if (m != MAP_FAILED) { 1974 munmap(m, r->size + r->mmap_offset); 1975 } 1976 } 1977 dev->nregions = 0; 1978 1979 for (i = 0; i < dev->max_queues; i++) { 1980 VuVirtq *vq = &dev->vq[i]; 1981 1982 if (vq->call_fd != -1) { 1983 close(vq->call_fd); 1984 vq->call_fd = -1; 1985 } 1986 1987 if (vq->kick_fd != -1) { 1988 dev->remove_watch(dev, vq->kick_fd); 1989 close(vq->kick_fd); 1990 vq->kick_fd = -1; 1991 } 1992 1993 if (vq->err_fd != -1) { 1994 close(vq->err_fd); 1995 vq->err_fd = -1; 1996 } 1997 1998 if (vq->resubmit_list) { 1999 free(vq->resubmit_list); 2000 vq->resubmit_list = NULL; 2001 } 2002 2003 vq->inflight = NULL; 2004 } 2005 2006 if (dev->inflight_info.addr) { 2007 munmap(dev->inflight_info.addr, dev->inflight_info.size); 2008 dev->inflight_info.addr = NULL; 2009 } 2010 2011 if (dev->inflight_info.fd > 0) { 2012 close(dev->inflight_info.fd); 2013 dev->inflight_info.fd = -1; 2014 } 2015 2016 vu_close_log(dev); 2017 if (dev->slave_fd != -1) { 2018 close(dev->slave_fd); 2019 dev->slave_fd = -1; 2020 } 2021 pthread_mutex_destroy(&dev->slave_mutex); 2022 2023 if (dev->sock != -1) { 2024 close(dev->sock); 2025 } 2026 2027 free(dev->vq); 2028 dev->vq = NULL; 2029 } 2030 2031 bool 2032 vu_init(VuDev *dev, 2033 uint16_t max_queues, 2034 int socket, 2035 vu_panic_cb panic, 2036 vu_read_msg_cb read_msg, 2037 vu_set_watch_cb set_watch, 2038 vu_remove_watch_cb remove_watch, 2039 const VuDevIface *iface) 2040 { 2041 uint16_t i; 2042 2043 assert(max_queues > 0); 2044 assert(socket >= 0); 2045 assert(set_watch); 2046 assert(remove_watch); 2047 assert(iface); 2048 assert(panic); 2049 2050 memset(dev, 0, sizeof(*dev)); 2051 2052 dev->sock = socket; 2053 dev->panic = panic; 2054 dev->read_msg = read_msg ? read_msg : vu_message_read_default; 2055 dev->set_watch = set_watch; 2056 dev->remove_watch = remove_watch; 2057 dev->iface = iface; 2058 dev->log_call_fd = -1; 2059 pthread_mutex_init(&dev->slave_mutex, NULL); 2060 dev->slave_fd = -1; 2061 dev->max_queues = max_queues; 2062 2063 dev->vq = malloc(max_queues * sizeof(dev->vq[0])); 2064 if (!dev->vq) { 2065 DPRINT("%s: failed to malloc virtqueues\n", __func__); 2066 return false; 2067 } 2068 2069 for (i = 0; i < max_queues; i++) { 2070 dev->vq[i] = (VuVirtq) { 2071 .call_fd = -1, .kick_fd = -1, .err_fd = -1, 2072 .notification = true, 2073 }; 2074 } 2075 2076 return true; 2077 } 2078 2079 VuVirtq * 2080 vu_get_queue(VuDev *dev, int qidx) 2081 { 2082 assert(qidx < dev->max_queues); 2083 return &dev->vq[qidx]; 2084 } 2085 2086 bool 2087 vu_queue_enabled(VuDev *dev, VuVirtq *vq) 2088 { 2089 return vq->enable; 2090 } 2091 2092 bool 2093 vu_queue_started(const VuDev *dev, const VuVirtq *vq) 2094 { 2095 return vq->started; 2096 } 2097 2098 static inline uint16_t 2099 vring_avail_flags(VuVirtq *vq) 2100 { 2101 return le16toh(vq->vring.avail->flags); 2102 } 2103 2104 static inline uint16_t 2105 vring_avail_idx(VuVirtq *vq) 2106 { 2107 vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); 2108 2109 return vq->shadow_avail_idx; 2110 } 2111 2112 static inline uint16_t 2113 vring_avail_ring(VuVirtq *vq, int i) 2114 { 2115 return le16toh(vq->vring.avail->ring[i]); 2116 } 2117 2118 static inline uint16_t 2119 vring_get_used_event(VuVirtq *vq) 2120 { 2121 return vring_avail_ring(vq, vq->vring.num); 2122 } 2123 2124 static int 2125 virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx) 2126 { 2127 uint16_t num_heads = vring_avail_idx(vq) - idx; 2128 2129 /* Check it isn't doing very strange things with descriptor numbers. */ 2130 if (num_heads > vq->vring.num) { 2131 vu_panic(dev, "Guest moved used index from %u to %u", 2132 idx, vq->shadow_avail_idx); 2133 return -1; 2134 } 2135 if (num_heads) { 2136 /* On success, callers read a descriptor at vq->last_avail_idx. 2137 * Make sure descriptor read does not bypass avail index read. */ 2138 smp_rmb(); 2139 } 2140 2141 return num_heads; 2142 } 2143 2144 static bool 2145 virtqueue_get_head(VuDev *dev, VuVirtq *vq, 2146 unsigned int idx, unsigned int *head) 2147 { 2148 /* Grab the next descriptor number they're advertising, and increment 2149 * the index we've seen. */ 2150 *head = vring_avail_ring(vq, idx % vq->vring.num); 2151 2152 /* If their number is silly, that's a fatal mistake. */ 2153 if (*head >= vq->vring.num) { 2154 vu_panic(dev, "Guest says index %u is available", *head); 2155 return false; 2156 } 2157 2158 return true; 2159 } 2160 2161 static int 2162 virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc, 2163 uint64_t addr, size_t len) 2164 { 2165 struct vring_desc *ori_desc; 2166 uint64_t read_len; 2167 2168 if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { 2169 return -1; 2170 } 2171 2172 if (len == 0) { 2173 return -1; 2174 } 2175 2176 while (len) { 2177 read_len = len; 2178 ori_desc = vu_gpa_to_va(dev, &read_len, addr); 2179 if (!ori_desc) { 2180 return -1; 2181 } 2182 2183 memcpy(desc, ori_desc, read_len); 2184 len -= read_len; 2185 addr += read_len; 2186 desc += read_len; 2187 } 2188 2189 return 0; 2190 } 2191 2192 enum { 2193 VIRTQUEUE_READ_DESC_ERROR = -1, 2194 VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ 2195 VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ 2196 }; 2197 2198 static int 2199 virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, 2200 int i, unsigned int max, unsigned int *next) 2201 { 2202 /* If this descriptor says it doesn't chain, we're done. */ 2203 if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) { 2204 return VIRTQUEUE_READ_DESC_DONE; 2205 } 2206 2207 /* Check they're not leading us off end of descriptors. */ 2208 *next = le16toh(desc[i].next); 2209 /* Make sure compiler knows to grab that: we don't want it changing! */ 2210 smp_wmb(); 2211 2212 if (*next >= max) { 2213 vu_panic(dev, "Desc next is %u", *next); 2214 return VIRTQUEUE_READ_DESC_ERROR; 2215 } 2216 2217 return VIRTQUEUE_READ_DESC_MORE; 2218 } 2219 2220 void 2221 vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, 2222 unsigned int *out_bytes, 2223 unsigned max_in_bytes, unsigned max_out_bytes) 2224 { 2225 unsigned int idx; 2226 unsigned int total_bufs, in_total, out_total; 2227 int rc; 2228 2229 idx = vq->last_avail_idx; 2230 2231 total_bufs = in_total = out_total = 0; 2232 if (unlikely(dev->broken) || 2233 unlikely(!vq->vring.avail)) { 2234 goto done; 2235 } 2236 2237 while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) { 2238 unsigned int max, desc_len, num_bufs, indirect = 0; 2239 uint64_t desc_addr, read_len; 2240 struct vring_desc *desc; 2241 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2242 unsigned int i; 2243 2244 max = vq->vring.num; 2245 num_bufs = total_bufs; 2246 if (!virtqueue_get_head(dev, vq, idx++, &i)) { 2247 goto err; 2248 } 2249 desc = vq->vring.desc; 2250 2251 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2252 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2253 vu_panic(dev, "Invalid size for indirect buffer table"); 2254 goto err; 2255 } 2256 2257 /* If we've got too many, that implies a descriptor loop. */ 2258 if (num_bufs >= max) { 2259 vu_panic(dev, "Looped descriptor"); 2260 goto err; 2261 } 2262 2263 /* loop over the indirect descriptor table */ 2264 indirect = 1; 2265 desc_addr = le64toh(desc[i].addr); 2266 desc_len = le32toh(desc[i].len); 2267 max = desc_len / sizeof(struct vring_desc); 2268 read_len = desc_len; 2269 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2270 if (unlikely(desc && read_len != desc_len)) { 2271 /* Failed to use zero copy */ 2272 desc = NULL; 2273 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2274 desc_addr, 2275 desc_len)) { 2276 desc = desc_buf; 2277 } 2278 } 2279 if (!desc) { 2280 vu_panic(dev, "Invalid indirect buffer table"); 2281 goto err; 2282 } 2283 num_bufs = i = 0; 2284 } 2285 2286 do { 2287 /* If we've got too many, that implies a descriptor loop. */ 2288 if (++num_bufs > max) { 2289 vu_panic(dev, "Looped descriptor"); 2290 goto err; 2291 } 2292 2293 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2294 in_total += le32toh(desc[i].len); 2295 } else { 2296 out_total += le32toh(desc[i].len); 2297 } 2298 if (in_total >= max_in_bytes && out_total >= max_out_bytes) { 2299 goto done; 2300 } 2301 rc = virtqueue_read_next_desc(dev, desc, i, max, &i); 2302 } while (rc == VIRTQUEUE_READ_DESC_MORE); 2303 2304 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 2305 goto err; 2306 } 2307 2308 if (!indirect) { 2309 total_bufs = num_bufs; 2310 } else { 2311 total_bufs++; 2312 } 2313 } 2314 if (rc < 0) { 2315 goto err; 2316 } 2317 done: 2318 if (in_bytes) { 2319 *in_bytes = in_total; 2320 } 2321 if (out_bytes) { 2322 *out_bytes = out_total; 2323 } 2324 return; 2325 2326 err: 2327 in_total = out_total = 0; 2328 goto done; 2329 } 2330 2331 bool 2332 vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, 2333 unsigned int out_bytes) 2334 { 2335 unsigned int in_total, out_total; 2336 2337 vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total, 2338 in_bytes, out_bytes); 2339 2340 return in_bytes <= in_total && out_bytes <= out_total; 2341 } 2342 2343 /* Fetch avail_idx from VQ memory only when we really need to know if 2344 * guest has added some buffers. */ 2345 bool 2346 vu_queue_empty(VuDev *dev, VuVirtq *vq) 2347 { 2348 if (unlikely(dev->broken) || 2349 unlikely(!vq->vring.avail)) { 2350 return true; 2351 } 2352 2353 if (vq->shadow_avail_idx != vq->last_avail_idx) { 2354 return false; 2355 } 2356 2357 return vring_avail_idx(vq) == vq->last_avail_idx; 2358 } 2359 2360 static bool 2361 vring_notify(VuDev *dev, VuVirtq *vq) 2362 { 2363 uint16_t old, new; 2364 bool v; 2365 2366 /* We need to expose used array entries before checking used event. */ 2367 smp_mb(); 2368 2369 /* Always notify when queue is empty (when feature acknowledge) */ 2370 if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && 2371 !vq->inuse && vu_queue_empty(dev, vq)) { 2372 return true; 2373 } 2374 2375 if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2376 return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); 2377 } 2378 2379 v = vq->signalled_used_valid; 2380 vq->signalled_used_valid = true; 2381 old = vq->signalled_used; 2382 new = vq->signalled_used = vq->used_idx; 2383 return !v || vring_need_event(vring_get_used_event(vq), new, old); 2384 } 2385 2386 static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) 2387 { 2388 if (unlikely(dev->broken) || 2389 unlikely(!vq->vring.avail)) { 2390 return; 2391 } 2392 2393 if (!vring_notify(dev, vq)) { 2394 DPRINT("skipped notify...\n"); 2395 return; 2396 } 2397 2398 if (vq->call_fd < 0 && 2399 vu_has_protocol_feature(dev, 2400 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && 2401 vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { 2402 VhostUserMsg vmsg = { 2403 .request = VHOST_USER_SLAVE_VRING_CALL, 2404 .flags = VHOST_USER_VERSION, 2405 .size = sizeof(vmsg.payload.state), 2406 .payload.state = { 2407 .index = vq - dev->vq, 2408 }, 2409 }; 2410 bool ack = sync && 2411 vu_has_protocol_feature(dev, 2412 VHOST_USER_PROTOCOL_F_REPLY_ACK); 2413 2414 if (ack) { 2415 vmsg.flags |= VHOST_USER_NEED_REPLY_MASK; 2416 } 2417 2418 vu_message_write(dev, dev->slave_fd, &vmsg); 2419 if (ack) { 2420 vu_message_read_default(dev, dev->slave_fd, &vmsg); 2421 } 2422 return; 2423 } 2424 2425 if (eventfd_write(vq->call_fd, 1) < 0) { 2426 vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); 2427 } 2428 } 2429 2430 void vu_queue_notify(VuDev *dev, VuVirtq *vq) 2431 { 2432 _vu_queue_notify(dev, vq, false); 2433 } 2434 2435 void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq) 2436 { 2437 _vu_queue_notify(dev, vq, true); 2438 } 2439 2440 static inline void 2441 vring_used_flags_set_bit(VuVirtq *vq, int mask) 2442 { 2443 uint16_t *flags; 2444 2445 flags = (uint16_t *)((char*)vq->vring.used + 2446 offsetof(struct vring_used, flags)); 2447 *flags = htole16(le16toh(*flags) | mask); 2448 } 2449 2450 static inline void 2451 vring_used_flags_unset_bit(VuVirtq *vq, int mask) 2452 { 2453 uint16_t *flags; 2454 2455 flags = (uint16_t *)((char*)vq->vring.used + 2456 offsetof(struct vring_used, flags)); 2457 *flags = htole16(le16toh(*flags) & ~mask); 2458 } 2459 2460 static inline void 2461 vring_set_avail_event(VuVirtq *vq, uint16_t val) 2462 { 2463 uint16_t *avail; 2464 2465 if (!vq->notification) { 2466 return; 2467 } 2468 2469 avail = (uint16_t *)&vq->vring.used->ring[vq->vring.num]; 2470 *avail = htole16(val); 2471 } 2472 2473 void 2474 vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable) 2475 { 2476 vq->notification = enable; 2477 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2478 vring_set_avail_event(vq, vring_avail_idx(vq)); 2479 } else if (enable) { 2480 vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); 2481 } else { 2482 vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); 2483 } 2484 if (enable) { 2485 /* Expose avail event/used flags before caller checks the avail idx. */ 2486 smp_mb(); 2487 } 2488 } 2489 2490 static bool 2491 virtqueue_map_desc(VuDev *dev, 2492 unsigned int *p_num_sg, struct iovec *iov, 2493 unsigned int max_num_sg, bool is_write, 2494 uint64_t pa, size_t sz) 2495 { 2496 unsigned num_sg = *p_num_sg; 2497 2498 assert(num_sg <= max_num_sg); 2499 2500 if (!sz) { 2501 vu_panic(dev, "virtio: zero sized buffers are not allowed"); 2502 return false; 2503 } 2504 2505 while (sz) { 2506 uint64_t len = sz; 2507 2508 if (num_sg == max_num_sg) { 2509 vu_panic(dev, "virtio: too many descriptors in indirect table"); 2510 return false; 2511 } 2512 2513 iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); 2514 if (iov[num_sg].iov_base == NULL) { 2515 vu_panic(dev, "virtio: invalid address for buffers"); 2516 return false; 2517 } 2518 iov[num_sg].iov_len = len; 2519 num_sg++; 2520 sz -= len; 2521 pa += len; 2522 } 2523 2524 *p_num_sg = num_sg; 2525 return true; 2526 } 2527 2528 static void * 2529 virtqueue_alloc_element(size_t sz, 2530 unsigned out_num, unsigned in_num) 2531 { 2532 VuVirtqElement *elem; 2533 size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); 2534 size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); 2535 size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); 2536 2537 assert(sz >= sizeof(VuVirtqElement)); 2538 elem = malloc(out_sg_end); 2539 elem->out_num = out_num; 2540 elem->in_num = in_num; 2541 elem->in_sg = (void *)elem + in_sg_ofs; 2542 elem->out_sg = (void *)elem + out_sg_ofs; 2543 return elem; 2544 } 2545 2546 static void * 2547 vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) 2548 { 2549 struct vring_desc *desc = vq->vring.desc; 2550 uint64_t desc_addr, read_len; 2551 unsigned int desc_len; 2552 unsigned int max = vq->vring.num; 2553 unsigned int i = idx; 2554 VuVirtqElement *elem; 2555 unsigned int out_num = 0, in_num = 0; 2556 struct iovec iov[VIRTQUEUE_MAX_SIZE]; 2557 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2558 int rc; 2559 2560 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2561 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2562 vu_panic(dev, "Invalid size for indirect buffer table"); 2563 return NULL; 2564 } 2565 2566 /* loop over the indirect descriptor table */ 2567 desc_addr = le64toh(desc[i].addr); 2568 desc_len = le32toh(desc[i].len); 2569 max = desc_len / sizeof(struct vring_desc); 2570 read_len = desc_len; 2571 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2572 if (unlikely(desc && read_len != desc_len)) { 2573 /* Failed to use zero copy */ 2574 desc = NULL; 2575 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2576 desc_addr, 2577 desc_len)) { 2578 desc = desc_buf; 2579 } 2580 } 2581 if (!desc) { 2582 vu_panic(dev, "Invalid indirect buffer table"); 2583 return NULL; 2584 } 2585 i = 0; 2586 } 2587 2588 /* Collect all the descriptors */ 2589 do { 2590 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2591 if (!virtqueue_map_desc(dev, &in_num, iov + out_num, 2592 VIRTQUEUE_MAX_SIZE - out_num, true, 2593 le64toh(desc[i].addr), 2594 le32toh(desc[i].len))) { 2595 return NULL; 2596 } 2597 } else { 2598 if (in_num) { 2599 vu_panic(dev, "Incorrect order for descriptors"); 2600 return NULL; 2601 } 2602 if (!virtqueue_map_desc(dev, &out_num, iov, 2603 VIRTQUEUE_MAX_SIZE, false, 2604 le64toh(desc[i].addr), 2605 le32toh(desc[i].len))) { 2606 return NULL; 2607 } 2608 } 2609 2610 /* If we've got too many, that implies a descriptor loop. */ 2611 if ((in_num + out_num) > max) { 2612 vu_panic(dev, "Looped descriptor"); 2613 return NULL; 2614 } 2615 rc = virtqueue_read_next_desc(dev, desc, i, max, &i); 2616 } while (rc == VIRTQUEUE_READ_DESC_MORE); 2617 2618 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 2619 vu_panic(dev, "read descriptor error"); 2620 return NULL; 2621 } 2622 2623 /* Now copy what we have collected and mapped */ 2624 elem = virtqueue_alloc_element(sz, out_num, in_num); 2625 elem->index = idx; 2626 for (i = 0; i < out_num; i++) { 2627 elem->out_sg[i] = iov[i]; 2628 } 2629 for (i = 0; i < in_num; i++) { 2630 elem->in_sg[i] = iov[out_num + i]; 2631 } 2632 2633 return elem; 2634 } 2635 2636 static int 2637 vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx) 2638 { 2639 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2640 return 0; 2641 } 2642 2643 if (unlikely(!vq->inflight)) { 2644 return -1; 2645 } 2646 2647 vq->inflight->desc[desc_idx].counter = vq->counter++; 2648 vq->inflight->desc[desc_idx].inflight = 1; 2649 2650 return 0; 2651 } 2652 2653 static int 2654 vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx) 2655 { 2656 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2657 return 0; 2658 } 2659 2660 if (unlikely(!vq->inflight)) { 2661 return -1; 2662 } 2663 2664 vq->inflight->last_batch_head = desc_idx; 2665 2666 return 0; 2667 } 2668 2669 static int 2670 vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx) 2671 { 2672 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2673 return 0; 2674 } 2675 2676 if (unlikely(!vq->inflight)) { 2677 return -1; 2678 } 2679 2680 barrier(); 2681 2682 vq->inflight->desc[desc_idx].inflight = 0; 2683 2684 barrier(); 2685 2686 vq->inflight->used_idx = vq->used_idx; 2687 2688 return 0; 2689 } 2690 2691 void * 2692 vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) 2693 { 2694 int i; 2695 unsigned int head; 2696 VuVirtqElement *elem; 2697 2698 if (unlikely(dev->broken) || 2699 unlikely(!vq->vring.avail)) { 2700 return NULL; 2701 } 2702 2703 if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) { 2704 i = (--vq->resubmit_num); 2705 elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz); 2706 2707 if (!vq->resubmit_num) { 2708 free(vq->resubmit_list); 2709 vq->resubmit_list = NULL; 2710 } 2711 2712 return elem; 2713 } 2714 2715 if (vu_queue_empty(dev, vq)) { 2716 return NULL; 2717 } 2718 /* 2719 * Needed after virtio_queue_empty(), see comment in 2720 * virtqueue_num_heads(). 2721 */ 2722 smp_rmb(); 2723 2724 if (vq->inuse >= vq->vring.num) { 2725 vu_panic(dev, "Virtqueue size exceeded"); 2726 return NULL; 2727 } 2728 2729 if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) { 2730 return NULL; 2731 } 2732 2733 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2734 vring_set_avail_event(vq, vq->last_avail_idx); 2735 } 2736 2737 elem = vu_queue_map_desc(dev, vq, head, sz); 2738 2739 if (!elem) { 2740 return NULL; 2741 } 2742 2743 vq->inuse++; 2744 2745 vu_queue_inflight_get(dev, vq, head); 2746 2747 return elem; 2748 } 2749 2750 static void 2751 vu_queue_detach_element(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, 2752 size_t len) 2753 { 2754 vq->inuse--; 2755 /* unmap, when DMA support is added */ 2756 } 2757 2758 void 2759 vu_queue_unpop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, 2760 size_t len) 2761 { 2762 vq->last_avail_idx--; 2763 vu_queue_detach_element(dev, vq, elem, len); 2764 } 2765 2766 bool 2767 vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) 2768 { 2769 if (num > vq->inuse) { 2770 return false; 2771 } 2772 vq->last_avail_idx -= num; 2773 vq->inuse -= num; 2774 return true; 2775 } 2776 2777 static inline 2778 void vring_used_write(VuDev *dev, VuVirtq *vq, 2779 struct vring_used_elem *uelem, int i) 2780 { 2781 struct vring_used *used = vq->vring.used; 2782 2783 used->ring[i] = *uelem; 2784 vu_log_write(dev, vq->vring.log_guest_addr + 2785 offsetof(struct vring_used, ring[i]), 2786 sizeof(used->ring[i])); 2787 } 2788 2789 2790 static void 2791 vu_log_queue_fill(VuDev *dev, VuVirtq *vq, 2792 const VuVirtqElement *elem, 2793 unsigned int len) 2794 { 2795 struct vring_desc *desc = vq->vring.desc; 2796 unsigned int i, max, min, desc_len; 2797 uint64_t desc_addr, read_len; 2798 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2799 unsigned num_bufs = 0; 2800 2801 max = vq->vring.num; 2802 i = elem->index; 2803 2804 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2805 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2806 vu_panic(dev, "Invalid size for indirect buffer table"); 2807 return; 2808 } 2809 2810 /* loop over the indirect descriptor table */ 2811 desc_addr = le64toh(desc[i].addr); 2812 desc_len = le32toh(desc[i].len); 2813 max = desc_len / sizeof(struct vring_desc); 2814 read_len = desc_len; 2815 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2816 if (unlikely(desc && read_len != desc_len)) { 2817 /* Failed to use zero copy */ 2818 desc = NULL; 2819 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2820 desc_addr, 2821 desc_len)) { 2822 desc = desc_buf; 2823 } 2824 } 2825 if (!desc) { 2826 vu_panic(dev, "Invalid indirect buffer table"); 2827 return; 2828 } 2829 i = 0; 2830 } 2831 2832 do { 2833 if (++num_bufs > max) { 2834 vu_panic(dev, "Looped descriptor"); 2835 return; 2836 } 2837 2838 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2839 min = MIN(le32toh(desc[i].len), len); 2840 vu_log_write(dev, le64toh(desc[i].addr), min); 2841 len -= min; 2842 } 2843 2844 } while (len > 0 && 2845 (virtqueue_read_next_desc(dev, desc, i, max, &i) 2846 == VIRTQUEUE_READ_DESC_MORE)); 2847 } 2848 2849 void 2850 vu_queue_fill(VuDev *dev, VuVirtq *vq, 2851 const VuVirtqElement *elem, 2852 unsigned int len, unsigned int idx) 2853 { 2854 struct vring_used_elem uelem; 2855 2856 if (unlikely(dev->broken) || 2857 unlikely(!vq->vring.avail)) { 2858 return; 2859 } 2860 2861 vu_log_queue_fill(dev, vq, elem, len); 2862 2863 idx = (idx + vq->used_idx) % vq->vring.num; 2864 2865 uelem.id = htole32(elem->index); 2866 uelem.len = htole32(len); 2867 vring_used_write(dev, vq, &uelem, idx); 2868 } 2869 2870 static inline 2871 void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val) 2872 { 2873 vq->vring.used->idx = htole16(val); 2874 vu_log_write(dev, 2875 vq->vring.log_guest_addr + offsetof(struct vring_used, idx), 2876 sizeof(vq->vring.used->idx)); 2877 2878 vq->used_idx = val; 2879 } 2880 2881 void 2882 vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) 2883 { 2884 uint16_t old, new; 2885 2886 if (unlikely(dev->broken) || 2887 unlikely(!vq->vring.avail)) { 2888 return; 2889 } 2890 2891 /* Make sure buffer is written before we update index. */ 2892 smp_wmb(); 2893 2894 old = vq->used_idx; 2895 new = old + count; 2896 vring_used_idx_set(dev, vq, new); 2897 vq->inuse -= count; 2898 if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { 2899 vq->signalled_used_valid = false; 2900 } 2901 } 2902 2903 void 2904 vu_queue_push(VuDev *dev, VuVirtq *vq, 2905 const VuVirtqElement *elem, unsigned int len) 2906 { 2907 vu_queue_fill(dev, vq, elem, len, 0); 2908 vu_queue_inflight_pre_put(dev, vq, elem->index); 2909 vu_queue_flush(dev, vq, 1); 2910 vu_queue_inflight_post_put(dev, vq, elem->index); 2911 } 2912