1 /* 2 * Vhost User library 3 * 4 * Copyright IBM, Corp. 2007 5 * Copyright (c) 2016 Red Hat, Inc. 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Marc-André Lureau <mlureau@redhat.com> 10 * Victor Kaplansky <victork@redhat.com> 11 * 12 * This work is licensed under the terms of the GNU GPL, version 2 or 13 * later. See the COPYING file in the top-level directory. 14 */ 15 16 /* this code avoids GLib dependency */ 17 #include <stdlib.h> 18 #include <stdio.h> 19 #include <unistd.h> 20 #include <stdarg.h> 21 #include <errno.h> 22 #include <string.h> 23 #include <assert.h> 24 #include <inttypes.h> 25 #include <sys/types.h> 26 #include <sys/socket.h> 27 #include <sys/eventfd.h> 28 #include <sys/mman.h> 29 #include <endian.h> 30 31 #if defined(__linux__) 32 #include <sys/syscall.h> 33 #include <fcntl.h> 34 #include <sys/ioctl.h> 35 #include <linux/vhost.h> 36 37 #ifdef __NR_userfaultfd 38 #include <linux/userfaultfd.h> 39 #endif 40 41 #endif 42 43 #include "include/atomic.h" 44 45 #include "libvhost-user.h" 46 47 /* usually provided by GLib */ 48 #ifndef MIN 49 #define MIN(x, y) ({ \ 50 typeof(x) _min1 = (x); \ 51 typeof(y) _min2 = (y); \ 52 (void) (&_min1 == &_min2); \ 53 _min1 < _min2 ? _min1 : _min2; }) 54 #endif 55 56 /* Round number down to multiple */ 57 #define ALIGN_DOWN(n, m) ((n) / (m) * (m)) 58 59 /* Round number up to multiple */ 60 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) 61 62 #ifndef unlikely 63 #define unlikely(x) __builtin_expect(!!(x), 0) 64 #endif 65 66 /* Align each region to cache line size in inflight buffer */ 67 #define INFLIGHT_ALIGNMENT 64 68 69 /* The version of inflight buffer */ 70 #define INFLIGHT_VERSION 1 71 72 /* The version of the protocol we support */ 73 #define VHOST_USER_VERSION 1 74 #define LIBVHOST_USER_DEBUG 0 75 76 #define DPRINT(...) \ 77 do { \ 78 if (LIBVHOST_USER_DEBUG) { \ 79 fprintf(stderr, __VA_ARGS__); \ 80 } \ 81 } while (0) 82 83 static inline 84 bool has_feature(uint64_t features, unsigned int fbit) 85 { 86 assert(fbit < 64); 87 return !!(features & (1ULL << fbit)); 88 } 89 90 static inline 91 bool vu_has_feature(VuDev *dev, 92 unsigned int fbit) 93 { 94 return has_feature(dev->features, fbit); 95 } 96 97 static inline bool vu_has_protocol_feature(VuDev *dev, unsigned int fbit) 98 { 99 return has_feature(dev->protocol_features, fbit); 100 } 101 102 static const char * 103 vu_request_to_string(unsigned int req) 104 { 105 #define REQ(req) [req] = #req 106 static const char *vu_request_str[] = { 107 REQ(VHOST_USER_NONE), 108 REQ(VHOST_USER_GET_FEATURES), 109 REQ(VHOST_USER_SET_FEATURES), 110 REQ(VHOST_USER_SET_OWNER), 111 REQ(VHOST_USER_RESET_OWNER), 112 REQ(VHOST_USER_SET_MEM_TABLE), 113 REQ(VHOST_USER_SET_LOG_BASE), 114 REQ(VHOST_USER_SET_LOG_FD), 115 REQ(VHOST_USER_SET_VRING_NUM), 116 REQ(VHOST_USER_SET_VRING_ADDR), 117 REQ(VHOST_USER_SET_VRING_BASE), 118 REQ(VHOST_USER_GET_VRING_BASE), 119 REQ(VHOST_USER_SET_VRING_KICK), 120 REQ(VHOST_USER_SET_VRING_CALL), 121 REQ(VHOST_USER_SET_VRING_ERR), 122 REQ(VHOST_USER_GET_PROTOCOL_FEATURES), 123 REQ(VHOST_USER_SET_PROTOCOL_FEATURES), 124 REQ(VHOST_USER_GET_QUEUE_NUM), 125 REQ(VHOST_USER_SET_VRING_ENABLE), 126 REQ(VHOST_USER_SEND_RARP), 127 REQ(VHOST_USER_NET_SET_MTU), 128 REQ(VHOST_USER_SET_SLAVE_REQ_FD), 129 REQ(VHOST_USER_IOTLB_MSG), 130 REQ(VHOST_USER_SET_VRING_ENDIAN), 131 REQ(VHOST_USER_GET_CONFIG), 132 REQ(VHOST_USER_SET_CONFIG), 133 REQ(VHOST_USER_POSTCOPY_ADVISE), 134 REQ(VHOST_USER_POSTCOPY_LISTEN), 135 REQ(VHOST_USER_POSTCOPY_END), 136 REQ(VHOST_USER_GET_INFLIGHT_FD), 137 REQ(VHOST_USER_SET_INFLIGHT_FD), 138 REQ(VHOST_USER_GPU_SET_SOCKET), 139 REQ(VHOST_USER_VRING_KICK), 140 REQ(VHOST_USER_GET_MAX_MEM_SLOTS), 141 REQ(VHOST_USER_ADD_MEM_REG), 142 REQ(VHOST_USER_REM_MEM_REG), 143 REQ(VHOST_USER_MAX), 144 }; 145 #undef REQ 146 147 if (req < VHOST_USER_MAX) { 148 return vu_request_str[req]; 149 } else { 150 return "unknown"; 151 } 152 } 153 154 static void 155 vu_panic(VuDev *dev, const char *msg, ...) 156 { 157 char *buf = NULL; 158 va_list ap; 159 160 va_start(ap, msg); 161 if (vasprintf(&buf, msg, ap) < 0) { 162 buf = NULL; 163 } 164 va_end(ap); 165 166 dev->broken = true; 167 dev->panic(dev, buf); 168 free(buf); 169 170 /* 171 * FIXME: 172 * find a way to call virtio_error, or perhaps close the connection? 173 */ 174 } 175 176 /* Translate guest physical address to our virtual address. */ 177 void * 178 vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) 179 { 180 int i; 181 182 if (*plen == 0) { 183 return NULL; 184 } 185 186 /* Find matching memory region. */ 187 for (i = 0; i < dev->nregions; i++) { 188 VuDevRegion *r = &dev->regions[i]; 189 190 if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { 191 if ((guest_addr + *plen) > (r->gpa + r->size)) { 192 *plen = r->gpa + r->size - guest_addr; 193 } 194 return (void *)(uintptr_t) 195 guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; 196 } 197 } 198 199 return NULL; 200 } 201 202 /* Translate qemu virtual address to our virtual address. */ 203 static void * 204 qva_to_va(VuDev *dev, uint64_t qemu_addr) 205 { 206 int i; 207 208 /* Find matching memory region. */ 209 for (i = 0; i < dev->nregions; i++) { 210 VuDevRegion *r = &dev->regions[i]; 211 212 if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { 213 return (void *)(uintptr_t) 214 qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; 215 } 216 } 217 218 return NULL; 219 } 220 221 static void 222 vmsg_close_fds(VhostUserMsg *vmsg) 223 { 224 int i; 225 226 for (i = 0; i < vmsg->fd_num; i++) { 227 close(vmsg->fds[i]); 228 } 229 } 230 231 /* Set reply payload.u64 and clear request flags and fd_num */ 232 static void vmsg_set_reply_u64(VhostUserMsg *vmsg, uint64_t val) 233 { 234 vmsg->flags = 0; /* defaults will be set by vu_send_reply() */ 235 vmsg->size = sizeof(vmsg->payload.u64); 236 vmsg->payload.u64 = val; 237 vmsg->fd_num = 0; 238 } 239 240 /* A test to see if we have userfault available */ 241 static bool 242 have_userfault(void) 243 { 244 #if defined(__linux__) && defined(__NR_userfaultfd) &&\ 245 defined(UFFD_FEATURE_MISSING_SHMEM) &&\ 246 defined(UFFD_FEATURE_MISSING_HUGETLBFS) 247 /* Now test the kernel we're running on really has the features */ 248 int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 249 struct uffdio_api api_struct; 250 if (ufd < 0) { 251 return false; 252 } 253 254 api_struct.api = UFFD_API; 255 api_struct.features = UFFD_FEATURE_MISSING_SHMEM | 256 UFFD_FEATURE_MISSING_HUGETLBFS; 257 if (ioctl(ufd, UFFDIO_API, &api_struct)) { 258 close(ufd); 259 return false; 260 } 261 close(ufd); 262 return true; 263 264 #else 265 return false; 266 #endif 267 } 268 269 static bool 270 vu_message_read_default(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 271 { 272 char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; 273 struct iovec iov = { 274 .iov_base = (char *)vmsg, 275 .iov_len = VHOST_USER_HDR_SIZE, 276 }; 277 struct msghdr msg = { 278 .msg_iov = &iov, 279 .msg_iovlen = 1, 280 .msg_control = control, 281 .msg_controllen = sizeof(control), 282 }; 283 size_t fd_size; 284 struct cmsghdr *cmsg; 285 int rc; 286 287 do { 288 rc = recvmsg(conn_fd, &msg, 0); 289 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 290 291 if (rc < 0) { 292 vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); 293 return false; 294 } 295 296 vmsg->fd_num = 0; 297 for (cmsg = CMSG_FIRSTHDR(&msg); 298 cmsg != NULL; 299 cmsg = CMSG_NXTHDR(&msg, cmsg)) 300 { 301 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { 302 fd_size = cmsg->cmsg_len - CMSG_LEN(0); 303 vmsg->fd_num = fd_size / sizeof(int); 304 memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); 305 break; 306 } 307 } 308 309 if (vmsg->size > sizeof(vmsg->payload)) { 310 vu_panic(dev, 311 "Error: too big message request: %d, size: vmsg->size: %u, " 312 "while sizeof(vmsg->payload) = %zu\n", 313 vmsg->request, vmsg->size, sizeof(vmsg->payload)); 314 goto fail; 315 } 316 317 if (vmsg->size) { 318 do { 319 rc = read(conn_fd, &vmsg->payload, vmsg->size); 320 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 321 322 if (rc <= 0) { 323 vu_panic(dev, "Error while reading: %s", strerror(errno)); 324 goto fail; 325 } 326 327 assert(rc == vmsg->size); 328 } 329 330 return true; 331 332 fail: 333 vmsg_close_fds(vmsg); 334 335 return false; 336 } 337 338 static bool 339 vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 340 { 341 int rc; 342 uint8_t *p = (uint8_t *)vmsg; 343 char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; 344 struct iovec iov = { 345 .iov_base = (char *)vmsg, 346 .iov_len = VHOST_USER_HDR_SIZE, 347 }; 348 struct msghdr msg = { 349 .msg_iov = &iov, 350 .msg_iovlen = 1, 351 .msg_control = control, 352 }; 353 struct cmsghdr *cmsg; 354 355 memset(control, 0, sizeof(control)); 356 assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); 357 if (vmsg->fd_num > 0) { 358 size_t fdsize = vmsg->fd_num * sizeof(int); 359 msg.msg_controllen = CMSG_SPACE(fdsize); 360 cmsg = CMSG_FIRSTHDR(&msg); 361 cmsg->cmsg_len = CMSG_LEN(fdsize); 362 cmsg->cmsg_level = SOL_SOCKET; 363 cmsg->cmsg_type = SCM_RIGHTS; 364 memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); 365 } else { 366 msg.msg_controllen = 0; 367 } 368 369 do { 370 rc = sendmsg(conn_fd, &msg, 0); 371 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 372 373 if (vmsg->size) { 374 do { 375 if (vmsg->data) { 376 rc = write(conn_fd, vmsg->data, vmsg->size); 377 } else { 378 rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); 379 } 380 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 381 } 382 383 if (rc <= 0) { 384 vu_panic(dev, "Error while writing: %s", strerror(errno)); 385 return false; 386 } 387 388 return true; 389 } 390 391 static bool 392 vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 393 { 394 /* Set the version in the flags when sending the reply */ 395 vmsg->flags &= ~VHOST_USER_VERSION_MASK; 396 vmsg->flags |= VHOST_USER_VERSION; 397 vmsg->flags |= VHOST_USER_REPLY_MASK; 398 399 return vu_message_write(dev, conn_fd, vmsg); 400 } 401 402 /* 403 * Processes a reply on the slave channel. 404 * Entered with slave_mutex held and releases it before exit. 405 * Returns true on success. 406 */ 407 static bool 408 vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg) 409 { 410 VhostUserMsg msg_reply; 411 bool result = false; 412 413 if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { 414 result = true; 415 goto out; 416 } 417 418 if (!vu_message_read_default(dev, dev->slave_fd, &msg_reply)) { 419 goto out; 420 } 421 422 if (msg_reply.request != vmsg->request) { 423 DPRINT("Received unexpected msg type. Expected %d received %d", 424 vmsg->request, msg_reply.request); 425 goto out; 426 } 427 428 result = msg_reply.payload.u64 == 0; 429 430 out: 431 pthread_mutex_unlock(&dev->slave_mutex); 432 return result; 433 } 434 435 /* Kick the log_call_fd if required. */ 436 static void 437 vu_log_kick(VuDev *dev) 438 { 439 if (dev->log_call_fd != -1) { 440 DPRINT("Kicking the QEMU's log...\n"); 441 if (eventfd_write(dev->log_call_fd, 1) < 0) { 442 vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); 443 } 444 } 445 } 446 447 static void 448 vu_log_page(uint8_t *log_table, uint64_t page) 449 { 450 DPRINT("Logged dirty guest page: %"PRId64"\n", page); 451 qatomic_or(&log_table[page / 8], 1 << (page % 8)); 452 } 453 454 static void 455 vu_log_write(VuDev *dev, uint64_t address, uint64_t length) 456 { 457 uint64_t page; 458 459 if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || 460 !dev->log_table || !length) { 461 return; 462 } 463 464 assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); 465 466 page = address / VHOST_LOG_PAGE; 467 while (page * VHOST_LOG_PAGE < address + length) { 468 vu_log_page(dev->log_table, page); 469 page += 1; 470 } 471 472 vu_log_kick(dev); 473 } 474 475 static void 476 vu_kick_cb(VuDev *dev, int condition, void *data) 477 { 478 int index = (intptr_t)data; 479 VuVirtq *vq = &dev->vq[index]; 480 int sock = vq->kick_fd; 481 eventfd_t kick_data; 482 ssize_t rc; 483 484 rc = eventfd_read(sock, &kick_data); 485 if (rc == -1) { 486 vu_panic(dev, "kick eventfd_read(): %s", strerror(errno)); 487 dev->remove_watch(dev, dev->vq[index].kick_fd); 488 } else { 489 DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n", 490 kick_data, vq->handler, index); 491 if (vq->handler) { 492 vq->handler(dev, index); 493 } 494 } 495 } 496 497 static bool 498 vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) 499 { 500 vmsg->payload.u64 = 501 /* 502 * The following VIRTIO feature bits are supported by our virtqueue 503 * implementation: 504 */ 505 1ULL << VIRTIO_F_NOTIFY_ON_EMPTY | 506 1ULL << VIRTIO_RING_F_INDIRECT_DESC | 507 1ULL << VIRTIO_RING_F_EVENT_IDX | 508 1ULL << VIRTIO_F_VERSION_1 | 509 510 /* vhost-user feature bits */ 511 1ULL << VHOST_F_LOG_ALL | 512 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; 513 514 if (dev->iface->get_features) { 515 vmsg->payload.u64 |= dev->iface->get_features(dev); 516 } 517 518 vmsg->size = sizeof(vmsg->payload.u64); 519 vmsg->fd_num = 0; 520 521 DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 522 523 return true; 524 } 525 526 static void 527 vu_set_enable_all_rings(VuDev *dev, bool enabled) 528 { 529 uint16_t i; 530 531 for (i = 0; i < dev->max_queues; i++) { 532 dev->vq[i].enable = enabled; 533 } 534 } 535 536 static bool 537 vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg) 538 { 539 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 540 541 dev->features = vmsg->payload.u64; 542 if (!vu_has_feature(dev, VIRTIO_F_VERSION_1)) { 543 /* 544 * We only support devices conforming to VIRTIO 1.0 or 545 * later 546 */ 547 vu_panic(dev, "virtio legacy devices aren't supported by libvhost-user"); 548 return false; 549 } 550 551 if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) { 552 vu_set_enable_all_rings(dev, true); 553 } 554 555 if (dev->iface->set_features) { 556 dev->iface->set_features(dev, dev->features); 557 } 558 559 return false; 560 } 561 562 static bool 563 vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg) 564 { 565 return false; 566 } 567 568 static void 569 vu_close_log(VuDev *dev) 570 { 571 if (dev->log_table) { 572 if (munmap(dev->log_table, dev->log_size) != 0) { 573 perror("close log munmap() error"); 574 } 575 576 dev->log_table = NULL; 577 } 578 if (dev->log_call_fd != -1) { 579 close(dev->log_call_fd); 580 dev->log_call_fd = -1; 581 } 582 } 583 584 static bool 585 vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) 586 { 587 vu_set_enable_all_rings(dev, false); 588 589 return false; 590 } 591 592 static bool 593 map_ring(VuDev *dev, VuVirtq *vq) 594 { 595 vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); 596 vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); 597 vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); 598 599 DPRINT("Setting virtq addresses:\n"); 600 DPRINT(" vring_desc at %p\n", vq->vring.desc); 601 DPRINT(" vring_used at %p\n", vq->vring.used); 602 DPRINT(" vring_avail at %p\n", vq->vring.avail); 603 604 return !(vq->vring.desc && vq->vring.used && vq->vring.avail); 605 } 606 607 static bool 608 generate_faults(VuDev *dev) { 609 int i; 610 for (i = 0; i < dev->nregions; i++) { 611 VuDevRegion *dev_region = &dev->regions[i]; 612 int ret; 613 #ifdef UFFDIO_REGISTER 614 /* 615 * We should already have an open ufd. Mark each memory 616 * range as ufd. 617 * Discard any mapping we have here; note I can't use MADV_REMOVE 618 * or fallocate to make the hole since I don't want to lose 619 * data that's already arrived in the shared process. 620 * TODO: How to do hugepage 621 */ 622 ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, 623 dev_region->size + dev_region->mmap_offset, 624 MADV_DONTNEED); 625 if (ret) { 626 fprintf(stderr, 627 "%s: Failed to madvise(DONTNEED) region %d: %s\n", 628 __func__, i, strerror(errno)); 629 } 630 /* 631 * Turn off transparent hugepages so we dont get lose wakeups 632 * in neighbouring pages. 633 * TODO: Turn this backon later. 634 */ 635 ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, 636 dev_region->size + dev_region->mmap_offset, 637 MADV_NOHUGEPAGE); 638 if (ret) { 639 /* 640 * Note: This can happen legally on kernels that are configured 641 * without madvise'able hugepages 642 */ 643 fprintf(stderr, 644 "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n", 645 __func__, i, strerror(errno)); 646 } 647 struct uffdio_register reg_struct; 648 reg_struct.range.start = (uintptr_t)dev_region->mmap_addr; 649 reg_struct.range.len = dev_region->size + dev_region->mmap_offset; 650 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 651 652 if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) { 653 vu_panic(dev, "%s: Failed to userfault region %d " 654 "@%p + size:%zx offset: %zx: (ufd=%d)%s\n", 655 __func__, i, 656 dev_region->mmap_addr, 657 dev_region->size, dev_region->mmap_offset, 658 dev->postcopy_ufd, strerror(errno)); 659 return false; 660 } 661 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) { 662 vu_panic(dev, "%s Region (%d) doesn't support COPY", 663 __func__, i); 664 return false; 665 } 666 DPRINT("%s: region %d: Registered userfault for %" 667 PRIx64 " + %" PRIx64 "\n", __func__, i, 668 (uint64_t)reg_struct.range.start, 669 (uint64_t)reg_struct.range.len); 670 /* Now it's registered we can let the client at it */ 671 if (mprotect((void *)(uintptr_t)dev_region->mmap_addr, 672 dev_region->size + dev_region->mmap_offset, 673 PROT_READ | PROT_WRITE)) { 674 vu_panic(dev, "failed to mprotect region %d for postcopy (%s)", 675 i, strerror(errno)); 676 return false; 677 } 678 /* TODO: Stash 'zero' support flags somewhere */ 679 #endif 680 } 681 682 return true; 683 } 684 685 static bool 686 vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { 687 int i; 688 bool track_ramblocks = dev->postcopy_listening; 689 VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; 690 VuDevRegion *dev_region = &dev->regions[dev->nregions]; 691 void *mmap_addr; 692 693 if (vmsg->fd_num != 1) { 694 vmsg_close_fds(vmsg); 695 vu_panic(dev, "VHOST_USER_ADD_MEM_REG received %d fds - only 1 fd " 696 "should be sent for this message type", vmsg->fd_num); 697 return false; 698 } 699 700 if (vmsg->size < VHOST_USER_MEM_REG_SIZE) { 701 close(vmsg->fds[0]); 702 vu_panic(dev, "VHOST_USER_ADD_MEM_REG requires a message size of at " 703 "least %d bytes and only %d bytes were received", 704 VHOST_USER_MEM_REG_SIZE, vmsg->size); 705 return false; 706 } 707 708 if (dev->nregions == VHOST_USER_MAX_RAM_SLOTS) { 709 close(vmsg->fds[0]); 710 vu_panic(dev, "failing attempt to hot add memory via " 711 "VHOST_USER_ADD_MEM_REG message because the backend has " 712 "no free ram slots available"); 713 return false; 714 } 715 716 /* 717 * If we are in postcopy mode and we receive a u64 payload with a 0 value 718 * we know all the postcopy client bases have been received, and we 719 * should start generating faults. 720 */ 721 if (track_ramblocks && 722 vmsg->size == sizeof(vmsg->payload.u64) && 723 vmsg->payload.u64 == 0) { 724 (void)generate_faults(dev); 725 return false; 726 } 727 728 DPRINT("Adding region: %u\n", dev->nregions); 729 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 730 msg_region->guest_phys_addr); 731 DPRINT(" memory_size: 0x%016"PRIx64"\n", 732 msg_region->memory_size); 733 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 734 msg_region->userspace_addr); 735 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 736 msg_region->mmap_offset); 737 738 dev_region->gpa = msg_region->guest_phys_addr; 739 dev_region->size = msg_region->memory_size; 740 dev_region->qva = msg_region->userspace_addr; 741 dev_region->mmap_offset = msg_region->mmap_offset; 742 743 /* 744 * We don't use offset argument of mmap() since the 745 * mapped address has to be page aligned, and we use huge 746 * pages. 747 */ 748 if (track_ramblocks) { 749 /* 750 * In postcopy we're using PROT_NONE here to catch anyone 751 * accessing it before we userfault. 752 */ 753 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 754 PROT_NONE, MAP_SHARED | MAP_NORESERVE, 755 vmsg->fds[0], 0); 756 } else { 757 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 758 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, 759 vmsg->fds[0], 0); 760 } 761 762 if (mmap_addr == MAP_FAILED) { 763 vu_panic(dev, "region mmap error: %s", strerror(errno)); 764 } else { 765 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 766 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 767 dev_region->mmap_addr); 768 } 769 770 close(vmsg->fds[0]); 771 772 if (track_ramblocks) { 773 /* 774 * Return the address to QEMU so that it can translate the ufd 775 * fault addresses back. 776 */ 777 msg_region->userspace_addr = (uintptr_t)(mmap_addr + 778 dev_region->mmap_offset); 779 780 /* Send the message back to qemu with the addresses filled in. */ 781 vmsg->fd_num = 0; 782 if (!vu_send_reply(dev, dev->sock, vmsg)) { 783 vu_panic(dev, "failed to respond to add-mem-region for postcopy"); 784 return false; 785 } 786 787 DPRINT("Successfully added new region in postcopy\n"); 788 dev->nregions++; 789 return false; 790 791 } else { 792 for (i = 0; i < dev->max_queues; i++) { 793 if (dev->vq[i].vring.desc) { 794 if (map_ring(dev, &dev->vq[i])) { 795 vu_panic(dev, "remapping queue %d for new memory region", 796 i); 797 } 798 } 799 } 800 801 DPRINT("Successfully added new region\n"); 802 dev->nregions++; 803 vmsg_set_reply_u64(vmsg, 0); 804 return true; 805 } 806 } 807 808 static inline bool reg_equal(VuDevRegion *vudev_reg, 809 VhostUserMemoryRegion *msg_reg) 810 { 811 if (vudev_reg->gpa == msg_reg->guest_phys_addr && 812 vudev_reg->qva == msg_reg->userspace_addr && 813 vudev_reg->size == msg_reg->memory_size) { 814 return true; 815 } 816 817 return false; 818 } 819 820 static bool 821 vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { 822 VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; 823 int i; 824 bool found = false; 825 826 if (vmsg->fd_num != 1) { 827 vmsg_close_fds(vmsg); 828 vu_panic(dev, "VHOST_USER_REM_MEM_REG received %d fds - only 1 fd " 829 "should be sent for this message type", vmsg->fd_num); 830 return false; 831 } 832 833 if (vmsg->size < VHOST_USER_MEM_REG_SIZE) { 834 close(vmsg->fds[0]); 835 vu_panic(dev, "VHOST_USER_REM_MEM_REG requires a message size of at " 836 "least %d bytes and only %d bytes were received", 837 VHOST_USER_MEM_REG_SIZE, vmsg->size); 838 return false; 839 } 840 841 DPRINT("Removing region:\n"); 842 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 843 msg_region->guest_phys_addr); 844 DPRINT(" memory_size: 0x%016"PRIx64"\n", 845 msg_region->memory_size); 846 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 847 msg_region->userspace_addr); 848 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 849 msg_region->mmap_offset); 850 851 for (i = 0; i < dev->nregions; i++) { 852 if (reg_equal(&dev->regions[i], msg_region)) { 853 VuDevRegion *r = &dev->regions[i]; 854 void *m = (void *) (uintptr_t) r->mmap_addr; 855 856 if (m) { 857 munmap(m, r->size + r->mmap_offset); 858 } 859 860 /* 861 * Shift all affected entries by 1 to close the hole at index i and 862 * zero out the last entry. 863 */ 864 memmove(dev->regions + i, dev->regions + i + 1, 865 sizeof(VuDevRegion) * (dev->nregions - i - 1)); 866 memset(dev->regions + dev->nregions - 1, 0, sizeof(VuDevRegion)); 867 DPRINT("Successfully removed a region\n"); 868 dev->nregions--; 869 i--; 870 871 found = true; 872 873 /* Continue the search for eventual duplicates. */ 874 } 875 } 876 877 if (found) { 878 vmsg_set_reply_u64(vmsg, 0); 879 } else { 880 vu_panic(dev, "Specified region not found\n"); 881 } 882 883 close(vmsg->fds[0]); 884 885 return true; 886 } 887 888 static bool 889 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) 890 { 891 int i; 892 VhostUserMemory m = vmsg->payload.memory, *memory = &m; 893 dev->nregions = memory->nregions; 894 895 DPRINT("Nregions: %u\n", memory->nregions); 896 for (i = 0; i < dev->nregions; i++) { 897 void *mmap_addr; 898 VhostUserMemoryRegion *msg_region = &memory->regions[i]; 899 VuDevRegion *dev_region = &dev->regions[i]; 900 901 DPRINT("Region %d\n", i); 902 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 903 msg_region->guest_phys_addr); 904 DPRINT(" memory_size: 0x%016"PRIx64"\n", 905 msg_region->memory_size); 906 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 907 msg_region->userspace_addr); 908 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 909 msg_region->mmap_offset); 910 911 dev_region->gpa = msg_region->guest_phys_addr; 912 dev_region->size = msg_region->memory_size; 913 dev_region->qva = msg_region->userspace_addr; 914 dev_region->mmap_offset = msg_region->mmap_offset; 915 916 /* We don't use offset argument of mmap() since the 917 * mapped address has to be page aligned, and we use huge 918 * pages. 919 * In postcopy we're using PROT_NONE here to catch anyone 920 * accessing it before we userfault 921 */ 922 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 923 PROT_NONE, MAP_SHARED | MAP_NORESERVE, 924 vmsg->fds[i], 0); 925 926 if (mmap_addr == MAP_FAILED) { 927 vu_panic(dev, "region mmap error: %s", strerror(errno)); 928 } else { 929 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 930 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 931 dev_region->mmap_addr); 932 } 933 934 /* Return the address to QEMU so that it can translate the ufd 935 * fault addresses back. 936 */ 937 msg_region->userspace_addr = (uintptr_t)(mmap_addr + 938 dev_region->mmap_offset); 939 close(vmsg->fds[i]); 940 } 941 942 /* Send the message back to qemu with the addresses filled in */ 943 vmsg->fd_num = 0; 944 if (!vu_send_reply(dev, dev->sock, vmsg)) { 945 vu_panic(dev, "failed to respond to set-mem-table for postcopy"); 946 return false; 947 } 948 949 /* Wait for QEMU to confirm that it's registered the handler for the 950 * faults. 951 */ 952 if (!dev->read_msg(dev, dev->sock, vmsg) || 953 vmsg->size != sizeof(vmsg->payload.u64) || 954 vmsg->payload.u64 != 0) { 955 vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); 956 return false; 957 } 958 959 /* OK, now we can go and register the memory and generate faults */ 960 (void)generate_faults(dev); 961 962 return false; 963 } 964 965 static bool 966 vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) 967 { 968 int i; 969 VhostUserMemory m = vmsg->payload.memory, *memory = &m; 970 971 for (i = 0; i < dev->nregions; i++) { 972 VuDevRegion *r = &dev->regions[i]; 973 void *m = (void *) (uintptr_t) r->mmap_addr; 974 975 if (m) { 976 munmap(m, r->size + r->mmap_offset); 977 } 978 } 979 dev->nregions = memory->nregions; 980 981 if (dev->postcopy_listening) { 982 return vu_set_mem_table_exec_postcopy(dev, vmsg); 983 } 984 985 DPRINT("Nregions: %u\n", memory->nregions); 986 for (i = 0; i < dev->nregions; i++) { 987 void *mmap_addr; 988 VhostUserMemoryRegion *msg_region = &memory->regions[i]; 989 VuDevRegion *dev_region = &dev->regions[i]; 990 991 DPRINT("Region %d\n", i); 992 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 993 msg_region->guest_phys_addr); 994 DPRINT(" memory_size: 0x%016"PRIx64"\n", 995 msg_region->memory_size); 996 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 997 msg_region->userspace_addr); 998 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 999 msg_region->mmap_offset); 1000 1001 dev_region->gpa = msg_region->guest_phys_addr; 1002 dev_region->size = msg_region->memory_size; 1003 dev_region->qva = msg_region->userspace_addr; 1004 dev_region->mmap_offset = msg_region->mmap_offset; 1005 1006 /* We don't use offset argument of mmap() since the 1007 * mapped address has to be page aligned, and we use huge 1008 * pages. */ 1009 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 1010 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, 1011 vmsg->fds[i], 0); 1012 1013 if (mmap_addr == MAP_FAILED) { 1014 vu_panic(dev, "region mmap error: %s", strerror(errno)); 1015 } else { 1016 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 1017 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 1018 dev_region->mmap_addr); 1019 } 1020 1021 close(vmsg->fds[i]); 1022 } 1023 1024 for (i = 0; i < dev->max_queues; i++) { 1025 if (dev->vq[i].vring.desc) { 1026 if (map_ring(dev, &dev->vq[i])) { 1027 vu_panic(dev, "remapping queue %d during setmemtable", i); 1028 } 1029 } 1030 } 1031 1032 return false; 1033 } 1034 1035 static bool 1036 vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1037 { 1038 int fd; 1039 uint64_t log_mmap_size, log_mmap_offset; 1040 void *rc; 1041 1042 if (vmsg->fd_num != 1 || 1043 vmsg->size != sizeof(vmsg->payload.log)) { 1044 vu_panic(dev, "Invalid log_base message"); 1045 return true; 1046 } 1047 1048 fd = vmsg->fds[0]; 1049 log_mmap_offset = vmsg->payload.log.mmap_offset; 1050 log_mmap_size = vmsg->payload.log.mmap_size; 1051 DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); 1052 DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); 1053 1054 rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 1055 log_mmap_offset); 1056 close(fd); 1057 if (rc == MAP_FAILED) { 1058 perror("log mmap error"); 1059 } 1060 1061 if (dev->log_table) { 1062 munmap(dev->log_table, dev->log_size); 1063 } 1064 dev->log_table = rc; 1065 dev->log_size = log_mmap_size; 1066 1067 vmsg->size = sizeof(vmsg->payload.u64); 1068 vmsg->fd_num = 0; 1069 1070 return true; 1071 } 1072 1073 static bool 1074 vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg) 1075 { 1076 if (vmsg->fd_num != 1) { 1077 vu_panic(dev, "Invalid log_fd message"); 1078 return false; 1079 } 1080 1081 if (dev->log_call_fd != -1) { 1082 close(dev->log_call_fd); 1083 } 1084 dev->log_call_fd = vmsg->fds[0]; 1085 DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); 1086 1087 return false; 1088 } 1089 1090 static bool 1091 vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg) 1092 { 1093 unsigned int index = vmsg->payload.state.index; 1094 unsigned int num = vmsg->payload.state.num; 1095 1096 DPRINT("State.index: %u\n", index); 1097 DPRINT("State.num: %u\n", num); 1098 dev->vq[index].vring.num = num; 1099 1100 return false; 1101 } 1102 1103 static bool 1104 vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) 1105 { 1106 struct vhost_vring_addr addr = vmsg->payload.addr, *vra = &addr; 1107 unsigned int index = vra->index; 1108 VuVirtq *vq = &dev->vq[index]; 1109 1110 DPRINT("vhost_vring_addr:\n"); 1111 DPRINT(" index: %d\n", vra->index); 1112 DPRINT(" flags: %d\n", vra->flags); 1113 DPRINT(" desc_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->desc_user_addr); 1114 DPRINT(" used_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->used_user_addr); 1115 DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->avail_user_addr); 1116 DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->log_guest_addr); 1117 1118 vq->vra = *vra; 1119 vq->vring.flags = vra->flags; 1120 vq->vring.log_guest_addr = vra->log_guest_addr; 1121 1122 1123 if (map_ring(dev, vq)) { 1124 vu_panic(dev, "Invalid vring_addr message"); 1125 return false; 1126 } 1127 1128 vq->used_idx = le16toh(vq->vring.used->idx); 1129 1130 if (vq->last_avail_idx != vq->used_idx) { 1131 bool resume = dev->iface->queue_is_processed_in_order && 1132 dev->iface->queue_is_processed_in_order(dev, index); 1133 1134 DPRINT("Last avail index != used index: %u != %u%s\n", 1135 vq->last_avail_idx, vq->used_idx, 1136 resume ? ", resuming" : ""); 1137 1138 if (resume) { 1139 vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx; 1140 } 1141 } 1142 1143 return false; 1144 } 1145 1146 static bool 1147 vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1148 { 1149 unsigned int index = vmsg->payload.state.index; 1150 unsigned int num = vmsg->payload.state.num; 1151 1152 DPRINT("State.index: %u\n", index); 1153 DPRINT("State.num: %u\n", num); 1154 dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num; 1155 1156 return false; 1157 } 1158 1159 static bool 1160 vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1161 { 1162 unsigned int index = vmsg->payload.state.index; 1163 1164 DPRINT("State.index: %u\n", index); 1165 vmsg->payload.state.num = dev->vq[index].last_avail_idx; 1166 vmsg->size = sizeof(vmsg->payload.state); 1167 1168 dev->vq[index].started = false; 1169 if (dev->iface->queue_set_started) { 1170 dev->iface->queue_set_started(dev, index, false); 1171 } 1172 1173 if (dev->vq[index].call_fd != -1) { 1174 close(dev->vq[index].call_fd); 1175 dev->vq[index].call_fd = -1; 1176 } 1177 if (dev->vq[index].kick_fd != -1) { 1178 dev->remove_watch(dev, dev->vq[index].kick_fd); 1179 close(dev->vq[index].kick_fd); 1180 dev->vq[index].kick_fd = -1; 1181 } 1182 1183 return true; 1184 } 1185 1186 static bool 1187 vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) 1188 { 1189 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1190 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1191 1192 if (index >= dev->max_queues) { 1193 vmsg_close_fds(vmsg); 1194 vu_panic(dev, "Invalid queue index: %u", index); 1195 return false; 1196 } 1197 1198 if (nofd) { 1199 vmsg_close_fds(vmsg); 1200 return true; 1201 } 1202 1203 if (vmsg->fd_num != 1) { 1204 vmsg_close_fds(vmsg); 1205 vu_panic(dev, "Invalid fds in request: %d", vmsg->request); 1206 return false; 1207 } 1208 1209 return true; 1210 } 1211 1212 static int 1213 inflight_desc_compare(const void *a, const void *b) 1214 { 1215 VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a, 1216 *desc1 = (VuVirtqInflightDesc *)b; 1217 1218 if (desc1->counter > desc0->counter && 1219 (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) { 1220 return 1; 1221 } 1222 1223 return -1; 1224 } 1225 1226 static int 1227 vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) 1228 { 1229 int i = 0; 1230 1231 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 1232 return 0; 1233 } 1234 1235 if (unlikely(!vq->inflight)) { 1236 return -1; 1237 } 1238 1239 if (unlikely(!vq->inflight->version)) { 1240 /* initialize the buffer */ 1241 vq->inflight->version = INFLIGHT_VERSION; 1242 return 0; 1243 } 1244 1245 vq->used_idx = le16toh(vq->vring.used->idx); 1246 vq->resubmit_num = 0; 1247 vq->resubmit_list = NULL; 1248 vq->counter = 0; 1249 1250 if (unlikely(vq->inflight->used_idx != vq->used_idx)) { 1251 vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0; 1252 1253 barrier(); 1254 1255 vq->inflight->used_idx = vq->used_idx; 1256 } 1257 1258 for (i = 0; i < vq->inflight->desc_num; i++) { 1259 if (vq->inflight->desc[i].inflight == 1) { 1260 vq->inuse++; 1261 } 1262 } 1263 1264 vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; 1265 1266 if (vq->inuse) { 1267 vq->resubmit_list = calloc(vq->inuse, sizeof(VuVirtqInflightDesc)); 1268 if (!vq->resubmit_list) { 1269 return -1; 1270 } 1271 1272 for (i = 0; i < vq->inflight->desc_num; i++) { 1273 if (vq->inflight->desc[i].inflight) { 1274 vq->resubmit_list[vq->resubmit_num].index = i; 1275 vq->resubmit_list[vq->resubmit_num].counter = 1276 vq->inflight->desc[i].counter; 1277 vq->resubmit_num++; 1278 } 1279 } 1280 1281 if (vq->resubmit_num > 1) { 1282 qsort(vq->resubmit_list, vq->resubmit_num, 1283 sizeof(VuVirtqInflightDesc), inflight_desc_compare); 1284 } 1285 vq->counter = vq->resubmit_list[0].counter + 1; 1286 } 1287 1288 /* in case of I/O hang after reconnecting */ 1289 if (eventfd_write(vq->kick_fd, 1)) { 1290 return -1; 1291 } 1292 1293 return 0; 1294 } 1295 1296 static bool 1297 vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) 1298 { 1299 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1300 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1301 1302 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1303 1304 if (!vu_check_queue_msg_file(dev, vmsg)) { 1305 return false; 1306 } 1307 1308 if (dev->vq[index].kick_fd != -1) { 1309 dev->remove_watch(dev, dev->vq[index].kick_fd); 1310 close(dev->vq[index].kick_fd); 1311 dev->vq[index].kick_fd = -1; 1312 } 1313 1314 dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0]; 1315 DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index); 1316 1317 dev->vq[index].started = true; 1318 if (dev->iface->queue_set_started) { 1319 dev->iface->queue_set_started(dev, index, true); 1320 } 1321 1322 if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) { 1323 dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN, 1324 vu_kick_cb, (void *)(long)index); 1325 1326 DPRINT("Waiting for kicks on fd: %d for vq: %d\n", 1327 dev->vq[index].kick_fd, index); 1328 } 1329 1330 if (vu_check_queue_inflights(dev, &dev->vq[index])) { 1331 vu_panic(dev, "Failed to check inflights for vq: %d\n", index); 1332 } 1333 1334 return false; 1335 } 1336 1337 void vu_set_queue_handler(VuDev *dev, VuVirtq *vq, 1338 vu_queue_handler_cb handler) 1339 { 1340 int qidx = vq - dev->vq; 1341 1342 vq->handler = handler; 1343 if (vq->kick_fd >= 0) { 1344 if (handler) { 1345 dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN, 1346 vu_kick_cb, (void *)(long)qidx); 1347 } else { 1348 dev->remove_watch(dev, vq->kick_fd); 1349 } 1350 } 1351 } 1352 1353 bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd, 1354 int size, int offset) 1355 { 1356 int qidx = vq - dev->vq; 1357 int fd_num = 0; 1358 VhostUserMsg vmsg = { 1359 .request = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, 1360 .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, 1361 .size = sizeof(vmsg.payload.area), 1362 .payload.area = { 1363 .u64 = qidx & VHOST_USER_VRING_IDX_MASK, 1364 .size = size, 1365 .offset = offset, 1366 }, 1367 }; 1368 1369 if (fd == -1) { 1370 vmsg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; 1371 } else { 1372 vmsg.fds[fd_num++] = fd; 1373 } 1374 1375 vmsg.fd_num = fd_num; 1376 1377 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) { 1378 return false; 1379 } 1380 1381 pthread_mutex_lock(&dev->slave_mutex); 1382 if (!vu_message_write(dev, dev->slave_fd, &vmsg)) { 1383 pthread_mutex_unlock(&dev->slave_mutex); 1384 return false; 1385 } 1386 1387 /* Also unlocks the slave_mutex */ 1388 return vu_process_message_reply(dev, &vmsg); 1389 } 1390 1391 static bool 1392 vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) 1393 { 1394 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1395 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1396 1397 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1398 1399 if (!vu_check_queue_msg_file(dev, vmsg)) { 1400 return false; 1401 } 1402 1403 if (dev->vq[index].call_fd != -1) { 1404 close(dev->vq[index].call_fd); 1405 dev->vq[index].call_fd = -1; 1406 } 1407 1408 dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0]; 1409 1410 /* in case of I/O hang after reconnecting */ 1411 if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) { 1412 return -1; 1413 } 1414 1415 DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index); 1416 1417 return false; 1418 } 1419 1420 static bool 1421 vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) 1422 { 1423 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1424 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1425 1426 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1427 1428 if (!vu_check_queue_msg_file(dev, vmsg)) { 1429 return false; 1430 } 1431 1432 if (dev->vq[index].err_fd != -1) { 1433 close(dev->vq[index].err_fd); 1434 dev->vq[index].err_fd = -1; 1435 } 1436 1437 dev->vq[index].err_fd = nofd ? -1 : vmsg->fds[0]; 1438 1439 return false; 1440 } 1441 1442 static bool 1443 vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1444 { 1445 /* 1446 * Note that we support, but intentionally do not set, 1447 * VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. This means that 1448 * a device implementation can return it in its callback 1449 * (get_protocol_features) if it wants to use this for 1450 * simulation, but it is otherwise not desirable (if even 1451 * implemented by the master.) 1452 */ 1453 uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_MQ | 1454 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | 1455 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | 1456 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | 1457 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | 1458 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | 1459 1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS; 1460 1461 if (have_userfault()) { 1462 features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; 1463 } 1464 1465 if (dev->iface->get_config && dev->iface->set_config) { 1466 features |= 1ULL << VHOST_USER_PROTOCOL_F_CONFIG; 1467 } 1468 1469 if (dev->iface->get_protocol_features) { 1470 features |= dev->iface->get_protocol_features(dev); 1471 } 1472 1473 vmsg_set_reply_u64(vmsg, features); 1474 return true; 1475 } 1476 1477 static bool 1478 vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1479 { 1480 uint64_t features = vmsg->payload.u64; 1481 1482 DPRINT("u64: 0x%016"PRIx64"\n", features); 1483 1484 dev->protocol_features = vmsg->payload.u64; 1485 1486 if (vu_has_protocol_feature(dev, 1487 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && 1488 (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ) || 1489 !vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { 1490 /* 1491 * The use case for using messages for kick/call is simulation, to make 1492 * the kick and call synchronous. To actually get that behaviour, both 1493 * of the other features are required. 1494 * Theoretically, one could use only kick messages, or do them without 1495 * having F_REPLY_ACK, but too many (possibly pending) messages on the 1496 * socket will eventually cause the master to hang, to avoid this in 1497 * scenarios where not desired enforce that the settings are in a way 1498 * that actually enables the simulation case. 1499 */ 1500 vu_panic(dev, 1501 "F_IN_BAND_NOTIFICATIONS requires F_SLAVE_REQ && F_REPLY_ACK"); 1502 return false; 1503 } 1504 1505 if (dev->iface->set_protocol_features) { 1506 dev->iface->set_protocol_features(dev, features); 1507 } 1508 1509 return false; 1510 } 1511 1512 static bool 1513 vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg) 1514 { 1515 vmsg_set_reply_u64(vmsg, dev->max_queues); 1516 return true; 1517 } 1518 1519 static bool 1520 vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg) 1521 { 1522 unsigned int index = vmsg->payload.state.index; 1523 unsigned int enable = vmsg->payload.state.num; 1524 1525 DPRINT("State.index: %u\n", index); 1526 DPRINT("State.enable: %u\n", enable); 1527 1528 if (index >= dev->max_queues) { 1529 vu_panic(dev, "Invalid vring_enable index: %u", index); 1530 return false; 1531 } 1532 1533 dev->vq[index].enable = enable; 1534 return false; 1535 } 1536 1537 static bool 1538 vu_set_slave_req_fd(VuDev *dev, VhostUserMsg *vmsg) 1539 { 1540 if (vmsg->fd_num != 1) { 1541 vu_panic(dev, "Invalid slave_req_fd message (%d fd's)", vmsg->fd_num); 1542 return false; 1543 } 1544 1545 if (dev->slave_fd != -1) { 1546 close(dev->slave_fd); 1547 } 1548 dev->slave_fd = vmsg->fds[0]; 1549 DPRINT("Got slave_fd: %d\n", vmsg->fds[0]); 1550 1551 return false; 1552 } 1553 1554 static bool 1555 vu_get_config(VuDev *dev, VhostUserMsg *vmsg) 1556 { 1557 int ret = -1; 1558 1559 if (dev->iface->get_config) { 1560 ret = dev->iface->get_config(dev, vmsg->payload.config.region, 1561 vmsg->payload.config.size); 1562 } 1563 1564 if (ret) { 1565 /* resize to zero to indicate an error to master */ 1566 vmsg->size = 0; 1567 } 1568 1569 return true; 1570 } 1571 1572 static bool 1573 vu_set_config(VuDev *dev, VhostUserMsg *vmsg) 1574 { 1575 int ret = -1; 1576 1577 if (dev->iface->set_config) { 1578 ret = dev->iface->set_config(dev, vmsg->payload.config.region, 1579 vmsg->payload.config.offset, 1580 vmsg->payload.config.size, 1581 vmsg->payload.config.flags); 1582 if (ret) { 1583 vu_panic(dev, "Set virtio configuration space failed"); 1584 } 1585 } 1586 1587 return false; 1588 } 1589 1590 static bool 1591 vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg) 1592 { 1593 dev->postcopy_ufd = -1; 1594 #ifdef UFFDIO_API 1595 struct uffdio_api api_struct; 1596 1597 dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 1598 vmsg->size = 0; 1599 #endif 1600 1601 if (dev->postcopy_ufd == -1) { 1602 vu_panic(dev, "Userfaultfd not available: %s", strerror(errno)); 1603 goto out; 1604 } 1605 1606 #ifdef UFFDIO_API 1607 api_struct.api = UFFD_API; 1608 api_struct.features = 0; 1609 if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { 1610 vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno)); 1611 close(dev->postcopy_ufd); 1612 dev->postcopy_ufd = -1; 1613 goto out; 1614 } 1615 /* TODO: Stash feature flags somewhere */ 1616 #endif 1617 1618 out: 1619 /* Return a ufd to the QEMU */ 1620 vmsg->fd_num = 1; 1621 vmsg->fds[0] = dev->postcopy_ufd; 1622 return true; /* = send a reply */ 1623 } 1624 1625 static bool 1626 vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg) 1627 { 1628 if (dev->nregions) { 1629 vu_panic(dev, "Regions already registered at postcopy-listen"); 1630 vmsg_set_reply_u64(vmsg, -1); 1631 return true; 1632 } 1633 dev->postcopy_listening = true; 1634 1635 vmsg_set_reply_u64(vmsg, 0); 1636 return true; 1637 } 1638 1639 static bool 1640 vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) 1641 { 1642 DPRINT("%s: Entry\n", __func__); 1643 dev->postcopy_listening = false; 1644 if (dev->postcopy_ufd > 0) { 1645 close(dev->postcopy_ufd); 1646 dev->postcopy_ufd = -1; 1647 DPRINT("%s: Done close\n", __func__); 1648 } 1649 1650 vmsg_set_reply_u64(vmsg, 0); 1651 DPRINT("%s: exit\n", __func__); 1652 return true; 1653 } 1654 1655 static inline uint64_t 1656 vu_inflight_queue_size(uint16_t queue_size) 1657 { 1658 return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size + 1659 sizeof(uint16_t), INFLIGHT_ALIGNMENT); 1660 } 1661 1662 #ifdef MFD_ALLOW_SEALING 1663 static void * 1664 memfd_alloc(const char *name, size_t size, unsigned int flags, int *fd) 1665 { 1666 void *ptr; 1667 int ret; 1668 1669 *fd = memfd_create(name, MFD_ALLOW_SEALING); 1670 if (*fd < 0) { 1671 return NULL; 1672 } 1673 1674 ret = ftruncate(*fd, size); 1675 if (ret < 0) { 1676 close(*fd); 1677 return NULL; 1678 } 1679 1680 ret = fcntl(*fd, F_ADD_SEALS, flags); 1681 if (ret < 0) { 1682 close(*fd); 1683 return NULL; 1684 } 1685 1686 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0); 1687 if (ptr == MAP_FAILED) { 1688 close(*fd); 1689 return NULL; 1690 } 1691 1692 return ptr; 1693 } 1694 #endif 1695 1696 static bool 1697 vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) 1698 { 1699 int fd = -1; 1700 void *addr = NULL; 1701 uint64_t mmap_size; 1702 uint16_t num_queues, queue_size; 1703 1704 if (vmsg->size != sizeof(vmsg->payload.inflight)) { 1705 vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size); 1706 vmsg->payload.inflight.mmap_size = 0; 1707 return true; 1708 } 1709 1710 num_queues = vmsg->payload.inflight.num_queues; 1711 queue_size = vmsg->payload.inflight.queue_size; 1712 1713 DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); 1714 DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size); 1715 1716 mmap_size = vu_inflight_queue_size(queue_size) * num_queues; 1717 1718 #ifdef MFD_ALLOW_SEALING 1719 addr = memfd_alloc("vhost-inflight", mmap_size, 1720 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 1721 &fd); 1722 #else 1723 vu_panic(dev, "Not implemented: memfd support is missing"); 1724 #endif 1725 1726 if (!addr) { 1727 vu_panic(dev, "Failed to alloc vhost inflight area"); 1728 vmsg->payload.inflight.mmap_size = 0; 1729 return true; 1730 } 1731 1732 memset(addr, 0, mmap_size); 1733 1734 dev->inflight_info.addr = addr; 1735 dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size; 1736 dev->inflight_info.fd = vmsg->fds[0] = fd; 1737 vmsg->fd_num = 1; 1738 vmsg->payload.inflight.mmap_offset = 0; 1739 1740 DPRINT("send inflight mmap_size: %"PRId64"\n", 1741 vmsg->payload.inflight.mmap_size); 1742 DPRINT("send inflight mmap offset: %"PRId64"\n", 1743 vmsg->payload.inflight.mmap_offset); 1744 1745 return true; 1746 } 1747 1748 static bool 1749 vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) 1750 { 1751 int fd, i; 1752 uint64_t mmap_size, mmap_offset; 1753 uint16_t num_queues, queue_size; 1754 void *rc; 1755 1756 if (vmsg->fd_num != 1 || 1757 vmsg->size != sizeof(vmsg->payload.inflight)) { 1758 vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d", 1759 vmsg->size, vmsg->fd_num); 1760 return false; 1761 } 1762 1763 fd = vmsg->fds[0]; 1764 mmap_size = vmsg->payload.inflight.mmap_size; 1765 mmap_offset = vmsg->payload.inflight.mmap_offset; 1766 num_queues = vmsg->payload.inflight.num_queues; 1767 queue_size = vmsg->payload.inflight.queue_size; 1768 1769 DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size); 1770 DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset); 1771 DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); 1772 DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size); 1773 1774 rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 1775 fd, mmap_offset); 1776 1777 if (rc == MAP_FAILED) { 1778 vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno)); 1779 return false; 1780 } 1781 1782 if (dev->inflight_info.fd) { 1783 close(dev->inflight_info.fd); 1784 } 1785 1786 if (dev->inflight_info.addr) { 1787 munmap(dev->inflight_info.addr, dev->inflight_info.size); 1788 } 1789 1790 dev->inflight_info.fd = fd; 1791 dev->inflight_info.addr = rc; 1792 dev->inflight_info.size = mmap_size; 1793 1794 for (i = 0; i < num_queues; i++) { 1795 dev->vq[i].inflight = (VuVirtqInflight *)rc; 1796 dev->vq[i].inflight->desc_num = queue_size; 1797 rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size)); 1798 } 1799 1800 return false; 1801 } 1802 1803 static bool 1804 vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg) 1805 { 1806 unsigned int index = vmsg->payload.state.index; 1807 1808 if (index >= dev->max_queues) { 1809 vu_panic(dev, "Invalid queue index: %u", index); 1810 return false; 1811 } 1812 1813 DPRINT("Got kick message: handler:%p idx:%u\n", 1814 dev->vq[index].handler, index); 1815 1816 if (!dev->vq[index].started) { 1817 dev->vq[index].started = true; 1818 1819 if (dev->iface->queue_set_started) { 1820 dev->iface->queue_set_started(dev, index, true); 1821 } 1822 } 1823 1824 if (dev->vq[index].handler) { 1825 dev->vq[index].handler(dev, index); 1826 } 1827 1828 return false; 1829 } 1830 1831 static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg) 1832 { 1833 vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; 1834 vmsg->size = sizeof(vmsg->payload.u64); 1835 vmsg->payload.u64 = VHOST_USER_MAX_RAM_SLOTS; 1836 vmsg->fd_num = 0; 1837 1838 if (!vu_message_write(dev, dev->sock, vmsg)) { 1839 vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno)); 1840 } 1841 1842 DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS); 1843 1844 return false; 1845 } 1846 1847 static bool 1848 vu_process_message(VuDev *dev, VhostUserMsg *vmsg) 1849 { 1850 int do_reply = 0; 1851 1852 /* Print out generic part of the request. */ 1853 DPRINT("================ Vhost user message ================\n"); 1854 DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request), 1855 vmsg->request); 1856 DPRINT("Flags: 0x%x\n", vmsg->flags); 1857 DPRINT("Size: %u\n", vmsg->size); 1858 1859 if (vmsg->fd_num) { 1860 int i; 1861 DPRINT("Fds:"); 1862 for (i = 0; i < vmsg->fd_num; i++) { 1863 DPRINT(" %d", vmsg->fds[i]); 1864 } 1865 DPRINT("\n"); 1866 } 1867 1868 if (dev->iface->process_msg && 1869 dev->iface->process_msg(dev, vmsg, &do_reply)) { 1870 return do_reply; 1871 } 1872 1873 switch (vmsg->request) { 1874 case VHOST_USER_GET_FEATURES: 1875 return vu_get_features_exec(dev, vmsg); 1876 case VHOST_USER_SET_FEATURES: 1877 return vu_set_features_exec(dev, vmsg); 1878 case VHOST_USER_GET_PROTOCOL_FEATURES: 1879 return vu_get_protocol_features_exec(dev, vmsg); 1880 case VHOST_USER_SET_PROTOCOL_FEATURES: 1881 return vu_set_protocol_features_exec(dev, vmsg); 1882 case VHOST_USER_SET_OWNER: 1883 return vu_set_owner_exec(dev, vmsg); 1884 case VHOST_USER_RESET_OWNER: 1885 return vu_reset_device_exec(dev, vmsg); 1886 case VHOST_USER_SET_MEM_TABLE: 1887 return vu_set_mem_table_exec(dev, vmsg); 1888 case VHOST_USER_SET_LOG_BASE: 1889 return vu_set_log_base_exec(dev, vmsg); 1890 case VHOST_USER_SET_LOG_FD: 1891 return vu_set_log_fd_exec(dev, vmsg); 1892 case VHOST_USER_SET_VRING_NUM: 1893 return vu_set_vring_num_exec(dev, vmsg); 1894 case VHOST_USER_SET_VRING_ADDR: 1895 return vu_set_vring_addr_exec(dev, vmsg); 1896 case VHOST_USER_SET_VRING_BASE: 1897 return vu_set_vring_base_exec(dev, vmsg); 1898 case VHOST_USER_GET_VRING_BASE: 1899 return vu_get_vring_base_exec(dev, vmsg); 1900 case VHOST_USER_SET_VRING_KICK: 1901 return vu_set_vring_kick_exec(dev, vmsg); 1902 case VHOST_USER_SET_VRING_CALL: 1903 return vu_set_vring_call_exec(dev, vmsg); 1904 case VHOST_USER_SET_VRING_ERR: 1905 return vu_set_vring_err_exec(dev, vmsg); 1906 case VHOST_USER_GET_QUEUE_NUM: 1907 return vu_get_queue_num_exec(dev, vmsg); 1908 case VHOST_USER_SET_VRING_ENABLE: 1909 return vu_set_vring_enable_exec(dev, vmsg); 1910 case VHOST_USER_SET_SLAVE_REQ_FD: 1911 return vu_set_slave_req_fd(dev, vmsg); 1912 case VHOST_USER_GET_CONFIG: 1913 return vu_get_config(dev, vmsg); 1914 case VHOST_USER_SET_CONFIG: 1915 return vu_set_config(dev, vmsg); 1916 case VHOST_USER_NONE: 1917 /* if you need processing before exit, override iface->process_msg */ 1918 exit(0); 1919 case VHOST_USER_POSTCOPY_ADVISE: 1920 return vu_set_postcopy_advise(dev, vmsg); 1921 case VHOST_USER_POSTCOPY_LISTEN: 1922 return vu_set_postcopy_listen(dev, vmsg); 1923 case VHOST_USER_POSTCOPY_END: 1924 return vu_set_postcopy_end(dev, vmsg); 1925 case VHOST_USER_GET_INFLIGHT_FD: 1926 return vu_get_inflight_fd(dev, vmsg); 1927 case VHOST_USER_SET_INFLIGHT_FD: 1928 return vu_set_inflight_fd(dev, vmsg); 1929 case VHOST_USER_VRING_KICK: 1930 return vu_handle_vring_kick(dev, vmsg); 1931 case VHOST_USER_GET_MAX_MEM_SLOTS: 1932 return vu_handle_get_max_memslots(dev, vmsg); 1933 case VHOST_USER_ADD_MEM_REG: 1934 return vu_add_mem_reg(dev, vmsg); 1935 case VHOST_USER_REM_MEM_REG: 1936 return vu_rem_mem_reg(dev, vmsg); 1937 default: 1938 vmsg_close_fds(vmsg); 1939 vu_panic(dev, "Unhandled request: %d", vmsg->request); 1940 } 1941 1942 return false; 1943 } 1944 1945 bool 1946 vu_dispatch(VuDev *dev) 1947 { 1948 VhostUserMsg vmsg = { 0, }; 1949 int reply_requested; 1950 bool need_reply, success = false; 1951 1952 if (!dev->read_msg(dev, dev->sock, &vmsg)) { 1953 goto end; 1954 } 1955 1956 need_reply = vmsg.flags & VHOST_USER_NEED_REPLY_MASK; 1957 1958 reply_requested = vu_process_message(dev, &vmsg); 1959 if (!reply_requested && need_reply) { 1960 vmsg_set_reply_u64(&vmsg, 0); 1961 reply_requested = 1; 1962 } 1963 1964 if (!reply_requested) { 1965 success = true; 1966 goto end; 1967 } 1968 1969 if (!vu_send_reply(dev, dev->sock, &vmsg)) { 1970 goto end; 1971 } 1972 1973 success = true; 1974 1975 end: 1976 free(vmsg.data); 1977 return success; 1978 } 1979 1980 void 1981 vu_deinit(VuDev *dev) 1982 { 1983 int i; 1984 1985 for (i = 0; i < dev->nregions; i++) { 1986 VuDevRegion *r = &dev->regions[i]; 1987 void *m = (void *) (uintptr_t) r->mmap_addr; 1988 if (m != MAP_FAILED) { 1989 munmap(m, r->size + r->mmap_offset); 1990 } 1991 } 1992 dev->nregions = 0; 1993 1994 for (i = 0; i < dev->max_queues; i++) { 1995 VuVirtq *vq = &dev->vq[i]; 1996 1997 if (vq->call_fd != -1) { 1998 close(vq->call_fd); 1999 vq->call_fd = -1; 2000 } 2001 2002 if (vq->kick_fd != -1) { 2003 dev->remove_watch(dev, vq->kick_fd); 2004 close(vq->kick_fd); 2005 vq->kick_fd = -1; 2006 } 2007 2008 if (vq->err_fd != -1) { 2009 close(vq->err_fd); 2010 vq->err_fd = -1; 2011 } 2012 2013 if (vq->resubmit_list) { 2014 free(vq->resubmit_list); 2015 vq->resubmit_list = NULL; 2016 } 2017 2018 vq->inflight = NULL; 2019 } 2020 2021 if (dev->inflight_info.addr) { 2022 munmap(dev->inflight_info.addr, dev->inflight_info.size); 2023 dev->inflight_info.addr = NULL; 2024 } 2025 2026 if (dev->inflight_info.fd > 0) { 2027 close(dev->inflight_info.fd); 2028 dev->inflight_info.fd = -1; 2029 } 2030 2031 vu_close_log(dev); 2032 if (dev->slave_fd != -1) { 2033 close(dev->slave_fd); 2034 dev->slave_fd = -1; 2035 } 2036 pthread_mutex_destroy(&dev->slave_mutex); 2037 2038 if (dev->sock != -1) { 2039 close(dev->sock); 2040 } 2041 2042 free(dev->vq); 2043 dev->vq = NULL; 2044 } 2045 2046 bool 2047 vu_init(VuDev *dev, 2048 uint16_t max_queues, 2049 int socket, 2050 vu_panic_cb panic, 2051 vu_read_msg_cb read_msg, 2052 vu_set_watch_cb set_watch, 2053 vu_remove_watch_cb remove_watch, 2054 const VuDevIface *iface) 2055 { 2056 uint16_t i; 2057 2058 assert(max_queues > 0); 2059 assert(socket >= 0); 2060 assert(set_watch); 2061 assert(remove_watch); 2062 assert(iface); 2063 assert(panic); 2064 2065 memset(dev, 0, sizeof(*dev)); 2066 2067 dev->sock = socket; 2068 dev->panic = panic; 2069 dev->read_msg = read_msg ? read_msg : vu_message_read_default; 2070 dev->set_watch = set_watch; 2071 dev->remove_watch = remove_watch; 2072 dev->iface = iface; 2073 dev->log_call_fd = -1; 2074 pthread_mutex_init(&dev->slave_mutex, NULL); 2075 dev->slave_fd = -1; 2076 dev->max_queues = max_queues; 2077 2078 dev->vq = malloc(max_queues * sizeof(dev->vq[0])); 2079 if (!dev->vq) { 2080 DPRINT("%s: failed to malloc virtqueues\n", __func__); 2081 return false; 2082 } 2083 2084 for (i = 0; i < max_queues; i++) { 2085 dev->vq[i] = (VuVirtq) { 2086 .call_fd = -1, .kick_fd = -1, .err_fd = -1, 2087 .notification = true, 2088 }; 2089 } 2090 2091 return true; 2092 } 2093 2094 VuVirtq * 2095 vu_get_queue(VuDev *dev, int qidx) 2096 { 2097 assert(qidx < dev->max_queues); 2098 return &dev->vq[qidx]; 2099 } 2100 2101 bool 2102 vu_queue_enabled(VuDev *dev, VuVirtq *vq) 2103 { 2104 return vq->enable; 2105 } 2106 2107 bool 2108 vu_queue_started(const VuDev *dev, const VuVirtq *vq) 2109 { 2110 return vq->started; 2111 } 2112 2113 static inline uint16_t 2114 vring_avail_flags(VuVirtq *vq) 2115 { 2116 return le16toh(vq->vring.avail->flags); 2117 } 2118 2119 static inline uint16_t 2120 vring_avail_idx(VuVirtq *vq) 2121 { 2122 vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); 2123 2124 return vq->shadow_avail_idx; 2125 } 2126 2127 static inline uint16_t 2128 vring_avail_ring(VuVirtq *vq, int i) 2129 { 2130 return le16toh(vq->vring.avail->ring[i]); 2131 } 2132 2133 static inline uint16_t 2134 vring_get_used_event(VuVirtq *vq) 2135 { 2136 return vring_avail_ring(vq, vq->vring.num); 2137 } 2138 2139 static int 2140 virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx) 2141 { 2142 uint16_t num_heads = vring_avail_idx(vq) - idx; 2143 2144 /* Check it isn't doing very strange things with descriptor numbers. */ 2145 if (num_heads > vq->vring.num) { 2146 vu_panic(dev, "Guest moved used index from %u to %u", 2147 idx, vq->shadow_avail_idx); 2148 return -1; 2149 } 2150 if (num_heads) { 2151 /* On success, callers read a descriptor at vq->last_avail_idx. 2152 * Make sure descriptor read does not bypass avail index read. */ 2153 smp_rmb(); 2154 } 2155 2156 return num_heads; 2157 } 2158 2159 static bool 2160 virtqueue_get_head(VuDev *dev, VuVirtq *vq, 2161 unsigned int idx, unsigned int *head) 2162 { 2163 /* Grab the next descriptor number they're advertising, and increment 2164 * the index we've seen. */ 2165 *head = vring_avail_ring(vq, idx % vq->vring.num); 2166 2167 /* If their number is silly, that's a fatal mistake. */ 2168 if (*head >= vq->vring.num) { 2169 vu_panic(dev, "Guest says index %u is available", *head); 2170 return false; 2171 } 2172 2173 return true; 2174 } 2175 2176 static int 2177 virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc, 2178 uint64_t addr, size_t len) 2179 { 2180 struct vring_desc *ori_desc; 2181 uint64_t read_len; 2182 2183 if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { 2184 return -1; 2185 } 2186 2187 if (len == 0) { 2188 return -1; 2189 } 2190 2191 while (len) { 2192 read_len = len; 2193 ori_desc = vu_gpa_to_va(dev, &read_len, addr); 2194 if (!ori_desc) { 2195 return -1; 2196 } 2197 2198 memcpy(desc, ori_desc, read_len); 2199 len -= read_len; 2200 addr += read_len; 2201 desc += read_len; 2202 } 2203 2204 return 0; 2205 } 2206 2207 enum { 2208 VIRTQUEUE_READ_DESC_ERROR = -1, 2209 VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ 2210 VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ 2211 }; 2212 2213 static int 2214 virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, 2215 int i, unsigned int max, unsigned int *next) 2216 { 2217 /* If this descriptor says it doesn't chain, we're done. */ 2218 if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) { 2219 return VIRTQUEUE_READ_DESC_DONE; 2220 } 2221 2222 /* Check they're not leading us off end of descriptors. */ 2223 *next = le16toh(desc[i].next); 2224 /* Make sure compiler knows to grab that: we don't want it changing! */ 2225 smp_wmb(); 2226 2227 if (*next >= max) { 2228 vu_panic(dev, "Desc next is %u", *next); 2229 return VIRTQUEUE_READ_DESC_ERROR; 2230 } 2231 2232 return VIRTQUEUE_READ_DESC_MORE; 2233 } 2234 2235 void 2236 vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, 2237 unsigned int *out_bytes, 2238 unsigned max_in_bytes, unsigned max_out_bytes) 2239 { 2240 unsigned int idx; 2241 unsigned int total_bufs, in_total, out_total; 2242 int rc; 2243 2244 idx = vq->last_avail_idx; 2245 2246 total_bufs = in_total = out_total = 0; 2247 if (unlikely(dev->broken) || 2248 unlikely(!vq->vring.avail)) { 2249 goto done; 2250 } 2251 2252 while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) { 2253 unsigned int max, desc_len, num_bufs, indirect = 0; 2254 uint64_t desc_addr, read_len; 2255 struct vring_desc *desc; 2256 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2257 unsigned int i; 2258 2259 max = vq->vring.num; 2260 num_bufs = total_bufs; 2261 if (!virtqueue_get_head(dev, vq, idx++, &i)) { 2262 goto err; 2263 } 2264 desc = vq->vring.desc; 2265 2266 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2267 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2268 vu_panic(dev, "Invalid size for indirect buffer table"); 2269 goto err; 2270 } 2271 2272 /* If we've got too many, that implies a descriptor loop. */ 2273 if (num_bufs >= max) { 2274 vu_panic(dev, "Looped descriptor"); 2275 goto err; 2276 } 2277 2278 /* loop over the indirect descriptor table */ 2279 indirect = 1; 2280 desc_addr = le64toh(desc[i].addr); 2281 desc_len = le32toh(desc[i].len); 2282 max = desc_len / sizeof(struct vring_desc); 2283 read_len = desc_len; 2284 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2285 if (unlikely(desc && read_len != desc_len)) { 2286 /* Failed to use zero copy */ 2287 desc = NULL; 2288 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2289 desc_addr, 2290 desc_len)) { 2291 desc = desc_buf; 2292 } 2293 } 2294 if (!desc) { 2295 vu_panic(dev, "Invalid indirect buffer table"); 2296 goto err; 2297 } 2298 num_bufs = i = 0; 2299 } 2300 2301 do { 2302 /* If we've got too many, that implies a descriptor loop. */ 2303 if (++num_bufs > max) { 2304 vu_panic(dev, "Looped descriptor"); 2305 goto err; 2306 } 2307 2308 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2309 in_total += le32toh(desc[i].len); 2310 } else { 2311 out_total += le32toh(desc[i].len); 2312 } 2313 if (in_total >= max_in_bytes && out_total >= max_out_bytes) { 2314 goto done; 2315 } 2316 rc = virtqueue_read_next_desc(dev, desc, i, max, &i); 2317 } while (rc == VIRTQUEUE_READ_DESC_MORE); 2318 2319 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 2320 goto err; 2321 } 2322 2323 if (!indirect) { 2324 total_bufs = num_bufs; 2325 } else { 2326 total_bufs++; 2327 } 2328 } 2329 if (rc < 0) { 2330 goto err; 2331 } 2332 done: 2333 if (in_bytes) { 2334 *in_bytes = in_total; 2335 } 2336 if (out_bytes) { 2337 *out_bytes = out_total; 2338 } 2339 return; 2340 2341 err: 2342 in_total = out_total = 0; 2343 goto done; 2344 } 2345 2346 bool 2347 vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, 2348 unsigned int out_bytes) 2349 { 2350 unsigned int in_total, out_total; 2351 2352 vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total, 2353 in_bytes, out_bytes); 2354 2355 return in_bytes <= in_total && out_bytes <= out_total; 2356 } 2357 2358 /* Fetch avail_idx from VQ memory only when we really need to know if 2359 * guest has added some buffers. */ 2360 bool 2361 vu_queue_empty(VuDev *dev, VuVirtq *vq) 2362 { 2363 if (unlikely(dev->broken) || 2364 unlikely(!vq->vring.avail)) { 2365 return true; 2366 } 2367 2368 if (vq->shadow_avail_idx != vq->last_avail_idx) { 2369 return false; 2370 } 2371 2372 return vring_avail_idx(vq) == vq->last_avail_idx; 2373 } 2374 2375 static bool 2376 vring_notify(VuDev *dev, VuVirtq *vq) 2377 { 2378 uint16_t old, new; 2379 bool v; 2380 2381 /* We need to expose used array entries before checking used event. */ 2382 smp_mb(); 2383 2384 /* Always notify when queue is empty (when feature acknowledge) */ 2385 if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && 2386 !vq->inuse && vu_queue_empty(dev, vq)) { 2387 return true; 2388 } 2389 2390 if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2391 return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); 2392 } 2393 2394 v = vq->signalled_used_valid; 2395 vq->signalled_used_valid = true; 2396 old = vq->signalled_used; 2397 new = vq->signalled_used = vq->used_idx; 2398 return !v || vring_need_event(vring_get_used_event(vq), new, old); 2399 } 2400 2401 static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) 2402 { 2403 if (unlikely(dev->broken) || 2404 unlikely(!vq->vring.avail)) { 2405 return; 2406 } 2407 2408 if (!vring_notify(dev, vq)) { 2409 DPRINT("skipped notify...\n"); 2410 return; 2411 } 2412 2413 if (vq->call_fd < 0 && 2414 vu_has_protocol_feature(dev, 2415 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && 2416 vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { 2417 VhostUserMsg vmsg = { 2418 .request = VHOST_USER_SLAVE_VRING_CALL, 2419 .flags = VHOST_USER_VERSION, 2420 .size = sizeof(vmsg.payload.state), 2421 .payload.state = { 2422 .index = vq - dev->vq, 2423 }, 2424 }; 2425 bool ack = sync && 2426 vu_has_protocol_feature(dev, 2427 VHOST_USER_PROTOCOL_F_REPLY_ACK); 2428 2429 if (ack) { 2430 vmsg.flags |= VHOST_USER_NEED_REPLY_MASK; 2431 } 2432 2433 vu_message_write(dev, dev->slave_fd, &vmsg); 2434 if (ack) { 2435 vu_message_read_default(dev, dev->slave_fd, &vmsg); 2436 } 2437 return; 2438 } 2439 2440 if (eventfd_write(vq->call_fd, 1) < 0) { 2441 vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); 2442 } 2443 } 2444 2445 void vu_queue_notify(VuDev *dev, VuVirtq *vq) 2446 { 2447 _vu_queue_notify(dev, vq, false); 2448 } 2449 2450 void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq) 2451 { 2452 _vu_queue_notify(dev, vq, true); 2453 } 2454 2455 static inline void 2456 vring_used_flags_set_bit(VuVirtq *vq, int mask) 2457 { 2458 uint16_t *flags; 2459 2460 flags = (uint16_t *)((char*)vq->vring.used + 2461 offsetof(struct vring_used, flags)); 2462 *flags = htole16(le16toh(*flags) | mask); 2463 } 2464 2465 static inline void 2466 vring_used_flags_unset_bit(VuVirtq *vq, int mask) 2467 { 2468 uint16_t *flags; 2469 2470 flags = (uint16_t *)((char*)vq->vring.used + 2471 offsetof(struct vring_used, flags)); 2472 *flags = htole16(le16toh(*flags) & ~mask); 2473 } 2474 2475 static inline void 2476 vring_set_avail_event(VuVirtq *vq, uint16_t val) 2477 { 2478 uint16_t *avail; 2479 2480 if (!vq->notification) { 2481 return; 2482 } 2483 2484 avail = (uint16_t *)&vq->vring.used->ring[vq->vring.num]; 2485 *avail = htole16(val); 2486 } 2487 2488 void 2489 vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable) 2490 { 2491 vq->notification = enable; 2492 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2493 vring_set_avail_event(vq, vring_avail_idx(vq)); 2494 } else if (enable) { 2495 vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); 2496 } else { 2497 vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); 2498 } 2499 if (enable) { 2500 /* Expose avail event/used flags before caller checks the avail idx. */ 2501 smp_mb(); 2502 } 2503 } 2504 2505 static bool 2506 virtqueue_map_desc(VuDev *dev, 2507 unsigned int *p_num_sg, struct iovec *iov, 2508 unsigned int max_num_sg, bool is_write, 2509 uint64_t pa, size_t sz) 2510 { 2511 unsigned num_sg = *p_num_sg; 2512 2513 assert(num_sg <= max_num_sg); 2514 2515 if (!sz) { 2516 vu_panic(dev, "virtio: zero sized buffers are not allowed"); 2517 return false; 2518 } 2519 2520 while (sz) { 2521 uint64_t len = sz; 2522 2523 if (num_sg == max_num_sg) { 2524 vu_panic(dev, "virtio: too many descriptors in indirect table"); 2525 return false; 2526 } 2527 2528 iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); 2529 if (iov[num_sg].iov_base == NULL) { 2530 vu_panic(dev, "virtio: invalid address for buffers"); 2531 return false; 2532 } 2533 iov[num_sg].iov_len = len; 2534 num_sg++; 2535 sz -= len; 2536 pa += len; 2537 } 2538 2539 *p_num_sg = num_sg; 2540 return true; 2541 } 2542 2543 static void * 2544 virtqueue_alloc_element(size_t sz, 2545 unsigned out_num, unsigned in_num) 2546 { 2547 VuVirtqElement *elem; 2548 size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); 2549 size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); 2550 size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); 2551 2552 assert(sz >= sizeof(VuVirtqElement)); 2553 elem = malloc(out_sg_end); 2554 elem->out_num = out_num; 2555 elem->in_num = in_num; 2556 elem->in_sg = (void *)elem + in_sg_ofs; 2557 elem->out_sg = (void *)elem + out_sg_ofs; 2558 return elem; 2559 } 2560 2561 static void * 2562 vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) 2563 { 2564 struct vring_desc *desc = vq->vring.desc; 2565 uint64_t desc_addr, read_len; 2566 unsigned int desc_len; 2567 unsigned int max = vq->vring.num; 2568 unsigned int i = idx; 2569 VuVirtqElement *elem; 2570 unsigned int out_num = 0, in_num = 0; 2571 struct iovec iov[VIRTQUEUE_MAX_SIZE]; 2572 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2573 int rc; 2574 2575 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2576 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2577 vu_panic(dev, "Invalid size for indirect buffer table"); 2578 return NULL; 2579 } 2580 2581 /* loop over the indirect descriptor table */ 2582 desc_addr = le64toh(desc[i].addr); 2583 desc_len = le32toh(desc[i].len); 2584 max = desc_len / sizeof(struct vring_desc); 2585 read_len = desc_len; 2586 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2587 if (unlikely(desc && read_len != desc_len)) { 2588 /* Failed to use zero copy */ 2589 desc = NULL; 2590 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2591 desc_addr, 2592 desc_len)) { 2593 desc = desc_buf; 2594 } 2595 } 2596 if (!desc) { 2597 vu_panic(dev, "Invalid indirect buffer table"); 2598 return NULL; 2599 } 2600 i = 0; 2601 } 2602 2603 /* Collect all the descriptors */ 2604 do { 2605 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2606 if (!virtqueue_map_desc(dev, &in_num, iov + out_num, 2607 VIRTQUEUE_MAX_SIZE - out_num, true, 2608 le64toh(desc[i].addr), 2609 le32toh(desc[i].len))) { 2610 return NULL; 2611 } 2612 } else { 2613 if (in_num) { 2614 vu_panic(dev, "Incorrect order for descriptors"); 2615 return NULL; 2616 } 2617 if (!virtqueue_map_desc(dev, &out_num, iov, 2618 VIRTQUEUE_MAX_SIZE, false, 2619 le64toh(desc[i].addr), 2620 le32toh(desc[i].len))) { 2621 return NULL; 2622 } 2623 } 2624 2625 /* If we've got too many, that implies a descriptor loop. */ 2626 if ((in_num + out_num) > max) { 2627 vu_panic(dev, "Looped descriptor"); 2628 return NULL; 2629 } 2630 rc = virtqueue_read_next_desc(dev, desc, i, max, &i); 2631 } while (rc == VIRTQUEUE_READ_DESC_MORE); 2632 2633 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 2634 vu_panic(dev, "read descriptor error"); 2635 return NULL; 2636 } 2637 2638 /* Now copy what we have collected and mapped */ 2639 elem = virtqueue_alloc_element(sz, out_num, in_num); 2640 elem->index = idx; 2641 for (i = 0; i < out_num; i++) { 2642 elem->out_sg[i] = iov[i]; 2643 } 2644 for (i = 0; i < in_num; i++) { 2645 elem->in_sg[i] = iov[out_num + i]; 2646 } 2647 2648 return elem; 2649 } 2650 2651 static int 2652 vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx) 2653 { 2654 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2655 return 0; 2656 } 2657 2658 if (unlikely(!vq->inflight)) { 2659 return -1; 2660 } 2661 2662 vq->inflight->desc[desc_idx].counter = vq->counter++; 2663 vq->inflight->desc[desc_idx].inflight = 1; 2664 2665 return 0; 2666 } 2667 2668 static int 2669 vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx) 2670 { 2671 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2672 return 0; 2673 } 2674 2675 if (unlikely(!vq->inflight)) { 2676 return -1; 2677 } 2678 2679 vq->inflight->last_batch_head = desc_idx; 2680 2681 return 0; 2682 } 2683 2684 static int 2685 vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx) 2686 { 2687 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2688 return 0; 2689 } 2690 2691 if (unlikely(!vq->inflight)) { 2692 return -1; 2693 } 2694 2695 barrier(); 2696 2697 vq->inflight->desc[desc_idx].inflight = 0; 2698 2699 barrier(); 2700 2701 vq->inflight->used_idx = vq->used_idx; 2702 2703 return 0; 2704 } 2705 2706 void * 2707 vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) 2708 { 2709 int i; 2710 unsigned int head; 2711 VuVirtqElement *elem; 2712 2713 if (unlikely(dev->broken) || 2714 unlikely(!vq->vring.avail)) { 2715 return NULL; 2716 } 2717 2718 if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) { 2719 i = (--vq->resubmit_num); 2720 elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz); 2721 2722 if (!vq->resubmit_num) { 2723 free(vq->resubmit_list); 2724 vq->resubmit_list = NULL; 2725 } 2726 2727 return elem; 2728 } 2729 2730 if (vu_queue_empty(dev, vq)) { 2731 return NULL; 2732 } 2733 /* 2734 * Needed after virtio_queue_empty(), see comment in 2735 * virtqueue_num_heads(). 2736 */ 2737 smp_rmb(); 2738 2739 if (vq->inuse >= vq->vring.num) { 2740 vu_panic(dev, "Virtqueue size exceeded"); 2741 return NULL; 2742 } 2743 2744 if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) { 2745 return NULL; 2746 } 2747 2748 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2749 vring_set_avail_event(vq, vq->last_avail_idx); 2750 } 2751 2752 elem = vu_queue_map_desc(dev, vq, head, sz); 2753 2754 if (!elem) { 2755 return NULL; 2756 } 2757 2758 vq->inuse++; 2759 2760 vu_queue_inflight_get(dev, vq, head); 2761 2762 return elem; 2763 } 2764 2765 static void 2766 vu_queue_detach_element(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, 2767 size_t len) 2768 { 2769 vq->inuse--; 2770 /* unmap, when DMA support is added */ 2771 } 2772 2773 void 2774 vu_queue_unpop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, 2775 size_t len) 2776 { 2777 vq->last_avail_idx--; 2778 vu_queue_detach_element(dev, vq, elem, len); 2779 } 2780 2781 bool 2782 vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) 2783 { 2784 if (num > vq->inuse) { 2785 return false; 2786 } 2787 vq->last_avail_idx -= num; 2788 vq->inuse -= num; 2789 return true; 2790 } 2791 2792 static inline 2793 void vring_used_write(VuDev *dev, VuVirtq *vq, 2794 struct vring_used_elem *uelem, int i) 2795 { 2796 struct vring_used *used = vq->vring.used; 2797 2798 used->ring[i] = *uelem; 2799 vu_log_write(dev, vq->vring.log_guest_addr + 2800 offsetof(struct vring_used, ring[i]), 2801 sizeof(used->ring[i])); 2802 } 2803 2804 2805 static void 2806 vu_log_queue_fill(VuDev *dev, VuVirtq *vq, 2807 const VuVirtqElement *elem, 2808 unsigned int len) 2809 { 2810 struct vring_desc *desc = vq->vring.desc; 2811 unsigned int i, max, min, desc_len; 2812 uint64_t desc_addr, read_len; 2813 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2814 unsigned num_bufs = 0; 2815 2816 max = vq->vring.num; 2817 i = elem->index; 2818 2819 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2820 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2821 vu_panic(dev, "Invalid size for indirect buffer table"); 2822 return; 2823 } 2824 2825 /* loop over the indirect descriptor table */ 2826 desc_addr = le64toh(desc[i].addr); 2827 desc_len = le32toh(desc[i].len); 2828 max = desc_len / sizeof(struct vring_desc); 2829 read_len = desc_len; 2830 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2831 if (unlikely(desc && read_len != desc_len)) { 2832 /* Failed to use zero copy */ 2833 desc = NULL; 2834 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2835 desc_addr, 2836 desc_len)) { 2837 desc = desc_buf; 2838 } 2839 } 2840 if (!desc) { 2841 vu_panic(dev, "Invalid indirect buffer table"); 2842 return; 2843 } 2844 i = 0; 2845 } 2846 2847 do { 2848 if (++num_bufs > max) { 2849 vu_panic(dev, "Looped descriptor"); 2850 return; 2851 } 2852 2853 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2854 min = MIN(le32toh(desc[i].len), len); 2855 vu_log_write(dev, le64toh(desc[i].addr), min); 2856 len -= min; 2857 } 2858 2859 } while (len > 0 && 2860 (virtqueue_read_next_desc(dev, desc, i, max, &i) 2861 == VIRTQUEUE_READ_DESC_MORE)); 2862 } 2863 2864 void 2865 vu_queue_fill(VuDev *dev, VuVirtq *vq, 2866 const VuVirtqElement *elem, 2867 unsigned int len, unsigned int idx) 2868 { 2869 struct vring_used_elem uelem; 2870 2871 if (unlikely(dev->broken) || 2872 unlikely(!vq->vring.avail)) { 2873 return; 2874 } 2875 2876 vu_log_queue_fill(dev, vq, elem, len); 2877 2878 idx = (idx + vq->used_idx) % vq->vring.num; 2879 2880 uelem.id = htole32(elem->index); 2881 uelem.len = htole32(len); 2882 vring_used_write(dev, vq, &uelem, idx); 2883 } 2884 2885 static inline 2886 void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val) 2887 { 2888 vq->vring.used->idx = htole16(val); 2889 vu_log_write(dev, 2890 vq->vring.log_guest_addr + offsetof(struct vring_used, idx), 2891 sizeof(vq->vring.used->idx)); 2892 2893 vq->used_idx = val; 2894 } 2895 2896 void 2897 vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) 2898 { 2899 uint16_t old, new; 2900 2901 if (unlikely(dev->broken) || 2902 unlikely(!vq->vring.avail)) { 2903 return; 2904 } 2905 2906 /* Make sure buffer is written before we update index. */ 2907 smp_wmb(); 2908 2909 old = vq->used_idx; 2910 new = old + count; 2911 vring_used_idx_set(dev, vq, new); 2912 vq->inuse -= count; 2913 if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { 2914 vq->signalled_used_valid = false; 2915 } 2916 } 2917 2918 void 2919 vu_queue_push(VuDev *dev, VuVirtq *vq, 2920 const VuVirtqElement *elem, unsigned int len) 2921 { 2922 vu_queue_fill(dev, vq, elem, len, 0); 2923 vu_queue_inflight_pre_put(dev, vq, elem->index); 2924 vu_queue_flush(dev, vq, 1); 2925 vu_queue_inflight_post_put(dev, vq, elem->index); 2926 } 2927