1 /* 2 * Vhost User library 3 * 4 * Copyright IBM, Corp. 2007 5 * Copyright (c) 2016 Red Hat, Inc. 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Marc-André Lureau <mlureau@redhat.com> 10 * Victor Kaplansky <victork@redhat.com> 11 * 12 * This work is licensed under the terms of the GNU GPL, version 2 or 13 * later. See the COPYING file in the top-level directory. 14 */ 15 16 /* this code avoids GLib dependency */ 17 #include <stdlib.h> 18 #include <stdio.h> 19 #include <unistd.h> 20 #include <stdarg.h> 21 #include <errno.h> 22 #include <string.h> 23 #include <assert.h> 24 #include <inttypes.h> 25 #include <sys/types.h> 26 #include <sys/socket.h> 27 #include <sys/eventfd.h> 28 #include <sys/mman.h> 29 #include <endian.h> 30 31 #if defined(__linux__) 32 #include <sys/syscall.h> 33 #include <fcntl.h> 34 #include <sys/ioctl.h> 35 #include <linux/vhost.h> 36 37 #ifdef __NR_userfaultfd 38 #include <linux/userfaultfd.h> 39 #endif 40 41 #endif 42 43 #include "include/atomic.h" 44 45 #include "libvhost-user.h" 46 47 /* usually provided by GLib */ 48 #ifndef MIN 49 #define MIN(x, y) ({ \ 50 typeof(x) _min1 = (x); \ 51 typeof(y) _min2 = (y); \ 52 (void) (&_min1 == &_min2); \ 53 _min1 < _min2 ? _min1 : _min2; }) 54 #endif 55 56 /* Round number down to multiple */ 57 #define ALIGN_DOWN(n, m) ((n) / (m) * (m)) 58 59 /* Round number up to multiple */ 60 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) 61 62 #ifndef unlikely 63 #define unlikely(x) __builtin_expect(!!(x), 0) 64 #endif 65 66 /* Align each region to cache line size in inflight buffer */ 67 #define INFLIGHT_ALIGNMENT 64 68 69 /* The version of inflight buffer */ 70 #define INFLIGHT_VERSION 1 71 72 /* The version of the protocol we support */ 73 #define VHOST_USER_VERSION 1 74 #define LIBVHOST_USER_DEBUG 0 75 76 #define DPRINT(...) \ 77 do { \ 78 if (LIBVHOST_USER_DEBUG) { \ 79 fprintf(stderr, __VA_ARGS__); \ 80 } \ 81 } while (0) 82 83 static inline 84 bool has_feature(uint64_t features, unsigned int fbit) 85 { 86 assert(fbit < 64); 87 return !!(features & (1ULL << fbit)); 88 } 89 90 static inline 91 bool vu_has_feature(VuDev *dev, 92 unsigned int fbit) 93 { 94 return has_feature(dev->features, fbit); 95 } 96 97 static inline bool vu_has_protocol_feature(VuDev *dev, unsigned int fbit) 98 { 99 return has_feature(dev->protocol_features, fbit); 100 } 101 102 static const char * 103 vu_request_to_string(unsigned int req) 104 { 105 #define REQ(req) [req] = #req 106 static const char *vu_request_str[] = { 107 REQ(VHOST_USER_NONE), 108 REQ(VHOST_USER_GET_FEATURES), 109 REQ(VHOST_USER_SET_FEATURES), 110 REQ(VHOST_USER_SET_OWNER), 111 REQ(VHOST_USER_RESET_OWNER), 112 REQ(VHOST_USER_SET_MEM_TABLE), 113 REQ(VHOST_USER_SET_LOG_BASE), 114 REQ(VHOST_USER_SET_LOG_FD), 115 REQ(VHOST_USER_SET_VRING_NUM), 116 REQ(VHOST_USER_SET_VRING_ADDR), 117 REQ(VHOST_USER_SET_VRING_BASE), 118 REQ(VHOST_USER_GET_VRING_BASE), 119 REQ(VHOST_USER_SET_VRING_KICK), 120 REQ(VHOST_USER_SET_VRING_CALL), 121 REQ(VHOST_USER_SET_VRING_ERR), 122 REQ(VHOST_USER_GET_PROTOCOL_FEATURES), 123 REQ(VHOST_USER_SET_PROTOCOL_FEATURES), 124 REQ(VHOST_USER_GET_QUEUE_NUM), 125 REQ(VHOST_USER_SET_VRING_ENABLE), 126 REQ(VHOST_USER_SEND_RARP), 127 REQ(VHOST_USER_NET_SET_MTU), 128 REQ(VHOST_USER_SET_SLAVE_REQ_FD), 129 REQ(VHOST_USER_IOTLB_MSG), 130 REQ(VHOST_USER_SET_VRING_ENDIAN), 131 REQ(VHOST_USER_GET_CONFIG), 132 REQ(VHOST_USER_SET_CONFIG), 133 REQ(VHOST_USER_POSTCOPY_ADVISE), 134 REQ(VHOST_USER_POSTCOPY_LISTEN), 135 REQ(VHOST_USER_POSTCOPY_END), 136 REQ(VHOST_USER_GET_INFLIGHT_FD), 137 REQ(VHOST_USER_SET_INFLIGHT_FD), 138 REQ(VHOST_USER_GPU_SET_SOCKET), 139 REQ(VHOST_USER_VRING_KICK), 140 REQ(VHOST_USER_GET_MAX_MEM_SLOTS), 141 REQ(VHOST_USER_ADD_MEM_REG), 142 REQ(VHOST_USER_REM_MEM_REG), 143 REQ(VHOST_USER_MAX), 144 }; 145 #undef REQ 146 147 if (req < VHOST_USER_MAX) { 148 return vu_request_str[req]; 149 } else { 150 return "unknown"; 151 } 152 } 153 154 static void 155 vu_panic(VuDev *dev, const char *msg, ...) 156 { 157 char *buf = NULL; 158 va_list ap; 159 160 va_start(ap, msg); 161 if (vasprintf(&buf, msg, ap) < 0) { 162 buf = NULL; 163 } 164 va_end(ap); 165 166 dev->broken = true; 167 dev->panic(dev, buf); 168 free(buf); 169 170 /* 171 * FIXME: 172 * find a way to call virtio_error, or perhaps close the connection? 173 */ 174 } 175 176 /* Translate guest physical address to our virtual address. */ 177 void * 178 vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) 179 { 180 int i; 181 182 if (*plen == 0) { 183 return NULL; 184 } 185 186 /* Find matching memory region. */ 187 for (i = 0; i < dev->nregions; i++) { 188 VuDevRegion *r = &dev->regions[i]; 189 190 if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { 191 if ((guest_addr + *plen) > (r->gpa + r->size)) { 192 *plen = r->gpa + r->size - guest_addr; 193 } 194 return (void *)(uintptr_t) 195 guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; 196 } 197 } 198 199 return NULL; 200 } 201 202 /* Translate qemu virtual address to our virtual address. */ 203 static void * 204 qva_to_va(VuDev *dev, uint64_t qemu_addr) 205 { 206 int i; 207 208 /* Find matching memory region. */ 209 for (i = 0; i < dev->nregions; i++) { 210 VuDevRegion *r = &dev->regions[i]; 211 212 if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { 213 return (void *)(uintptr_t) 214 qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; 215 } 216 } 217 218 return NULL; 219 } 220 221 static void 222 vmsg_close_fds(VhostUserMsg *vmsg) 223 { 224 int i; 225 226 for (i = 0; i < vmsg->fd_num; i++) { 227 close(vmsg->fds[i]); 228 } 229 } 230 231 /* Set reply payload.u64 and clear request flags and fd_num */ 232 static void vmsg_set_reply_u64(VhostUserMsg *vmsg, uint64_t val) 233 { 234 vmsg->flags = 0; /* defaults will be set by vu_send_reply() */ 235 vmsg->size = sizeof(vmsg->payload.u64); 236 vmsg->payload.u64 = val; 237 vmsg->fd_num = 0; 238 } 239 240 /* A test to see if we have userfault available */ 241 static bool 242 have_userfault(void) 243 { 244 #if defined(__linux__) && defined(__NR_userfaultfd) &&\ 245 defined(UFFD_FEATURE_MISSING_SHMEM) &&\ 246 defined(UFFD_FEATURE_MISSING_HUGETLBFS) 247 /* Now test the kernel we're running on really has the features */ 248 int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 249 struct uffdio_api api_struct; 250 if (ufd < 0) { 251 return false; 252 } 253 254 api_struct.api = UFFD_API; 255 api_struct.features = UFFD_FEATURE_MISSING_SHMEM | 256 UFFD_FEATURE_MISSING_HUGETLBFS; 257 if (ioctl(ufd, UFFDIO_API, &api_struct)) { 258 close(ufd); 259 return false; 260 } 261 close(ufd); 262 return true; 263 264 #else 265 return false; 266 #endif 267 } 268 269 static bool 270 vu_message_read_default(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 271 { 272 char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; 273 struct iovec iov = { 274 .iov_base = (char *)vmsg, 275 .iov_len = VHOST_USER_HDR_SIZE, 276 }; 277 struct msghdr msg = { 278 .msg_iov = &iov, 279 .msg_iovlen = 1, 280 .msg_control = control, 281 .msg_controllen = sizeof(control), 282 }; 283 size_t fd_size; 284 struct cmsghdr *cmsg; 285 int rc; 286 287 do { 288 rc = recvmsg(conn_fd, &msg, 0); 289 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 290 291 if (rc < 0) { 292 vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); 293 return false; 294 } 295 296 vmsg->fd_num = 0; 297 for (cmsg = CMSG_FIRSTHDR(&msg); 298 cmsg != NULL; 299 cmsg = CMSG_NXTHDR(&msg, cmsg)) 300 { 301 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { 302 fd_size = cmsg->cmsg_len - CMSG_LEN(0); 303 vmsg->fd_num = fd_size / sizeof(int); 304 memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); 305 break; 306 } 307 } 308 309 if (vmsg->size > sizeof(vmsg->payload)) { 310 vu_panic(dev, 311 "Error: too big message request: %d, size: vmsg->size: %u, " 312 "while sizeof(vmsg->payload) = %zu\n", 313 vmsg->request, vmsg->size, sizeof(vmsg->payload)); 314 goto fail; 315 } 316 317 if (vmsg->size) { 318 do { 319 rc = read(conn_fd, &vmsg->payload, vmsg->size); 320 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 321 322 if (rc <= 0) { 323 vu_panic(dev, "Error while reading: %s", strerror(errno)); 324 goto fail; 325 } 326 327 assert(rc == vmsg->size); 328 } 329 330 return true; 331 332 fail: 333 vmsg_close_fds(vmsg); 334 335 return false; 336 } 337 338 static bool 339 vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 340 { 341 int rc; 342 uint8_t *p = (uint8_t *)vmsg; 343 char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; 344 struct iovec iov = { 345 .iov_base = (char *)vmsg, 346 .iov_len = VHOST_USER_HDR_SIZE, 347 }; 348 struct msghdr msg = { 349 .msg_iov = &iov, 350 .msg_iovlen = 1, 351 .msg_control = control, 352 }; 353 struct cmsghdr *cmsg; 354 355 memset(control, 0, sizeof(control)); 356 assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); 357 if (vmsg->fd_num > 0) { 358 size_t fdsize = vmsg->fd_num * sizeof(int); 359 msg.msg_controllen = CMSG_SPACE(fdsize); 360 cmsg = CMSG_FIRSTHDR(&msg); 361 cmsg->cmsg_len = CMSG_LEN(fdsize); 362 cmsg->cmsg_level = SOL_SOCKET; 363 cmsg->cmsg_type = SCM_RIGHTS; 364 memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); 365 } else { 366 msg.msg_controllen = 0; 367 } 368 369 do { 370 rc = sendmsg(conn_fd, &msg, 0); 371 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 372 373 if (vmsg->size) { 374 do { 375 if (vmsg->data) { 376 rc = write(conn_fd, vmsg->data, vmsg->size); 377 } else { 378 rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); 379 } 380 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 381 } 382 383 if (rc <= 0) { 384 vu_panic(dev, "Error while writing: %s", strerror(errno)); 385 return false; 386 } 387 388 return true; 389 } 390 391 static bool 392 vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 393 { 394 /* Set the version in the flags when sending the reply */ 395 vmsg->flags &= ~VHOST_USER_VERSION_MASK; 396 vmsg->flags |= VHOST_USER_VERSION; 397 vmsg->flags |= VHOST_USER_REPLY_MASK; 398 399 return vu_message_write(dev, conn_fd, vmsg); 400 } 401 402 /* 403 * Processes a reply on the slave channel. 404 * Entered with slave_mutex held and releases it before exit. 405 * Returns true on success. 406 */ 407 static bool 408 vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg) 409 { 410 VhostUserMsg msg_reply; 411 bool result = false; 412 413 if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { 414 result = true; 415 goto out; 416 } 417 418 if (!vu_message_read_default(dev, dev->slave_fd, &msg_reply)) { 419 goto out; 420 } 421 422 if (msg_reply.request != vmsg->request) { 423 DPRINT("Received unexpected msg type. Expected %d received %d", 424 vmsg->request, msg_reply.request); 425 goto out; 426 } 427 428 result = msg_reply.payload.u64 == 0; 429 430 out: 431 pthread_mutex_unlock(&dev->slave_mutex); 432 return result; 433 } 434 435 /* Kick the log_call_fd if required. */ 436 static void 437 vu_log_kick(VuDev *dev) 438 { 439 if (dev->log_call_fd != -1) { 440 DPRINT("Kicking the QEMU's log...\n"); 441 if (eventfd_write(dev->log_call_fd, 1) < 0) { 442 vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); 443 } 444 } 445 } 446 447 static void 448 vu_log_page(uint8_t *log_table, uint64_t page) 449 { 450 DPRINT("Logged dirty guest page: %"PRId64"\n", page); 451 qatomic_or(&log_table[page / 8], 1 << (page % 8)); 452 } 453 454 static void 455 vu_log_write(VuDev *dev, uint64_t address, uint64_t length) 456 { 457 uint64_t page; 458 459 if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || 460 !dev->log_table || !length) { 461 return; 462 } 463 464 assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); 465 466 page = address / VHOST_LOG_PAGE; 467 while (page * VHOST_LOG_PAGE < address + length) { 468 vu_log_page(dev->log_table, page); 469 page += 1; 470 } 471 472 vu_log_kick(dev); 473 } 474 475 static void 476 vu_kick_cb(VuDev *dev, int condition, void *data) 477 { 478 int index = (intptr_t)data; 479 VuVirtq *vq = &dev->vq[index]; 480 int sock = vq->kick_fd; 481 eventfd_t kick_data; 482 ssize_t rc; 483 484 rc = eventfd_read(sock, &kick_data); 485 if (rc == -1) { 486 vu_panic(dev, "kick eventfd_read(): %s", strerror(errno)); 487 dev->remove_watch(dev, dev->vq[index].kick_fd); 488 } else { 489 DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n", 490 kick_data, vq->handler, index); 491 if (vq->handler) { 492 vq->handler(dev, index); 493 } 494 } 495 } 496 497 static bool 498 vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) 499 { 500 vmsg->payload.u64 = 501 /* 502 * The following VIRTIO feature bits are supported by our virtqueue 503 * implementation: 504 */ 505 1ULL << VIRTIO_F_NOTIFY_ON_EMPTY | 506 1ULL << VIRTIO_RING_F_INDIRECT_DESC | 507 1ULL << VIRTIO_RING_F_EVENT_IDX | 508 1ULL << VIRTIO_F_VERSION_1 | 509 510 /* vhost-user feature bits */ 511 1ULL << VHOST_F_LOG_ALL | 512 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; 513 514 if (dev->iface->get_features) { 515 vmsg->payload.u64 |= dev->iface->get_features(dev); 516 } 517 518 vmsg->size = sizeof(vmsg->payload.u64); 519 vmsg->fd_num = 0; 520 521 DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 522 523 return true; 524 } 525 526 static void 527 vu_set_enable_all_rings(VuDev *dev, bool enabled) 528 { 529 uint16_t i; 530 531 for (i = 0; i < dev->max_queues; i++) { 532 dev->vq[i].enable = enabled; 533 } 534 } 535 536 static bool 537 vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg) 538 { 539 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 540 541 dev->features = vmsg->payload.u64; 542 if (!vu_has_feature(dev, VIRTIO_F_VERSION_1)) { 543 /* 544 * We only support devices conforming to VIRTIO 1.0 or 545 * later 546 */ 547 vu_panic(dev, "virtio legacy devices aren't supported by libvhost-user"); 548 return false; 549 } 550 551 if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) { 552 vu_set_enable_all_rings(dev, true); 553 } 554 555 if (dev->iface->set_features) { 556 dev->iface->set_features(dev, dev->features); 557 } 558 559 return false; 560 } 561 562 static bool 563 vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg) 564 { 565 return false; 566 } 567 568 static void 569 vu_close_log(VuDev *dev) 570 { 571 if (dev->log_table) { 572 if (munmap(dev->log_table, dev->log_size) != 0) { 573 perror("close log munmap() error"); 574 } 575 576 dev->log_table = NULL; 577 } 578 if (dev->log_call_fd != -1) { 579 close(dev->log_call_fd); 580 dev->log_call_fd = -1; 581 } 582 } 583 584 static bool 585 vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) 586 { 587 vu_set_enable_all_rings(dev, false); 588 589 return false; 590 } 591 592 static bool 593 map_ring(VuDev *dev, VuVirtq *vq) 594 { 595 vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); 596 vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); 597 vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); 598 599 DPRINT("Setting virtq addresses:\n"); 600 DPRINT(" vring_desc at %p\n", vq->vring.desc); 601 DPRINT(" vring_used at %p\n", vq->vring.used); 602 DPRINT(" vring_avail at %p\n", vq->vring.avail); 603 604 return !(vq->vring.desc && vq->vring.used && vq->vring.avail); 605 } 606 607 static bool 608 generate_faults(VuDev *dev) { 609 int i; 610 for (i = 0; i < dev->nregions; i++) { 611 VuDevRegion *dev_region = &dev->regions[i]; 612 int ret; 613 #ifdef UFFDIO_REGISTER 614 /* 615 * We should already have an open ufd. Mark each memory 616 * range as ufd. 617 * Discard any mapping we have here; note I can't use MADV_REMOVE 618 * or fallocate to make the hole since I don't want to lose 619 * data that's already arrived in the shared process. 620 * TODO: How to do hugepage 621 */ 622 ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, 623 dev_region->size + dev_region->mmap_offset, 624 MADV_DONTNEED); 625 if (ret) { 626 fprintf(stderr, 627 "%s: Failed to madvise(DONTNEED) region %d: %s\n", 628 __func__, i, strerror(errno)); 629 } 630 /* 631 * Turn off transparent hugepages so we dont get lose wakeups 632 * in neighbouring pages. 633 * TODO: Turn this backon later. 634 */ 635 ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, 636 dev_region->size + dev_region->mmap_offset, 637 MADV_NOHUGEPAGE); 638 if (ret) { 639 /* 640 * Note: This can happen legally on kernels that are configured 641 * without madvise'able hugepages 642 */ 643 fprintf(stderr, 644 "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n", 645 __func__, i, strerror(errno)); 646 } 647 struct uffdio_register reg_struct; 648 reg_struct.range.start = (uintptr_t)dev_region->mmap_addr; 649 reg_struct.range.len = dev_region->size + dev_region->mmap_offset; 650 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 651 652 if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) { 653 vu_panic(dev, "%s: Failed to userfault region %d " 654 "@%p + size:%zx offset: %zx: (ufd=%d)%s\n", 655 __func__, i, 656 dev_region->mmap_addr, 657 dev_region->size, dev_region->mmap_offset, 658 dev->postcopy_ufd, strerror(errno)); 659 return false; 660 } 661 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) { 662 vu_panic(dev, "%s Region (%d) doesn't support COPY", 663 __func__, i); 664 return false; 665 } 666 DPRINT("%s: region %d: Registered userfault for %" 667 PRIx64 " + %" PRIx64 "\n", __func__, i, 668 (uint64_t)reg_struct.range.start, 669 (uint64_t)reg_struct.range.len); 670 /* Now it's registered we can let the client at it */ 671 if (mprotect((void *)(uintptr_t)dev_region->mmap_addr, 672 dev_region->size + dev_region->mmap_offset, 673 PROT_READ | PROT_WRITE)) { 674 vu_panic(dev, "failed to mprotect region %d for postcopy (%s)", 675 i, strerror(errno)); 676 return false; 677 } 678 /* TODO: Stash 'zero' support flags somewhere */ 679 #endif 680 } 681 682 return true; 683 } 684 685 static bool 686 vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { 687 int i; 688 bool track_ramblocks = dev->postcopy_listening; 689 VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; 690 VuDevRegion *dev_region = &dev->regions[dev->nregions]; 691 void *mmap_addr; 692 693 /* 694 * If we are in postcopy mode and we receive a u64 payload with a 0 value 695 * we know all the postcopy client bases have been received, and we 696 * should start generating faults. 697 */ 698 if (track_ramblocks && 699 vmsg->size == sizeof(vmsg->payload.u64) && 700 vmsg->payload.u64 == 0) { 701 (void)generate_faults(dev); 702 return false; 703 } 704 705 DPRINT("Adding region: %u\n", dev->nregions); 706 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 707 msg_region->guest_phys_addr); 708 DPRINT(" memory_size: 0x%016"PRIx64"\n", 709 msg_region->memory_size); 710 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 711 msg_region->userspace_addr); 712 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 713 msg_region->mmap_offset); 714 715 dev_region->gpa = msg_region->guest_phys_addr; 716 dev_region->size = msg_region->memory_size; 717 dev_region->qva = msg_region->userspace_addr; 718 dev_region->mmap_offset = msg_region->mmap_offset; 719 720 /* 721 * We don't use offset argument of mmap() since the 722 * mapped address has to be page aligned, and we use huge 723 * pages. 724 */ 725 if (track_ramblocks) { 726 /* 727 * In postcopy we're using PROT_NONE here to catch anyone 728 * accessing it before we userfault. 729 */ 730 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 731 PROT_NONE, MAP_SHARED, 732 vmsg->fds[0], 0); 733 } else { 734 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 735 PROT_READ | PROT_WRITE, MAP_SHARED, vmsg->fds[0], 736 0); 737 } 738 739 if (mmap_addr == MAP_FAILED) { 740 vu_panic(dev, "region mmap error: %s", strerror(errno)); 741 } else { 742 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 743 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 744 dev_region->mmap_addr); 745 } 746 747 close(vmsg->fds[0]); 748 749 if (track_ramblocks) { 750 /* 751 * Return the address to QEMU so that it can translate the ufd 752 * fault addresses back. 753 */ 754 msg_region->userspace_addr = (uintptr_t)(mmap_addr + 755 dev_region->mmap_offset); 756 757 /* Send the message back to qemu with the addresses filled in. */ 758 vmsg->fd_num = 0; 759 if (!vu_send_reply(dev, dev->sock, vmsg)) { 760 vu_panic(dev, "failed to respond to add-mem-region for postcopy"); 761 return false; 762 } 763 764 DPRINT("Successfully added new region in postcopy\n"); 765 dev->nregions++; 766 return false; 767 768 } else { 769 for (i = 0; i < dev->max_queues; i++) { 770 if (dev->vq[i].vring.desc) { 771 if (map_ring(dev, &dev->vq[i])) { 772 vu_panic(dev, "remapping queue %d for new memory region", 773 i); 774 } 775 } 776 } 777 778 DPRINT("Successfully added new region\n"); 779 dev->nregions++; 780 vmsg_set_reply_u64(vmsg, 0); 781 return true; 782 } 783 } 784 785 static inline bool reg_equal(VuDevRegion *vudev_reg, 786 VhostUserMemoryRegion *msg_reg) 787 { 788 if (vudev_reg->gpa == msg_reg->guest_phys_addr && 789 vudev_reg->qva == msg_reg->userspace_addr && 790 vudev_reg->size == msg_reg->memory_size) { 791 return true; 792 } 793 794 return false; 795 } 796 797 static bool 798 vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { 799 int i, j; 800 bool found = false; 801 VuDevRegion shadow_regions[VHOST_USER_MAX_RAM_SLOTS] = {}; 802 VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; 803 804 DPRINT("Removing region:\n"); 805 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 806 msg_region->guest_phys_addr); 807 DPRINT(" memory_size: 0x%016"PRIx64"\n", 808 msg_region->memory_size); 809 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 810 msg_region->userspace_addr); 811 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 812 msg_region->mmap_offset); 813 814 for (i = 0, j = 0; i < dev->nregions; i++) { 815 if (!reg_equal(&dev->regions[i], msg_region)) { 816 shadow_regions[j].gpa = dev->regions[i].gpa; 817 shadow_regions[j].size = dev->regions[i].size; 818 shadow_regions[j].qva = dev->regions[i].qva; 819 shadow_regions[j].mmap_offset = dev->regions[i].mmap_offset; 820 j++; 821 } else { 822 found = true; 823 VuDevRegion *r = &dev->regions[i]; 824 void *m = (void *) (uintptr_t) r->mmap_addr; 825 826 if (m) { 827 munmap(m, r->size + r->mmap_offset); 828 } 829 } 830 } 831 832 if (found) { 833 memcpy(dev->regions, shadow_regions, 834 sizeof(VuDevRegion) * VHOST_USER_MAX_RAM_SLOTS); 835 DPRINT("Successfully removed a region\n"); 836 dev->nregions--; 837 vmsg_set_reply_u64(vmsg, 0); 838 } else { 839 vu_panic(dev, "Specified region not found\n"); 840 } 841 842 return true; 843 } 844 845 static bool 846 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) 847 { 848 int i; 849 VhostUserMemory m = vmsg->payload.memory, *memory = &m; 850 dev->nregions = memory->nregions; 851 852 DPRINT("Nregions: %u\n", memory->nregions); 853 for (i = 0; i < dev->nregions; i++) { 854 void *mmap_addr; 855 VhostUserMemoryRegion *msg_region = &memory->regions[i]; 856 VuDevRegion *dev_region = &dev->regions[i]; 857 858 DPRINT("Region %d\n", i); 859 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 860 msg_region->guest_phys_addr); 861 DPRINT(" memory_size: 0x%016"PRIx64"\n", 862 msg_region->memory_size); 863 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 864 msg_region->userspace_addr); 865 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 866 msg_region->mmap_offset); 867 868 dev_region->gpa = msg_region->guest_phys_addr; 869 dev_region->size = msg_region->memory_size; 870 dev_region->qva = msg_region->userspace_addr; 871 dev_region->mmap_offset = msg_region->mmap_offset; 872 873 /* We don't use offset argument of mmap() since the 874 * mapped address has to be page aligned, and we use huge 875 * pages. 876 * In postcopy we're using PROT_NONE here to catch anyone 877 * accessing it before we userfault 878 */ 879 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 880 PROT_NONE, MAP_SHARED, 881 vmsg->fds[i], 0); 882 883 if (mmap_addr == MAP_FAILED) { 884 vu_panic(dev, "region mmap error: %s", strerror(errno)); 885 } else { 886 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 887 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 888 dev_region->mmap_addr); 889 } 890 891 /* Return the address to QEMU so that it can translate the ufd 892 * fault addresses back. 893 */ 894 msg_region->userspace_addr = (uintptr_t)(mmap_addr + 895 dev_region->mmap_offset); 896 close(vmsg->fds[i]); 897 } 898 899 /* Send the message back to qemu with the addresses filled in */ 900 vmsg->fd_num = 0; 901 if (!vu_send_reply(dev, dev->sock, vmsg)) { 902 vu_panic(dev, "failed to respond to set-mem-table for postcopy"); 903 return false; 904 } 905 906 /* Wait for QEMU to confirm that it's registered the handler for the 907 * faults. 908 */ 909 if (!dev->read_msg(dev, dev->sock, vmsg) || 910 vmsg->size != sizeof(vmsg->payload.u64) || 911 vmsg->payload.u64 != 0) { 912 vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); 913 return false; 914 } 915 916 /* OK, now we can go and register the memory and generate faults */ 917 (void)generate_faults(dev); 918 919 return false; 920 } 921 922 static bool 923 vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) 924 { 925 int i; 926 VhostUserMemory m = vmsg->payload.memory, *memory = &m; 927 928 for (i = 0; i < dev->nregions; i++) { 929 VuDevRegion *r = &dev->regions[i]; 930 void *m = (void *) (uintptr_t) r->mmap_addr; 931 932 if (m) { 933 munmap(m, r->size + r->mmap_offset); 934 } 935 } 936 dev->nregions = memory->nregions; 937 938 if (dev->postcopy_listening) { 939 return vu_set_mem_table_exec_postcopy(dev, vmsg); 940 } 941 942 DPRINT("Nregions: %u\n", memory->nregions); 943 for (i = 0; i < dev->nregions; i++) { 944 void *mmap_addr; 945 VhostUserMemoryRegion *msg_region = &memory->regions[i]; 946 VuDevRegion *dev_region = &dev->regions[i]; 947 948 DPRINT("Region %d\n", i); 949 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 950 msg_region->guest_phys_addr); 951 DPRINT(" memory_size: 0x%016"PRIx64"\n", 952 msg_region->memory_size); 953 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 954 msg_region->userspace_addr); 955 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 956 msg_region->mmap_offset); 957 958 dev_region->gpa = msg_region->guest_phys_addr; 959 dev_region->size = msg_region->memory_size; 960 dev_region->qva = msg_region->userspace_addr; 961 dev_region->mmap_offset = msg_region->mmap_offset; 962 963 /* We don't use offset argument of mmap() since the 964 * mapped address has to be page aligned, and we use huge 965 * pages. */ 966 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 967 PROT_READ | PROT_WRITE, MAP_SHARED, 968 vmsg->fds[i], 0); 969 970 if (mmap_addr == MAP_FAILED) { 971 vu_panic(dev, "region mmap error: %s", strerror(errno)); 972 } else { 973 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 974 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 975 dev_region->mmap_addr); 976 } 977 978 close(vmsg->fds[i]); 979 } 980 981 for (i = 0; i < dev->max_queues; i++) { 982 if (dev->vq[i].vring.desc) { 983 if (map_ring(dev, &dev->vq[i])) { 984 vu_panic(dev, "remapping queue %d during setmemtable", i); 985 } 986 } 987 } 988 989 return false; 990 } 991 992 static bool 993 vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) 994 { 995 int fd; 996 uint64_t log_mmap_size, log_mmap_offset; 997 void *rc; 998 999 if (vmsg->fd_num != 1 || 1000 vmsg->size != sizeof(vmsg->payload.log)) { 1001 vu_panic(dev, "Invalid log_base message"); 1002 return true; 1003 } 1004 1005 fd = vmsg->fds[0]; 1006 log_mmap_offset = vmsg->payload.log.mmap_offset; 1007 log_mmap_size = vmsg->payload.log.mmap_size; 1008 DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); 1009 DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); 1010 1011 rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 1012 log_mmap_offset); 1013 close(fd); 1014 if (rc == MAP_FAILED) { 1015 perror("log mmap error"); 1016 } 1017 1018 if (dev->log_table) { 1019 munmap(dev->log_table, dev->log_size); 1020 } 1021 dev->log_table = rc; 1022 dev->log_size = log_mmap_size; 1023 1024 vmsg->size = sizeof(vmsg->payload.u64); 1025 vmsg->fd_num = 0; 1026 1027 return true; 1028 } 1029 1030 static bool 1031 vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg) 1032 { 1033 if (vmsg->fd_num != 1) { 1034 vu_panic(dev, "Invalid log_fd message"); 1035 return false; 1036 } 1037 1038 if (dev->log_call_fd != -1) { 1039 close(dev->log_call_fd); 1040 } 1041 dev->log_call_fd = vmsg->fds[0]; 1042 DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); 1043 1044 return false; 1045 } 1046 1047 static bool 1048 vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg) 1049 { 1050 unsigned int index = vmsg->payload.state.index; 1051 unsigned int num = vmsg->payload.state.num; 1052 1053 DPRINT("State.index: %u\n", index); 1054 DPRINT("State.num: %u\n", num); 1055 dev->vq[index].vring.num = num; 1056 1057 return false; 1058 } 1059 1060 static bool 1061 vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) 1062 { 1063 struct vhost_vring_addr addr = vmsg->payload.addr, *vra = &addr; 1064 unsigned int index = vra->index; 1065 VuVirtq *vq = &dev->vq[index]; 1066 1067 DPRINT("vhost_vring_addr:\n"); 1068 DPRINT(" index: %d\n", vra->index); 1069 DPRINT(" flags: %d\n", vra->flags); 1070 DPRINT(" desc_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->desc_user_addr); 1071 DPRINT(" used_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->used_user_addr); 1072 DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->avail_user_addr); 1073 DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->log_guest_addr); 1074 1075 vq->vra = *vra; 1076 vq->vring.flags = vra->flags; 1077 vq->vring.log_guest_addr = vra->log_guest_addr; 1078 1079 1080 if (map_ring(dev, vq)) { 1081 vu_panic(dev, "Invalid vring_addr message"); 1082 return false; 1083 } 1084 1085 vq->used_idx = le16toh(vq->vring.used->idx); 1086 1087 if (vq->last_avail_idx != vq->used_idx) { 1088 bool resume = dev->iface->queue_is_processed_in_order && 1089 dev->iface->queue_is_processed_in_order(dev, index); 1090 1091 DPRINT("Last avail index != used index: %u != %u%s\n", 1092 vq->last_avail_idx, vq->used_idx, 1093 resume ? ", resuming" : ""); 1094 1095 if (resume) { 1096 vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx; 1097 } 1098 } 1099 1100 return false; 1101 } 1102 1103 static bool 1104 vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1105 { 1106 unsigned int index = vmsg->payload.state.index; 1107 unsigned int num = vmsg->payload.state.num; 1108 1109 DPRINT("State.index: %u\n", index); 1110 DPRINT("State.num: %u\n", num); 1111 dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num; 1112 1113 return false; 1114 } 1115 1116 static bool 1117 vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1118 { 1119 unsigned int index = vmsg->payload.state.index; 1120 1121 DPRINT("State.index: %u\n", index); 1122 vmsg->payload.state.num = dev->vq[index].last_avail_idx; 1123 vmsg->size = sizeof(vmsg->payload.state); 1124 1125 dev->vq[index].started = false; 1126 if (dev->iface->queue_set_started) { 1127 dev->iface->queue_set_started(dev, index, false); 1128 } 1129 1130 if (dev->vq[index].call_fd != -1) { 1131 close(dev->vq[index].call_fd); 1132 dev->vq[index].call_fd = -1; 1133 } 1134 if (dev->vq[index].kick_fd != -1) { 1135 dev->remove_watch(dev, dev->vq[index].kick_fd); 1136 close(dev->vq[index].kick_fd); 1137 dev->vq[index].kick_fd = -1; 1138 } 1139 1140 return true; 1141 } 1142 1143 static bool 1144 vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) 1145 { 1146 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1147 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1148 1149 if (index >= dev->max_queues) { 1150 vmsg_close_fds(vmsg); 1151 vu_panic(dev, "Invalid queue index: %u", index); 1152 return false; 1153 } 1154 1155 if (nofd) { 1156 vmsg_close_fds(vmsg); 1157 return true; 1158 } 1159 1160 if (vmsg->fd_num != 1) { 1161 vmsg_close_fds(vmsg); 1162 vu_panic(dev, "Invalid fds in request: %d", vmsg->request); 1163 return false; 1164 } 1165 1166 return true; 1167 } 1168 1169 static int 1170 inflight_desc_compare(const void *a, const void *b) 1171 { 1172 VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a, 1173 *desc1 = (VuVirtqInflightDesc *)b; 1174 1175 if (desc1->counter > desc0->counter && 1176 (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) { 1177 return 1; 1178 } 1179 1180 return -1; 1181 } 1182 1183 static int 1184 vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) 1185 { 1186 int i = 0; 1187 1188 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 1189 return 0; 1190 } 1191 1192 if (unlikely(!vq->inflight)) { 1193 return -1; 1194 } 1195 1196 if (unlikely(!vq->inflight->version)) { 1197 /* initialize the buffer */ 1198 vq->inflight->version = INFLIGHT_VERSION; 1199 return 0; 1200 } 1201 1202 vq->used_idx = le16toh(vq->vring.used->idx); 1203 vq->resubmit_num = 0; 1204 vq->resubmit_list = NULL; 1205 vq->counter = 0; 1206 1207 if (unlikely(vq->inflight->used_idx != vq->used_idx)) { 1208 vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0; 1209 1210 barrier(); 1211 1212 vq->inflight->used_idx = vq->used_idx; 1213 } 1214 1215 for (i = 0; i < vq->inflight->desc_num; i++) { 1216 if (vq->inflight->desc[i].inflight == 1) { 1217 vq->inuse++; 1218 } 1219 } 1220 1221 vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; 1222 1223 if (vq->inuse) { 1224 vq->resubmit_list = calloc(vq->inuse, sizeof(VuVirtqInflightDesc)); 1225 if (!vq->resubmit_list) { 1226 return -1; 1227 } 1228 1229 for (i = 0; i < vq->inflight->desc_num; i++) { 1230 if (vq->inflight->desc[i].inflight) { 1231 vq->resubmit_list[vq->resubmit_num].index = i; 1232 vq->resubmit_list[vq->resubmit_num].counter = 1233 vq->inflight->desc[i].counter; 1234 vq->resubmit_num++; 1235 } 1236 } 1237 1238 if (vq->resubmit_num > 1) { 1239 qsort(vq->resubmit_list, vq->resubmit_num, 1240 sizeof(VuVirtqInflightDesc), inflight_desc_compare); 1241 } 1242 vq->counter = vq->resubmit_list[0].counter + 1; 1243 } 1244 1245 /* in case of I/O hang after reconnecting */ 1246 if (eventfd_write(vq->kick_fd, 1)) { 1247 return -1; 1248 } 1249 1250 return 0; 1251 } 1252 1253 static bool 1254 vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) 1255 { 1256 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1257 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1258 1259 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1260 1261 if (!vu_check_queue_msg_file(dev, vmsg)) { 1262 return false; 1263 } 1264 1265 if (dev->vq[index].kick_fd != -1) { 1266 dev->remove_watch(dev, dev->vq[index].kick_fd); 1267 close(dev->vq[index].kick_fd); 1268 dev->vq[index].kick_fd = -1; 1269 } 1270 1271 dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0]; 1272 DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index); 1273 1274 dev->vq[index].started = true; 1275 if (dev->iface->queue_set_started) { 1276 dev->iface->queue_set_started(dev, index, true); 1277 } 1278 1279 if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) { 1280 dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN, 1281 vu_kick_cb, (void *)(long)index); 1282 1283 DPRINT("Waiting for kicks on fd: %d for vq: %d\n", 1284 dev->vq[index].kick_fd, index); 1285 } 1286 1287 if (vu_check_queue_inflights(dev, &dev->vq[index])) { 1288 vu_panic(dev, "Failed to check inflights for vq: %d\n", index); 1289 } 1290 1291 return false; 1292 } 1293 1294 void vu_set_queue_handler(VuDev *dev, VuVirtq *vq, 1295 vu_queue_handler_cb handler) 1296 { 1297 int qidx = vq - dev->vq; 1298 1299 vq->handler = handler; 1300 if (vq->kick_fd >= 0) { 1301 if (handler) { 1302 dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN, 1303 vu_kick_cb, (void *)(long)qidx); 1304 } else { 1305 dev->remove_watch(dev, vq->kick_fd); 1306 } 1307 } 1308 } 1309 1310 bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd, 1311 int size, int offset) 1312 { 1313 int qidx = vq - dev->vq; 1314 int fd_num = 0; 1315 VhostUserMsg vmsg = { 1316 .request = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, 1317 .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, 1318 .size = sizeof(vmsg.payload.area), 1319 .payload.area = { 1320 .u64 = qidx & VHOST_USER_VRING_IDX_MASK, 1321 .size = size, 1322 .offset = offset, 1323 }, 1324 }; 1325 1326 if (fd == -1) { 1327 vmsg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; 1328 } else { 1329 vmsg.fds[fd_num++] = fd; 1330 } 1331 1332 vmsg.fd_num = fd_num; 1333 1334 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) { 1335 return false; 1336 } 1337 1338 pthread_mutex_lock(&dev->slave_mutex); 1339 if (!vu_message_write(dev, dev->slave_fd, &vmsg)) { 1340 pthread_mutex_unlock(&dev->slave_mutex); 1341 return false; 1342 } 1343 1344 /* Also unlocks the slave_mutex */ 1345 return vu_process_message_reply(dev, &vmsg); 1346 } 1347 1348 static bool 1349 vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) 1350 { 1351 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1352 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1353 1354 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1355 1356 if (!vu_check_queue_msg_file(dev, vmsg)) { 1357 return false; 1358 } 1359 1360 if (dev->vq[index].call_fd != -1) { 1361 close(dev->vq[index].call_fd); 1362 dev->vq[index].call_fd = -1; 1363 } 1364 1365 dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0]; 1366 1367 /* in case of I/O hang after reconnecting */ 1368 if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) { 1369 return -1; 1370 } 1371 1372 DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index); 1373 1374 return false; 1375 } 1376 1377 static bool 1378 vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) 1379 { 1380 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1381 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1382 1383 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1384 1385 if (!vu_check_queue_msg_file(dev, vmsg)) { 1386 return false; 1387 } 1388 1389 if (dev->vq[index].err_fd != -1) { 1390 close(dev->vq[index].err_fd); 1391 dev->vq[index].err_fd = -1; 1392 } 1393 1394 dev->vq[index].err_fd = nofd ? -1 : vmsg->fds[0]; 1395 1396 return false; 1397 } 1398 1399 static bool 1400 vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1401 { 1402 /* 1403 * Note that we support, but intentionally do not set, 1404 * VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. This means that 1405 * a device implementation can return it in its callback 1406 * (get_protocol_features) if it wants to use this for 1407 * simulation, but it is otherwise not desirable (if even 1408 * implemented by the master.) 1409 */ 1410 uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_MQ | 1411 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | 1412 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | 1413 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | 1414 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | 1415 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | 1416 1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS; 1417 1418 if (have_userfault()) { 1419 features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; 1420 } 1421 1422 if (dev->iface->get_config && dev->iface->set_config) { 1423 features |= 1ULL << VHOST_USER_PROTOCOL_F_CONFIG; 1424 } 1425 1426 if (dev->iface->get_protocol_features) { 1427 features |= dev->iface->get_protocol_features(dev); 1428 } 1429 1430 vmsg_set_reply_u64(vmsg, features); 1431 return true; 1432 } 1433 1434 static bool 1435 vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1436 { 1437 uint64_t features = vmsg->payload.u64; 1438 1439 DPRINT("u64: 0x%016"PRIx64"\n", features); 1440 1441 dev->protocol_features = vmsg->payload.u64; 1442 1443 if (vu_has_protocol_feature(dev, 1444 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && 1445 (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ) || 1446 !vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { 1447 /* 1448 * The use case for using messages for kick/call is simulation, to make 1449 * the kick and call synchronous. To actually get that behaviour, both 1450 * of the other features are required. 1451 * Theoretically, one could use only kick messages, or do them without 1452 * having F_REPLY_ACK, but too many (possibly pending) messages on the 1453 * socket will eventually cause the master to hang, to avoid this in 1454 * scenarios where not desired enforce that the settings are in a way 1455 * that actually enables the simulation case. 1456 */ 1457 vu_panic(dev, 1458 "F_IN_BAND_NOTIFICATIONS requires F_SLAVE_REQ && F_REPLY_ACK"); 1459 return false; 1460 } 1461 1462 if (dev->iface->set_protocol_features) { 1463 dev->iface->set_protocol_features(dev, features); 1464 } 1465 1466 return false; 1467 } 1468 1469 static bool 1470 vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg) 1471 { 1472 vmsg_set_reply_u64(vmsg, dev->max_queues); 1473 return true; 1474 } 1475 1476 static bool 1477 vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg) 1478 { 1479 unsigned int index = vmsg->payload.state.index; 1480 unsigned int enable = vmsg->payload.state.num; 1481 1482 DPRINT("State.index: %u\n", index); 1483 DPRINT("State.enable: %u\n", enable); 1484 1485 if (index >= dev->max_queues) { 1486 vu_panic(dev, "Invalid vring_enable index: %u", index); 1487 return false; 1488 } 1489 1490 dev->vq[index].enable = enable; 1491 return false; 1492 } 1493 1494 static bool 1495 vu_set_slave_req_fd(VuDev *dev, VhostUserMsg *vmsg) 1496 { 1497 if (vmsg->fd_num != 1) { 1498 vu_panic(dev, "Invalid slave_req_fd message (%d fd's)", vmsg->fd_num); 1499 return false; 1500 } 1501 1502 if (dev->slave_fd != -1) { 1503 close(dev->slave_fd); 1504 } 1505 dev->slave_fd = vmsg->fds[0]; 1506 DPRINT("Got slave_fd: %d\n", vmsg->fds[0]); 1507 1508 return false; 1509 } 1510 1511 static bool 1512 vu_get_config(VuDev *dev, VhostUserMsg *vmsg) 1513 { 1514 int ret = -1; 1515 1516 if (dev->iface->get_config) { 1517 ret = dev->iface->get_config(dev, vmsg->payload.config.region, 1518 vmsg->payload.config.size); 1519 } 1520 1521 if (ret) { 1522 /* resize to zero to indicate an error to master */ 1523 vmsg->size = 0; 1524 } 1525 1526 return true; 1527 } 1528 1529 static bool 1530 vu_set_config(VuDev *dev, VhostUserMsg *vmsg) 1531 { 1532 int ret = -1; 1533 1534 if (dev->iface->set_config) { 1535 ret = dev->iface->set_config(dev, vmsg->payload.config.region, 1536 vmsg->payload.config.offset, 1537 vmsg->payload.config.size, 1538 vmsg->payload.config.flags); 1539 if (ret) { 1540 vu_panic(dev, "Set virtio configuration space failed"); 1541 } 1542 } 1543 1544 return false; 1545 } 1546 1547 static bool 1548 vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg) 1549 { 1550 dev->postcopy_ufd = -1; 1551 #ifdef UFFDIO_API 1552 struct uffdio_api api_struct; 1553 1554 dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 1555 vmsg->size = 0; 1556 #endif 1557 1558 if (dev->postcopy_ufd == -1) { 1559 vu_panic(dev, "Userfaultfd not available: %s", strerror(errno)); 1560 goto out; 1561 } 1562 1563 #ifdef UFFDIO_API 1564 api_struct.api = UFFD_API; 1565 api_struct.features = 0; 1566 if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { 1567 vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno)); 1568 close(dev->postcopy_ufd); 1569 dev->postcopy_ufd = -1; 1570 goto out; 1571 } 1572 /* TODO: Stash feature flags somewhere */ 1573 #endif 1574 1575 out: 1576 /* Return a ufd to the QEMU */ 1577 vmsg->fd_num = 1; 1578 vmsg->fds[0] = dev->postcopy_ufd; 1579 return true; /* = send a reply */ 1580 } 1581 1582 static bool 1583 vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg) 1584 { 1585 if (dev->nregions) { 1586 vu_panic(dev, "Regions already registered at postcopy-listen"); 1587 vmsg_set_reply_u64(vmsg, -1); 1588 return true; 1589 } 1590 dev->postcopy_listening = true; 1591 1592 vmsg_set_reply_u64(vmsg, 0); 1593 return true; 1594 } 1595 1596 static bool 1597 vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) 1598 { 1599 DPRINT("%s: Entry\n", __func__); 1600 dev->postcopy_listening = false; 1601 if (dev->postcopy_ufd > 0) { 1602 close(dev->postcopy_ufd); 1603 dev->postcopy_ufd = -1; 1604 DPRINT("%s: Done close\n", __func__); 1605 } 1606 1607 vmsg_set_reply_u64(vmsg, 0); 1608 DPRINT("%s: exit\n", __func__); 1609 return true; 1610 } 1611 1612 static inline uint64_t 1613 vu_inflight_queue_size(uint16_t queue_size) 1614 { 1615 return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size + 1616 sizeof(uint16_t), INFLIGHT_ALIGNMENT); 1617 } 1618 1619 #ifdef MFD_ALLOW_SEALING 1620 static void * 1621 memfd_alloc(const char *name, size_t size, unsigned int flags, int *fd) 1622 { 1623 void *ptr; 1624 int ret; 1625 1626 *fd = memfd_create(name, MFD_ALLOW_SEALING); 1627 if (*fd < 0) { 1628 return NULL; 1629 } 1630 1631 ret = ftruncate(*fd, size); 1632 if (ret < 0) { 1633 close(*fd); 1634 return NULL; 1635 } 1636 1637 ret = fcntl(*fd, F_ADD_SEALS, flags); 1638 if (ret < 0) { 1639 close(*fd); 1640 return NULL; 1641 } 1642 1643 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0); 1644 if (ptr == MAP_FAILED) { 1645 close(*fd); 1646 return NULL; 1647 } 1648 1649 return ptr; 1650 } 1651 #endif 1652 1653 static bool 1654 vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) 1655 { 1656 int fd = -1; 1657 void *addr = NULL; 1658 uint64_t mmap_size; 1659 uint16_t num_queues, queue_size; 1660 1661 if (vmsg->size != sizeof(vmsg->payload.inflight)) { 1662 vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size); 1663 vmsg->payload.inflight.mmap_size = 0; 1664 return true; 1665 } 1666 1667 num_queues = vmsg->payload.inflight.num_queues; 1668 queue_size = vmsg->payload.inflight.queue_size; 1669 1670 DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); 1671 DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size); 1672 1673 mmap_size = vu_inflight_queue_size(queue_size) * num_queues; 1674 1675 #ifdef MFD_ALLOW_SEALING 1676 addr = memfd_alloc("vhost-inflight", mmap_size, 1677 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 1678 &fd); 1679 #else 1680 vu_panic(dev, "Not implemented: memfd support is missing"); 1681 #endif 1682 1683 if (!addr) { 1684 vu_panic(dev, "Failed to alloc vhost inflight area"); 1685 vmsg->payload.inflight.mmap_size = 0; 1686 return true; 1687 } 1688 1689 memset(addr, 0, mmap_size); 1690 1691 dev->inflight_info.addr = addr; 1692 dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size; 1693 dev->inflight_info.fd = vmsg->fds[0] = fd; 1694 vmsg->fd_num = 1; 1695 vmsg->payload.inflight.mmap_offset = 0; 1696 1697 DPRINT("send inflight mmap_size: %"PRId64"\n", 1698 vmsg->payload.inflight.mmap_size); 1699 DPRINT("send inflight mmap offset: %"PRId64"\n", 1700 vmsg->payload.inflight.mmap_offset); 1701 1702 return true; 1703 } 1704 1705 static bool 1706 vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) 1707 { 1708 int fd, i; 1709 uint64_t mmap_size, mmap_offset; 1710 uint16_t num_queues, queue_size; 1711 void *rc; 1712 1713 if (vmsg->fd_num != 1 || 1714 vmsg->size != sizeof(vmsg->payload.inflight)) { 1715 vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d", 1716 vmsg->size, vmsg->fd_num); 1717 return false; 1718 } 1719 1720 fd = vmsg->fds[0]; 1721 mmap_size = vmsg->payload.inflight.mmap_size; 1722 mmap_offset = vmsg->payload.inflight.mmap_offset; 1723 num_queues = vmsg->payload.inflight.num_queues; 1724 queue_size = vmsg->payload.inflight.queue_size; 1725 1726 DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size); 1727 DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset); 1728 DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); 1729 DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size); 1730 1731 rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 1732 fd, mmap_offset); 1733 1734 if (rc == MAP_FAILED) { 1735 vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno)); 1736 return false; 1737 } 1738 1739 if (dev->inflight_info.fd) { 1740 close(dev->inflight_info.fd); 1741 } 1742 1743 if (dev->inflight_info.addr) { 1744 munmap(dev->inflight_info.addr, dev->inflight_info.size); 1745 } 1746 1747 dev->inflight_info.fd = fd; 1748 dev->inflight_info.addr = rc; 1749 dev->inflight_info.size = mmap_size; 1750 1751 for (i = 0; i < num_queues; i++) { 1752 dev->vq[i].inflight = (VuVirtqInflight *)rc; 1753 dev->vq[i].inflight->desc_num = queue_size; 1754 rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size)); 1755 } 1756 1757 return false; 1758 } 1759 1760 static bool 1761 vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg) 1762 { 1763 unsigned int index = vmsg->payload.state.index; 1764 1765 if (index >= dev->max_queues) { 1766 vu_panic(dev, "Invalid queue index: %u", index); 1767 return false; 1768 } 1769 1770 DPRINT("Got kick message: handler:%p idx:%u\n", 1771 dev->vq[index].handler, index); 1772 1773 if (!dev->vq[index].started) { 1774 dev->vq[index].started = true; 1775 1776 if (dev->iface->queue_set_started) { 1777 dev->iface->queue_set_started(dev, index, true); 1778 } 1779 } 1780 1781 if (dev->vq[index].handler) { 1782 dev->vq[index].handler(dev, index); 1783 } 1784 1785 return false; 1786 } 1787 1788 static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg) 1789 { 1790 vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; 1791 vmsg->size = sizeof(vmsg->payload.u64); 1792 vmsg->payload.u64 = VHOST_USER_MAX_RAM_SLOTS; 1793 vmsg->fd_num = 0; 1794 1795 if (!vu_message_write(dev, dev->sock, vmsg)) { 1796 vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno)); 1797 } 1798 1799 DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS); 1800 1801 return false; 1802 } 1803 1804 static bool 1805 vu_process_message(VuDev *dev, VhostUserMsg *vmsg) 1806 { 1807 int do_reply = 0; 1808 1809 /* Print out generic part of the request. */ 1810 DPRINT("================ Vhost user message ================\n"); 1811 DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request), 1812 vmsg->request); 1813 DPRINT("Flags: 0x%x\n", vmsg->flags); 1814 DPRINT("Size: %u\n", vmsg->size); 1815 1816 if (vmsg->fd_num) { 1817 int i; 1818 DPRINT("Fds:"); 1819 for (i = 0; i < vmsg->fd_num; i++) { 1820 DPRINT(" %d", vmsg->fds[i]); 1821 } 1822 DPRINT("\n"); 1823 } 1824 1825 if (dev->iface->process_msg && 1826 dev->iface->process_msg(dev, vmsg, &do_reply)) { 1827 return do_reply; 1828 } 1829 1830 switch (vmsg->request) { 1831 case VHOST_USER_GET_FEATURES: 1832 return vu_get_features_exec(dev, vmsg); 1833 case VHOST_USER_SET_FEATURES: 1834 return vu_set_features_exec(dev, vmsg); 1835 case VHOST_USER_GET_PROTOCOL_FEATURES: 1836 return vu_get_protocol_features_exec(dev, vmsg); 1837 case VHOST_USER_SET_PROTOCOL_FEATURES: 1838 return vu_set_protocol_features_exec(dev, vmsg); 1839 case VHOST_USER_SET_OWNER: 1840 return vu_set_owner_exec(dev, vmsg); 1841 case VHOST_USER_RESET_OWNER: 1842 return vu_reset_device_exec(dev, vmsg); 1843 case VHOST_USER_SET_MEM_TABLE: 1844 return vu_set_mem_table_exec(dev, vmsg); 1845 case VHOST_USER_SET_LOG_BASE: 1846 return vu_set_log_base_exec(dev, vmsg); 1847 case VHOST_USER_SET_LOG_FD: 1848 return vu_set_log_fd_exec(dev, vmsg); 1849 case VHOST_USER_SET_VRING_NUM: 1850 return vu_set_vring_num_exec(dev, vmsg); 1851 case VHOST_USER_SET_VRING_ADDR: 1852 return vu_set_vring_addr_exec(dev, vmsg); 1853 case VHOST_USER_SET_VRING_BASE: 1854 return vu_set_vring_base_exec(dev, vmsg); 1855 case VHOST_USER_GET_VRING_BASE: 1856 return vu_get_vring_base_exec(dev, vmsg); 1857 case VHOST_USER_SET_VRING_KICK: 1858 return vu_set_vring_kick_exec(dev, vmsg); 1859 case VHOST_USER_SET_VRING_CALL: 1860 return vu_set_vring_call_exec(dev, vmsg); 1861 case VHOST_USER_SET_VRING_ERR: 1862 return vu_set_vring_err_exec(dev, vmsg); 1863 case VHOST_USER_GET_QUEUE_NUM: 1864 return vu_get_queue_num_exec(dev, vmsg); 1865 case VHOST_USER_SET_VRING_ENABLE: 1866 return vu_set_vring_enable_exec(dev, vmsg); 1867 case VHOST_USER_SET_SLAVE_REQ_FD: 1868 return vu_set_slave_req_fd(dev, vmsg); 1869 case VHOST_USER_GET_CONFIG: 1870 return vu_get_config(dev, vmsg); 1871 case VHOST_USER_SET_CONFIG: 1872 return vu_set_config(dev, vmsg); 1873 case VHOST_USER_NONE: 1874 /* if you need processing before exit, override iface->process_msg */ 1875 exit(0); 1876 case VHOST_USER_POSTCOPY_ADVISE: 1877 return vu_set_postcopy_advise(dev, vmsg); 1878 case VHOST_USER_POSTCOPY_LISTEN: 1879 return vu_set_postcopy_listen(dev, vmsg); 1880 case VHOST_USER_POSTCOPY_END: 1881 return vu_set_postcopy_end(dev, vmsg); 1882 case VHOST_USER_GET_INFLIGHT_FD: 1883 return vu_get_inflight_fd(dev, vmsg); 1884 case VHOST_USER_SET_INFLIGHT_FD: 1885 return vu_set_inflight_fd(dev, vmsg); 1886 case VHOST_USER_VRING_KICK: 1887 return vu_handle_vring_kick(dev, vmsg); 1888 case VHOST_USER_GET_MAX_MEM_SLOTS: 1889 return vu_handle_get_max_memslots(dev, vmsg); 1890 case VHOST_USER_ADD_MEM_REG: 1891 return vu_add_mem_reg(dev, vmsg); 1892 case VHOST_USER_REM_MEM_REG: 1893 return vu_rem_mem_reg(dev, vmsg); 1894 default: 1895 vmsg_close_fds(vmsg); 1896 vu_panic(dev, "Unhandled request: %d", vmsg->request); 1897 } 1898 1899 return false; 1900 } 1901 1902 bool 1903 vu_dispatch(VuDev *dev) 1904 { 1905 VhostUserMsg vmsg = { 0, }; 1906 int reply_requested; 1907 bool need_reply, success = false; 1908 1909 if (!dev->read_msg(dev, dev->sock, &vmsg)) { 1910 goto end; 1911 } 1912 1913 need_reply = vmsg.flags & VHOST_USER_NEED_REPLY_MASK; 1914 1915 reply_requested = vu_process_message(dev, &vmsg); 1916 if (!reply_requested && need_reply) { 1917 vmsg_set_reply_u64(&vmsg, 0); 1918 reply_requested = 1; 1919 } 1920 1921 if (!reply_requested) { 1922 success = true; 1923 goto end; 1924 } 1925 1926 if (!vu_send_reply(dev, dev->sock, &vmsg)) { 1927 goto end; 1928 } 1929 1930 success = true; 1931 1932 end: 1933 free(vmsg.data); 1934 return success; 1935 } 1936 1937 void 1938 vu_deinit(VuDev *dev) 1939 { 1940 int i; 1941 1942 for (i = 0; i < dev->nregions; i++) { 1943 VuDevRegion *r = &dev->regions[i]; 1944 void *m = (void *) (uintptr_t) r->mmap_addr; 1945 if (m != MAP_FAILED) { 1946 munmap(m, r->size + r->mmap_offset); 1947 } 1948 } 1949 dev->nregions = 0; 1950 1951 for (i = 0; i < dev->max_queues; i++) { 1952 VuVirtq *vq = &dev->vq[i]; 1953 1954 if (vq->call_fd != -1) { 1955 close(vq->call_fd); 1956 vq->call_fd = -1; 1957 } 1958 1959 if (vq->kick_fd != -1) { 1960 dev->remove_watch(dev, vq->kick_fd); 1961 close(vq->kick_fd); 1962 vq->kick_fd = -1; 1963 } 1964 1965 if (vq->err_fd != -1) { 1966 close(vq->err_fd); 1967 vq->err_fd = -1; 1968 } 1969 1970 if (vq->resubmit_list) { 1971 free(vq->resubmit_list); 1972 vq->resubmit_list = NULL; 1973 } 1974 1975 vq->inflight = NULL; 1976 } 1977 1978 if (dev->inflight_info.addr) { 1979 munmap(dev->inflight_info.addr, dev->inflight_info.size); 1980 dev->inflight_info.addr = NULL; 1981 } 1982 1983 if (dev->inflight_info.fd > 0) { 1984 close(dev->inflight_info.fd); 1985 dev->inflight_info.fd = -1; 1986 } 1987 1988 vu_close_log(dev); 1989 if (dev->slave_fd != -1) { 1990 close(dev->slave_fd); 1991 dev->slave_fd = -1; 1992 } 1993 pthread_mutex_destroy(&dev->slave_mutex); 1994 1995 if (dev->sock != -1) { 1996 close(dev->sock); 1997 } 1998 1999 free(dev->vq); 2000 dev->vq = NULL; 2001 } 2002 2003 bool 2004 vu_init(VuDev *dev, 2005 uint16_t max_queues, 2006 int socket, 2007 vu_panic_cb panic, 2008 vu_read_msg_cb read_msg, 2009 vu_set_watch_cb set_watch, 2010 vu_remove_watch_cb remove_watch, 2011 const VuDevIface *iface) 2012 { 2013 uint16_t i; 2014 2015 assert(max_queues > 0); 2016 assert(socket >= 0); 2017 assert(set_watch); 2018 assert(remove_watch); 2019 assert(iface); 2020 assert(panic); 2021 2022 memset(dev, 0, sizeof(*dev)); 2023 2024 dev->sock = socket; 2025 dev->panic = panic; 2026 dev->read_msg = read_msg ? read_msg : vu_message_read_default; 2027 dev->set_watch = set_watch; 2028 dev->remove_watch = remove_watch; 2029 dev->iface = iface; 2030 dev->log_call_fd = -1; 2031 pthread_mutex_init(&dev->slave_mutex, NULL); 2032 dev->slave_fd = -1; 2033 dev->max_queues = max_queues; 2034 2035 dev->vq = malloc(max_queues * sizeof(dev->vq[0])); 2036 if (!dev->vq) { 2037 DPRINT("%s: failed to malloc virtqueues\n", __func__); 2038 return false; 2039 } 2040 2041 for (i = 0; i < max_queues; i++) { 2042 dev->vq[i] = (VuVirtq) { 2043 .call_fd = -1, .kick_fd = -1, .err_fd = -1, 2044 .notification = true, 2045 }; 2046 } 2047 2048 return true; 2049 } 2050 2051 VuVirtq * 2052 vu_get_queue(VuDev *dev, int qidx) 2053 { 2054 assert(qidx < dev->max_queues); 2055 return &dev->vq[qidx]; 2056 } 2057 2058 bool 2059 vu_queue_enabled(VuDev *dev, VuVirtq *vq) 2060 { 2061 return vq->enable; 2062 } 2063 2064 bool 2065 vu_queue_started(const VuDev *dev, const VuVirtq *vq) 2066 { 2067 return vq->started; 2068 } 2069 2070 static inline uint16_t 2071 vring_avail_flags(VuVirtq *vq) 2072 { 2073 return le16toh(vq->vring.avail->flags); 2074 } 2075 2076 static inline uint16_t 2077 vring_avail_idx(VuVirtq *vq) 2078 { 2079 vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); 2080 2081 return vq->shadow_avail_idx; 2082 } 2083 2084 static inline uint16_t 2085 vring_avail_ring(VuVirtq *vq, int i) 2086 { 2087 return le16toh(vq->vring.avail->ring[i]); 2088 } 2089 2090 static inline uint16_t 2091 vring_get_used_event(VuVirtq *vq) 2092 { 2093 return vring_avail_ring(vq, vq->vring.num); 2094 } 2095 2096 static int 2097 virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx) 2098 { 2099 uint16_t num_heads = vring_avail_idx(vq) - idx; 2100 2101 /* Check it isn't doing very strange things with descriptor numbers. */ 2102 if (num_heads > vq->vring.num) { 2103 vu_panic(dev, "Guest moved used index from %u to %u", 2104 idx, vq->shadow_avail_idx); 2105 return -1; 2106 } 2107 if (num_heads) { 2108 /* On success, callers read a descriptor at vq->last_avail_idx. 2109 * Make sure descriptor read does not bypass avail index read. */ 2110 smp_rmb(); 2111 } 2112 2113 return num_heads; 2114 } 2115 2116 static bool 2117 virtqueue_get_head(VuDev *dev, VuVirtq *vq, 2118 unsigned int idx, unsigned int *head) 2119 { 2120 /* Grab the next descriptor number they're advertising, and increment 2121 * the index we've seen. */ 2122 *head = vring_avail_ring(vq, idx % vq->vring.num); 2123 2124 /* If their number is silly, that's a fatal mistake. */ 2125 if (*head >= vq->vring.num) { 2126 vu_panic(dev, "Guest says index %u is available", *head); 2127 return false; 2128 } 2129 2130 return true; 2131 } 2132 2133 static int 2134 virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc, 2135 uint64_t addr, size_t len) 2136 { 2137 struct vring_desc *ori_desc; 2138 uint64_t read_len; 2139 2140 if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { 2141 return -1; 2142 } 2143 2144 if (len == 0) { 2145 return -1; 2146 } 2147 2148 while (len) { 2149 read_len = len; 2150 ori_desc = vu_gpa_to_va(dev, &read_len, addr); 2151 if (!ori_desc) { 2152 return -1; 2153 } 2154 2155 memcpy(desc, ori_desc, read_len); 2156 len -= read_len; 2157 addr += read_len; 2158 desc += read_len; 2159 } 2160 2161 return 0; 2162 } 2163 2164 enum { 2165 VIRTQUEUE_READ_DESC_ERROR = -1, 2166 VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ 2167 VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ 2168 }; 2169 2170 static int 2171 virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, 2172 int i, unsigned int max, unsigned int *next) 2173 { 2174 /* If this descriptor says it doesn't chain, we're done. */ 2175 if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) { 2176 return VIRTQUEUE_READ_DESC_DONE; 2177 } 2178 2179 /* Check they're not leading us off end of descriptors. */ 2180 *next = le16toh(desc[i].next); 2181 /* Make sure compiler knows to grab that: we don't want it changing! */ 2182 smp_wmb(); 2183 2184 if (*next >= max) { 2185 vu_panic(dev, "Desc next is %u", *next); 2186 return VIRTQUEUE_READ_DESC_ERROR; 2187 } 2188 2189 return VIRTQUEUE_READ_DESC_MORE; 2190 } 2191 2192 void 2193 vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, 2194 unsigned int *out_bytes, 2195 unsigned max_in_bytes, unsigned max_out_bytes) 2196 { 2197 unsigned int idx; 2198 unsigned int total_bufs, in_total, out_total; 2199 int rc; 2200 2201 idx = vq->last_avail_idx; 2202 2203 total_bufs = in_total = out_total = 0; 2204 if (unlikely(dev->broken) || 2205 unlikely(!vq->vring.avail)) { 2206 goto done; 2207 } 2208 2209 while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) { 2210 unsigned int max, desc_len, num_bufs, indirect = 0; 2211 uint64_t desc_addr, read_len; 2212 struct vring_desc *desc; 2213 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2214 unsigned int i; 2215 2216 max = vq->vring.num; 2217 num_bufs = total_bufs; 2218 if (!virtqueue_get_head(dev, vq, idx++, &i)) { 2219 goto err; 2220 } 2221 desc = vq->vring.desc; 2222 2223 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2224 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2225 vu_panic(dev, "Invalid size for indirect buffer table"); 2226 goto err; 2227 } 2228 2229 /* If we've got too many, that implies a descriptor loop. */ 2230 if (num_bufs >= max) { 2231 vu_panic(dev, "Looped descriptor"); 2232 goto err; 2233 } 2234 2235 /* loop over the indirect descriptor table */ 2236 indirect = 1; 2237 desc_addr = le64toh(desc[i].addr); 2238 desc_len = le32toh(desc[i].len); 2239 max = desc_len / sizeof(struct vring_desc); 2240 read_len = desc_len; 2241 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2242 if (unlikely(desc && read_len != desc_len)) { 2243 /* Failed to use zero copy */ 2244 desc = NULL; 2245 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2246 desc_addr, 2247 desc_len)) { 2248 desc = desc_buf; 2249 } 2250 } 2251 if (!desc) { 2252 vu_panic(dev, "Invalid indirect buffer table"); 2253 goto err; 2254 } 2255 num_bufs = i = 0; 2256 } 2257 2258 do { 2259 /* If we've got too many, that implies a descriptor loop. */ 2260 if (++num_bufs > max) { 2261 vu_panic(dev, "Looped descriptor"); 2262 goto err; 2263 } 2264 2265 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2266 in_total += le32toh(desc[i].len); 2267 } else { 2268 out_total += le32toh(desc[i].len); 2269 } 2270 if (in_total >= max_in_bytes && out_total >= max_out_bytes) { 2271 goto done; 2272 } 2273 rc = virtqueue_read_next_desc(dev, desc, i, max, &i); 2274 } while (rc == VIRTQUEUE_READ_DESC_MORE); 2275 2276 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 2277 goto err; 2278 } 2279 2280 if (!indirect) { 2281 total_bufs = num_bufs; 2282 } else { 2283 total_bufs++; 2284 } 2285 } 2286 if (rc < 0) { 2287 goto err; 2288 } 2289 done: 2290 if (in_bytes) { 2291 *in_bytes = in_total; 2292 } 2293 if (out_bytes) { 2294 *out_bytes = out_total; 2295 } 2296 return; 2297 2298 err: 2299 in_total = out_total = 0; 2300 goto done; 2301 } 2302 2303 bool 2304 vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, 2305 unsigned int out_bytes) 2306 { 2307 unsigned int in_total, out_total; 2308 2309 vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total, 2310 in_bytes, out_bytes); 2311 2312 return in_bytes <= in_total && out_bytes <= out_total; 2313 } 2314 2315 /* Fetch avail_idx from VQ memory only when we really need to know if 2316 * guest has added some buffers. */ 2317 bool 2318 vu_queue_empty(VuDev *dev, VuVirtq *vq) 2319 { 2320 if (unlikely(dev->broken) || 2321 unlikely(!vq->vring.avail)) { 2322 return true; 2323 } 2324 2325 if (vq->shadow_avail_idx != vq->last_avail_idx) { 2326 return false; 2327 } 2328 2329 return vring_avail_idx(vq) == vq->last_avail_idx; 2330 } 2331 2332 static bool 2333 vring_notify(VuDev *dev, VuVirtq *vq) 2334 { 2335 uint16_t old, new; 2336 bool v; 2337 2338 /* We need to expose used array entries before checking used event. */ 2339 smp_mb(); 2340 2341 /* Always notify when queue is empty (when feature acknowledge) */ 2342 if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && 2343 !vq->inuse && vu_queue_empty(dev, vq)) { 2344 return true; 2345 } 2346 2347 if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2348 return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); 2349 } 2350 2351 v = vq->signalled_used_valid; 2352 vq->signalled_used_valid = true; 2353 old = vq->signalled_used; 2354 new = vq->signalled_used = vq->used_idx; 2355 return !v || vring_need_event(vring_get_used_event(vq), new, old); 2356 } 2357 2358 static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) 2359 { 2360 if (unlikely(dev->broken) || 2361 unlikely(!vq->vring.avail)) { 2362 return; 2363 } 2364 2365 if (!vring_notify(dev, vq)) { 2366 DPRINT("skipped notify...\n"); 2367 return; 2368 } 2369 2370 if (vq->call_fd < 0 && 2371 vu_has_protocol_feature(dev, 2372 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && 2373 vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { 2374 VhostUserMsg vmsg = { 2375 .request = VHOST_USER_SLAVE_VRING_CALL, 2376 .flags = VHOST_USER_VERSION, 2377 .size = sizeof(vmsg.payload.state), 2378 .payload.state = { 2379 .index = vq - dev->vq, 2380 }, 2381 }; 2382 bool ack = sync && 2383 vu_has_protocol_feature(dev, 2384 VHOST_USER_PROTOCOL_F_REPLY_ACK); 2385 2386 if (ack) { 2387 vmsg.flags |= VHOST_USER_NEED_REPLY_MASK; 2388 } 2389 2390 vu_message_write(dev, dev->slave_fd, &vmsg); 2391 if (ack) { 2392 vu_message_read_default(dev, dev->slave_fd, &vmsg); 2393 } 2394 return; 2395 } 2396 2397 if (eventfd_write(vq->call_fd, 1) < 0) { 2398 vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); 2399 } 2400 } 2401 2402 void vu_queue_notify(VuDev *dev, VuVirtq *vq) 2403 { 2404 _vu_queue_notify(dev, vq, false); 2405 } 2406 2407 void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq) 2408 { 2409 _vu_queue_notify(dev, vq, true); 2410 } 2411 2412 static inline void 2413 vring_used_flags_set_bit(VuVirtq *vq, int mask) 2414 { 2415 uint16_t *flags; 2416 2417 flags = (uint16_t *)((char*)vq->vring.used + 2418 offsetof(struct vring_used, flags)); 2419 *flags = htole16(le16toh(*flags) | mask); 2420 } 2421 2422 static inline void 2423 vring_used_flags_unset_bit(VuVirtq *vq, int mask) 2424 { 2425 uint16_t *flags; 2426 2427 flags = (uint16_t *)((char*)vq->vring.used + 2428 offsetof(struct vring_used, flags)); 2429 *flags = htole16(le16toh(*flags) & ~mask); 2430 } 2431 2432 static inline void 2433 vring_set_avail_event(VuVirtq *vq, uint16_t val) 2434 { 2435 uint16_t *avail; 2436 2437 if (!vq->notification) { 2438 return; 2439 } 2440 2441 avail = (uint16_t *)&vq->vring.used->ring[vq->vring.num]; 2442 *avail = htole16(val); 2443 } 2444 2445 void 2446 vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable) 2447 { 2448 vq->notification = enable; 2449 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2450 vring_set_avail_event(vq, vring_avail_idx(vq)); 2451 } else if (enable) { 2452 vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); 2453 } else { 2454 vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); 2455 } 2456 if (enable) { 2457 /* Expose avail event/used flags before caller checks the avail idx. */ 2458 smp_mb(); 2459 } 2460 } 2461 2462 static bool 2463 virtqueue_map_desc(VuDev *dev, 2464 unsigned int *p_num_sg, struct iovec *iov, 2465 unsigned int max_num_sg, bool is_write, 2466 uint64_t pa, size_t sz) 2467 { 2468 unsigned num_sg = *p_num_sg; 2469 2470 assert(num_sg <= max_num_sg); 2471 2472 if (!sz) { 2473 vu_panic(dev, "virtio: zero sized buffers are not allowed"); 2474 return false; 2475 } 2476 2477 while (sz) { 2478 uint64_t len = sz; 2479 2480 if (num_sg == max_num_sg) { 2481 vu_panic(dev, "virtio: too many descriptors in indirect table"); 2482 return false; 2483 } 2484 2485 iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); 2486 if (iov[num_sg].iov_base == NULL) { 2487 vu_panic(dev, "virtio: invalid address for buffers"); 2488 return false; 2489 } 2490 iov[num_sg].iov_len = len; 2491 num_sg++; 2492 sz -= len; 2493 pa += len; 2494 } 2495 2496 *p_num_sg = num_sg; 2497 return true; 2498 } 2499 2500 static void * 2501 virtqueue_alloc_element(size_t sz, 2502 unsigned out_num, unsigned in_num) 2503 { 2504 VuVirtqElement *elem; 2505 size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); 2506 size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); 2507 size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); 2508 2509 assert(sz >= sizeof(VuVirtqElement)); 2510 elem = malloc(out_sg_end); 2511 elem->out_num = out_num; 2512 elem->in_num = in_num; 2513 elem->in_sg = (void *)elem + in_sg_ofs; 2514 elem->out_sg = (void *)elem + out_sg_ofs; 2515 return elem; 2516 } 2517 2518 static void * 2519 vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) 2520 { 2521 struct vring_desc *desc = vq->vring.desc; 2522 uint64_t desc_addr, read_len; 2523 unsigned int desc_len; 2524 unsigned int max = vq->vring.num; 2525 unsigned int i = idx; 2526 VuVirtqElement *elem; 2527 unsigned int out_num = 0, in_num = 0; 2528 struct iovec iov[VIRTQUEUE_MAX_SIZE]; 2529 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2530 int rc; 2531 2532 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2533 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2534 vu_panic(dev, "Invalid size for indirect buffer table"); 2535 return NULL; 2536 } 2537 2538 /* loop over the indirect descriptor table */ 2539 desc_addr = le64toh(desc[i].addr); 2540 desc_len = le32toh(desc[i].len); 2541 max = desc_len / sizeof(struct vring_desc); 2542 read_len = desc_len; 2543 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2544 if (unlikely(desc && read_len != desc_len)) { 2545 /* Failed to use zero copy */ 2546 desc = NULL; 2547 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2548 desc_addr, 2549 desc_len)) { 2550 desc = desc_buf; 2551 } 2552 } 2553 if (!desc) { 2554 vu_panic(dev, "Invalid indirect buffer table"); 2555 return NULL; 2556 } 2557 i = 0; 2558 } 2559 2560 /* Collect all the descriptors */ 2561 do { 2562 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2563 if (!virtqueue_map_desc(dev, &in_num, iov + out_num, 2564 VIRTQUEUE_MAX_SIZE - out_num, true, 2565 le64toh(desc[i].addr), 2566 le32toh(desc[i].len))) { 2567 return NULL; 2568 } 2569 } else { 2570 if (in_num) { 2571 vu_panic(dev, "Incorrect order for descriptors"); 2572 return NULL; 2573 } 2574 if (!virtqueue_map_desc(dev, &out_num, iov, 2575 VIRTQUEUE_MAX_SIZE, false, 2576 le64toh(desc[i].addr), 2577 le32toh(desc[i].len))) { 2578 return NULL; 2579 } 2580 } 2581 2582 /* If we've got too many, that implies a descriptor loop. */ 2583 if ((in_num + out_num) > max) { 2584 vu_panic(dev, "Looped descriptor"); 2585 return NULL; 2586 } 2587 rc = virtqueue_read_next_desc(dev, desc, i, max, &i); 2588 } while (rc == VIRTQUEUE_READ_DESC_MORE); 2589 2590 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 2591 vu_panic(dev, "read descriptor error"); 2592 return NULL; 2593 } 2594 2595 /* Now copy what we have collected and mapped */ 2596 elem = virtqueue_alloc_element(sz, out_num, in_num); 2597 elem->index = idx; 2598 for (i = 0; i < out_num; i++) { 2599 elem->out_sg[i] = iov[i]; 2600 } 2601 for (i = 0; i < in_num; i++) { 2602 elem->in_sg[i] = iov[out_num + i]; 2603 } 2604 2605 return elem; 2606 } 2607 2608 static int 2609 vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx) 2610 { 2611 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2612 return 0; 2613 } 2614 2615 if (unlikely(!vq->inflight)) { 2616 return -1; 2617 } 2618 2619 vq->inflight->desc[desc_idx].counter = vq->counter++; 2620 vq->inflight->desc[desc_idx].inflight = 1; 2621 2622 return 0; 2623 } 2624 2625 static int 2626 vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx) 2627 { 2628 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2629 return 0; 2630 } 2631 2632 if (unlikely(!vq->inflight)) { 2633 return -1; 2634 } 2635 2636 vq->inflight->last_batch_head = desc_idx; 2637 2638 return 0; 2639 } 2640 2641 static int 2642 vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx) 2643 { 2644 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2645 return 0; 2646 } 2647 2648 if (unlikely(!vq->inflight)) { 2649 return -1; 2650 } 2651 2652 barrier(); 2653 2654 vq->inflight->desc[desc_idx].inflight = 0; 2655 2656 barrier(); 2657 2658 vq->inflight->used_idx = vq->used_idx; 2659 2660 return 0; 2661 } 2662 2663 void * 2664 vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) 2665 { 2666 int i; 2667 unsigned int head; 2668 VuVirtqElement *elem; 2669 2670 if (unlikely(dev->broken) || 2671 unlikely(!vq->vring.avail)) { 2672 return NULL; 2673 } 2674 2675 if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) { 2676 i = (--vq->resubmit_num); 2677 elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz); 2678 2679 if (!vq->resubmit_num) { 2680 free(vq->resubmit_list); 2681 vq->resubmit_list = NULL; 2682 } 2683 2684 return elem; 2685 } 2686 2687 if (vu_queue_empty(dev, vq)) { 2688 return NULL; 2689 } 2690 /* 2691 * Needed after virtio_queue_empty(), see comment in 2692 * virtqueue_num_heads(). 2693 */ 2694 smp_rmb(); 2695 2696 if (vq->inuse >= vq->vring.num) { 2697 vu_panic(dev, "Virtqueue size exceeded"); 2698 return NULL; 2699 } 2700 2701 if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) { 2702 return NULL; 2703 } 2704 2705 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2706 vring_set_avail_event(vq, vq->last_avail_idx); 2707 } 2708 2709 elem = vu_queue_map_desc(dev, vq, head, sz); 2710 2711 if (!elem) { 2712 return NULL; 2713 } 2714 2715 vq->inuse++; 2716 2717 vu_queue_inflight_get(dev, vq, head); 2718 2719 return elem; 2720 } 2721 2722 static void 2723 vu_queue_detach_element(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, 2724 size_t len) 2725 { 2726 vq->inuse--; 2727 /* unmap, when DMA support is added */ 2728 } 2729 2730 void 2731 vu_queue_unpop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, 2732 size_t len) 2733 { 2734 vq->last_avail_idx--; 2735 vu_queue_detach_element(dev, vq, elem, len); 2736 } 2737 2738 bool 2739 vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) 2740 { 2741 if (num > vq->inuse) { 2742 return false; 2743 } 2744 vq->last_avail_idx -= num; 2745 vq->inuse -= num; 2746 return true; 2747 } 2748 2749 static inline 2750 void vring_used_write(VuDev *dev, VuVirtq *vq, 2751 struct vring_used_elem *uelem, int i) 2752 { 2753 struct vring_used *used = vq->vring.used; 2754 2755 used->ring[i] = *uelem; 2756 vu_log_write(dev, vq->vring.log_guest_addr + 2757 offsetof(struct vring_used, ring[i]), 2758 sizeof(used->ring[i])); 2759 } 2760 2761 2762 static void 2763 vu_log_queue_fill(VuDev *dev, VuVirtq *vq, 2764 const VuVirtqElement *elem, 2765 unsigned int len) 2766 { 2767 struct vring_desc *desc = vq->vring.desc; 2768 unsigned int i, max, min, desc_len; 2769 uint64_t desc_addr, read_len; 2770 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2771 unsigned num_bufs = 0; 2772 2773 max = vq->vring.num; 2774 i = elem->index; 2775 2776 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2777 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2778 vu_panic(dev, "Invalid size for indirect buffer table"); 2779 return; 2780 } 2781 2782 /* loop over the indirect descriptor table */ 2783 desc_addr = le64toh(desc[i].addr); 2784 desc_len = le32toh(desc[i].len); 2785 max = desc_len / sizeof(struct vring_desc); 2786 read_len = desc_len; 2787 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2788 if (unlikely(desc && read_len != desc_len)) { 2789 /* Failed to use zero copy */ 2790 desc = NULL; 2791 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2792 desc_addr, 2793 desc_len)) { 2794 desc = desc_buf; 2795 } 2796 } 2797 if (!desc) { 2798 vu_panic(dev, "Invalid indirect buffer table"); 2799 return; 2800 } 2801 i = 0; 2802 } 2803 2804 do { 2805 if (++num_bufs > max) { 2806 vu_panic(dev, "Looped descriptor"); 2807 return; 2808 } 2809 2810 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2811 min = MIN(le32toh(desc[i].len), len); 2812 vu_log_write(dev, le64toh(desc[i].addr), min); 2813 len -= min; 2814 } 2815 2816 } while (len > 0 && 2817 (virtqueue_read_next_desc(dev, desc, i, max, &i) 2818 == VIRTQUEUE_READ_DESC_MORE)); 2819 } 2820 2821 void 2822 vu_queue_fill(VuDev *dev, VuVirtq *vq, 2823 const VuVirtqElement *elem, 2824 unsigned int len, unsigned int idx) 2825 { 2826 struct vring_used_elem uelem; 2827 2828 if (unlikely(dev->broken) || 2829 unlikely(!vq->vring.avail)) { 2830 return; 2831 } 2832 2833 vu_log_queue_fill(dev, vq, elem, len); 2834 2835 idx = (idx + vq->used_idx) % vq->vring.num; 2836 2837 uelem.id = htole32(elem->index); 2838 uelem.len = htole32(len); 2839 vring_used_write(dev, vq, &uelem, idx); 2840 } 2841 2842 static inline 2843 void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val) 2844 { 2845 vq->vring.used->idx = htole16(val); 2846 vu_log_write(dev, 2847 vq->vring.log_guest_addr + offsetof(struct vring_used, idx), 2848 sizeof(vq->vring.used->idx)); 2849 2850 vq->used_idx = val; 2851 } 2852 2853 void 2854 vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) 2855 { 2856 uint16_t old, new; 2857 2858 if (unlikely(dev->broken) || 2859 unlikely(!vq->vring.avail)) { 2860 return; 2861 } 2862 2863 /* Make sure buffer is written before we update index. */ 2864 smp_wmb(); 2865 2866 old = vq->used_idx; 2867 new = old + count; 2868 vring_used_idx_set(dev, vq, new); 2869 vq->inuse -= count; 2870 if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { 2871 vq->signalled_used_valid = false; 2872 } 2873 } 2874 2875 void 2876 vu_queue_push(VuDev *dev, VuVirtq *vq, 2877 const VuVirtqElement *elem, unsigned int len) 2878 { 2879 vu_queue_fill(dev, vq, elem, len, 0); 2880 vu_queue_inflight_pre_put(dev, vq, elem->index); 2881 vu_queue_flush(dev, vq, 1); 2882 vu_queue_inflight_post_put(dev, vq, elem->index); 2883 } 2884