1 /* 2 * Vhost User library 3 * 4 * Copyright IBM, Corp. 2007 5 * Copyright (c) 2016 Red Hat, Inc. 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Marc-André Lureau <mlureau@redhat.com> 10 * Victor Kaplansky <victork@redhat.com> 11 * 12 * This work is licensed under the terms of the GNU GPL, version 2 or 13 * later. See the COPYING file in the top-level directory. 14 */ 15 16 /* this code avoids GLib dependency */ 17 #include <stdlib.h> 18 #include <stdio.h> 19 #include <unistd.h> 20 #include <stdarg.h> 21 #include <errno.h> 22 #include <string.h> 23 #include <assert.h> 24 #include <inttypes.h> 25 #include <sys/types.h> 26 #include <sys/socket.h> 27 #include <sys/eventfd.h> 28 #include <sys/mman.h> 29 #include <endian.h> 30 31 #if defined(__linux__) 32 #include <sys/syscall.h> 33 #include <fcntl.h> 34 #include <sys/ioctl.h> 35 #include <linux/vhost.h> 36 37 #ifdef __NR_userfaultfd 38 #include <linux/userfaultfd.h> 39 #endif 40 41 #endif 42 43 #include "include/atomic.h" 44 45 #include "libvhost-user.h" 46 47 /* usually provided by GLib */ 48 #ifndef MIN 49 #define MIN(x, y) ({ \ 50 typeof(x) _min1 = (x); \ 51 typeof(y) _min2 = (y); \ 52 (void) (&_min1 == &_min2); \ 53 _min1 < _min2 ? _min1 : _min2; }) 54 #endif 55 56 /* Round number down to multiple */ 57 #define ALIGN_DOWN(n, m) ((n) / (m) * (m)) 58 59 /* Round number up to multiple */ 60 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) 61 62 #ifndef unlikely 63 #define unlikely(x) __builtin_expect(!!(x), 0) 64 #endif 65 66 /* Align each region to cache line size in inflight buffer */ 67 #define INFLIGHT_ALIGNMENT 64 68 69 /* The version of inflight buffer */ 70 #define INFLIGHT_VERSION 1 71 72 /* The version of the protocol we support */ 73 #define VHOST_USER_VERSION 1 74 #define LIBVHOST_USER_DEBUG 0 75 76 #define DPRINT(...) \ 77 do { \ 78 if (LIBVHOST_USER_DEBUG) { \ 79 fprintf(stderr, __VA_ARGS__); \ 80 } \ 81 } while (0) 82 83 static inline 84 bool has_feature(uint64_t features, unsigned int fbit) 85 { 86 assert(fbit < 64); 87 return !!(features & (1ULL << fbit)); 88 } 89 90 static inline 91 bool vu_has_feature(VuDev *dev, 92 unsigned int fbit) 93 { 94 return has_feature(dev->features, fbit); 95 } 96 97 static inline bool vu_has_protocol_feature(VuDev *dev, unsigned int fbit) 98 { 99 return has_feature(dev->protocol_features, fbit); 100 } 101 102 static const char * 103 vu_request_to_string(unsigned int req) 104 { 105 #define REQ(req) [req] = #req 106 static const char *vu_request_str[] = { 107 REQ(VHOST_USER_NONE), 108 REQ(VHOST_USER_GET_FEATURES), 109 REQ(VHOST_USER_SET_FEATURES), 110 REQ(VHOST_USER_SET_OWNER), 111 REQ(VHOST_USER_RESET_OWNER), 112 REQ(VHOST_USER_SET_MEM_TABLE), 113 REQ(VHOST_USER_SET_LOG_BASE), 114 REQ(VHOST_USER_SET_LOG_FD), 115 REQ(VHOST_USER_SET_VRING_NUM), 116 REQ(VHOST_USER_SET_VRING_ADDR), 117 REQ(VHOST_USER_SET_VRING_BASE), 118 REQ(VHOST_USER_GET_VRING_BASE), 119 REQ(VHOST_USER_SET_VRING_KICK), 120 REQ(VHOST_USER_SET_VRING_CALL), 121 REQ(VHOST_USER_SET_VRING_ERR), 122 REQ(VHOST_USER_GET_PROTOCOL_FEATURES), 123 REQ(VHOST_USER_SET_PROTOCOL_FEATURES), 124 REQ(VHOST_USER_GET_QUEUE_NUM), 125 REQ(VHOST_USER_SET_VRING_ENABLE), 126 REQ(VHOST_USER_SEND_RARP), 127 REQ(VHOST_USER_NET_SET_MTU), 128 REQ(VHOST_USER_SET_SLAVE_REQ_FD), 129 REQ(VHOST_USER_IOTLB_MSG), 130 REQ(VHOST_USER_SET_VRING_ENDIAN), 131 REQ(VHOST_USER_GET_CONFIG), 132 REQ(VHOST_USER_SET_CONFIG), 133 REQ(VHOST_USER_POSTCOPY_ADVISE), 134 REQ(VHOST_USER_POSTCOPY_LISTEN), 135 REQ(VHOST_USER_POSTCOPY_END), 136 REQ(VHOST_USER_GET_INFLIGHT_FD), 137 REQ(VHOST_USER_SET_INFLIGHT_FD), 138 REQ(VHOST_USER_GPU_SET_SOCKET), 139 REQ(VHOST_USER_VRING_KICK), 140 REQ(VHOST_USER_GET_MAX_MEM_SLOTS), 141 REQ(VHOST_USER_ADD_MEM_REG), 142 REQ(VHOST_USER_REM_MEM_REG), 143 REQ(VHOST_USER_MAX), 144 }; 145 #undef REQ 146 147 if (req < VHOST_USER_MAX) { 148 return vu_request_str[req]; 149 } else { 150 return "unknown"; 151 } 152 } 153 154 static void 155 vu_panic(VuDev *dev, const char *msg, ...) 156 { 157 char *buf = NULL; 158 va_list ap; 159 160 va_start(ap, msg); 161 if (vasprintf(&buf, msg, ap) < 0) { 162 buf = NULL; 163 } 164 va_end(ap); 165 166 dev->broken = true; 167 dev->panic(dev, buf); 168 free(buf); 169 170 /* 171 * FIXME: 172 * find a way to call virtio_error, or perhaps close the connection? 173 */ 174 } 175 176 /* Translate guest physical address to our virtual address. */ 177 void * 178 vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) 179 { 180 int i; 181 182 if (*plen == 0) { 183 return NULL; 184 } 185 186 /* Find matching memory region. */ 187 for (i = 0; i < dev->nregions; i++) { 188 VuDevRegion *r = &dev->regions[i]; 189 190 if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { 191 if ((guest_addr + *plen) > (r->gpa + r->size)) { 192 *plen = r->gpa + r->size - guest_addr; 193 } 194 return (void *)(uintptr_t) 195 guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; 196 } 197 } 198 199 return NULL; 200 } 201 202 /* Translate qemu virtual address to our virtual address. */ 203 static void * 204 qva_to_va(VuDev *dev, uint64_t qemu_addr) 205 { 206 int i; 207 208 /* Find matching memory region. */ 209 for (i = 0; i < dev->nregions; i++) { 210 VuDevRegion *r = &dev->regions[i]; 211 212 if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { 213 return (void *)(uintptr_t) 214 qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; 215 } 216 } 217 218 return NULL; 219 } 220 221 static void 222 vmsg_close_fds(VhostUserMsg *vmsg) 223 { 224 int i; 225 226 for (i = 0; i < vmsg->fd_num; i++) { 227 close(vmsg->fds[i]); 228 } 229 } 230 231 /* Set reply payload.u64 and clear request flags and fd_num */ 232 static void vmsg_set_reply_u64(VhostUserMsg *vmsg, uint64_t val) 233 { 234 vmsg->flags = 0; /* defaults will be set by vu_send_reply() */ 235 vmsg->size = sizeof(vmsg->payload.u64); 236 vmsg->payload.u64 = val; 237 vmsg->fd_num = 0; 238 } 239 240 /* A test to see if we have userfault available */ 241 static bool 242 have_userfault(void) 243 { 244 #if defined(__linux__) && defined(__NR_userfaultfd) &&\ 245 defined(UFFD_FEATURE_MISSING_SHMEM) &&\ 246 defined(UFFD_FEATURE_MISSING_HUGETLBFS) 247 /* Now test the kernel we're running on really has the features */ 248 int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 249 struct uffdio_api api_struct; 250 if (ufd < 0) { 251 return false; 252 } 253 254 api_struct.api = UFFD_API; 255 api_struct.features = UFFD_FEATURE_MISSING_SHMEM | 256 UFFD_FEATURE_MISSING_HUGETLBFS; 257 if (ioctl(ufd, UFFDIO_API, &api_struct)) { 258 close(ufd); 259 return false; 260 } 261 close(ufd); 262 return true; 263 264 #else 265 return false; 266 #endif 267 } 268 269 static bool 270 vu_message_read_default(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 271 { 272 char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; 273 struct iovec iov = { 274 .iov_base = (char *)vmsg, 275 .iov_len = VHOST_USER_HDR_SIZE, 276 }; 277 struct msghdr msg = { 278 .msg_iov = &iov, 279 .msg_iovlen = 1, 280 .msg_control = control, 281 .msg_controllen = sizeof(control), 282 }; 283 size_t fd_size; 284 struct cmsghdr *cmsg; 285 int rc; 286 287 do { 288 rc = recvmsg(conn_fd, &msg, 0); 289 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 290 291 if (rc < 0) { 292 vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); 293 return false; 294 } 295 296 vmsg->fd_num = 0; 297 for (cmsg = CMSG_FIRSTHDR(&msg); 298 cmsg != NULL; 299 cmsg = CMSG_NXTHDR(&msg, cmsg)) 300 { 301 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { 302 fd_size = cmsg->cmsg_len - CMSG_LEN(0); 303 vmsg->fd_num = fd_size / sizeof(int); 304 memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); 305 break; 306 } 307 } 308 309 if (vmsg->size > sizeof(vmsg->payload)) { 310 vu_panic(dev, 311 "Error: too big message request: %d, size: vmsg->size: %u, " 312 "while sizeof(vmsg->payload) = %zu\n", 313 vmsg->request, vmsg->size, sizeof(vmsg->payload)); 314 goto fail; 315 } 316 317 if (vmsg->size) { 318 do { 319 rc = read(conn_fd, &vmsg->payload, vmsg->size); 320 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 321 322 if (rc <= 0) { 323 vu_panic(dev, "Error while reading: %s", strerror(errno)); 324 goto fail; 325 } 326 327 assert(rc == vmsg->size); 328 } 329 330 return true; 331 332 fail: 333 vmsg_close_fds(vmsg); 334 335 return false; 336 } 337 338 static bool 339 vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 340 { 341 int rc; 342 uint8_t *p = (uint8_t *)vmsg; 343 char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = {}; 344 struct iovec iov = { 345 .iov_base = (char *)vmsg, 346 .iov_len = VHOST_USER_HDR_SIZE, 347 }; 348 struct msghdr msg = { 349 .msg_iov = &iov, 350 .msg_iovlen = 1, 351 .msg_control = control, 352 }; 353 struct cmsghdr *cmsg; 354 355 memset(control, 0, sizeof(control)); 356 assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); 357 if (vmsg->fd_num > 0) { 358 size_t fdsize = vmsg->fd_num * sizeof(int); 359 msg.msg_controllen = CMSG_SPACE(fdsize); 360 cmsg = CMSG_FIRSTHDR(&msg); 361 cmsg->cmsg_len = CMSG_LEN(fdsize); 362 cmsg->cmsg_level = SOL_SOCKET; 363 cmsg->cmsg_type = SCM_RIGHTS; 364 memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); 365 } else { 366 msg.msg_controllen = 0; 367 } 368 369 do { 370 rc = sendmsg(conn_fd, &msg, 0); 371 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 372 373 if (vmsg->size) { 374 do { 375 if (vmsg->data) { 376 rc = write(conn_fd, vmsg->data, vmsg->size); 377 } else { 378 rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); 379 } 380 } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); 381 } 382 383 if (rc <= 0) { 384 vu_panic(dev, "Error while writing: %s", strerror(errno)); 385 return false; 386 } 387 388 return true; 389 } 390 391 static bool 392 vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) 393 { 394 /* Set the version in the flags when sending the reply */ 395 vmsg->flags &= ~VHOST_USER_VERSION_MASK; 396 vmsg->flags |= VHOST_USER_VERSION; 397 vmsg->flags |= VHOST_USER_REPLY_MASK; 398 399 return vu_message_write(dev, conn_fd, vmsg); 400 } 401 402 /* 403 * Processes a reply on the slave channel. 404 * Entered with slave_mutex held and releases it before exit. 405 * Returns true on success. 406 */ 407 static bool 408 vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg) 409 { 410 VhostUserMsg msg_reply; 411 bool result = false; 412 413 if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { 414 result = true; 415 goto out; 416 } 417 418 if (!vu_message_read_default(dev, dev->slave_fd, &msg_reply)) { 419 goto out; 420 } 421 422 if (msg_reply.request != vmsg->request) { 423 DPRINT("Received unexpected msg type. Expected %d received %d", 424 vmsg->request, msg_reply.request); 425 goto out; 426 } 427 428 result = msg_reply.payload.u64 == 0; 429 430 out: 431 pthread_mutex_unlock(&dev->slave_mutex); 432 return result; 433 } 434 435 /* Kick the log_call_fd if required. */ 436 static void 437 vu_log_kick(VuDev *dev) 438 { 439 if (dev->log_call_fd != -1) { 440 DPRINT("Kicking the QEMU's log...\n"); 441 if (eventfd_write(dev->log_call_fd, 1) < 0) { 442 vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); 443 } 444 } 445 } 446 447 static void 448 vu_log_page(uint8_t *log_table, uint64_t page) 449 { 450 DPRINT("Logged dirty guest page: %"PRId64"\n", page); 451 qatomic_or(&log_table[page / 8], 1 << (page % 8)); 452 } 453 454 static void 455 vu_log_write(VuDev *dev, uint64_t address, uint64_t length) 456 { 457 uint64_t page; 458 459 if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || 460 !dev->log_table || !length) { 461 return; 462 } 463 464 assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); 465 466 page = address / VHOST_LOG_PAGE; 467 while (page * VHOST_LOG_PAGE < address + length) { 468 vu_log_page(dev->log_table, page); 469 page += 1; 470 } 471 472 vu_log_kick(dev); 473 } 474 475 static void 476 vu_kick_cb(VuDev *dev, int condition, void *data) 477 { 478 int index = (intptr_t)data; 479 VuVirtq *vq = &dev->vq[index]; 480 int sock = vq->kick_fd; 481 eventfd_t kick_data; 482 ssize_t rc; 483 484 rc = eventfd_read(sock, &kick_data); 485 if (rc == -1) { 486 vu_panic(dev, "kick eventfd_read(): %s", strerror(errno)); 487 dev->remove_watch(dev, dev->vq[index].kick_fd); 488 } else { 489 DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n", 490 kick_data, vq->handler, index); 491 if (vq->handler) { 492 vq->handler(dev, index); 493 } 494 } 495 } 496 497 static bool 498 vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) 499 { 500 vmsg->payload.u64 = 501 /* 502 * The following VIRTIO feature bits are supported by our virtqueue 503 * implementation: 504 */ 505 1ULL << VIRTIO_F_NOTIFY_ON_EMPTY | 506 1ULL << VIRTIO_RING_F_INDIRECT_DESC | 507 1ULL << VIRTIO_RING_F_EVENT_IDX | 508 1ULL << VIRTIO_F_VERSION_1 | 509 510 /* vhost-user feature bits */ 511 1ULL << VHOST_F_LOG_ALL | 512 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; 513 514 if (dev->iface->get_features) { 515 vmsg->payload.u64 |= dev->iface->get_features(dev); 516 } 517 518 vmsg->size = sizeof(vmsg->payload.u64); 519 vmsg->fd_num = 0; 520 521 DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 522 523 return true; 524 } 525 526 static void 527 vu_set_enable_all_rings(VuDev *dev, bool enabled) 528 { 529 uint16_t i; 530 531 for (i = 0; i < dev->max_queues; i++) { 532 dev->vq[i].enable = enabled; 533 } 534 } 535 536 static bool 537 vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg) 538 { 539 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 540 541 dev->features = vmsg->payload.u64; 542 if (!vu_has_feature(dev, VIRTIO_F_VERSION_1)) { 543 /* 544 * We only support devices conforming to VIRTIO 1.0 or 545 * later 546 */ 547 vu_panic(dev, "virtio legacy devices aren't supported by libvhost-user"); 548 return false; 549 } 550 551 if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) { 552 vu_set_enable_all_rings(dev, true); 553 } 554 555 if (dev->iface->set_features) { 556 dev->iface->set_features(dev, dev->features); 557 } 558 559 return false; 560 } 561 562 static bool 563 vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg) 564 { 565 return false; 566 } 567 568 static void 569 vu_close_log(VuDev *dev) 570 { 571 if (dev->log_table) { 572 if (munmap(dev->log_table, dev->log_size) != 0) { 573 perror("close log munmap() error"); 574 } 575 576 dev->log_table = NULL; 577 } 578 if (dev->log_call_fd != -1) { 579 close(dev->log_call_fd); 580 dev->log_call_fd = -1; 581 } 582 } 583 584 static bool 585 vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) 586 { 587 vu_set_enable_all_rings(dev, false); 588 589 return false; 590 } 591 592 static bool 593 map_ring(VuDev *dev, VuVirtq *vq) 594 { 595 vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); 596 vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); 597 vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); 598 599 DPRINT("Setting virtq addresses:\n"); 600 DPRINT(" vring_desc at %p\n", vq->vring.desc); 601 DPRINT(" vring_used at %p\n", vq->vring.used); 602 DPRINT(" vring_avail at %p\n", vq->vring.avail); 603 604 return !(vq->vring.desc && vq->vring.used && vq->vring.avail); 605 } 606 607 static bool 608 generate_faults(VuDev *dev) { 609 int i; 610 for (i = 0; i < dev->nregions; i++) { 611 VuDevRegion *dev_region = &dev->regions[i]; 612 int ret; 613 #ifdef UFFDIO_REGISTER 614 /* 615 * We should already have an open ufd. Mark each memory 616 * range as ufd. 617 * Discard any mapping we have here; note I can't use MADV_REMOVE 618 * or fallocate to make the hole since I don't want to lose 619 * data that's already arrived in the shared process. 620 * TODO: How to do hugepage 621 */ 622 ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, 623 dev_region->size + dev_region->mmap_offset, 624 MADV_DONTNEED); 625 if (ret) { 626 fprintf(stderr, 627 "%s: Failed to madvise(DONTNEED) region %d: %s\n", 628 __func__, i, strerror(errno)); 629 } 630 /* 631 * Turn off transparent hugepages so we dont get lose wakeups 632 * in neighbouring pages. 633 * TODO: Turn this backon later. 634 */ 635 ret = madvise((void *)(uintptr_t)dev_region->mmap_addr, 636 dev_region->size + dev_region->mmap_offset, 637 MADV_NOHUGEPAGE); 638 if (ret) { 639 /* 640 * Note: This can happen legally on kernels that are configured 641 * without madvise'able hugepages 642 */ 643 fprintf(stderr, 644 "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n", 645 __func__, i, strerror(errno)); 646 } 647 struct uffdio_register reg_struct; 648 reg_struct.range.start = (uintptr_t)dev_region->mmap_addr; 649 reg_struct.range.len = dev_region->size + dev_region->mmap_offset; 650 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 651 652 if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) { 653 vu_panic(dev, "%s: Failed to userfault region %d " 654 "@%p + size:%zx offset: %zx: (ufd=%d)%s\n", 655 __func__, i, 656 dev_region->mmap_addr, 657 dev_region->size, dev_region->mmap_offset, 658 dev->postcopy_ufd, strerror(errno)); 659 return false; 660 } 661 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) { 662 vu_panic(dev, "%s Region (%d) doesn't support COPY", 663 __func__, i); 664 return false; 665 } 666 DPRINT("%s: region %d: Registered userfault for %" 667 PRIx64 " + %" PRIx64 "\n", __func__, i, 668 (uint64_t)reg_struct.range.start, 669 (uint64_t)reg_struct.range.len); 670 /* Now it's registered we can let the client at it */ 671 if (mprotect((void *)(uintptr_t)dev_region->mmap_addr, 672 dev_region->size + dev_region->mmap_offset, 673 PROT_READ | PROT_WRITE)) { 674 vu_panic(dev, "failed to mprotect region %d for postcopy (%s)", 675 i, strerror(errno)); 676 return false; 677 } 678 /* TODO: Stash 'zero' support flags somewhere */ 679 #endif 680 } 681 682 return true; 683 } 684 685 static bool 686 vu_add_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { 687 int i; 688 bool track_ramblocks = dev->postcopy_listening; 689 VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; 690 VuDevRegion *dev_region = &dev->regions[dev->nregions]; 691 void *mmap_addr; 692 693 /* 694 * If we are in postcopy mode and we receive a u64 payload with a 0 value 695 * we know all the postcopy client bases have been received, and we 696 * should start generating faults. 697 */ 698 if (track_ramblocks && 699 vmsg->size == sizeof(vmsg->payload.u64) && 700 vmsg->payload.u64 == 0) { 701 (void)generate_faults(dev); 702 return false; 703 } 704 705 DPRINT("Adding region: %u\n", dev->nregions); 706 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 707 msg_region->guest_phys_addr); 708 DPRINT(" memory_size: 0x%016"PRIx64"\n", 709 msg_region->memory_size); 710 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 711 msg_region->userspace_addr); 712 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 713 msg_region->mmap_offset); 714 715 dev_region->gpa = msg_region->guest_phys_addr; 716 dev_region->size = msg_region->memory_size; 717 dev_region->qva = msg_region->userspace_addr; 718 dev_region->mmap_offset = msg_region->mmap_offset; 719 720 /* 721 * We don't use offset argument of mmap() since the 722 * mapped address has to be page aligned, and we use huge 723 * pages. 724 */ 725 if (track_ramblocks) { 726 /* 727 * In postcopy we're using PROT_NONE here to catch anyone 728 * accessing it before we userfault. 729 */ 730 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 731 PROT_NONE, MAP_SHARED, 732 vmsg->fds[0], 0); 733 } else { 734 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 735 PROT_READ | PROT_WRITE, MAP_SHARED, vmsg->fds[0], 736 0); 737 } 738 739 if (mmap_addr == MAP_FAILED) { 740 vu_panic(dev, "region mmap error: %s", strerror(errno)); 741 } else { 742 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 743 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 744 dev_region->mmap_addr); 745 } 746 747 close(vmsg->fds[0]); 748 749 if (track_ramblocks) { 750 /* 751 * Return the address to QEMU so that it can translate the ufd 752 * fault addresses back. 753 */ 754 msg_region->userspace_addr = (uintptr_t)(mmap_addr + 755 dev_region->mmap_offset); 756 757 /* Send the message back to qemu with the addresses filled in. */ 758 vmsg->fd_num = 0; 759 if (!vu_send_reply(dev, dev->sock, vmsg)) { 760 vu_panic(dev, "failed to respond to add-mem-region for postcopy"); 761 return false; 762 } 763 764 DPRINT("Successfully added new region in postcopy\n"); 765 dev->nregions++; 766 return false; 767 768 } else { 769 for (i = 0; i < dev->max_queues; i++) { 770 if (dev->vq[i].vring.desc) { 771 if (map_ring(dev, &dev->vq[i])) { 772 vu_panic(dev, "remapping queue %d for new memory region", 773 i); 774 } 775 } 776 } 777 778 DPRINT("Successfully added new region\n"); 779 dev->nregions++; 780 vmsg_set_reply_u64(vmsg, 0); 781 return true; 782 } 783 } 784 785 static inline bool reg_equal(VuDevRegion *vudev_reg, 786 VhostUserMemoryRegion *msg_reg) 787 { 788 if (vudev_reg->gpa == msg_reg->guest_phys_addr && 789 vudev_reg->qva == msg_reg->userspace_addr && 790 vudev_reg->size == msg_reg->memory_size) { 791 return true; 792 } 793 794 return false; 795 } 796 797 static bool 798 vu_rem_mem_reg(VuDev *dev, VhostUserMsg *vmsg) { 799 int i, j; 800 bool found = false; 801 VuDevRegion shadow_regions[VHOST_USER_MAX_RAM_SLOTS] = {}; 802 VhostUserMemoryRegion m = vmsg->payload.memreg.region, *msg_region = &m; 803 804 DPRINT("Removing region:\n"); 805 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 806 msg_region->guest_phys_addr); 807 DPRINT(" memory_size: 0x%016"PRIx64"\n", 808 msg_region->memory_size); 809 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 810 msg_region->userspace_addr); 811 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 812 msg_region->mmap_offset); 813 814 for (i = 0, j = 0; i < dev->nregions; i++) { 815 if (!reg_equal(&dev->regions[i], msg_region)) { 816 shadow_regions[j].gpa = dev->regions[i].gpa; 817 shadow_regions[j].size = dev->regions[i].size; 818 shadow_regions[j].qva = dev->regions[i].qva; 819 shadow_regions[j].mmap_addr = dev->regions[i].mmap_addr; 820 shadow_regions[j].mmap_offset = dev->regions[i].mmap_offset; 821 j++; 822 } else { 823 found = true; 824 VuDevRegion *r = &dev->regions[i]; 825 void *m = (void *) (uintptr_t) r->mmap_addr; 826 827 if (m) { 828 munmap(m, r->size + r->mmap_offset); 829 } 830 } 831 } 832 833 if (found) { 834 memcpy(dev->regions, shadow_regions, 835 sizeof(VuDevRegion) * VHOST_USER_MAX_RAM_SLOTS); 836 DPRINT("Successfully removed a region\n"); 837 dev->nregions--; 838 vmsg_set_reply_u64(vmsg, 0); 839 } else { 840 vu_panic(dev, "Specified region not found\n"); 841 } 842 843 return true; 844 } 845 846 static bool 847 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) 848 { 849 int i; 850 VhostUserMemory m = vmsg->payload.memory, *memory = &m; 851 dev->nregions = memory->nregions; 852 853 DPRINT("Nregions: %u\n", memory->nregions); 854 for (i = 0; i < dev->nregions; i++) { 855 void *mmap_addr; 856 VhostUserMemoryRegion *msg_region = &memory->regions[i]; 857 VuDevRegion *dev_region = &dev->regions[i]; 858 859 DPRINT("Region %d\n", i); 860 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 861 msg_region->guest_phys_addr); 862 DPRINT(" memory_size: 0x%016"PRIx64"\n", 863 msg_region->memory_size); 864 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 865 msg_region->userspace_addr); 866 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 867 msg_region->mmap_offset); 868 869 dev_region->gpa = msg_region->guest_phys_addr; 870 dev_region->size = msg_region->memory_size; 871 dev_region->qva = msg_region->userspace_addr; 872 dev_region->mmap_offset = msg_region->mmap_offset; 873 874 /* We don't use offset argument of mmap() since the 875 * mapped address has to be page aligned, and we use huge 876 * pages. 877 * In postcopy we're using PROT_NONE here to catch anyone 878 * accessing it before we userfault 879 */ 880 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 881 PROT_NONE, MAP_SHARED, 882 vmsg->fds[i], 0); 883 884 if (mmap_addr == MAP_FAILED) { 885 vu_panic(dev, "region mmap error: %s", strerror(errno)); 886 } else { 887 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 888 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 889 dev_region->mmap_addr); 890 } 891 892 /* Return the address to QEMU so that it can translate the ufd 893 * fault addresses back. 894 */ 895 msg_region->userspace_addr = (uintptr_t)(mmap_addr + 896 dev_region->mmap_offset); 897 close(vmsg->fds[i]); 898 } 899 900 /* Send the message back to qemu with the addresses filled in */ 901 vmsg->fd_num = 0; 902 if (!vu_send_reply(dev, dev->sock, vmsg)) { 903 vu_panic(dev, "failed to respond to set-mem-table for postcopy"); 904 return false; 905 } 906 907 /* Wait for QEMU to confirm that it's registered the handler for the 908 * faults. 909 */ 910 if (!dev->read_msg(dev, dev->sock, vmsg) || 911 vmsg->size != sizeof(vmsg->payload.u64) || 912 vmsg->payload.u64 != 0) { 913 vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); 914 return false; 915 } 916 917 /* OK, now we can go and register the memory and generate faults */ 918 (void)generate_faults(dev); 919 920 return false; 921 } 922 923 static bool 924 vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) 925 { 926 int i; 927 VhostUserMemory m = vmsg->payload.memory, *memory = &m; 928 929 for (i = 0; i < dev->nregions; i++) { 930 VuDevRegion *r = &dev->regions[i]; 931 void *m = (void *) (uintptr_t) r->mmap_addr; 932 933 if (m) { 934 munmap(m, r->size + r->mmap_offset); 935 } 936 } 937 dev->nregions = memory->nregions; 938 939 if (dev->postcopy_listening) { 940 return vu_set_mem_table_exec_postcopy(dev, vmsg); 941 } 942 943 DPRINT("Nregions: %u\n", memory->nregions); 944 for (i = 0; i < dev->nregions; i++) { 945 void *mmap_addr; 946 VhostUserMemoryRegion *msg_region = &memory->regions[i]; 947 VuDevRegion *dev_region = &dev->regions[i]; 948 949 DPRINT("Region %d\n", i); 950 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", 951 msg_region->guest_phys_addr); 952 DPRINT(" memory_size: 0x%016"PRIx64"\n", 953 msg_region->memory_size); 954 DPRINT(" userspace_addr 0x%016"PRIx64"\n", 955 msg_region->userspace_addr); 956 DPRINT(" mmap_offset 0x%016"PRIx64"\n", 957 msg_region->mmap_offset); 958 959 dev_region->gpa = msg_region->guest_phys_addr; 960 dev_region->size = msg_region->memory_size; 961 dev_region->qva = msg_region->userspace_addr; 962 dev_region->mmap_offset = msg_region->mmap_offset; 963 964 /* We don't use offset argument of mmap() since the 965 * mapped address has to be page aligned, and we use huge 966 * pages. */ 967 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, 968 PROT_READ | PROT_WRITE, MAP_SHARED, 969 vmsg->fds[i], 0); 970 971 if (mmap_addr == MAP_FAILED) { 972 vu_panic(dev, "region mmap error: %s", strerror(errno)); 973 } else { 974 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 975 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", 976 dev_region->mmap_addr); 977 } 978 979 close(vmsg->fds[i]); 980 } 981 982 for (i = 0; i < dev->max_queues; i++) { 983 if (dev->vq[i].vring.desc) { 984 if (map_ring(dev, &dev->vq[i])) { 985 vu_panic(dev, "remapping queue %d during setmemtable", i); 986 } 987 } 988 } 989 990 return false; 991 } 992 993 static bool 994 vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) 995 { 996 int fd; 997 uint64_t log_mmap_size, log_mmap_offset; 998 void *rc; 999 1000 if (vmsg->fd_num != 1 || 1001 vmsg->size != sizeof(vmsg->payload.log)) { 1002 vu_panic(dev, "Invalid log_base message"); 1003 return true; 1004 } 1005 1006 fd = vmsg->fds[0]; 1007 log_mmap_offset = vmsg->payload.log.mmap_offset; 1008 log_mmap_size = vmsg->payload.log.mmap_size; 1009 DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); 1010 DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); 1011 1012 rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 1013 log_mmap_offset); 1014 close(fd); 1015 if (rc == MAP_FAILED) { 1016 perror("log mmap error"); 1017 } 1018 1019 if (dev->log_table) { 1020 munmap(dev->log_table, dev->log_size); 1021 } 1022 dev->log_table = rc; 1023 dev->log_size = log_mmap_size; 1024 1025 vmsg->size = sizeof(vmsg->payload.u64); 1026 vmsg->fd_num = 0; 1027 1028 return true; 1029 } 1030 1031 static bool 1032 vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg) 1033 { 1034 if (vmsg->fd_num != 1) { 1035 vu_panic(dev, "Invalid log_fd message"); 1036 return false; 1037 } 1038 1039 if (dev->log_call_fd != -1) { 1040 close(dev->log_call_fd); 1041 } 1042 dev->log_call_fd = vmsg->fds[0]; 1043 DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); 1044 1045 return false; 1046 } 1047 1048 static bool 1049 vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg) 1050 { 1051 unsigned int index = vmsg->payload.state.index; 1052 unsigned int num = vmsg->payload.state.num; 1053 1054 DPRINT("State.index: %u\n", index); 1055 DPRINT("State.num: %u\n", num); 1056 dev->vq[index].vring.num = num; 1057 1058 return false; 1059 } 1060 1061 static bool 1062 vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) 1063 { 1064 struct vhost_vring_addr addr = vmsg->payload.addr, *vra = &addr; 1065 unsigned int index = vra->index; 1066 VuVirtq *vq = &dev->vq[index]; 1067 1068 DPRINT("vhost_vring_addr:\n"); 1069 DPRINT(" index: %d\n", vra->index); 1070 DPRINT(" flags: %d\n", vra->flags); 1071 DPRINT(" desc_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->desc_user_addr); 1072 DPRINT(" used_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->used_user_addr); 1073 DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->avail_user_addr); 1074 DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", (uint64_t)vra->log_guest_addr); 1075 1076 vq->vra = *vra; 1077 vq->vring.flags = vra->flags; 1078 vq->vring.log_guest_addr = vra->log_guest_addr; 1079 1080 1081 if (map_ring(dev, vq)) { 1082 vu_panic(dev, "Invalid vring_addr message"); 1083 return false; 1084 } 1085 1086 vq->used_idx = le16toh(vq->vring.used->idx); 1087 1088 if (vq->last_avail_idx != vq->used_idx) { 1089 bool resume = dev->iface->queue_is_processed_in_order && 1090 dev->iface->queue_is_processed_in_order(dev, index); 1091 1092 DPRINT("Last avail index != used index: %u != %u%s\n", 1093 vq->last_avail_idx, vq->used_idx, 1094 resume ? ", resuming" : ""); 1095 1096 if (resume) { 1097 vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx; 1098 } 1099 } 1100 1101 return false; 1102 } 1103 1104 static bool 1105 vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1106 { 1107 unsigned int index = vmsg->payload.state.index; 1108 unsigned int num = vmsg->payload.state.num; 1109 1110 DPRINT("State.index: %u\n", index); 1111 DPRINT("State.num: %u\n", num); 1112 dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num; 1113 1114 return false; 1115 } 1116 1117 static bool 1118 vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) 1119 { 1120 unsigned int index = vmsg->payload.state.index; 1121 1122 DPRINT("State.index: %u\n", index); 1123 vmsg->payload.state.num = dev->vq[index].last_avail_idx; 1124 vmsg->size = sizeof(vmsg->payload.state); 1125 1126 dev->vq[index].started = false; 1127 if (dev->iface->queue_set_started) { 1128 dev->iface->queue_set_started(dev, index, false); 1129 } 1130 1131 if (dev->vq[index].call_fd != -1) { 1132 close(dev->vq[index].call_fd); 1133 dev->vq[index].call_fd = -1; 1134 } 1135 if (dev->vq[index].kick_fd != -1) { 1136 dev->remove_watch(dev, dev->vq[index].kick_fd); 1137 close(dev->vq[index].kick_fd); 1138 dev->vq[index].kick_fd = -1; 1139 } 1140 1141 return true; 1142 } 1143 1144 static bool 1145 vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) 1146 { 1147 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1148 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1149 1150 if (index >= dev->max_queues) { 1151 vmsg_close_fds(vmsg); 1152 vu_panic(dev, "Invalid queue index: %u", index); 1153 return false; 1154 } 1155 1156 if (nofd) { 1157 vmsg_close_fds(vmsg); 1158 return true; 1159 } 1160 1161 if (vmsg->fd_num != 1) { 1162 vmsg_close_fds(vmsg); 1163 vu_panic(dev, "Invalid fds in request: %d", vmsg->request); 1164 return false; 1165 } 1166 1167 return true; 1168 } 1169 1170 static int 1171 inflight_desc_compare(const void *a, const void *b) 1172 { 1173 VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a, 1174 *desc1 = (VuVirtqInflightDesc *)b; 1175 1176 if (desc1->counter > desc0->counter && 1177 (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) { 1178 return 1; 1179 } 1180 1181 return -1; 1182 } 1183 1184 static int 1185 vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) 1186 { 1187 int i = 0; 1188 1189 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 1190 return 0; 1191 } 1192 1193 if (unlikely(!vq->inflight)) { 1194 return -1; 1195 } 1196 1197 if (unlikely(!vq->inflight->version)) { 1198 /* initialize the buffer */ 1199 vq->inflight->version = INFLIGHT_VERSION; 1200 return 0; 1201 } 1202 1203 vq->used_idx = le16toh(vq->vring.used->idx); 1204 vq->resubmit_num = 0; 1205 vq->resubmit_list = NULL; 1206 vq->counter = 0; 1207 1208 if (unlikely(vq->inflight->used_idx != vq->used_idx)) { 1209 vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0; 1210 1211 barrier(); 1212 1213 vq->inflight->used_idx = vq->used_idx; 1214 } 1215 1216 for (i = 0; i < vq->inflight->desc_num; i++) { 1217 if (vq->inflight->desc[i].inflight == 1) { 1218 vq->inuse++; 1219 } 1220 } 1221 1222 vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; 1223 1224 if (vq->inuse) { 1225 vq->resubmit_list = calloc(vq->inuse, sizeof(VuVirtqInflightDesc)); 1226 if (!vq->resubmit_list) { 1227 return -1; 1228 } 1229 1230 for (i = 0; i < vq->inflight->desc_num; i++) { 1231 if (vq->inflight->desc[i].inflight) { 1232 vq->resubmit_list[vq->resubmit_num].index = i; 1233 vq->resubmit_list[vq->resubmit_num].counter = 1234 vq->inflight->desc[i].counter; 1235 vq->resubmit_num++; 1236 } 1237 } 1238 1239 if (vq->resubmit_num > 1) { 1240 qsort(vq->resubmit_list, vq->resubmit_num, 1241 sizeof(VuVirtqInflightDesc), inflight_desc_compare); 1242 } 1243 vq->counter = vq->resubmit_list[0].counter + 1; 1244 } 1245 1246 /* in case of I/O hang after reconnecting */ 1247 if (eventfd_write(vq->kick_fd, 1)) { 1248 return -1; 1249 } 1250 1251 return 0; 1252 } 1253 1254 static bool 1255 vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) 1256 { 1257 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1258 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1259 1260 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1261 1262 if (!vu_check_queue_msg_file(dev, vmsg)) { 1263 return false; 1264 } 1265 1266 if (dev->vq[index].kick_fd != -1) { 1267 dev->remove_watch(dev, dev->vq[index].kick_fd); 1268 close(dev->vq[index].kick_fd); 1269 dev->vq[index].kick_fd = -1; 1270 } 1271 1272 dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0]; 1273 DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index); 1274 1275 dev->vq[index].started = true; 1276 if (dev->iface->queue_set_started) { 1277 dev->iface->queue_set_started(dev, index, true); 1278 } 1279 1280 if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) { 1281 dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN, 1282 vu_kick_cb, (void *)(long)index); 1283 1284 DPRINT("Waiting for kicks on fd: %d for vq: %d\n", 1285 dev->vq[index].kick_fd, index); 1286 } 1287 1288 if (vu_check_queue_inflights(dev, &dev->vq[index])) { 1289 vu_panic(dev, "Failed to check inflights for vq: %d\n", index); 1290 } 1291 1292 return false; 1293 } 1294 1295 void vu_set_queue_handler(VuDev *dev, VuVirtq *vq, 1296 vu_queue_handler_cb handler) 1297 { 1298 int qidx = vq - dev->vq; 1299 1300 vq->handler = handler; 1301 if (vq->kick_fd >= 0) { 1302 if (handler) { 1303 dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN, 1304 vu_kick_cb, (void *)(long)qidx); 1305 } else { 1306 dev->remove_watch(dev, vq->kick_fd); 1307 } 1308 } 1309 } 1310 1311 bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd, 1312 int size, int offset) 1313 { 1314 int qidx = vq - dev->vq; 1315 int fd_num = 0; 1316 VhostUserMsg vmsg = { 1317 .request = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, 1318 .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, 1319 .size = sizeof(vmsg.payload.area), 1320 .payload.area = { 1321 .u64 = qidx & VHOST_USER_VRING_IDX_MASK, 1322 .size = size, 1323 .offset = offset, 1324 }, 1325 }; 1326 1327 if (fd == -1) { 1328 vmsg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; 1329 } else { 1330 vmsg.fds[fd_num++] = fd; 1331 } 1332 1333 vmsg.fd_num = fd_num; 1334 1335 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) { 1336 return false; 1337 } 1338 1339 pthread_mutex_lock(&dev->slave_mutex); 1340 if (!vu_message_write(dev, dev->slave_fd, &vmsg)) { 1341 pthread_mutex_unlock(&dev->slave_mutex); 1342 return false; 1343 } 1344 1345 /* Also unlocks the slave_mutex */ 1346 return vu_process_message_reply(dev, &vmsg); 1347 } 1348 1349 static bool 1350 vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) 1351 { 1352 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1353 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1354 1355 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1356 1357 if (!vu_check_queue_msg_file(dev, vmsg)) { 1358 return false; 1359 } 1360 1361 if (dev->vq[index].call_fd != -1) { 1362 close(dev->vq[index].call_fd); 1363 dev->vq[index].call_fd = -1; 1364 } 1365 1366 dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0]; 1367 1368 /* in case of I/O hang after reconnecting */ 1369 if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) { 1370 return -1; 1371 } 1372 1373 DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index); 1374 1375 return false; 1376 } 1377 1378 static bool 1379 vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) 1380 { 1381 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; 1382 bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; 1383 1384 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); 1385 1386 if (!vu_check_queue_msg_file(dev, vmsg)) { 1387 return false; 1388 } 1389 1390 if (dev->vq[index].err_fd != -1) { 1391 close(dev->vq[index].err_fd); 1392 dev->vq[index].err_fd = -1; 1393 } 1394 1395 dev->vq[index].err_fd = nofd ? -1 : vmsg->fds[0]; 1396 1397 return false; 1398 } 1399 1400 static bool 1401 vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1402 { 1403 /* 1404 * Note that we support, but intentionally do not set, 1405 * VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. This means that 1406 * a device implementation can return it in its callback 1407 * (get_protocol_features) if it wants to use this for 1408 * simulation, but it is otherwise not desirable (if even 1409 * implemented by the master.) 1410 */ 1411 uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_MQ | 1412 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | 1413 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | 1414 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | 1415 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | 1416 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | 1417 1ULL << VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS; 1418 1419 if (have_userfault()) { 1420 features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; 1421 } 1422 1423 if (dev->iface->get_config && dev->iface->set_config) { 1424 features |= 1ULL << VHOST_USER_PROTOCOL_F_CONFIG; 1425 } 1426 1427 if (dev->iface->get_protocol_features) { 1428 features |= dev->iface->get_protocol_features(dev); 1429 } 1430 1431 vmsg_set_reply_u64(vmsg, features); 1432 return true; 1433 } 1434 1435 static bool 1436 vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) 1437 { 1438 uint64_t features = vmsg->payload.u64; 1439 1440 DPRINT("u64: 0x%016"PRIx64"\n", features); 1441 1442 dev->protocol_features = vmsg->payload.u64; 1443 1444 if (vu_has_protocol_feature(dev, 1445 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && 1446 (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ) || 1447 !vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { 1448 /* 1449 * The use case for using messages for kick/call is simulation, to make 1450 * the kick and call synchronous. To actually get that behaviour, both 1451 * of the other features are required. 1452 * Theoretically, one could use only kick messages, or do them without 1453 * having F_REPLY_ACK, but too many (possibly pending) messages on the 1454 * socket will eventually cause the master to hang, to avoid this in 1455 * scenarios where not desired enforce that the settings are in a way 1456 * that actually enables the simulation case. 1457 */ 1458 vu_panic(dev, 1459 "F_IN_BAND_NOTIFICATIONS requires F_SLAVE_REQ && F_REPLY_ACK"); 1460 return false; 1461 } 1462 1463 if (dev->iface->set_protocol_features) { 1464 dev->iface->set_protocol_features(dev, features); 1465 } 1466 1467 return false; 1468 } 1469 1470 static bool 1471 vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg) 1472 { 1473 vmsg_set_reply_u64(vmsg, dev->max_queues); 1474 return true; 1475 } 1476 1477 static bool 1478 vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg) 1479 { 1480 unsigned int index = vmsg->payload.state.index; 1481 unsigned int enable = vmsg->payload.state.num; 1482 1483 DPRINT("State.index: %u\n", index); 1484 DPRINT("State.enable: %u\n", enable); 1485 1486 if (index >= dev->max_queues) { 1487 vu_panic(dev, "Invalid vring_enable index: %u", index); 1488 return false; 1489 } 1490 1491 dev->vq[index].enable = enable; 1492 return false; 1493 } 1494 1495 static bool 1496 vu_set_slave_req_fd(VuDev *dev, VhostUserMsg *vmsg) 1497 { 1498 if (vmsg->fd_num != 1) { 1499 vu_panic(dev, "Invalid slave_req_fd message (%d fd's)", vmsg->fd_num); 1500 return false; 1501 } 1502 1503 if (dev->slave_fd != -1) { 1504 close(dev->slave_fd); 1505 } 1506 dev->slave_fd = vmsg->fds[0]; 1507 DPRINT("Got slave_fd: %d\n", vmsg->fds[0]); 1508 1509 return false; 1510 } 1511 1512 static bool 1513 vu_get_config(VuDev *dev, VhostUserMsg *vmsg) 1514 { 1515 int ret = -1; 1516 1517 if (dev->iface->get_config) { 1518 ret = dev->iface->get_config(dev, vmsg->payload.config.region, 1519 vmsg->payload.config.size); 1520 } 1521 1522 if (ret) { 1523 /* resize to zero to indicate an error to master */ 1524 vmsg->size = 0; 1525 } 1526 1527 return true; 1528 } 1529 1530 static bool 1531 vu_set_config(VuDev *dev, VhostUserMsg *vmsg) 1532 { 1533 int ret = -1; 1534 1535 if (dev->iface->set_config) { 1536 ret = dev->iface->set_config(dev, vmsg->payload.config.region, 1537 vmsg->payload.config.offset, 1538 vmsg->payload.config.size, 1539 vmsg->payload.config.flags); 1540 if (ret) { 1541 vu_panic(dev, "Set virtio configuration space failed"); 1542 } 1543 } 1544 1545 return false; 1546 } 1547 1548 static bool 1549 vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg) 1550 { 1551 dev->postcopy_ufd = -1; 1552 #ifdef UFFDIO_API 1553 struct uffdio_api api_struct; 1554 1555 dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 1556 vmsg->size = 0; 1557 #endif 1558 1559 if (dev->postcopy_ufd == -1) { 1560 vu_panic(dev, "Userfaultfd not available: %s", strerror(errno)); 1561 goto out; 1562 } 1563 1564 #ifdef UFFDIO_API 1565 api_struct.api = UFFD_API; 1566 api_struct.features = 0; 1567 if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { 1568 vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno)); 1569 close(dev->postcopy_ufd); 1570 dev->postcopy_ufd = -1; 1571 goto out; 1572 } 1573 /* TODO: Stash feature flags somewhere */ 1574 #endif 1575 1576 out: 1577 /* Return a ufd to the QEMU */ 1578 vmsg->fd_num = 1; 1579 vmsg->fds[0] = dev->postcopy_ufd; 1580 return true; /* = send a reply */ 1581 } 1582 1583 static bool 1584 vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg) 1585 { 1586 if (dev->nregions) { 1587 vu_panic(dev, "Regions already registered at postcopy-listen"); 1588 vmsg_set_reply_u64(vmsg, -1); 1589 return true; 1590 } 1591 dev->postcopy_listening = true; 1592 1593 vmsg_set_reply_u64(vmsg, 0); 1594 return true; 1595 } 1596 1597 static bool 1598 vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) 1599 { 1600 DPRINT("%s: Entry\n", __func__); 1601 dev->postcopy_listening = false; 1602 if (dev->postcopy_ufd > 0) { 1603 close(dev->postcopy_ufd); 1604 dev->postcopy_ufd = -1; 1605 DPRINT("%s: Done close\n", __func__); 1606 } 1607 1608 vmsg_set_reply_u64(vmsg, 0); 1609 DPRINT("%s: exit\n", __func__); 1610 return true; 1611 } 1612 1613 static inline uint64_t 1614 vu_inflight_queue_size(uint16_t queue_size) 1615 { 1616 return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size + 1617 sizeof(uint16_t), INFLIGHT_ALIGNMENT); 1618 } 1619 1620 #ifdef MFD_ALLOW_SEALING 1621 static void * 1622 memfd_alloc(const char *name, size_t size, unsigned int flags, int *fd) 1623 { 1624 void *ptr; 1625 int ret; 1626 1627 *fd = memfd_create(name, MFD_ALLOW_SEALING); 1628 if (*fd < 0) { 1629 return NULL; 1630 } 1631 1632 ret = ftruncate(*fd, size); 1633 if (ret < 0) { 1634 close(*fd); 1635 return NULL; 1636 } 1637 1638 ret = fcntl(*fd, F_ADD_SEALS, flags); 1639 if (ret < 0) { 1640 close(*fd); 1641 return NULL; 1642 } 1643 1644 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0); 1645 if (ptr == MAP_FAILED) { 1646 close(*fd); 1647 return NULL; 1648 } 1649 1650 return ptr; 1651 } 1652 #endif 1653 1654 static bool 1655 vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) 1656 { 1657 int fd = -1; 1658 void *addr = NULL; 1659 uint64_t mmap_size; 1660 uint16_t num_queues, queue_size; 1661 1662 if (vmsg->size != sizeof(vmsg->payload.inflight)) { 1663 vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size); 1664 vmsg->payload.inflight.mmap_size = 0; 1665 return true; 1666 } 1667 1668 num_queues = vmsg->payload.inflight.num_queues; 1669 queue_size = vmsg->payload.inflight.queue_size; 1670 1671 DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); 1672 DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size); 1673 1674 mmap_size = vu_inflight_queue_size(queue_size) * num_queues; 1675 1676 #ifdef MFD_ALLOW_SEALING 1677 addr = memfd_alloc("vhost-inflight", mmap_size, 1678 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 1679 &fd); 1680 #else 1681 vu_panic(dev, "Not implemented: memfd support is missing"); 1682 #endif 1683 1684 if (!addr) { 1685 vu_panic(dev, "Failed to alloc vhost inflight area"); 1686 vmsg->payload.inflight.mmap_size = 0; 1687 return true; 1688 } 1689 1690 memset(addr, 0, mmap_size); 1691 1692 dev->inflight_info.addr = addr; 1693 dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size; 1694 dev->inflight_info.fd = vmsg->fds[0] = fd; 1695 vmsg->fd_num = 1; 1696 vmsg->payload.inflight.mmap_offset = 0; 1697 1698 DPRINT("send inflight mmap_size: %"PRId64"\n", 1699 vmsg->payload.inflight.mmap_size); 1700 DPRINT("send inflight mmap offset: %"PRId64"\n", 1701 vmsg->payload.inflight.mmap_offset); 1702 1703 return true; 1704 } 1705 1706 static bool 1707 vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) 1708 { 1709 int fd, i; 1710 uint64_t mmap_size, mmap_offset; 1711 uint16_t num_queues, queue_size; 1712 void *rc; 1713 1714 if (vmsg->fd_num != 1 || 1715 vmsg->size != sizeof(vmsg->payload.inflight)) { 1716 vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d", 1717 vmsg->size, vmsg->fd_num); 1718 return false; 1719 } 1720 1721 fd = vmsg->fds[0]; 1722 mmap_size = vmsg->payload.inflight.mmap_size; 1723 mmap_offset = vmsg->payload.inflight.mmap_offset; 1724 num_queues = vmsg->payload.inflight.num_queues; 1725 queue_size = vmsg->payload.inflight.queue_size; 1726 1727 DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size); 1728 DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset); 1729 DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues); 1730 DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size); 1731 1732 rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 1733 fd, mmap_offset); 1734 1735 if (rc == MAP_FAILED) { 1736 vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno)); 1737 return false; 1738 } 1739 1740 if (dev->inflight_info.fd) { 1741 close(dev->inflight_info.fd); 1742 } 1743 1744 if (dev->inflight_info.addr) { 1745 munmap(dev->inflight_info.addr, dev->inflight_info.size); 1746 } 1747 1748 dev->inflight_info.fd = fd; 1749 dev->inflight_info.addr = rc; 1750 dev->inflight_info.size = mmap_size; 1751 1752 for (i = 0; i < num_queues; i++) { 1753 dev->vq[i].inflight = (VuVirtqInflight *)rc; 1754 dev->vq[i].inflight->desc_num = queue_size; 1755 rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size)); 1756 } 1757 1758 return false; 1759 } 1760 1761 static bool 1762 vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg) 1763 { 1764 unsigned int index = vmsg->payload.state.index; 1765 1766 if (index >= dev->max_queues) { 1767 vu_panic(dev, "Invalid queue index: %u", index); 1768 return false; 1769 } 1770 1771 DPRINT("Got kick message: handler:%p idx:%u\n", 1772 dev->vq[index].handler, index); 1773 1774 if (!dev->vq[index].started) { 1775 dev->vq[index].started = true; 1776 1777 if (dev->iface->queue_set_started) { 1778 dev->iface->queue_set_started(dev, index, true); 1779 } 1780 } 1781 1782 if (dev->vq[index].handler) { 1783 dev->vq[index].handler(dev, index); 1784 } 1785 1786 return false; 1787 } 1788 1789 static bool vu_handle_get_max_memslots(VuDev *dev, VhostUserMsg *vmsg) 1790 { 1791 vmsg->flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; 1792 vmsg->size = sizeof(vmsg->payload.u64); 1793 vmsg->payload.u64 = VHOST_USER_MAX_RAM_SLOTS; 1794 vmsg->fd_num = 0; 1795 1796 if (!vu_message_write(dev, dev->sock, vmsg)) { 1797 vu_panic(dev, "Failed to send max ram slots: %s\n", strerror(errno)); 1798 } 1799 1800 DPRINT("u64: 0x%016"PRIx64"\n", (uint64_t) VHOST_USER_MAX_RAM_SLOTS); 1801 1802 return false; 1803 } 1804 1805 static bool 1806 vu_process_message(VuDev *dev, VhostUserMsg *vmsg) 1807 { 1808 int do_reply = 0; 1809 1810 /* Print out generic part of the request. */ 1811 DPRINT("================ Vhost user message ================\n"); 1812 DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request), 1813 vmsg->request); 1814 DPRINT("Flags: 0x%x\n", vmsg->flags); 1815 DPRINT("Size: %u\n", vmsg->size); 1816 1817 if (vmsg->fd_num) { 1818 int i; 1819 DPRINT("Fds:"); 1820 for (i = 0; i < vmsg->fd_num; i++) { 1821 DPRINT(" %d", vmsg->fds[i]); 1822 } 1823 DPRINT("\n"); 1824 } 1825 1826 if (dev->iface->process_msg && 1827 dev->iface->process_msg(dev, vmsg, &do_reply)) { 1828 return do_reply; 1829 } 1830 1831 switch (vmsg->request) { 1832 case VHOST_USER_GET_FEATURES: 1833 return vu_get_features_exec(dev, vmsg); 1834 case VHOST_USER_SET_FEATURES: 1835 return vu_set_features_exec(dev, vmsg); 1836 case VHOST_USER_GET_PROTOCOL_FEATURES: 1837 return vu_get_protocol_features_exec(dev, vmsg); 1838 case VHOST_USER_SET_PROTOCOL_FEATURES: 1839 return vu_set_protocol_features_exec(dev, vmsg); 1840 case VHOST_USER_SET_OWNER: 1841 return vu_set_owner_exec(dev, vmsg); 1842 case VHOST_USER_RESET_OWNER: 1843 return vu_reset_device_exec(dev, vmsg); 1844 case VHOST_USER_SET_MEM_TABLE: 1845 return vu_set_mem_table_exec(dev, vmsg); 1846 case VHOST_USER_SET_LOG_BASE: 1847 return vu_set_log_base_exec(dev, vmsg); 1848 case VHOST_USER_SET_LOG_FD: 1849 return vu_set_log_fd_exec(dev, vmsg); 1850 case VHOST_USER_SET_VRING_NUM: 1851 return vu_set_vring_num_exec(dev, vmsg); 1852 case VHOST_USER_SET_VRING_ADDR: 1853 return vu_set_vring_addr_exec(dev, vmsg); 1854 case VHOST_USER_SET_VRING_BASE: 1855 return vu_set_vring_base_exec(dev, vmsg); 1856 case VHOST_USER_GET_VRING_BASE: 1857 return vu_get_vring_base_exec(dev, vmsg); 1858 case VHOST_USER_SET_VRING_KICK: 1859 return vu_set_vring_kick_exec(dev, vmsg); 1860 case VHOST_USER_SET_VRING_CALL: 1861 return vu_set_vring_call_exec(dev, vmsg); 1862 case VHOST_USER_SET_VRING_ERR: 1863 return vu_set_vring_err_exec(dev, vmsg); 1864 case VHOST_USER_GET_QUEUE_NUM: 1865 return vu_get_queue_num_exec(dev, vmsg); 1866 case VHOST_USER_SET_VRING_ENABLE: 1867 return vu_set_vring_enable_exec(dev, vmsg); 1868 case VHOST_USER_SET_SLAVE_REQ_FD: 1869 return vu_set_slave_req_fd(dev, vmsg); 1870 case VHOST_USER_GET_CONFIG: 1871 return vu_get_config(dev, vmsg); 1872 case VHOST_USER_SET_CONFIG: 1873 return vu_set_config(dev, vmsg); 1874 case VHOST_USER_NONE: 1875 /* if you need processing before exit, override iface->process_msg */ 1876 exit(0); 1877 case VHOST_USER_POSTCOPY_ADVISE: 1878 return vu_set_postcopy_advise(dev, vmsg); 1879 case VHOST_USER_POSTCOPY_LISTEN: 1880 return vu_set_postcopy_listen(dev, vmsg); 1881 case VHOST_USER_POSTCOPY_END: 1882 return vu_set_postcopy_end(dev, vmsg); 1883 case VHOST_USER_GET_INFLIGHT_FD: 1884 return vu_get_inflight_fd(dev, vmsg); 1885 case VHOST_USER_SET_INFLIGHT_FD: 1886 return vu_set_inflight_fd(dev, vmsg); 1887 case VHOST_USER_VRING_KICK: 1888 return vu_handle_vring_kick(dev, vmsg); 1889 case VHOST_USER_GET_MAX_MEM_SLOTS: 1890 return vu_handle_get_max_memslots(dev, vmsg); 1891 case VHOST_USER_ADD_MEM_REG: 1892 return vu_add_mem_reg(dev, vmsg); 1893 case VHOST_USER_REM_MEM_REG: 1894 return vu_rem_mem_reg(dev, vmsg); 1895 default: 1896 vmsg_close_fds(vmsg); 1897 vu_panic(dev, "Unhandled request: %d", vmsg->request); 1898 } 1899 1900 return false; 1901 } 1902 1903 bool 1904 vu_dispatch(VuDev *dev) 1905 { 1906 VhostUserMsg vmsg = { 0, }; 1907 int reply_requested; 1908 bool need_reply, success = false; 1909 1910 if (!dev->read_msg(dev, dev->sock, &vmsg)) { 1911 goto end; 1912 } 1913 1914 need_reply = vmsg.flags & VHOST_USER_NEED_REPLY_MASK; 1915 1916 reply_requested = vu_process_message(dev, &vmsg); 1917 if (!reply_requested && need_reply) { 1918 vmsg_set_reply_u64(&vmsg, 0); 1919 reply_requested = 1; 1920 } 1921 1922 if (!reply_requested) { 1923 success = true; 1924 goto end; 1925 } 1926 1927 if (!vu_send_reply(dev, dev->sock, &vmsg)) { 1928 goto end; 1929 } 1930 1931 success = true; 1932 1933 end: 1934 free(vmsg.data); 1935 return success; 1936 } 1937 1938 void 1939 vu_deinit(VuDev *dev) 1940 { 1941 int i; 1942 1943 for (i = 0; i < dev->nregions; i++) { 1944 VuDevRegion *r = &dev->regions[i]; 1945 void *m = (void *) (uintptr_t) r->mmap_addr; 1946 if (m != MAP_FAILED) { 1947 munmap(m, r->size + r->mmap_offset); 1948 } 1949 } 1950 dev->nregions = 0; 1951 1952 for (i = 0; i < dev->max_queues; i++) { 1953 VuVirtq *vq = &dev->vq[i]; 1954 1955 if (vq->call_fd != -1) { 1956 close(vq->call_fd); 1957 vq->call_fd = -1; 1958 } 1959 1960 if (vq->kick_fd != -1) { 1961 dev->remove_watch(dev, vq->kick_fd); 1962 close(vq->kick_fd); 1963 vq->kick_fd = -1; 1964 } 1965 1966 if (vq->err_fd != -1) { 1967 close(vq->err_fd); 1968 vq->err_fd = -1; 1969 } 1970 1971 if (vq->resubmit_list) { 1972 free(vq->resubmit_list); 1973 vq->resubmit_list = NULL; 1974 } 1975 1976 vq->inflight = NULL; 1977 } 1978 1979 if (dev->inflight_info.addr) { 1980 munmap(dev->inflight_info.addr, dev->inflight_info.size); 1981 dev->inflight_info.addr = NULL; 1982 } 1983 1984 if (dev->inflight_info.fd > 0) { 1985 close(dev->inflight_info.fd); 1986 dev->inflight_info.fd = -1; 1987 } 1988 1989 vu_close_log(dev); 1990 if (dev->slave_fd != -1) { 1991 close(dev->slave_fd); 1992 dev->slave_fd = -1; 1993 } 1994 pthread_mutex_destroy(&dev->slave_mutex); 1995 1996 if (dev->sock != -1) { 1997 close(dev->sock); 1998 } 1999 2000 free(dev->vq); 2001 dev->vq = NULL; 2002 } 2003 2004 bool 2005 vu_init(VuDev *dev, 2006 uint16_t max_queues, 2007 int socket, 2008 vu_panic_cb panic, 2009 vu_read_msg_cb read_msg, 2010 vu_set_watch_cb set_watch, 2011 vu_remove_watch_cb remove_watch, 2012 const VuDevIface *iface) 2013 { 2014 uint16_t i; 2015 2016 assert(max_queues > 0); 2017 assert(socket >= 0); 2018 assert(set_watch); 2019 assert(remove_watch); 2020 assert(iface); 2021 assert(panic); 2022 2023 memset(dev, 0, sizeof(*dev)); 2024 2025 dev->sock = socket; 2026 dev->panic = panic; 2027 dev->read_msg = read_msg ? read_msg : vu_message_read_default; 2028 dev->set_watch = set_watch; 2029 dev->remove_watch = remove_watch; 2030 dev->iface = iface; 2031 dev->log_call_fd = -1; 2032 pthread_mutex_init(&dev->slave_mutex, NULL); 2033 dev->slave_fd = -1; 2034 dev->max_queues = max_queues; 2035 2036 dev->vq = malloc(max_queues * sizeof(dev->vq[0])); 2037 if (!dev->vq) { 2038 DPRINT("%s: failed to malloc virtqueues\n", __func__); 2039 return false; 2040 } 2041 2042 for (i = 0; i < max_queues; i++) { 2043 dev->vq[i] = (VuVirtq) { 2044 .call_fd = -1, .kick_fd = -1, .err_fd = -1, 2045 .notification = true, 2046 }; 2047 } 2048 2049 return true; 2050 } 2051 2052 VuVirtq * 2053 vu_get_queue(VuDev *dev, int qidx) 2054 { 2055 assert(qidx < dev->max_queues); 2056 return &dev->vq[qidx]; 2057 } 2058 2059 bool 2060 vu_queue_enabled(VuDev *dev, VuVirtq *vq) 2061 { 2062 return vq->enable; 2063 } 2064 2065 bool 2066 vu_queue_started(const VuDev *dev, const VuVirtq *vq) 2067 { 2068 return vq->started; 2069 } 2070 2071 static inline uint16_t 2072 vring_avail_flags(VuVirtq *vq) 2073 { 2074 return le16toh(vq->vring.avail->flags); 2075 } 2076 2077 static inline uint16_t 2078 vring_avail_idx(VuVirtq *vq) 2079 { 2080 vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); 2081 2082 return vq->shadow_avail_idx; 2083 } 2084 2085 static inline uint16_t 2086 vring_avail_ring(VuVirtq *vq, int i) 2087 { 2088 return le16toh(vq->vring.avail->ring[i]); 2089 } 2090 2091 static inline uint16_t 2092 vring_get_used_event(VuVirtq *vq) 2093 { 2094 return vring_avail_ring(vq, vq->vring.num); 2095 } 2096 2097 static int 2098 virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx) 2099 { 2100 uint16_t num_heads = vring_avail_idx(vq) - idx; 2101 2102 /* Check it isn't doing very strange things with descriptor numbers. */ 2103 if (num_heads > vq->vring.num) { 2104 vu_panic(dev, "Guest moved used index from %u to %u", 2105 idx, vq->shadow_avail_idx); 2106 return -1; 2107 } 2108 if (num_heads) { 2109 /* On success, callers read a descriptor at vq->last_avail_idx. 2110 * Make sure descriptor read does not bypass avail index read. */ 2111 smp_rmb(); 2112 } 2113 2114 return num_heads; 2115 } 2116 2117 static bool 2118 virtqueue_get_head(VuDev *dev, VuVirtq *vq, 2119 unsigned int idx, unsigned int *head) 2120 { 2121 /* Grab the next descriptor number they're advertising, and increment 2122 * the index we've seen. */ 2123 *head = vring_avail_ring(vq, idx % vq->vring.num); 2124 2125 /* If their number is silly, that's a fatal mistake. */ 2126 if (*head >= vq->vring.num) { 2127 vu_panic(dev, "Guest says index %u is available", *head); 2128 return false; 2129 } 2130 2131 return true; 2132 } 2133 2134 static int 2135 virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc, 2136 uint64_t addr, size_t len) 2137 { 2138 struct vring_desc *ori_desc; 2139 uint64_t read_len; 2140 2141 if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { 2142 return -1; 2143 } 2144 2145 if (len == 0) { 2146 return -1; 2147 } 2148 2149 while (len) { 2150 read_len = len; 2151 ori_desc = vu_gpa_to_va(dev, &read_len, addr); 2152 if (!ori_desc) { 2153 return -1; 2154 } 2155 2156 memcpy(desc, ori_desc, read_len); 2157 len -= read_len; 2158 addr += read_len; 2159 desc += read_len; 2160 } 2161 2162 return 0; 2163 } 2164 2165 enum { 2166 VIRTQUEUE_READ_DESC_ERROR = -1, 2167 VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ 2168 VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ 2169 }; 2170 2171 static int 2172 virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, 2173 int i, unsigned int max, unsigned int *next) 2174 { 2175 /* If this descriptor says it doesn't chain, we're done. */ 2176 if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) { 2177 return VIRTQUEUE_READ_DESC_DONE; 2178 } 2179 2180 /* Check they're not leading us off end of descriptors. */ 2181 *next = le16toh(desc[i].next); 2182 /* Make sure compiler knows to grab that: we don't want it changing! */ 2183 smp_wmb(); 2184 2185 if (*next >= max) { 2186 vu_panic(dev, "Desc next is %u", *next); 2187 return VIRTQUEUE_READ_DESC_ERROR; 2188 } 2189 2190 return VIRTQUEUE_READ_DESC_MORE; 2191 } 2192 2193 void 2194 vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, 2195 unsigned int *out_bytes, 2196 unsigned max_in_bytes, unsigned max_out_bytes) 2197 { 2198 unsigned int idx; 2199 unsigned int total_bufs, in_total, out_total; 2200 int rc; 2201 2202 idx = vq->last_avail_idx; 2203 2204 total_bufs = in_total = out_total = 0; 2205 if (unlikely(dev->broken) || 2206 unlikely(!vq->vring.avail)) { 2207 goto done; 2208 } 2209 2210 while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) { 2211 unsigned int max, desc_len, num_bufs, indirect = 0; 2212 uint64_t desc_addr, read_len; 2213 struct vring_desc *desc; 2214 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2215 unsigned int i; 2216 2217 max = vq->vring.num; 2218 num_bufs = total_bufs; 2219 if (!virtqueue_get_head(dev, vq, idx++, &i)) { 2220 goto err; 2221 } 2222 desc = vq->vring.desc; 2223 2224 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2225 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2226 vu_panic(dev, "Invalid size for indirect buffer table"); 2227 goto err; 2228 } 2229 2230 /* If we've got too many, that implies a descriptor loop. */ 2231 if (num_bufs >= max) { 2232 vu_panic(dev, "Looped descriptor"); 2233 goto err; 2234 } 2235 2236 /* loop over the indirect descriptor table */ 2237 indirect = 1; 2238 desc_addr = le64toh(desc[i].addr); 2239 desc_len = le32toh(desc[i].len); 2240 max = desc_len / sizeof(struct vring_desc); 2241 read_len = desc_len; 2242 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2243 if (unlikely(desc && read_len != desc_len)) { 2244 /* Failed to use zero copy */ 2245 desc = NULL; 2246 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2247 desc_addr, 2248 desc_len)) { 2249 desc = desc_buf; 2250 } 2251 } 2252 if (!desc) { 2253 vu_panic(dev, "Invalid indirect buffer table"); 2254 goto err; 2255 } 2256 num_bufs = i = 0; 2257 } 2258 2259 do { 2260 /* If we've got too many, that implies a descriptor loop. */ 2261 if (++num_bufs > max) { 2262 vu_panic(dev, "Looped descriptor"); 2263 goto err; 2264 } 2265 2266 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2267 in_total += le32toh(desc[i].len); 2268 } else { 2269 out_total += le32toh(desc[i].len); 2270 } 2271 if (in_total >= max_in_bytes && out_total >= max_out_bytes) { 2272 goto done; 2273 } 2274 rc = virtqueue_read_next_desc(dev, desc, i, max, &i); 2275 } while (rc == VIRTQUEUE_READ_DESC_MORE); 2276 2277 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 2278 goto err; 2279 } 2280 2281 if (!indirect) { 2282 total_bufs = num_bufs; 2283 } else { 2284 total_bufs++; 2285 } 2286 } 2287 if (rc < 0) { 2288 goto err; 2289 } 2290 done: 2291 if (in_bytes) { 2292 *in_bytes = in_total; 2293 } 2294 if (out_bytes) { 2295 *out_bytes = out_total; 2296 } 2297 return; 2298 2299 err: 2300 in_total = out_total = 0; 2301 goto done; 2302 } 2303 2304 bool 2305 vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, 2306 unsigned int out_bytes) 2307 { 2308 unsigned int in_total, out_total; 2309 2310 vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total, 2311 in_bytes, out_bytes); 2312 2313 return in_bytes <= in_total && out_bytes <= out_total; 2314 } 2315 2316 /* Fetch avail_idx from VQ memory only when we really need to know if 2317 * guest has added some buffers. */ 2318 bool 2319 vu_queue_empty(VuDev *dev, VuVirtq *vq) 2320 { 2321 if (unlikely(dev->broken) || 2322 unlikely(!vq->vring.avail)) { 2323 return true; 2324 } 2325 2326 if (vq->shadow_avail_idx != vq->last_avail_idx) { 2327 return false; 2328 } 2329 2330 return vring_avail_idx(vq) == vq->last_avail_idx; 2331 } 2332 2333 static bool 2334 vring_notify(VuDev *dev, VuVirtq *vq) 2335 { 2336 uint16_t old, new; 2337 bool v; 2338 2339 /* We need to expose used array entries before checking used event. */ 2340 smp_mb(); 2341 2342 /* Always notify when queue is empty (when feature acknowledge) */ 2343 if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && 2344 !vq->inuse && vu_queue_empty(dev, vq)) { 2345 return true; 2346 } 2347 2348 if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2349 return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); 2350 } 2351 2352 v = vq->signalled_used_valid; 2353 vq->signalled_used_valid = true; 2354 old = vq->signalled_used; 2355 new = vq->signalled_used = vq->used_idx; 2356 return !v || vring_need_event(vring_get_used_event(vq), new, old); 2357 } 2358 2359 static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) 2360 { 2361 if (unlikely(dev->broken) || 2362 unlikely(!vq->vring.avail)) { 2363 return; 2364 } 2365 2366 if (!vring_notify(dev, vq)) { 2367 DPRINT("skipped notify...\n"); 2368 return; 2369 } 2370 2371 if (vq->call_fd < 0 && 2372 vu_has_protocol_feature(dev, 2373 VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && 2374 vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { 2375 VhostUserMsg vmsg = { 2376 .request = VHOST_USER_SLAVE_VRING_CALL, 2377 .flags = VHOST_USER_VERSION, 2378 .size = sizeof(vmsg.payload.state), 2379 .payload.state = { 2380 .index = vq - dev->vq, 2381 }, 2382 }; 2383 bool ack = sync && 2384 vu_has_protocol_feature(dev, 2385 VHOST_USER_PROTOCOL_F_REPLY_ACK); 2386 2387 if (ack) { 2388 vmsg.flags |= VHOST_USER_NEED_REPLY_MASK; 2389 } 2390 2391 vu_message_write(dev, dev->slave_fd, &vmsg); 2392 if (ack) { 2393 vu_message_read_default(dev, dev->slave_fd, &vmsg); 2394 } 2395 return; 2396 } 2397 2398 if (eventfd_write(vq->call_fd, 1) < 0) { 2399 vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); 2400 } 2401 } 2402 2403 void vu_queue_notify(VuDev *dev, VuVirtq *vq) 2404 { 2405 _vu_queue_notify(dev, vq, false); 2406 } 2407 2408 void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq) 2409 { 2410 _vu_queue_notify(dev, vq, true); 2411 } 2412 2413 static inline void 2414 vring_used_flags_set_bit(VuVirtq *vq, int mask) 2415 { 2416 uint16_t *flags; 2417 2418 flags = (uint16_t *)((char*)vq->vring.used + 2419 offsetof(struct vring_used, flags)); 2420 *flags = htole16(le16toh(*flags) | mask); 2421 } 2422 2423 static inline void 2424 vring_used_flags_unset_bit(VuVirtq *vq, int mask) 2425 { 2426 uint16_t *flags; 2427 2428 flags = (uint16_t *)((char*)vq->vring.used + 2429 offsetof(struct vring_used, flags)); 2430 *flags = htole16(le16toh(*flags) & ~mask); 2431 } 2432 2433 static inline void 2434 vring_set_avail_event(VuVirtq *vq, uint16_t val) 2435 { 2436 uint16_t *avail; 2437 2438 if (!vq->notification) { 2439 return; 2440 } 2441 2442 avail = (uint16_t *)&vq->vring.used->ring[vq->vring.num]; 2443 *avail = htole16(val); 2444 } 2445 2446 void 2447 vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable) 2448 { 2449 vq->notification = enable; 2450 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2451 vring_set_avail_event(vq, vring_avail_idx(vq)); 2452 } else if (enable) { 2453 vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); 2454 } else { 2455 vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); 2456 } 2457 if (enable) { 2458 /* Expose avail event/used flags before caller checks the avail idx. */ 2459 smp_mb(); 2460 } 2461 } 2462 2463 static bool 2464 virtqueue_map_desc(VuDev *dev, 2465 unsigned int *p_num_sg, struct iovec *iov, 2466 unsigned int max_num_sg, bool is_write, 2467 uint64_t pa, size_t sz) 2468 { 2469 unsigned num_sg = *p_num_sg; 2470 2471 assert(num_sg <= max_num_sg); 2472 2473 if (!sz) { 2474 vu_panic(dev, "virtio: zero sized buffers are not allowed"); 2475 return false; 2476 } 2477 2478 while (sz) { 2479 uint64_t len = sz; 2480 2481 if (num_sg == max_num_sg) { 2482 vu_panic(dev, "virtio: too many descriptors in indirect table"); 2483 return false; 2484 } 2485 2486 iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); 2487 if (iov[num_sg].iov_base == NULL) { 2488 vu_panic(dev, "virtio: invalid address for buffers"); 2489 return false; 2490 } 2491 iov[num_sg].iov_len = len; 2492 num_sg++; 2493 sz -= len; 2494 pa += len; 2495 } 2496 2497 *p_num_sg = num_sg; 2498 return true; 2499 } 2500 2501 static void * 2502 virtqueue_alloc_element(size_t sz, 2503 unsigned out_num, unsigned in_num) 2504 { 2505 VuVirtqElement *elem; 2506 size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); 2507 size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); 2508 size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); 2509 2510 assert(sz >= sizeof(VuVirtqElement)); 2511 elem = malloc(out_sg_end); 2512 elem->out_num = out_num; 2513 elem->in_num = in_num; 2514 elem->in_sg = (void *)elem + in_sg_ofs; 2515 elem->out_sg = (void *)elem + out_sg_ofs; 2516 return elem; 2517 } 2518 2519 static void * 2520 vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) 2521 { 2522 struct vring_desc *desc = vq->vring.desc; 2523 uint64_t desc_addr, read_len; 2524 unsigned int desc_len; 2525 unsigned int max = vq->vring.num; 2526 unsigned int i = idx; 2527 VuVirtqElement *elem; 2528 unsigned int out_num = 0, in_num = 0; 2529 struct iovec iov[VIRTQUEUE_MAX_SIZE]; 2530 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2531 int rc; 2532 2533 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2534 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2535 vu_panic(dev, "Invalid size for indirect buffer table"); 2536 return NULL; 2537 } 2538 2539 /* loop over the indirect descriptor table */ 2540 desc_addr = le64toh(desc[i].addr); 2541 desc_len = le32toh(desc[i].len); 2542 max = desc_len / sizeof(struct vring_desc); 2543 read_len = desc_len; 2544 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2545 if (unlikely(desc && read_len != desc_len)) { 2546 /* Failed to use zero copy */ 2547 desc = NULL; 2548 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2549 desc_addr, 2550 desc_len)) { 2551 desc = desc_buf; 2552 } 2553 } 2554 if (!desc) { 2555 vu_panic(dev, "Invalid indirect buffer table"); 2556 return NULL; 2557 } 2558 i = 0; 2559 } 2560 2561 /* Collect all the descriptors */ 2562 do { 2563 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2564 if (!virtqueue_map_desc(dev, &in_num, iov + out_num, 2565 VIRTQUEUE_MAX_SIZE - out_num, true, 2566 le64toh(desc[i].addr), 2567 le32toh(desc[i].len))) { 2568 return NULL; 2569 } 2570 } else { 2571 if (in_num) { 2572 vu_panic(dev, "Incorrect order for descriptors"); 2573 return NULL; 2574 } 2575 if (!virtqueue_map_desc(dev, &out_num, iov, 2576 VIRTQUEUE_MAX_SIZE, false, 2577 le64toh(desc[i].addr), 2578 le32toh(desc[i].len))) { 2579 return NULL; 2580 } 2581 } 2582 2583 /* If we've got too many, that implies a descriptor loop. */ 2584 if ((in_num + out_num) > max) { 2585 vu_panic(dev, "Looped descriptor"); 2586 return NULL; 2587 } 2588 rc = virtqueue_read_next_desc(dev, desc, i, max, &i); 2589 } while (rc == VIRTQUEUE_READ_DESC_MORE); 2590 2591 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 2592 vu_panic(dev, "read descriptor error"); 2593 return NULL; 2594 } 2595 2596 /* Now copy what we have collected and mapped */ 2597 elem = virtqueue_alloc_element(sz, out_num, in_num); 2598 elem->index = idx; 2599 for (i = 0; i < out_num; i++) { 2600 elem->out_sg[i] = iov[i]; 2601 } 2602 for (i = 0; i < in_num; i++) { 2603 elem->in_sg[i] = iov[out_num + i]; 2604 } 2605 2606 return elem; 2607 } 2608 2609 static int 2610 vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx) 2611 { 2612 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2613 return 0; 2614 } 2615 2616 if (unlikely(!vq->inflight)) { 2617 return -1; 2618 } 2619 2620 vq->inflight->desc[desc_idx].counter = vq->counter++; 2621 vq->inflight->desc[desc_idx].inflight = 1; 2622 2623 return 0; 2624 } 2625 2626 static int 2627 vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx) 2628 { 2629 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2630 return 0; 2631 } 2632 2633 if (unlikely(!vq->inflight)) { 2634 return -1; 2635 } 2636 2637 vq->inflight->last_batch_head = desc_idx; 2638 2639 return 0; 2640 } 2641 2642 static int 2643 vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx) 2644 { 2645 if (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { 2646 return 0; 2647 } 2648 2649 if (unlikely(!vq->inflight)) { 2650 return -1; 2651 } 2652 2653 barrier(); 2654 2655 vq->inflight->desc[desc_idx].inflight = 0; 2656 2657 barrier(); 2658 2659 vq->inflight->used_idx = vq->used_idx; 2660 2661 return 0; 2662 } 2663 2664 void * 2665 vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) 2666 { 2667 int i; 2668 unsigned int head; 2669 VuVirtqElement *elem; 2670 2671 if (unlikely(dev->broken) || 2672 unlikely(!vq->vring.avail)) { 2673 return NULL; 2674 } 2675 2676 if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) { 2677 i = (--vq->resubmit_num); 2678 elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz); 2679 2680 if (!vq->resubmit_num) { 2681 free(vq->resubmit_list); 2682 vq->resubmit_list = NULL; 2683 } 2684 2685 return elem; 2686 } 2687 2688 if (vu_queue_empty(dev, vq)) { 2689 return NULL; 2690 } 2691 /* 2692 * Needed after virtio_queue_empty(), see comment in 2693 * virtqueue_num_heads(). 2694 */ 2695 smp_rmb(); 2696 2697 if (vq->inuse >= vq->vring.num) { 2698 vu_panic(dev, "Virtqueue size exceeded"); 2699 return NULL; 2700 } 2701 2702 if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) { 2703 return NULL; 2704 } 2705 2706 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 2707 vring_set_avail_event(vq, vq->last_avail_idx); 2708 } 2709 2710 elem = vu_queue_map_desc(dev, vq, head, sz); 2711 2712 if (!elem) { 2713 return NULL; 2714 } 2715 2716 vq->inuse++; 2717 2718 vu_queue_inflight_get(dev, vq, head); 2719 2720 return elem; 2721 } 2722 2723 static void 2724 vu_queue_detach_element(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, 2725 size_t len) 2726 { 2727 vq->inuse--; 2728 /* unmap, when DMA support is added */ 2729 } 2730 2731 void 2732 vu_queue_unpop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem, 2733 size_t len) 2734 { 2735 vq->last_avail_idx--; 2736 vu_queue_detach_element(dev, vq, elem, len); 2737 } 2738 2739 bool 2740 vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) 2741 { 2742 if (num > vq->inuse) { 2743 return false; 2744 } 2745 vq->last_avail_idx -= num; 2746 vq->inuse -= num; 2747 return true; 2748 } 2749 2750 static inline 2751 void vring_used_write(VuDev *dev, VuVirtq *vq, 2752 struct vring_used_elem *uelem, int i) 2753 { 2754 struct vring_used *used = vq->vring.used; 2755 2756 used->ring[i] = *uelem; 2757 vu_log_write(dev, vq->vring.log_guest_addr + 2758 offsetof(struct vring_used, ring[i]), 2759 sizeof(used->ring[i])); 2760 } 2761 2762 2763 static void 2764 vu_log_queue_fill(VuDev *dev, VuVirtq *vq, 2765 const VuVirtqElement *elem, 2766 unsigned int len) 2767 { 2768 struct vring_desc *desc = vq->vring.desc; 2769 unsigned int i, max, min, desc_len; 2770 uint64_t desc_addr, read_len; 2771 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 2772 unsigned num_bufs = 0; 2773 2774 max = vq->vring.num; 2775 i = elem->index; 2776 2777 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 2778 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 2779 vu_panic(dev, "Invalid size for indirect buffer table"); 2780 return; 2781 } 2782 2783 /* loop over the indirect descriptor table */ 2784 desc_addr = le64toh(desc[i].addr); 2785 desc_len = le32toh(desc[i].len); 2786 max = desc_len / sizeof(struct vring_desc); 2787 read_len = desc_len; 2788 desc = vu_gpa_to_va(dev, &read_len, desc_addr); 2789 if (unlikely(desc && read_len != desc_len)) { 2790 /* Failed to use zero copy */ 2791 desc = NULL; 2792 if (!virtqueue_read_indirect_desc(dev, desc_buf, 2793 desc_addr, 2794 desc_len)) { 2795 desc = desc_buf; 2796 } 2797 } 2798 if (!desc) { 2799 vu_panic(dev, "Invalid indirect buffer table"); 2800 return; 2801 } 2802 i = 0; 2803 } 2804 2805 do { 2806 if (++num_bufs > max) { 2807 vu_panic(dev, "Looped descriptor"); 2808 return; 2809 } 2810 2811 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 2812 min = MIN(le32toh(desc[i].len), len); 2813 vu_log_write(dev, le64toh(desc[i].addr), min); 2814 len -= min; 2815 } 2816 2817 } while (len > 0 && 2818 (virtqueue_read_next_desc(dev, desc, i, max, &i) 2819 == VIRTQUEUE_READ_DESC_MORE)); 2820 } 2821 2822 void 2823 vu_queue_fill(VuDev *dev, VuVirtq *vq, 2824 const VuVirtqElement *elem, 2825 unsigned int len, unsigned int idx) 2826 { 2827 struct vring_used_elem uelem; 2828 2829 if (unlikely(dev->broken) || 2830 unlikely(!vq->vring.avail)) { 2831 return; 2832 } 2833 2834 vu_log_queue_fill(dev, vq, elem, len); 2835 2836 idx = (idx + vq->used_idx) % vq->vring.num; 2837 2838 uelem.id = htole32(elem->index); 2839 uelem.len = htole32(len); 2840 vring_used_write(dev, vq, &uelem, idx); 2841 } 2842 2843 static inline 2844 void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val) 2845 { 2846 vq->vring.used->idx = htole16(val); 2847 vu_log_write(dev, 2848 vq->vring.log_guest_addr + offsetof(struct vring_used, idx), 2849 sizeof(vq->vring.used->idx)); 2850 2851 vq->used_idx = val; 2852 } 2853 2854 void 2855 vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) 2856 { 2857 uint16_t old, new; 2858 2859 if (unlikely(dev->broken) || 2860 unlikely(!vq->vring.avail)) { 2861 return; 2862 } 2863 2864 /* Make sure buffer is written before we update index. */ 2865 smp_wmb(); 2866 2867 old = vq->used_idx; 2868 new = old + count; 2869 vring_used_idx_set(dev, vq, new); 2870 vq->inuse -= count; 2871 if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { 2872 vq->signalled_used_valid = false; 2873 } 2874 } 2875 2876 void 2877 vu_queue_push(VuDev *dev, VuVirtq *vq, 2878 const VuVirtqElement *elem, unsigned int len) 2879 { 2880 vu_queue_fill(dev, vq, elem, len, 0); 2881 vu_queue_inflight_pre_put(dev, vq, elem->index); 2882 vu_queue_flush(dev, vq, 1); 2883 vu_queue_inflight_post_put(dev, vq, elem->index); 2884 } 2885