1 /* 2 * VDUSE (vDPA Device in Userspace) library 3 * 4 * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. 5 * Portions of codes and concepts borrowed from libvhost-user.c, so: 6 * Copyright IBM, Corp. 2007 7 * Copyright (c) 2016 Red Hat, Inc. 8 * 9 * Author: 10 * Xie Yongji <xieyongji@bytedance.com> 11 * Anthony Liguori <aliguori@us.ibm.com> 12 * Marc-André Lureau <mlureau@redhat.com> 13 * Victor Kaplansky <victork@redhat.com> 14 * 15 * This work is licensed under the terms of the GNU GPL, version 2 or 16 * later. See the COPYING file in the top-level directory. 17 */ 18 19 #ifndef _GNU_SOURCE 20 #define _GNU_SOURCE 21 #endif 22 23 #include <stdlib.h> 24 #include <stdio.h> 25 #include <stdbool.h> 26 #include <stddef.h> 27 #include <errno.h> 28 #include <string.h> 29 #include <assert.h> 30 #include <endian.h> 31 #include <unistd.h> 32 #include <limits.h> 33 #include <fcntl.h> 34 #include <inttypes.h> 35 36 #include <sys/ioctl.h> 37 #include <sys/eventfd.h> 38 #include <sys/mman.h> 39 40 #include "include/atomic.h" 41 #include "linux-headers/linux/virtio_ring.h" 42 #include "linux-headers/linux/virtio_config.h" 43 #include "linux-headers/linux/vduse.h" 44 #include "libvduse.h" 45 46 #define VDUSE_VQ_ALIGN 4096 47 #define MAX_IOVA_REGIONS 256 48 49 #define LOG_ALIGNMENT 64 50 51 /* Round number down to multiple */ 52 #define ALIGN_DOWN(n, m) ((n) / (m) * (m)) 53 54 /* Round number up to multiple */ 55 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) 56 57 #ifndef unlikely 58 #define unlikely(x) __builtin_expect(!!(x), 0) 59 #endif 60 61 typedef struct VduseDescStateSplit { 62 uint8_t inflight; 63 uint8_t padding[5]; 64 uint16_t next; 65 uint64_t counter; 66 } VduseDescStateSplit; 67 68 typedef struct VduseVirtqLogInflight { 69 uint64_t features; 70 uint16_t version; 71 uint16_t desc_num; 72 uint16_t last_batch_head; 73 uint16_t used_idx; 74 VduseDescStateSplit desc[]; 75 } VduseVirtqLogInflight; 76 77 typedef struct VduseVirtqLog { 78 VduseVirtqLogInflight inflight; 79 } VduseVirtqLog; 80 81 typedef struct VduseVirtqInflightDesc { 82 uint16_t index; 83 uint64_t counter; 84 } VduseVirtqInflightDesc; 85 86 typedef struct VduseRing { 87 unsigned int num; 88 uint64_t desc_addr; 89 uint64_t avail_addr; 90 uint64_t used_addr; 91 struct vring_desc *desc; 92 struct vring_avail *avail; 93 struct vring_used *used; 94 } VduseRing; 95 96 struct VduseVirtq { 97 VduseRing vring; 98 uint16_t last_avail_idx; 99 uint16_t shadow_avail_idx; 100 uint16_t used_idx; 101 uint16_t signalled_used; 102 bool signalled_used_valid; 103 int index; 104 unsigned int inuse; 105 bool ready; 106 int fd; 107 VduseDev *dev; 108 VduseVirtqInflightDesc *resubmit_list; 109 uint16_t resubmit_num; 110 uint64_t counter; 111 VduseVirtqLog *log; 112 }; 113 114 typedef struct VduseIovaRegion { 115 uint64_t iova; 116 uint64_t size; 117 uint64_t mmap_offset; 118 uint64_t mmap_addr; 119 } VduseIovaRegion; 120 121 struct VduseDev { 122 VduseVirtq *vqs; 123 VduseIovaRegion regions[MAX_IOVA_REGIONS]; 124 int num_regions; 125 char *name; 126 uint32_t device_id; 127 uint32_t vendor_id; 128 uint16_t num_queues; 129 uint16_t queue_size; 130 uint64_t features; 131 const VduseOps *ops; 132 int fd; 133 int ctrl_fd; 134 void *priv; 135 void *log; 136 }; 137 138 static inline size_t vduse_vq_log_size(uint16_t queue_size) 139 { 140 return ALIGN_UP(sizeof(VduseDescStateSplit) * queue_size + 141 sizeof(VduseVirtqLogInflight), LOG_ALIGNMENT); 142 } 143 144 static void *vduse_log_get(const char *filename, size_t size) 145 { 146 void *ptr = MAP_FAILED; 147 int fd; 148 149 fd = open(filename, O_RDWR | O_CREAT, 0600); 150 if (fd == -1) { 151 return MAP_FAILED; 152 } 153 154 if (ftruncate(fd, size) == -1) { 155 goto out; 156 } 157 158 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 159 160 out: 161 close(fd); 162 return ptr; 163 } 164 165 static inline bool has_feature(uint64_t features, unsigned int fbit) 166 { 167 assert(fbit < 64); 168 return !!(features & (1ULL << fbit)); 169 } 170 171 static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit) 172 { 173 return has_feature(dev->features, fbit); 174 } 175 176 uint64_t vduse_get_virtio_features(void) 177 { 178 return (1ULL << VIRTIO_F_IOMMU_PLATFORM) | 179 (1ULL << VIRTIO_F_VERSION_1) | 180 (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | 181 (1ULL << VIRTIO_RING_F_EVENT_IDX) | 182 (1ULL << VIRTIO_RING_F_INDIRECT_DESC); 183 } 184 185 VduseDev *vduse_queue_get_dev(VduseVirtq *vq) 186 { 187 return vq->dev; 188 } 189 190 int vduse_queue_get_fd(VduseVirtq *vq) 191 { 192 return vq->fd; 193 } 194 195 void *vduse_dev_get_priv(VduseDev *dev) 196 { 197 return dev->priv; 198 } 199 200 VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index) 201 { 202 return &dev->vqs[index]; 203 } 204 205 int vduse_dev_get_fd(VduseDev *dev) 206 { 207 return dev->fd; 208 } 209 210 static int vduse_inject_irq(VduseDev *dev, int index) 211 { 212 return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index); 213 } 214 215 static int inflight_desc_compare(const void *a, const void *b) 216 { 217 VduseVirtqInflightDesc *desc0 = (VduseVirtqInflightDesc *)a, 218 *desc1 = (VduseVirtqInflightDesc *)b; 219 220 if (desc1->counter > desc0->counter && 221 (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) { 222 return 1; 223 } 224 225 return -1; 226 } 227 228 static int vduse_queue_check_inflights(VduseVirtq *vq) 229 { 230 int i = 0; 231 VduseDev *dev = vq->dev; 232 233 vq->used_idx = le16toh(vq->vring.used->idx); 234 vq->resubmit_num = 0; 235 vq->resubmit_list = NULL; 236 vq->counter = 0; 237 238 if (unlikely(vq->log->inflight.used_idx != vq->used_idx)) { 239 if (vq->log->inflight.last_batch_head > VIRTQUEUE_MAX_SIZE) { 240 return -1; 241 } 242 243 vq->log->inflight.desc[vq->log->inflight.last_batch_head].inflight = 0; 244 245 barrier(); 246 247 vq->log->inflight.used_idx = vq->used_idx; 248 } 249 250 for (i = 0; i < vq->log->inflight.desc_num; i++) { 251 if (vq->log->inflight.desc[i].inflight == 1) { 252 vq->inuse++; 253 } 254 } 255 256 vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; 257 258 if (vq->inuse) { 259 vq->resubmit_list = calloc(vq->inuse, sizeof(VduseVirtqInflightDesc)); 260 if (!vq->resubmit_list) { 261 return -1; 262 } 263 264 for (i = 0; i < vq->log->inflight.desc_num; i++) { 265 if (vq->log->inflight.desc[i].inflight) { 266 vq->resubmit_list[vq->resubmit_num].index = i; 267 vq->resubmit_list[vq->resubmit_num].counter = 268 vq->log->inflight.desc[i].counter; 269 vq->resubmit_num++; 270 } 271 } 272 273 if (vq->resubmit_num > 1) { 274 qsort(vq->resubmit_list, vq->resubmit_num, 275 sizeof(VduseVirtqInflightDesc), inflight_desc_compare); 276 } 277 vq->counter = vq->resubmit_list[0].counter + 1; 278 } 279 280 vduse_inject_irq(dev, vq->index); 281 282 return 0; 283 } 284 285 static int vduse_queue_inflight_get(VduseVirtq *vq, int desc_idx) 286 { 287 vq->log->inflight.desc[desc_idx].counter = vq->counter++; 288 289 barrier(); 290 291 vq->log->inflight.desc[desc_idx].inflight = 1; 292 293 return 0; 294 } 295 296 static int vduse_queue_inflight_pre_put(VduseVirtq *vq, int desc_idx) 297 { 298 vq->log->inflight.last_batch_head = desc_idx; 299 300 return 0; 301 } 302 303 static int vduse_queue_inflight_post_put(VduseVirtq *vq, int desc_idx) 304 { 305 vq->log->inflight.desc[desc_idx].inflight = 0; 306 307 barrier(); 308 309 vq->log->inflight.used_idx = vq->used_idx; 310 311 return 0; 312 } 313 314 static void vduse_iova_remove_region(VduseDev *dev, uint64_t start, 315 uint64_t last) 316 { 317 int i; 318 319 if (last == start) { 320 return; 321 } 322 323 for (i = 0; i < MAX_IOVA_REGIONS; i++) { 324 if (!dev->regions[i].mmap_addr) { 325 continue; 326 } 327 328 if (start <= dev->regions[i].iova && 329 last >= (dev->regions[i].iova + dev->regions[i].size - 1)) { 330 munmap((void *)(uintptr_t)dev->regions[i].mmap_addr, 331 dev->regions[i].mmap_offset + dev->regions[i].size); 332 dev->regions[i].mmap_addr = 0; 333 dev->num_regions--; 334 } 335 } 336 } 337 338 static int vduse_iova_add_region(VduseDev *dev, int fd, 339 uint64_t offset, uint64_t start, 340 uint64_t last, int prot) 341 { 342 int i; 343 uint64_t size = last - start + 1; 344 void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0); 345 346 if (mmap_addr == MAP_FAILED) { 347 close(fd); 348 return -EINVAL; 349 } 350 351 for (i = 0; i < MAX_IOVA_REGIONS; i++) { 352 if (!dev->regions[i].mmap_addr) { 353 dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 354 dev->regions[i].mmap_offset = offset; 355 dev->regions[i].iova = start; 356 dev->regions[i].size = size; 357 dev->num_regions++; 358 break; 359 } 360 } 361 assert(i < MAX_IOVA_REGIONS); 362 close(fd); 363 364 return 0; 365 } 366 367 static int perm_to_prot(uint8_t perm) 368 { 369 int prot = 0; 370 371 switch (perm) { 372 case VDUSE_ACCESS_WO: 373 prot |= PROT_WRITE; 374 break; 375 case VDUSE_ACCESS_RO: 376 prot |= PROT_READ; 377 break; 378 case VDUSE_ACCESS_RW: 379 prot |= PROT_READ | PROT_WRITE; 380 break; 381 default: 382 break; 383 } 384 385 return prot; 386 } 387 388 static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova) 389 { 390 int i, ret; 391 struct vduse_iotlb_entry entry; 392 393 for (i = 0; i < MAX_IOVA_REGIONS; i++) { 394 VduseIovaRegion *r = &dev->regions[i]; 395 396 if (!r->mmap_addr) { 397 continue; 398 } 399 400 if ((iova >= r->iova) && (iova < (r->iova + r->size))) { 401 if ((iova + *plen) > (r->iova + r->size)) { 402 *plen = r->iova + r->size - iova; 403 } 404 return (void *)(uintptr_t)(iova - r->iova + 405 r->mmap_addr + r->mmap_offset); 406 } 407 } 408 409 entry.start = iova; 410 entry.last = iova + 1; 411 ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry); 412 if (ret < 0) { 413 return NULL; 414 } 415 416 if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start, 417 entry.last, perm_to_prot(entry.perm))) { 418 return iova_to_va(dev, plen, iova); 419 } 420 421 return NULL; 422 } 423 424 static inline uint16_t vring_avail_flags(VduseVirtq *vq) 425 { 426 return le16toh(vq->vring.avail->flags); 427 } 428 429 static inline uint16_t vring_avail_idx(VduseVirtq *vq) 430 { 431 vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); 432 433 return vq->shadow_avail_idx; 434 } 435 436 static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i) 437 { 438 return le16toh(vq->vring.avail->ring[i]); 439 } 440 441 static inline uint16_t vring_get_used_event(VduseVirtq *vq) 442 { 443 return vring_avail_ring(vq, vq->vring.num); 444 } 445 446 static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx, 447 unsigned int *head) 448 { 449 /* 450 * Grab the next descriptor number they're advertising, and increment 451 * the index we've seen. 452 */ 453 *head = vring_avail_ring(vq, idx % vq->vring.num); 454 455 /* If their number is silly, that's a fatal mistake. */ 456 if (*head >= vq->vring.num) { 457 fprintf(stderr, "Guest says index %u is available\n", *head); 458 return false; 459 } 460 461 return true; 462 } 463 464 static int 465 vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc, 466 uint64_t addr, size_t len) 467 { 468 struct vring_desc *ori_desc; 469 uint64_t read_len; 470 471 if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { 472 return -1; 473 } 474 475 if (len == 0) { 476 return -1; 477 } 478 479 while (len) { 480 read_len = len; 481 ori_desc = iova_to_va(dev, &read_len, addr); 482 if (!ori_desc) { 483 return -1; 484 } 485 486 memcpy(desc, ori_desc, read_len); 487 len -= read_len; 488 addr += read_len; 489 desc += read_len; 490 } 491 492 return 0; 493 } 494 495 enum { 496 VIRTQUEUE_READ_DESC_ERROR = -1, 497 VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ 498 VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ 499 }; 500 501 static int vduse_queue_read_next_desc(struct vring_desc *desc, int i, 502 unsigned int max, unsigned int *next) 503 { 504 /* If this descriptor says it doesn't chain, we're done. */ 505 if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) { 506 return VIRTQUEUE_READ_DESC_DONE; 507 } 508 509 /* Check they're not leading us off end of descriptors. */ 510 *next = desc[i].next; 511 /* Make sure compiler knows to grab that: we don't want it changing! */ 512 smp_wmb(); 513 514 if (*next >= max) { 515 fprintf(stderr, "Desc next is %u\n", *next); 516 return VIRTQUEUE_READ_DESC_ERROR; 517 } 518 519 return VIRTQUEUE_READ_DESC_MORE; 520 } 521 522 /* 523 * Fetch avail_idx from VQ memory only when we really need to know if 524 * guest has added some buffers. 525 */ 526 static bool vduse_queue_empty(VduseVirtq *vq) 527 { 528 if (unlikely(!vq->vring.avail)) { 529 return true; 530 } 531 532 if (vq->shadow_avail_idx != vq->last_avail_idx) { 533 return false; 534 } 535 536 return vring_avail_idx(vq) == vq->last_avail_idx; 537 } 538 539 static bool vduse_queue_should_notify(VduseVirtq *vq) 540 { 541 VduseDev *dev = vq->dev; 542 uint16_t old, new; 543 bool v; 544 545 /* We need to expose used array entries before checking used event. */ 546 smp_mb(); 547 548 /* Always notify when queue is empty (when feature acknowledge) */ 549 if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && 550 !vq->inuse && vduse_queue_empty(vq)) { 551 return true; 552 } 553 554 if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 555 return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); 556 } 557 558 v = vq->signalled_used_valid; 559 vq->signalled_used_valid = true; 560 old = vq->signalled_used; 561 new = vq->signalled_used = vq->used_idx; 562 return !v || vring_need_event(vring_get_used_event(vq), new, old); 563 } 564 565 void vduse_queue_notify(VduseVirtq *vq) 566 { 567 VduseDev *dev = vq->dev; 568 569 if (unlikely(!vq->vring.avail)) { 570 return; 571 } 572 573 if (!vduse_queue_should_notify(vq)) { 574 return; 575 } 576 577 if (vduse_inject_irq(dev, vq->index) < 0) { 578 fprintf(stderr, "Error inject irq for vq %d: %s\n", 579 vq->index, strerror(errno)); 580 } 581 } 582 583 static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val) 584 { 585 uint16_t val_le = htole16(val); 586 memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t)); 587 } 588 589 static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg, 590 struct iovec *iov, unsigned int max_num_sg, 591 bool is_write, uint64_t pa, size_t sz) 592 { 593 unsigned num_sg = *p_num_sg; 594 VduseDev *dev = vq->dev; 595 596 assert(num_sg <= max_num_sg); 597 598 if (!sz) { 599 fprintf(stderr, "virtio: zero sized buffers are not allowed\n"); 600 return false; 601 } 602 603 while (sz) { 604 uint64_t len = sz; 605 606 if (num_sg == max_num_sg) { 607 fprintf(stderr, 608 "virtio: too many descriptors in indirect table\n"); 609 return false; 610 } 611 612 iov[num_sg].iov_base = iova_to_va(dev, &len, pa); 613 if (iov[num_sg].iov_base == NULL) { 614 fprintf(stderr, "virtio: invalid address for buffers\n"); 615 return false; 616 } 617 iov[num_sg++].iov_len = len; 618 sz -= len; 619 pa += len; 620 } 621 622 *p_num_sg = num_sg; 623 return true; 624 } 625 626 static void *vduse_queue_alloc_element(size_t sz, unsigned out_num, 627 unsigned in_num) 628 { 629 VduseVirtqElement *elem; 630 size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); 631 size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); 632 size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); 633 634 assert(sz >= sizeof(VduseVirtqElement)); 635 elem = malloc(out_sg_end); 636 if (!elem) { 637 return NULL; 638 } 639 elem->out_num = out_num; 640 elem->in_num = in_num; 641 elem->in_sg = (void *)elem + in_sg_ofs; 642 elem->out_sg = (void *)elem + out_sg_ofs; 643 return elem; 644 } 645 646 static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz) 647 { 648 struct vring_desc *desc = vq->vring.desc; 649 VduseDev *dev = vq->dev; 650 uint64_t desc_addr, read_len; 651 unsigned int desc_len; 652 unsigned int max = vq->vring.num; 653 unsigned int i = idx; 654 VduseVirtqElement *elem; 655 struct iovec iov[VIRTQUEUE_MAX_SIZE]; 656 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 657 unsigned int out_num = 0, in_num = 0; 658 int rc; 659 660 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 661 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 662 fprintf(stderr, "Invalid size for indirect buffer table\n"); 663 return NULL; 664 } 665 666 /* loop over the indirect descriptor table */ 667 desc_addr = le64toh(desc[i].addr); 668 desc_len = le32toh(desc[i].len); 669 max = desc_len / sizeof(struct vring_desc); 670 read_len = desc_len; 671 desc = iova_to_va(dev, &read_len, desc_addr); 672 if (unlikely(desc && read_len != desc_len)) { 673 /* Failed to use zero copy */ 674 desc = NULL; 675 if (!vduse_queue_read_indirect_desc(dev, desc_buf, 676 desc_addr, 677 desc_len)) { 678 desc = desc_buf; 679 } 680 } 681 if (!desc) { 682 fprintf(stderr, "Invalid indirect buffer table\n"); 683 return NULL; 684 } 685 i = 0; 686 } 687 688 /* Collect all the descriptors */ 689 do { 690 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 691 if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num, 692 VIRTQUEUE_MAX_SIZE - out_num, 693 true, le64toh(desc[i].addr), 694 le32toh(desc[i].len))) { 695 return NULL; 696 } 697 } else { 698 if (in_num) { 699 fprintf(stderr, "Incorrect order for descriptors\n"); 700 return NULL; 701 } 702 if (!vduse_queue_map_single_desc(vq, &out_num, iov, 703 VIRTQUEUE_MAX_SIZE, false, 704 le64toh(desc[i].addr), 705 le32toh(desc[i].len))) { 706 return NULL; 707 } 708 } 709 710 /* If we've got too many, that implies a descriptor loop. */ 711 if ((in_num + out_num) > max) { 712 fprintf(stderr, "Looped descriptor\n"); 713 return NULL; 714 } 715 rc = vduse_queue_read_next_desc(desc, i, max, &i); 716 } while (rc == VIRTQUEUE_READ_DESC_MORE); 717 718 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 719 fprintf(stderr, "read descriptor error\n"); 720 return NULL; 721 } 722 723 /* Now copy what we have collected and mapped */ 724 elem = vduse_queue_alloc_element(sz, out_num, in_num); 725 if (!elem) { 726 fprintf(stderr, "read descriptor error\n"); 727 return NULL; 728 } 729 elem->index = idx; 730 for (i = 0; i < out_num; i++) { 731 elem->out_sg[i] = iov[i]; 732 } 733 for (i = 0; i < in_num; i++) { 734 elem->in_sg[i] = iov[out_num + i]; 735 } 736 737 return elem; 738 } 739 740 void *vduse_queue_pop(VduseVirtq *vq, size_t sz) 741 { 742 unsigned int head; 743 VduseVirtqElement *elem; 744 VduseDev *dev = vq->dev; 745 int i; 746 747 if (unlikely(!vq->vring.avail)) { 748 return NULL; 749 } 750 751 if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) { 752 i = (--vq->resubmit_num); 753 elem = vduse_queue_map_desc(vq, vq->resubmit_list[i].index, sz); 754 755 if (!vq->resubmit_num) { 756 free(vq->resubmit_list); 757 vq->resubmit_list = NULL; 758 } 759 760 return elem; 761 } 762 763 if (vduse_queue_empty(vq)) { 764 return NULL; 765 } 766 /* Needed after virtio_queue_empty() */ 767 smp_rmb(); 768 769 if (vq->inuse >= vq->vring.num) { 770 fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse); 771 return NULL; 772 } 773 774 if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) { 775 return NULL; 776 } 777 778 if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 779 vring_set_avail_event(vq, vq->last_avail_idx); 780 } 781 782 elem = vduse_queue_map_desc(vq, head, sz); 783 784 if (!elem) { 785 return NULL; 786 } 787 788 vq->inuse++; 789 790 vduse_queue_inflight_get(vq, head); 791 792 return elem; 793 } 794 795 static inline void vring_used_write(VduseVirtq *vq, 796 struct vring_used_elem *uelem, int i) 797 { 798 struct vring_used *used = vq->vring.used; 799 800 used->ring[i] = *uelem; 801 } 802 803 static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem, 804 unsigned int len, unsigned int idx) 805 { 806 struct vring_used_elem uelem; 807 808 if (unlikely(!vq->vring.used)) { 809 return; 810 } 811 812 idx = (idx + vq->used_idx) % vq->vring.num; 813 814 uelem.id = htole32(elem->index); 815 uelem.len = htole32(len); 816 vring_used_write(vq, &uelem, idx); 817 } 818 819 static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val) 820 { 821 vq->vring.used->idx = htole16(val); 822 vq->used_idx = val; 823 } 824 825 static void vduse_queue_flush(VduseVirtq *vq, unsigned int count) 826 { 827 uint16_t old, new; 828 829 if (unlikely(!vq->vring.used)) { 830 return; 831 } 832 833 /* Make sure buffer is written before we update index. */ 834 smp_wmb(); 835 836 old = vq->used_idx; 837 new = old + count; 838 vring_used_idx_set(vq, new); 839 vq->inuse -= count; 840 if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { 841 vq->signalled_used_valid = false; 842 } 843 } 844 845 void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem, 846 unsigned int len) 847 { 848 vduse_queue_fill(vq, elem, len, 0); 849 vduse_queue_inflight_pre_put(vq, elem->index); 850 vduse_queue_flush(vq, 1); 851 vduse_queue_inflight_post_put(vq, elem->index); 852 } 853 854 static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr, 855 uint64_t avail_addr, uint64_t used_addr) 856 { 857 struct VduseDev *dev = vq->dev; 858 uint64_t len; 859 860 len = sizeof(struct vring_desc); 861 vq->vring.desc = iova_to_va(dev, &len, desc_addr); 862 if (len != sizeof(struct vring_desc)) { 863 return -EINVAL; 864 } 865 866 len = sizeof(struct vring_avail); 867 vq->vring.avail = iova_to_va(dev, &len, avail_addr); 868 if (len != sizeof(struct vring_avail)) { 869 return -EINVAL; 870 } 871 872 len = sizeof(struct vring_used); 873 vq->vring.used = iova_to_va(dev, &len, used_addr); 874 if (len != sizeof(struct vring_used)) { 875 return -EINVAL; 876 } 877 878 if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) { 879 fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index); 880 return -EINVAL; 881 } 882 883 return 0; 884 } 885 886 static void vduse_queue_enable(VduseVirtq *vq) 887 { 888 struct VduseDev *dev = vq->dev; 889 struct vduse_vq_info vq_info; 890 struct vduse_vq_eventfd vq_eventfd; 891 int fd; 892 893 vq_info.index = vq->index; 894 if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) { 895 fprintf(stderr, "Failed to get vq[%d] info: %s\n", 896 vq->index, strerror(errno)); 897 return; 898 } 899 900 if (!vq_info.ready) { 901 return; 902 } 903 904 vq->vring.num = vq_info.num; 905 vq->vring.desc_addr = vq_info.desc_addr; 906 vq->vring.avail_addr = vq_info.driver_addr; 907 vq->vring.used_addr = vq_info.device_addr; 908 909 if (vduse_queue_update_vring(vq, vq_info.desc_addr, 910 vq_info.driver_addr, vq_info.device_addr)) { 911 fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index); 912 return; 913 } 914 915 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 916 if (fd < 0) { 917 fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index); 918 return; 919 } 920 921 vq_eventfd.index = vq->index; 922 vq_eventfd.fd = fd; 923 if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) { 924 fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index); 925 close(fd); 926 return; 927 } 928 929 vq->fd = fd; 930 vq->signalled_used_valid = false; 931 vq->ready = true; 932 933 if (vduse_queue_check_inflights(vq)) { 934 fprintf(stderr, "Failed to check inflights for vq[%d]\n", vq->index); 935 close(fd); 936 return; 937 } 938 939 dev->ops->enable_queue(dev, vq); 940 } 941 942 static void vduse_queue_disable(VduseVirtq *vq) 943 { 944 struct VduseDev *dev = vq->dev; 945 struct vduse_vq_eventfd eventfd; 946 947 if (!vq->ready) { 948 return; 949 } 950 951 dev->ops->disable_queue(dev, vq); 952 953 eventfd.index = vq->index; 954 eventfd.fd = VDUSE_EVENTFD_DEASSIGN; 955 ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd); 956 close(vq->fd); 957 958 assert(vq->inuse == 0); 959 960 vq->vring.num = 0; 961 vq->vring.desc_addr = 0; 962 vq->vring.avail_addr = 0; 963 vq->vring.used_addr = 0; 964 vq->vring.desc = 0; 965 vq->vring.avail = 0; 966 vq->vring.used = 0; 967 vq->ready = false; 968 vq->fd = -1; 969 } 970 971 static void vduse_dev_start_dataplane(VduseDev *dev) 972 { 973 int i; 974 975 if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { 976 fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); 977 return; 978 } 979 assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1)); 980 981 for (i = 0; i < dev->num_queues; i++) { 982 vduse_queue_enable(&dev->vqs[i]); 983 } 984 } 985 986 static void vduse_dev_stop_dataplane(VduseDev *dev) 987 { 988 size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE); 989 int i; 990 991 for (i = 0; i < dev->num_queues; i++) { 992 vduse_queue_disable(&dev->vqs[i]); 993 } 994 if (dev->log) { 995 memset(dev->log, 0, log_size); 996 } 997 dev->features = 0; 998 vduse_iova_remove_region(dev, 0, ULONG_MAX); 999 } 1000 1001 int vduse_dev_handler(VduseDev *dev) 1002 { 1003 struct vduse_dev_request req; 1004 struct vduse_dev_response resp = { 0 }; 1005 VduseVirtq *vq; 1006 int i, ret; 1007 1008 ret = read(dev->fd, &req, sizeof(req)); 1009 if (ret != sizeof(req)) { 1010 fprintf(stderr, "Read request error [%d]: %s\n", 1011 ret, strerror(errno)); 1012 return -errno; 1013 } 1014 resp.request_id = req.request_id; 1015 1016 switch (req.type) { 1017 case VDUSE_GET_VQ_STATE: 1018 vq = &dev->vqs[req.vq_state.index]; 1019 resp.vq_state.split.avail_index = vq->last_avail_idx; 1020 resp.result = VDUSE_REQ_RESULT_OK; 1021 break; 1022 case VDUSE_SET_STATUS: 1023 if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) { 1024 vduse_dev_start_dataplane(dev); 1025 } else if (req.s.status == 0) { 1026 vduse_dev_stop_dataplane(dev); 1027 } 1028 resp.result = VDUSE_REQ_RESULT_OK; 1029 break; 1030 case VDUSE_UPDATE_IOTLB: 1031 /* The iova will be updated by iova_to_va() later, so just remove it */ 1032 vduse_iova_remove_region(dev, req.iova.start, req.iova.last); 1033 for (i = 0; i < dev->num_queues; i++) { 1034 VduseVirtq *vq = &dev->vqs[i]; 1035 if (vq->ready) { 1036 if (vduse_queue_update_vring(vq, vq->vring.desc_addr, 1037 vq->vring.avail_addr, 1038 vq->vring.used_addr)) { 1039 fprintf(stderr, "Failed to update vring for vq[%d]\n", 1040 vq->index); 1041 } 1042 } 1043 } 1044 resp.result = VDUSE_REQ_RESULT_OK; 1045 break; 1046 default: 1047 resp.result = VDUSE_REQ_RESULT_FAILED; 1048 break; 1049 } 1050 1051 ret = write(dev->fd, &resp, sizeof(resp)); 1052 if (ret != sizeof(resp)) { 1053 fprintf(stderr, "Write request %d error [%d]: %s\n", 1054 req.type, ret, strerror(errno)); 1055 return -errno; 1056 } 1057 return 0; 1058 } 1059 1060 int vduse_dev_update_config(VduseDev *dev, uint32_t size, 1061 uint32_t offset, char *buffer) 1062 { 1063 int ret; 1064 struct vduse_config_data *data; 1065 1066 data = malloc(offsetof(struct vduse_config_data, buffer) + size); 1067 if (!data) { 1068 return -ENOMEM; 1069 } 1070 1071 data->offset = offset; 1072 data->length = size; 1073 memcpy(data->buffer, buffer, size); 1074 1075 ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data); 1076 free(data); 1077 1078 if (ret) { 1079 return -errno; 1080 } 1081 1082 if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) { 1083 return -errno; 1084 } 1085 1086 return 0; 1087 } 1088 1089 int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size) 1090 { 1091 VduseVirtq *vq = &dev->vqs[index]; 1092 struct vduse_vq_config vq_config = { 0 }; 1093 1094 if (max_size > VIRTQUEUE_MAX_SIZE) { 1095 return -EINVAL; 1096 } 1097 1098 vq_config.index = vq->index; 1099 vq_config.max_size = max_size; 1100 1101 if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) { 1102 return -errno; 1103 } 1104 1105 vduse_queue_enable(vq); 1106 1107 return 0; 1108 } 1109 1110 int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename) 1111 { 1112 1113 size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE); 1114 void *log; 1115 int i; 1116 1117 dev->log = log = vduse_log_get(filename, log_size); 1118 if (log == MAP_FAILED) { 1119 fprintf(stderr, "Failed to get vduse log\n"); 1120 return -EINVAL; 1121 } 1122 1123 for (i = 0; i < dev->num_queues; i++) { 1124 dev->vqs[i].log = log; 1125 dev->vqs[i].log->inflight.desc_num = VIRTQUEUE_MAX_SIZE; 1126 log = (void *)((char *)log + vduse_vq_log_size(VIRTQUEUE_MAX_SIZE)); 1127 } 1128 1129 return 0; 1130 } 1131 1132 static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues) 1133 { 1134 VduseVirtq *vqs; 1135 int i; 1136 1137 vqs = calloc(sizeof(VduseVirtq), num_queues); 1138 if (!vqs) { 1139 return -ENOMEM; 1140 } 1141 1142 for (i = 0; i < num_queues; i++) { 1143 vqs[i].index = i; 1144 vqs[i].dev = dev; 1145 vqs[i].fd = -1; 1146 } 1147 dev->vqs = vqs; 1148 1149 return 0; 1150 } 1151 1152 static int vduse_dev_init(VduseDev *dev, const char *name, 1153 uint16_t num_queues, const VduseOps *ops, 1154 void *priv) 1155 { 1156 char *dev_path, *dev_name; 1157 int ret, fd; 1158 1159 dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1); 1160 if (!dev_path) { 1161 return -ENOMEM; 1162 } 1163 sprintf(dev_path, "/dev/vduse/%s", name); 1164 1165 fd = open(dev_path, O_RDWR); 1166 free(dev_path); 1167 if (fd < 0) { 1168 fprintf(stderr, "Failed to open vduse dev %s: %s\n", 1169 name, strerror(errno)); 1170 return -errno; 1171 } 1172 1173 if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { 1174 fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); 1175 close(fd); 1176 return -errno; 1177 } 1178 1179 dev_name = strdup(name); 1180 if (!dev_name) { 1181 close(fd); 1182 return -ENOMEM; 1183 } 1184 1185 ret = vduse_dev_init_vqs(dev, num_queues); 1186 if (ret) { 1187 free(dev_name); 1188 close(fd); 1189 return ret; 1190 } 1191 1192 dev->name = dev_name; 1193 dev->num_queues = num_queues; 1194 dev->fd = fd; 1195 dev->ops = ops; 1196 dev->priv = priv; 1197 1198 return 0; 1199 } 1200 1201 static inline bool vduse_name_is_invalid(const char *name) 1202 { 1203 return strlen(name) >= VDUSE_NAME_MAX || strstr(name, ".."); 1204 } 1205 1206 VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues, 1207 const VduseOps *ops, void *priv) 1208 { 1209 VduseDev *dev; 1210 int ret; 1211 1212 if (!ops || !ops->enable_queue || !ops->disable_queue) { 1213 fprintf(stderr, "Invalid parameter for vduse\n"); 1214 return NULL; 1215 } 1216 1217 dev = calloc(sizeof(VduseDev), 1); 1218 if (!dev) { 1219 fprintf(stderr, "Failed to allocate vduse device\n"); 1220 return NULL; 1221 } 1222 1223 if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { 1224 fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); 1225 free(dev); 1226 return NULL; 1227 } 1228 1229 ret = vduse_dev_init_vqs(dev, num_queues); 1230 if (ret) { 1231 fprintf(stderr, "Failed to init vqs\n"); 1232 free(dev); 1233 return NULL; 1234 } 1235 1236 dev->num_queues = num_queues; 1237 dev->fd = fd; 1238 dev->ops = ops; 1239 dev->priv = priv; 1240 1241 return dev; 1242 } 1243 1244 VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues, 1245 const VduseOps *ops, void *priv) 1246 { 1247 VduseDev *dev; 1248 int ret; 1249 1250 if (!name || vduse_name_is_invalid(name) || !ops || 1251 !ops->enable_queue || !ops->disable_queue) { 1252 fprintf(stderr, "Invalid parameter for vduse\n"); 1253 return NULL; 1254 } 1255 1256 dev = calloc(sizeof(VduseDev), 1); 1257 if (!dev) { 1258 fprintf(stderr, "Failed to allocate vduse device\n"); 1259 return NULL; 1260 } 1261 1262 ret = vduse_dev_init(dev, name, num_queues, ops, priv); 1263 if (ret < 0) { 1264 fprintf(stderr, "Failed to init vduse device %s: %s\n", 1265 name, strerror(-ret)); 1266 free(dev); 1267 return NULL; 1268 } 1269 1270 return dev; 1271 } 1272 1273 VduseDev *vduse_dev_create(const char *name, uint32_t device_id, 1274 uint32_t vendor_id, uint64_t features, 1275 uint16_t num_queues, uint32_t config_size, 1276 char *config, const VduseOps *ops, void *priv) 1277 { 1278 VduseDev *dev; 1279 int ret, ctrl_fd; 1280 uint64_t version; 1281 struct vduse_dev_config *dev_config; 1282 size_t size = offsetof(struct vduse_dev_config, config); 1283 1284 if (!name || vduse_name_is_invalid(name) || 1285 !has_feature(features, VIRTIO_F_VERSION_1) || !config || 1286 !config_size || !ops || !ops->enable_queue || !ops->disable_queue) { 1287 fprintf(stderr, "Invalid parameter for vduse\n"); 1288 return NULL; 1289 } 1290 1291 dev = calloc(sizeof(VduseDev), 1); 1292 if (!dev) { 1293 fprintf(stderr, "Failed to allocate vduse device\n"); 1294 return NULL; 1295 } 1296 1297 ctrl_fd = open("/dev/vduse/control", O_RDWR); 1298 if (ctrl_fd < 0) { 1299 fprintf(stderr, "Failed to open /dev/vduse/control: %s\n", 1300 strerror(errno)); 1301 goto err_ctrl; 1302 } 1303 1304 version = VDUSE_API_VERSION; 1305 if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) { 1306 fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n", 1307 version, strerror(errno)); 1308 goto err_dev; 1309 } 1310 1311 dev_config = calloc(size + config_size, 1); 1312 if (!dev_config) { 1313 fprintf(stderr, "Failed to allocate config space\n"); 1314 goto err_dev; 1315 } 1316 1317 assert(!vduse_name_is_invalid(name)); 1318 strcpy(dev_config->name, name); 1319 dev_config->device_id = device_id; 1320 dev_config->vendor_id = vendor_id; 1321 dev_config->features = features; 1322 dev_config->vq_num = num_queues; 1323 dev_config->vq_align = VDUSE_VQ_ALIGN; 1324 dev_config->config_size = config_size; 1325 memcpy(dev_config->config, config, config_size); 1326 1327 ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config); 1328 free(dev_config); 1329 if (ret && errno != EEXIST) { 1330 fprintf(stderr, "Failed to create vduse device %s: %s\n", 1331 name, strerror(errno)); 1332 goto err_dev; 1333 } 1334 dev->ctrl_fd = ctrl_fd; 1335 1336 ret = vduse_dev_init(dev, name, num_queues, ops, priv); 1337 if (ret < 0) { 1338 fprintf(stderr, "Failed to init vduse device %s: %s\n", 1339 name, strerror(-ret)); 1340 goto err; 1341 } 1342 1343 return dev; 1344 err: 1345 ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name); 1346 err_dev: 1347 close(ctrl_fd); 1348 err_ctrl: 1349 free(dev); 1350 1351 return NULL; 1352 } 1353 1354 int vduse_dev_destroy(VduseDev *dev) 1355 { 1356 size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE); 1357 int i, ret = 0; 1358 1359 if (dev->log) { 1360 munmap(dev->log, log_size); 1361 } 1362 for (i = 0; i < dev->num_queues; i++) { 1363 free(dev->vqs[i].resubmit_list); 1364 } 1365 free(dev->vqs); 1366 if (dev->fd >= 0) { 1367 close(dev->fd); 1368 dev->fd = -1; 1369 } 1370 if (dev->ctrl_fd >= 0) { 1371 if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) { 1372 ret = -errno; 1373 } 1374 close(dev->ctrl_fd); 1375 dev->ctrl_fd = -1; 1376 } 1377 free(dev->name); 1378 free(dev); 1379 1380 return ret; 1381 } 1382