1 /* 2 * VDUSE (vDPA Device in Userspace) library 3 * 4 * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. 5 * Portions of codes and concepts borrowed from libvhost-user.c, so: 6 * Copyright IBM, Corp. 2007 7 * Copyright (c) 2016 Red Hat, Inc. 8 * 9 * Author: 10 * Xie Yongji <xieyongji@bytedance.com> 11 * Anthony Liguori <aliguori@us.ibm.com> 12 * Marc-André Lureau <mlureau@redhat.com> 13 * Victor Kaplansky <victork@redhat.com> 14 * 15 * This work is licensed under the terms of the GNU GPL, version 2 or 16 * later. See the COPYING file in the top-level directory. 17 */ 18 19 #include <stdlib.h> 20 #include <stdio.h> 21 #include <stdbool.h> 22 #include <stddef.h> 23 #include <errno.h> 24 #include <string.h> 25 #include <assert.h> 26 #include <endian.h> 27 #include <unistd.h> 28 #include <limits.h> 29 #include <fcntl.h> 30 #include <inttypes.h> 31 32 #include <sys/ioctl.h> 33 #include <sys/eventfd.h> 34 #include <sys/mman.h> 35 36 #include "include/atomic.h" 37 #include "linux-headers/linux/virtio_ring.h" 38 #include "linux-headers/linux/virtio_config.h" 39 #include "linux-headers/linux/vduse.h" 40 #include "libvduse.h" 41 42 #define VDUSE_VQ_ALIGN 4096 43 #define MAX_IOVA_REGIONS 256 44 45 /* Round number down to multiple */ 46 #define ALIGN_DOWN(n, m) ((n) / (m) * (m)) 47 48 /* Round number up to multiple */ 49 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) 50 51 #ifndef unlikely 52 #define unlikely(x) __builtin_expect(!!(x), 0) 53 #endif 54 55 typedef struct VduseRing { 56 unsigned int num; 57 uint64_t desc_addr; 58 uint64_t avail_addr; 59 uint64_t used_addr; 60 struct vring_desc *desc; 61 struct vring_avail *avail; 62 struct vring_used *used; 63 } VduseRing; 64 65 struct VduseVirtq { 66 VduseRing vring; 67 uint16_t last_avail_idx; 68 uint16_t shadow_avail_idx; 69 uint16_t used_idx; 70 uint16_t signalled_used; 71 bool signalled_used_valid; 72 int index; 73 int inuse; 74 bool ready; 75 int fd; 76 VduseDev *dev; 77 }; 78 79 typedef struct VduseIovaRegion { 80 uint64_t iova; 81 uint64_t size; 82 uint64_t mmap_offset; 83 uint64_t mmap_addr; 84 } VduseIovaRegion; 85 86 struct VduseDev { 87 VduseVirtq *vqs; 88 VduseIovaRegion regions[MAX_IOVA_REGIONS]; 89 int num_regions; 90 char *name; 91 uint32_t device_id; 92 uint32_t vendor_id; 93 uint16_t num_queues; 94 uint16_t queue_size; 95 uint64_t features; 96 const VduseOps *ops; 97 int fd; 98 int ctrl_fd; 99 void *priv; 100 }; 101 102 static inline bool has_feature(uint64_t features, unsigned int fbit) 103 { 104 assert(fbit < 64); 105 return !!(features & (1ULL << fbit)); 106 } 107 108 static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit) 109 { 110 return has_feature(dev->features, fbit); 111 } 112 113 uint64_t vduse_get_virtio_features(void) 114 { 115 return (1ULL << VIRTIO_F_IOMMU_PLATFORM) | 116 (1ULL << VIRTIO_F_VERSION_1) | 117 (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | 118 (1ULL << VIRTIO_RING_F_EVENT_IDX) | 119 (1ULL << VIRTIO_RING_F_INDIRECT_DESC); 120 } 121 122 VduseDev *vduse_queue_get_dev(VduseVirtq *vq) 123 { 124 return vq->dev; 125 } 126 127 int vduse_queue_get_fd(VduseVirtq *vq) 128 { 129 return vq->fd; 130 } 131 132 void *vduse_dev_get_priv(VduseDev *dev) 133 { 134 return dev->priv; 135 } 136 137 VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index) 138 { 139 return &dev->vqs[index]; 140 } 141 142 int vduse_dev_get_fd(VduseDev *dev) 143 { 144 return dev->fd; 145 } 146 147 static int vduse_inject_irq(VduseDev *dev, int index) 148 { 149 return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index); 150 } 151 152 static void vduse_iova_remove_region(VduseDev *dev, uint64_t start, 153 uint64_t last) 154 { 155 int i; 156 157 if (last == start) { 158 return; 159 } 160 161 for (i = 0; i < MAX_IOVA_REGIONS; i++) { 162 if (!dev->regions[i].mmap_addr) { 163 continue; 164 } 165 166 if (start <= dev->regions[i].iova && 167 last >= (dev->regions[i].iova + dev->regions[i].size - 1)) { 168 munmap((void *)(uintptr_t)dev->regions[i].mmap_addr, 169 dev->regions[i].mmap_offset + dev->regions[i].size); 170 dev->regions[i].mmap_addr = 0; 171 dev->num_regions--; 172 } 173 } 174 } 175 176 static int vduse_iova_add_region(VduseDev *dev, int fd, 177 uint64_t offset, uint64_t start, 178 uint64_t last, int prot) 179 { 180 int i; 181 uint64_t size = last - start + 1; 182 void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0); 183 184 if (mmap_addr == MAP_FAILED) { 185 close(fd); 186 return -EINVAL; 187 } 188 189 for (i = 0; i < MAX_IOVA_REGIONS; i++) { 190 if (!dev->regions[i].mmap_addr) { 191 dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr; 192 dev->regions[i].mmap_offset = offset; 193 dev->regions[i].iova = start; 194 dev->regions[i].size = size; 195 dev->num_regions++; 196 break; 197 } 198 } 199 assert(i < MAX_IOVA_REGIONS); 200 close(fd); 201 202 return 0; 203 } 204 205 static int perm_to_prot(uint8_t perm) 206 { 207 int prot = 0; 208 209 switch (perm) { 210 case VDUSE_ACCESS_WO: 211 prot |= PROT_WRITE; 212 break; 213 case VDUSE_ACCESS_RO: 214 prot |= PROT_READ; 215 break; 216 case VDUSE_ACCESS_RW: 217 prot |= PROT_READ | PROT_WRITE; 218 break; 219 default: 220 break; 221 } 222 223 return prot; 224 } 225 226 static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova) 227 { 228 int i, ret; 229 struct vduse_iotlb_entry entry; 230 231 for (i = 0; i < MAX_IOVA_REGIONS; i++) { 232 VduseIovaRegion *r = &dev->regions[i]; 233 234 if (!r->mmap_addr) { 235 continue; 236 } 237 238 if ((iova >= r->iova) && (iova < (r->iova + r->size))) { 239 if ((iova + *plen) > (r->iova + r->size)) { 240 *plen = r->iova + r->size - iova; 241 } 242 return (void *)(uintptr_t)(iova - r->iova + 243 r->mmap_addr + r->mmap_offset); 244 } 245 } 246 247 entry.start = iova; 248 entry.last = iova + 1; 249 ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry); 250 if (ret < 0) { 251 return NULL; 252 } 253 254 if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start, 255 entry.last, perm_to_prot(entry.perm))) { 256 return iova_to_va(dev, plen, iova); 257 } 258 259 return NULL; 260 } 261 262 static inline uint16_t vring_avail_flags(VduseVirtq *vq) 263 { 264 return le16toh(vq->vring.avail->flags); 265 } 266 267 static inline uint16_t vring_avail_idx(VduseVirtq *vq) 268 { 269 vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); 270 271 return vq->shadow_avail_idx; 272 } 273 274 static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i) 275 { 276 return le16toh(vq->vring.avail->ring[i]); 277 } 278 279 static inline uint16_t vring_get_used_event(VduseVirtq *vq) 280 { 281 return vring_avail_ring(vq, vq->vring.num); 282 } 283 284 static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx, 285 unsigned int *head) 286 { 287 /* 288 * Grab the next descriptor number they're advertising, and increment 289 * the index we've seen. 290 */ 291 *head = vring_avail_ring(vq, idx % vq->vring.num); 292 293 /* If their number is silly, that's a fatal mistake. */ 294 if (*head >= vq->vring.num) { 295 fprintf(stderr, "Guest says index %u is available\n", *head); 296 return false; 297 } 298 299 return true; 300 } 301 302 static int 303 vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc, 304 uint64_t addr, size_t len) 305 { 306 struct vring_desc *ori_desc; 307 uint64_t read_len; 308 309 if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { 310 return -1; 311 } 312 313 if (len == 0) { 314 return -1; 315 } 316 317 while (len) { 318 read_len = len; 319 ori_desc = iova_to_va(dev, &read_len, addr); 320 if (!ori_desc) { 321 return -1; 322 } 323 324 memcpy(desc, ori_desc, read_len); 325 len -= read_len; 326 addr += read_len; 327 desc += read_len; 328 } 329 330 return 0; 331 } 332 333 enum { 334 VIRTQUEUE_READ_DESC_ERROR = -1, 335 VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ 336 VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ 337 }; 338 339 static int vduse_queue_read_next_desc(struct vring_desc *desc, int i, 340 unsigned int max, unsigned int *next) 341 { 342 /* If this descriptor says it doesn't chain, we're done. */ 343 if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) { 344 return VIRTQUEUE_READ_DESC_DONE; 345 } 346 347 /* Check they're not leading us off end of descriptors. */ 348 *next = desc[i].next; 349 /* Make sure compiler knows to grab that: we don't want it changing! */ 350 smp_wmb(); 351 352 if (*next >= max) { 353 fprintf(stderr, "Desc next is %u\n", *next); 354 return VIRTQUEUE_READ_DESC_ERROR; 355 } 356 357 return VIRTQUEUE_READ_DESC_MORE; 358 } 359 360 /* 361 * Fetch avail_idx from VQ memory only when we really need to know if 362 * guest has added some buffers. 363 */ 364 static bool vduse_queue_empty(VduseVirtq *vq) 365 { 366 if (unlikely(!vq->vring.avail)) { 367 return true; 368 } 369 370 if (vq->shadow_avail_idx != vq->last_avail_idx) { 371 return false; 372 } 373 374 return vring_avail_idx(vq) == vq->last_avail_idx; 375 } 376 377 static bool vduse_queue_should_notify(VduseVirtq *vq) 378 { 379 VduseDev *dev = vq->dev; 380 uint16_t old, new; 381 bool v; 382 383 /* We need to expose used array entries before checking used event. */ 384 smp_mb(); 385 386 /* Always notify when queue is empty (when feature acknowledge) */ 387 if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && 388 !vq->inuse && vduse_queue_empty(vq)) { 389 return true; 390 } 391 392 if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 393 return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); 394 } 395 396 v = vq->signalled_used_valid; 397 vq->signalled_used_valid = true; 398 old = vq->signalled_used; 399 new = vq->signalled_used = vq->used_idx; 400 return !v || vring_need_event(vring_get_used_event(vq), new, old); 401 } 402 403 void vduse_queue_notify(VduseVirtq *vq) 404 { 405 VduseDev *dev = vq->dev; 406 407 if (unlikely(!vq->vring.avail)) { 408 return; 409 } 410 411 if (!vduse_queue_should_notify(vq)) { 412 return; 413 } 414 415 if (vduse_inject_irq(dev, vq->index) < 0) { 416 fprintf(stderr, "Error inject irq for vq %d: %s\n", 417 vq->index, strerror(errno)); 418 } 419 } 420 421 static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val) 422 { 423 *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val); 424 } 425 426 static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg, 427 struct iovec *iov, unsigned int max_num_sg, 428 bool is_write, uint64_t pa, size_t sz) 429 { 430 unsigned num_sg = *p_num_sg; 431 VduseDev *dev = vq->dev; 432 433 assert(num_sg <= max_num_sg); 434 435 if (!sz) { 436 fprintf(stderr, "virtio: zero sized buffers are not allowed\n"); 437 return false; 438 } 439 440 while (sz) { 441 uint64_t len = sz; 442 443 if (num_sg == max_num_sg) { 444 fprintf(stderr, 445 "virtio: too many descriptors in indirect table\n"); 446 return false; 447 } 448 449 iov[num_sg].iov_base = iova_to_va(dev, &len, pa); 450 if (iov[num_sg].iov_base == NULL) { 451 fprintf(stderr, "virtio: invalid address for buffers\n"); 452 return false; 453 } 454 iov[num_sg++].iov_len = len; 455 sz -= len; 456 pa += len; 457 } 458 459 *p_num_sg = num_sg; 460 return true; 461 } 462 463 static void *vduse_queue_alloc_element(size_t sz, unsigned out_num, 464 unsigned in_num) 465 { 466 VduseVirtqElement *elem; 467 size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); 468 size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); 469 size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); 470 471 assert(sz >= sizeof(VduseVirtqElement)); 472 elem = malloc(out_sg_end); 473 if (!elem) { 474 return NULL; 475 } 476 elem->out_num = out_num; 477 elem->in_num = in_num; 478 elem->in_sg = (void *)elem + in_sg_ofs; 479 elem->out_sg = (void *)elem + out_sg_ofs; 480 return elem; 481 } 482 483 static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz) 484 { 485 struct vring_desc *desc = vq->vring.desc; 486 VduseDev *dev = vq->dev; 487 uint64_t desc_addr, read_len; 488 unsigned int desc_len; 489 unsigned int max = vq->vring.num; 490 unsigned int i = idx; 491 VduseVirtqElement *elem; 492 struct iovec iov[VIRTQUEUE_MAX_SIZE]; 493 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; 494 unsigned int out_num = 0, in_num = 0; 495 int rc; 496 497 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { 498 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { 499 fprintf(stderr, "Invalid size for indirect buffer table\n"); 500 return NULL; 501 } 502 503 /* loop over the indirect descriptor table */ 504 desc_addr = le64toh(desc[i].addr); 505 desc_len = le32toh(desc[i].len); 506 max = desc_len / sizeof(struct vring_desc); 507 read_len = desc_len; 508 desc = iova_to_va(dev, &read_len, desc_addr); 509 if (unlikely(desc && read_len != desc_len)) { 510 /* Failed to use zero copy */ 511 desc = NULL; 512 if (!vduse_queue_read_indirect_desc(dev, desc_buf, 513 desc_addr, 514 desc_len)) { 515 desc = desc_buf; 516 } 517 } 518 if (!desc) { 519 fprintf(stderr, "Invalid indirect buffer table\n"); 520 return NULL; 521 } 522 i = 0; 523 } 524 525 /* Collect all the descriptors */ 526 do { 527 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { 528 if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num, 529 VIRTQUEUE_MAX_SIZE - out_num, 530 true, le64toh(desc[i].addr), 531 le32toh(desc[i].len))) { 532 return NULL; 533 } 534 } else { 535 if (in_num) { 536 fprintf(stderr, "Incorrect order for descriptors\n"); 537 return NULL; 538 } 539 if (!vduse_queue_map_single_desc(vq, &out_num, iov, 540 VIRTQUEUE_MAX_SIZE, false, 541 le64toh(desc[i].addr), 542 le32toh(desc[i].len))) { 543 return NULL; 544 } 545 } 546 547 /* If we've got too many, that implies a descriptor loop. */ 548 if ((in_num + out_num) > max) { 549 fprintf(stderr, "Looped descriptor\n"); 550 return NULL; 551 } 552 rc = vduse_queue_read_next_desc(desc, i, max, &i); 553 } while (rc == VIRTQUEUE_READ_DESC_MORE); 554 555 if (rc == VIRTQUEUE_READ_DESC_ERROR) { 556 fprintf(stderr, "read descriptor error\n"); 557 return NULL; 558 } 559 560 /* Now copy what we have collected and mapped */ 561 elem = vduse_queue_alloc_element(sz, out_num, in_num); 562 if (!elem) { 563 fprintf(stderr, "read descriptor error\n"); 564 return NULL; 565 } 566 elem->index = idx; 567 for (i = 0; i < out_num; i++) { 568 elem->out_sg[i] = iov[i]; 569 } 570 for (i = 0; i < in_num; i++) { 571 elem->in_sg[i] = iov[out_num + i]; 572 } 573 574 return elem; 575 } 576 577 void *vduse_queue_pop(VduseVirtq *vq, size_t sz) 578 { 579 unsigned int head; 580 VduseVirtqElement *elem; 581 VduseDev *dev = vq->dev; 582 583 if (unlikely(!vq->vring.avail)) { 584 return NULL; 585 } 586 587 if (vduse_queue_empty(vq)) { 588 return NULL; 589 } 590 /* Needed after virtio_queue_empty() */ 591 smp_rmb(); 592 593 if (vq->inuse >= vq->vring.num) { 594 fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse); 595 return NULL; 596 } 597 598 if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) { 599 return NULL; 600 } 601 602 if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { 603 vring_set_avail_event(vq, vq->last_avail_idx); 604 } 605 606 elem = vduse_queue_map_desc(vq, head, sz); 607 608 if (!elem) { 609 return NULL; 610 } 611 612 vq->inuse++; 613 614 return elem; 615 } 616 617 static inline void vring_used_write(VduseVirtq *vq, 618 struct vring_used_elem *uelem, int i) 619 { 620 struct vring_used *used = vq->vring.used; 621 622 used->ring[i] = *uelem; 623 } 624 625 static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem, 626 unsigned int len, unsigned int idx) 627 { 628 struct vring_used_elem uelem; 629 630 if (unlikely(!vq->vring.used)) { 631 return; 632 } 633 634 idx = (idx + vq->used_idx) % vq->vring.num; 635 636 uelem.id = htole32(elem->index); 637 uelem.len = htole32(len); 638 vring_used_write(vq, &uelem, idx); 639 } 640 641 static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val) 642 { 643 vq->vring.used->idx = htole16(val); 644 vq->used_idx = val; 645 } 646 647 static void vduse_queue_flush(VduseVirtq *vq, unsigned int count) 648 { 649 uint16_t old, new; 650 651 if (unlikely(!vq->vring.used)) { 652 return; 653 } 654 655 /* Make sure buffer is written before we update index. */ 656 smp_wmb(); 657 658 old = vq->used_idx; 659 new = old + count; 660 vring_used_idx_set(vq, new); 661 vq->inuse -= count; 662 if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { 663 vq->signalled_used_valid = false; 664 } 665 } 666 667 void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem, 668 unsigned int len) 669 { 670 vduse_queue_fill(vq, elem, len, 0); 671 vduse_queue_flush(vq, 1); 672 } 673 674 static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr, 675 uint64_t avail_addr, uint64_t used_addr) 676 { 677 struct VduseDev *dev = vq->dev; 678 uint64_t len; 679 680 len = sizeof(struct vring_desc); 681 vq->vring.desc = iova_to_va(dev, &len, desc_addr); 682 if (len != sizeof(struct vring_desc)) { 683 return -EINVAL; 684 } 685 686 len = sizeof(struct vring_avail); 687 vq->vring.avail = iova_to_va(dev, &len, avail_addr); 688 if (len != sizeof(struct vring_avail)) { 689 return -EINVAL; 690 } 691 692 len = sizeof(struct vring_used); 693 vq->vring.used = iova_to_va(dev, &len, used_addr); 694 if (len != sizeof(struct vring_used)) { 695 return -EINVAL; 696 } 697 698 if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) { 699 fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index); 700 return -EINVAL; 701 } 702 703 return 0; 704 } 705 706 static void vduse_queue_enable(VduseVirtq *vq) 707 { 708 struct VduseDev *dev = vq->dev; 709 struct vduse_vq_info vq_info; 710 struct vduse_vq_eventfd vq_eventfd; 711 int fd; 712 713 vq_info.index = vq->index; 714 if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) { 715 fprintf(stderr, "Failed to get vq[%d] info: %s\n", 716 vq->index, strerror(errno)); 717 return; 718 } 719 720 if (!vq_info.ready) { 721 return; 722 } 723 724 vq->vring.num = vq_info.num; 725 vq->vring.desc_addr = vq_info.desc_addr; 726 vq->vring.avail_addr = vq_info.driver_addr; 727 vq->vring.used_addr = vq_info.device_addr; 728 729 if (vduse_queue_update_vring(vq, vq_info.desc_addr, 730 vq_info.driver_addr, vq_info.device_addr)) { 731 fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index); 732 return; 733 } 734 735 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 736 if (fd < 0) { 737 fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index); 738 return; 739 } 740 741 vq_eventfd.index = vq->index; 742 vq_eventfd.fd = fd; 743 if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) { 744 fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index); 745 close(fd); 746 return; 747 } 748 749 vq->fd = fd; 750 vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index; 751 vq->inuse = 0; 752 vq->used_idx = 0; 753 vq->signalled_used_valid = false; 754 vq->ready = true; 755 756 dev->ops->enable_queue(dev, vq); 757 } 758 759 static void vduse_queue_disable(VduseVirtq *vq) 760 { 761 struct VduseDev *dev = vq->dev; 762 struct vduse_vq_eventfd eventfd; 763 764 if (!vq->ready) { 765 return; 766 } 767 768 dev->ops->disable_queue(dev, vq); 769 770 eventfd.index = vq->index; 771 eventfd.fd = VDUSE_EVENTFD_DEASSIGN; 772 ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd); 773 close(vq->fd); 774 775 assert(vq->inuse == 0); 776 777 vq->vring.num = 0; 778 vq->vring.desc_addr = 0; 779 vq->vring.avail_addr = 0; 780 vq->vring.used_addr = 0; 781 vq->vring.desc = 0; 782 vq->vring.avail = 0; 783 vq->vring.used = 0; 784 vq->ready = false; 785 vq->fd = -1; 786 } 787 788 static void vduse_dev_start_dataplane(VduseDev *dev) 789 { 790 int i; 791 792 if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { 793 fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); 794 return; 795 } 796 assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1)); 797 798 for (i = 0; i < dev->num_queues; i++) { 799 vduse_queue_enable(&dev->vqs[i]); 800 } 801 } 802 803 static void vduse_dev_stop_dataplane(VduseDev *dev) 804 { 805 int i; 806 807 for (i = 0; i < dev->num_queues; i++) { 808 vduse_queue_disable(&dev->vqs[i]); 809 } 810 dev->features = 0; 811 vduse_iova_remove_region(dev, 0, ULONG_MAX); 812 } 813 814 int vduse_dev_handler(VduseDev *dev) 815 { 816 struct vduse_dev_request req; 817 struct vduse_dev_response resp = { 0 }; 818 VduseVirtq *vq; 819 int i, ret; 820 821 ret = read(dev->fd, &req, sizeof(req)); 822 if (ret != sizeof(req)) { 823 fprintf(stderr, "Read request error [%d]: %s\n", 824 ret, strerror(errno)); 825 return -errno; 826 } 827 resp.request_id = req.request_id; 828 829 switch (req.type) { 830 case VDUSE_GET_VQ_STATE: 831 vq = &dev->vqs[req.vq_state.index]; 832 resp.vq_state.split.avail_index = vq->last_avail_idx; 833 resp.result = VDUSE_REQ_RESULT_OK; 834 break; 835 case VDUSE_SET_STATUS: 836 if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) { 837 vduse_dev_start_dataplane(dev); 838 } else if (req.s.status == 0) { 839 vduse_dev_stop_dataplane(dev); 840 } 841 resp.result = VDUSE_REQ_RESULT_OK; 842 break; 843 case VDUSE_UPDATE_IOTLB: 844 /* The iova will be updated by iova_to_va() later, so just remove it */ 845 vduse_iova_remove_region(dev, req.iova.start, req.iova.last); 846 for (i = 0; i < dev->num_queues; i++) { 847 VduseVirtq *vq = &dev->vqs[i]; 848 if (vq->ready) { 849 if (vduse_queue_update_vring(vq, vq->vring.desc_addr, 850 vq->vring.avail_addr, 851 vq->vring.used_addr)) { 852 fprintf(stderr, "Failed to update vring for vq[%d]\n", 853 vq->index); 854 } 855 } 856 } 857 resp.result = VDUSE_REQ_RESULT_OK; 858 break; 859 default: 860 resp.result = VDUSE_REQ_RESULT_FAILED; 861 break; 862 } 863 864 ret = write(dev->fd, &resp, sizeof(resp)); 865 if (ret != sizeof(resp)) { 866 fprintf(stderr, "Write request %d error [%d]: %s\n", 867 req.type, ret, strerror(errno)); 868 return -errno; 869 } 870 return 0; 871 } 872 873 int vduse_dev_update_config(VduseDev *dev, uint32_t size, 874 uint32_t offset, char *buffer) 875 { 876 int ret; 877 struct vduse_config_data *data; 878 879 data = malloc(offsetof(struct vduse_config_data, buffer) + size); 880 if (!data) { 881 return -ENOMEM; 882 } 883 884 data->offset = offset; 885 data->length = size; 886 memcpy(data->buffer, buffer, size); 887 888 ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data); 889 free(data); 890 891 if (ret) { 892 return -errno; 893 } 894 895 if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) { 896 return -errno; 897 } 898 899 return 0; 900 } 901 902 int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size) 903 { 904 VduseVirtq *vq = &dev->vqs[index]; 905 struct vduse_vq_config vq_config = { 0 }; 906 907 if (max_size > VIRTQUEUE_MAX_SIZE) { 908 return -EINVAL; 909 } 910 911 vq_config.index = vq->index; 912 vq_config.max_size = max_size; 913 914 if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) { 915 return -errno; 916 } 917 918 return 0; 919 } 920 921 static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues) 922 { 923 VduseVirtq *vqs; 924 int i; 925 926 vqs = calloc(sizeof(VduseVirtq), num_queues); 927 if (!vqs) { 928 return -ENOMEM; 929 } 930 931 for (i = 0; i < num_queues; i++) { 932 vqs[i].index = i; 933 vqs[i].dev = dev; 934 vqs[i].fd = -1; 935 } 936 dev->vqs = vqs; 937 938 return 0; 939 } 940 941 static int vduse_dev_init(VduseDev *dev, const char *name, 942 uint16_t num_queues, const VduseOps *ops, 943 void *priv) 944 { 945 char *dev_path, *dev_name; 946 int ret, fd; 947 948 dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1); 949 if (!dev_path) { 950 return -ENOMEM; 951 } 952 sprintf(dev_path, "/dev/vduse/%s", name); 953 954 fd = open(dev_path, O_RDWR); 955 free(dev_path); 956 if (fd < 0) { 957 fprintf(stderr, "Failed to open vduse dev %s: %s\n", 958 name, strerror(errno)); 959 return -errno; 960 } 961 962 dev_name = strdup(name); 963 if (!dev_name) { 964 close(fd); 965 return -ENOMEM; 966 } 967 968 ret = vduse_dev_init_vqs(dev, num_queues); 969 if (ret) { 970 free(dev_name); 971 close(fd); 972 return ret; 973 } 974 975 dev->name = dev_name; 976 dev->num_queues = num_queues; 977 dev->fd = fd; 978 dev->ops = ops; 979 dev->priv = priv; 980 981 return 0; 982 } 983 984 static inline bool vduse_name_is_valid(const char *name) 985 { 986 return strlen(name) >= VDUSE_NAME_MAX || strstr(name, ".."); 987 } 988 989 VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues, 990 const VduseOps *ops, void *priv) 991 { 992 VduseDev *dev; 993 int ret; 994 995 if (!ops || !ops->enable_queue || !ops->disable_queue) { 996 fprintf(stderr, "Invalid parameter for vduse\n"); 997 return NULL; 998 } 999 1000 dev = calloc(sizeof(VduseDev), 1); 1001 if (!dev) { 1002 fprintf(stderr, "Failed to allocate vduse device\n"); 1003 return NULL; 1004 } 1005 1006 ret = vduse_dev_init_vqs(dev, num_queues); 1007 if (ret) { 1008 fprintf(stderr, "Failed to init vqs\n"); 1009 free(dev); 1010 return NULL; 1011 } 1012 1013 dev->num_queues = num_queues; 1014 dev->fd = fd; 1015 dev->ops = ops; 1016 dev->priv = priv; 1017 1018 return dev; 1019 } 1020 1021 VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues, 1022 const VduseOps *ops, void *priv) 1023 { 1024 VduseDev *dev; 1025 int ret; 1026 1027 if (!name || vduse_name_is_valid(name) || !ops || 1028 !ops->enable_queue || !ops->disable_queue) { 1029 fprintf(stderr, "Invalid parameter for vduse\n"); 1030 return NULL; 1031 } 1032 1033 dev = calloc(sizeof(VduseDev), 1); 1034 if (!dev) { 1035 fprintf(stderr, "Failed to allocate vduse device\n"); 1036 return NULL; 1037 } 1038 1039 ret = vduse_dev_init(dev, name, num_queues, ops, priv); 1040 if (ret < 0) { 1041 fprintf(stderr, "Failed to init vduse device %s: %s\n", 1042 name, strerror(ret)); 1043 free(dev); 1044 return NULL; 1045 } 1046 1047 return dev; 1048 } 1049 1050 VduseDev *vduse_dev_create(const char *name, uint32_t device_id, 1051 uint32_t vendor_id, uint64_t features, 1052 uint16_t num_queues, uint32_t config_size, 1053 char *config, const VduseOps *ops, void *priv) 1054 { 1055 VduseDev *dev; 1056 int ret, ctrl_fd; 1057 uint64_t version; 1058 struct vduse_dev_config *dev_config; 1059 size_t size = offsetof(struct vduse_dev_config, config); 1060 1061 if (!name || vduse_name_is_valid(name) || 1062 !has_feature(features, VIRTIO_F_VERSION_1) || !config || 1063 !config_size || !ops || !ops->enable_queue || !ops->disable_queue) { 1064 fprintf(stderr, "Invalid parameter for vduse\n"); 1065 return NULL; 1066 } 1067 1068 dev = calloc(sizeof(VduseDev), 1); 1069 if (!dev) { 1070 fprintf(stderr, "Failed to allocate vduse device\n"); 1071 return NULL; 1072 } 1073 1074 ctrl_fd = open("/dev/vduse/control", O_RDWR); 1075 if (ctrl_fd < 0) { 1076 fprintf(stderr, "Failed to open /dev/vduse/control: %s\n", 1077 strerror(errno)); 1078 goto err_ctrl; 1079 } 1080 1081 version = VDUSE_API_VERSION; 1082 if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) { 1083 fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n", 1084 version, strerror(errno)); 1085 goto err_dev; 1086 } 1087 1088 dev_config = calloc(size + config_size, 1); 1089 if (!dev_config) { 1090 fprintf(stderr, "Failed to allocate config space\n"); 1091 goto err_dev; 1092 } 1093 1094 strcpy(dev_config->name, name); 1095 dev_config->device_id = device_id; 1096 dev_config->vendor_id = vendor_id; 1097 dev_config->features = features; 1098 dev_config->vq_num = num_queues; 1099 dev_config->vq_align = VDUSE_VQ_ALIGN; 1100 dev_config->config_size = config_size; 1101 memcpy(dev_config->config, config, config_size); 1102 1103 ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config); 1104 free(dev_config); 1105 if (ret < 0) { 1106 fprintf(stderr, "Failed to create vduse device %s: %s\n", 1107 name, strerror(errno)); 1108 goto err_dev; 1109 } 1110 dev->ctrl_fd = ctrl_fd; 1111 1112 ret = vduse_dev_init(dev, name, num_queues, ops, priv); 1113 if (ret < 0) { 1114 fprintf(stderr, "Failed to init vduse device %s: %s\n", 1115 name, strerror(ret)); 1116 goto err; 1117 } 1118 1119 return dev; 1120 err: 1121 ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name); 1122 err_dev: 1123 close(ctrl_fd); 1124 err_ctrl: 1125 free(dev); 1126 1127 return NULL; 1128 } 1129 1130 int vduse_dev_destroy(VduseDev *dev) 1131 { 1132 int ret = 0; 1133 1134 free(dev->vqs); 1135 if (dev->fd >= 0) { 1136 close(dev->fd); 1137 dev->fd = -1; 1138 } 1139 if (dev->ctrl_fd >= 0) { 1140 if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) { 1141 ret = -errno; 1142 } 1143 close(dev->ctrl_fd); 1144 dev->ctrl_fd = -1; 1145 } 1146 free(dev->name); 1147 free(dev); 1148 1149 return ret; 1150 } 1151