1 /* Copyright (C) 2009 Red Hat, Inc. 2 * Copyright (C) 2006 Rusty Russell IBM Corporation 3 * 4 * Author: Michael S. Tsirkin <mst@redhat.com> 5 * 6 * Inspiration, some code, and most witty comments come from 7 * Documentation/virtual/lguest/lguest.c, by Rusty Russell 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. 10 * 11 * Generic code for virtio server in host kernel. 12 */ 13 14 #include <linux/eventfd.h> 15 #include <linux/vhost.h> 16 #include <linux/uio.h> 17 #include <linux/mm.h> 18 #include <linux/mmu_context.h> 19 #include <linux/miscdevice.h> 20 #include <linux/mutex.h> 21 #include <linux/poll.h> 22 #include <linux/file.h> 23 #include <linux/highmem.h> 24 #include <linux/slab.h> 25 #include <linux/vmalloc.h> 26 #include <linux/kthread.h> 27 #include <linux/cgroup.h> 28 #include <linux/module.h> 29 #include <linux/sort.h> 30 31 #include "vhost.h" 32 33 static ushort max_mem_regions = 64; 34 module_param(max_mem_regions, ushort, 0444); 35 MODULE_PARM_DESC(max_mem_regions, 36 "Maximum number of memory regions in memory map. (default: 64)"); 37 38 enum { 39 VHOST_MEMORY_F_LOG = 0x1, 40 }; 41 42 #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num]) 43 #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num]) 44 45 #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY 46 static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq) 47 { 48 vq->user_be = !virtio_legacy_is_little_endian(); 49 } 50 51 static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp) 52 { 53 struct vhost_vring_state s; 54 55 if (vq->private_data) 56 return -EBUSY; 57 58 if (copy_from_user(&s, argp, sizeof(s))) 59 return -EFAULT; 60 61 if (s.num != VHOST_VRING_LITTLE_ENDIAN && 62 s.num != VHOST_VRING_BIG_ENDIAN) 63 return -EINVAL; 64 65 vq->user_be = s.num; 66 67 return 0; 68 } 69 70 static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx, 71 int __user *argp) 72 { 73 struct vhost_vring_state s = { 74 .index = idx, 75 .num = vq->user_be 76 }; 77 78 if (copy_to_user(argp, &s, sizeof(s))) 79 return -EFAULT; 80 81 return 0; 82 } 83 84 static void vhost_init_is_le(struct vhost_virtqueue *vq) 85 { 86 /* Note for legacy virtio: user_be is initialized at reset time 87 * according to the host endianness. If userspace does not set an 88 * explicit endianness, the default behavior is native endian, as 89 * expected by legacy virtio. 90 */ 91 vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be; 92 } 93 #else 94 static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq) 95 { 96 } 97 98 static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp) 99 { 100 return -ENOIOCTLCMD; 101 } 102 103 static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx, 104 int __user *argp) 105 { 106 return -ENOIOCTLCMD; 107 } 108 109 static void vhost_init_is_le(struct vhost_virtqueue *vq) 110 { 111 if (vhost_has_feature(vq, VIRTIO_F_VERSION_1)) 112 vq->is_le = true; 113 } 114 #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */ 115 116 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, 117 poll_table *pt) 118 { 119 struct vhost_poll *poll; 120 121 poll = container_of(pt, struct vhost_poll, table); 122 poll->wqh = wqh; 123 add_wait_queue(wqh, &poll->wait); 124 } 125 126 static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, 127 void *key) 128 { 129 struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); 130 131 if (!((unsigned long)key & poll->mask)) 132 return 0; 133 134 vhost_poll_queue(poll); 135 return 0; 136 } 137 138 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) 139 { 140 INIT_LIST_HEAD(&work->node); 141 work->fn = fn; 142 init_waitqueue_head(&work->done); 143 work->flushing = 0; 144 work->queue_seq = work->done_seq = 0; 145 } 146 EXPORT_SYMBOL_GPL(vhost_work_init); 147 148 /* Init poll structure */ 149 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, 150 unsigned long mask, struct vhost_dev *dev) 151 { 152 init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); 153 init_poll_funcptr(&poll->table, vhost_poll_func); 154 poll->mask = mask; 155 poll->dev = dev; 156 poll->wqh = NULL; 157 158 vhost_work_init(&poll->work, fn); 159 } 160 EXPORT_SYMBOL_GPL(vhost_poll_init); 161 162 /* Start polling a file. We add ourselves to file's wait queue. The caller must 163 * keep a reference to a file until after vhost_poll_stop is called. */ 164 int vhost_poll_start(struct vhost_poll *poll, struct file *file) 165 { 166 unsigned long mask; 167 int ret = 0; 168 169 if (poll->wqh) 170 return 0; 171 172 mask = file->f_op->poll(file, &poll->table); 173 if (mask) 174 vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); 175 if (mask & POLLERR) { 176 if (poll->wqh) 177 remove_wait_queue(poll->wqh, &poll->wait); 178 ret = -EINVAL; 179 } 180 181 return ret; 182 } 183 EXPORT_SYMBOL_GPL(vhost_poll_start); 184 185 /* Stop polling a file. After this function returns, it becomes safe to drop the 186 * file reference. You must also flush afterwards. */ 187 void vhost_poll_stop(struct vhost_poll *poll) 188 { 189 if (poll->wqh) { 190 remove_wait_queue(poll->wqh, &poll->wait); 191 poll->wqh = NULL; 192 } 193 } 194 EXPORT_SYMBOL_GPL(vhost_poll_stop); 195 196 static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, 197 unsigned seq) 198 { 199 int left; 200 201 spin_lock_irq(&dev->work_lock); 202 left = seq - work->done_seq; 203 spin_unlock_irq(&dev->work_lock); 204 return left <= 0; 205 } 206 207 void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) 208 { 209 unsigned seq; 210 int flushing; 211 212 spin_lock_irq(&dev->work_lock); 213 seq = work->queue_seq; 214 work->flushing++; 215 spin_unlock_irq(&dev->work_lock); 216 wait_event(work->done, vhost_work_seq_done(dev, work, seq)); 217 spin_lock_irq(&dev->work_lock); 218 flushing = --work->flushing; 219 spin_unlock_irq(&dev->work_lock); 220 BUG_ON(flushing < 0); 221 } 222 EXPORT_SYMBOL_GPL(vhost_work_flush); 223 224 /* Flush any work that has been scheduled. When calling this, don't hold any 225 * locks that are also used by the callback. */ 226 void vhost_poll_flush(struct vhost_poll *poll) 227 { 228 vhost_work_flush(poll->dev, &poll->work); 229 } 230 EXPORT_SYMBOL_GPL(vhost_poll_flush); 231 232 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) 233 { 234 unsigned long flags; 235 236 spin_lock_irqsave(&dev->work_lock, flags); 237 if (list_empty(&work->node)) { 238 list_add_tail(&work->node, &dev->work_list); 239 work->queue_seq++; 240 spin_unlock_irqrestore(&dev->work_lock, flags); 241 wake_up_process(dev->worker); 242 } else { 243 spin_unlock_irqrestore(&dev->work_lock, flags); 244 } 245 } 246 EXPORT_SYMBOL_GPL(vhost_work_queue); 247 248 void vhost_poll_queue(struct vhost_poll *poll) 249 { 250 vhost_work_queue(poll->dev, &poll->work); 251 } 252 EXPORT_SYMBOL_GPL(vhost_poll_queue); 253 254 static void vhost_vq_reset(struct vhost_dev *dev, 255 struct vhost_virtqueue *vq) 256 { 257 vq->num = 1; 258 vq->desc = NULL; 259 vq->avail = NULL; 260 vq->used = NULL; 261 vq->last_avail_idx = 0; 262 vq->avail_idx = 0; 263 vq->last_used_idx = 0; 264 vq->signalled_used = 0; 265 vq->signalled_used_valid = false; 266 vq->used_flags = 0; 267 vq->log_used = false; 268 vq->log_addr = -1ull; 269 vq->private_data = NULL; 270 vq->acked_features = 0; 271 vq->log_base = NULL; 272 vq->error_ctx = NULL; 273 vq->error = NULL; 274 vq->kick = NULL; 275 vq->call_ctx = NULL; 276 vq->call = NULL; 277 vq->log_ctx = NULL; 278 vq->memory = NULL; 279 vq->is_le = virtio_legacy_is_little_endian(); 280 vhost_vq_reset_user_be(vq); 281 } 282 283 static int vhost_worker(void *data) 284 { 285 struct vhost_dev *dev = data; 286 struct vhost_work *work = NULL; 287 unsigned uninitialized_var(seq); 288 mm_segment_t oldfs = get_fs(); 289 290 set_fs(USER_DS); 291 use_mm(dev->mm); 292 293 for (;;) { 294 /* mb paired w/ kthread_stop */ 295 set_current_state(TASK_INTERRUPTIBLE); 296 297 spin_lock_irq(&dev->work_lock); 298 if (work) { 299 work->done_seq = seq; 300 if (work->flushing) 301 wake_up_all(&work->done); 302 } 303 304 if (kthread_should_stop()) { 305 spin_unlock_irq(&dev->work_lock); 306 __set_current_state(TASK_RUNNING); 307 break; 308 } 309 if (!list_empty(&dev->work_list)) { 310 work = list_first_entry(&dev->work_list, 311 struct vhost_work, node); 312 list_del_init(&work->node); 313 seq = work->queue_seq; 314 } else 315 work = NULL; 316 spin_unlock_irq(&dev->work_lock); 317 318 if (work) { 319 __set_current_state(TASK_RUNNING); 320 work->fn(work); 321 if (need_resched()) 322 schedule(); 323 } else 324 schedule(); 325 326 } 327 unuse_mm(dev->mm); 328 set_fs(oldfs); 329 return 0; 330 } 331 332 static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) 333 { 334 kfree(vq->indirect); 335 vq->indirect = NULL; 336 kfree(vq->log); 337 vq->log = NULL; 338 kfree(vq->heads); 339 vq->heads = NULL; 340 } 341 342 /* Helper to allocate iovec buffers for all vqs. */ 343 static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) 344 { 345 struct vhost_virtqueue *vq; 346 int i; 347 348 for (i = 0; i < dev->nvqs; ++i) { 349 vq = dev->vqs[i]; 350 vq->indirect = kmalloc(sizeof *vq->indirect * UIO_MAXIOV, 351 GFP_KERNEL); 352 vq->log = kmalloc(sizeof *vq->log * UIO_MAXIOV, GFP_KERNEL); 353 vq->heads = kmalloc(sizeof *vq->heads * UIO_MAXIOV, GFP_KERNEL); 354 if (!vq->indirect || !vq->log || !vq->heads) 355 goto err_nomem; 356 } 357 return 0; 358 359 err_nomem: 360 for (; i >= 0; --i) 361 vhost_vq_free_iovecs(dev->vqs[i]); 362 return -ENOMEM; 363 } 364 365 static void vhost_dev_free_iovecs(struct vhost_dev *dev) 366 { 367 int i; 368 369 for (i = 0; i < dev->nvqs; ++i) 370 vhost_vq_free_iovecs(dev->vqs[i]); 371 } 372 373 void vhost_dev_init(struct vhost_dev *dev, 374 struct vhost_virtqueue **vqs, int nvqs) 375 { 376 struct vhost_virtqueue *vq; 377 int i; 378 379 dev->vqs = vqs; 380 dev->nvqs = nvqs; 381 mutex_init(&dev->mutex); 382 dev->log_ctx = NULL; 383 dev->log_file = NULL; 384 dev->memory = NULL; 385 dev->mm = NULL; 386 spin_lock_init(&dev->work_lock); 387 INIT_LIST_HEAD(&dev->work_list); 388 dev->worker = NULL; 389 390 for (i = 0; i < dev->nvqs; ++i) { 391 vq = dev->vqs[i]; 392 vq->log = NULL; 393 vq->indirect = NULL; 394 vq->heads = NULL; 395 vq->dev = dev; 396 mutex_init(&vq->mutex); 397 vhost_vq_reset(dev, vq); 398 if (vq->handle_kick) 399 vhost_poll_init(&vq->poll, vq->handle_kick, 400 POLLIN, dev); 401 } 402 } 403 EXPORT_SYMBOL_GPL(vhost_dev_init); 404 405 /* Caller should have device mutex */ 406 long vhost_dev_check_owner(struct vhost_dev *dev) 407 { 408 /* Are you the owner? If not, I don't think you mean to do that */ 409 return dev->mm == current->mm ? 0 : -EPERM; 410 } 411 EXPORT_SYMBOL_GPL(vhost_dev_check_owner); 412 413 struct vhost_attach_cgroups_struct { 414 struct vhost_work work; 415 struct task_struct *owner; 416 int ret; 417 }; 418 419 static void vhost_attach_cgroups_work(struct vhost_work *work) 420 { 421 struct vhost_attach_cgroups_struct *s; 422 423 s = container_of(work, struct vhost_attach_cgroups_struct, work); 424 s->ret = cgroup_attach_task_all(s->owner, current); 425 } 426 427 static int vhost_attach_cgroups(struct vhost_dev *dev) 428 { 429 struct vhost_attach_cgroups_struct attach; 430 431 attach.owner = current; 432 vhost_work_init(&attach.work, vhost_attach_cgroups_work); 433 vhost_work_queue(dev, &attach.work); 434 vhost_work_flush(dev, &attach.work); 435 return attach.ret; 436 } 437 438 /* Caller should have device mutex */ 439 bool vhost_dev_has_owner(struct vhost_dev *dev) 440 { 441 return dev->mm; 442 } 443 EXPORT_SYMBOL_GPL(vhost_dev_has_owner); 444 445 /* Caller should have device mutex */ 446 long vhost_dev_set_owner(struct vhost_dev *dev) 447 { 448 struct task_struct *worker; 449 int err; 450 451 /* Is there an owner already? */ 452 if (vhost_dev_has_owner(dev)) { 453 err = -EBUSY; 454 goto err_mm; 455 } 456 457 /* No owner, become one */ 458 dev->mm = get_task_mm(current); 459 worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); 460 if (IS_ERR(worker)) { 461 err = PTR_ERR(worker); 462 goto err_worker; 463 } 464 465 dev->worker = worker; 466 wake_up_process(worker); /* avoid contributing to loadavg */ 467 468 err = vhost_attach_cgroups(dev); 469 if (err) 470 goto err_cgroup; 471 472 err = vhost_dev_alloc_iovecs(dev); 473 if (err) 474 goto err_cgroup; 475 476 return 0; 477 err_cgroup: 478 kthread_stop(worker); 479 dev->worker = NULL; 480 err_worker: 481 if (dev->mm) 482 mmput(dev->mm); 483 dev->mm = NULL; 484 err_mm: 485 return err; 486 } 487 EXPORT_SYMBOL_GPL(vhost_dev_set_owner); 488 489 struct vhost_memory *vhost_dev_reset_owner_prepare(void) 490 { 491 return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); 492 } 493 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare); 494 495 /* Caller should have device mutex */ 496 void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory) 497 { 498 int i; 499 500 vhost_dev_cleanup(dev, true); 501 502 /* Restore memory to default empty mapping. */ 503 memory->nregions = 0; 504 dev->memory = memory; 505 /* We don't need VQ locks below since vhost_dev_cleanup makes sure 506 * VQs aren't running. 507 */ 508 for (i = 0; i < dev->nvqs; ++i) 509 dev->vqs[i]->memory = memory; 510 } 511 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner); 512 513 void vhost_dev_stop(struct vhost_dev *dev) 514 { 515 int i; 516 517 for (i = 0; i < dev->nvqs; ++i) { 518 if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) { 519 vhost_poll_stop(&dev->vqs[i]->poll); 520 vhost_poll_flush(&dev->vqs[i]->poll); 521 } 522 } 523 } 524 EXPORT_SYMBOL_GPL(vhost_dev_stop); 525 526 /* Caller should have device mutex if and only if locked is set */ 527 void vhost_dev_cleanup(struct vhost_dev *dev, bool locked) 528 { 529 int i; 530 531 for (i = 0; i < dev->nvqs; ++i) { 532 if (dev->vqs[i]->error_ctx) 533 eventfd_ctx_put(dev->vqs[i]->error_ctx); 534 if (dev->vqs[i]->error) 535 fput(dev->vqs[i]->error); 536 if (dev->vqs[i]->kick) 537 fput(dev->vqs[i]->kick); 538 if (dev->vqs[i]->call_ctx) 539 eventfd_ctx_put(dev->vqs[i]->call_ctx); 540 if (dev->vqs[i]->call) 541 fput(dev->vqs[i]->call); 542 vhost_vq_reset(dev, dev->vqs[i]); 543 } 544 vhost_dev_free_iovecs(dev); 545 if (dev->log_ctx) 546 eventfd_ctx_put(dev->log_ctx); 547 dev->log_ctx = NULL; 548 if (dev->log_file) 549 fput(dev->log_file); 550 dev->log_file = NULL; 551 /* No one will access memory at this point */ 552 kvfree(dev->memory); 553 dev->memory = NULL; 554 WARN_ON(!list_empty(&dev->work_list)); 555 if (dev->worker) { 556 kthread_stop(dev->worker); 557 dev->worker = NULL; 558 } 559 if (dev->mm) 560 mmput(dev->mm); 561 dev->mm = NULL; 562 } 563 EXPORT_SYMBOL_GPL(vhost_dev_cleanup); 564 565 static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) 566 { 567 u64 a = addr / VHOST_PAGE_SIZE / 8; 568 569 /* Make sure 64 bit math will not overflow. */ 570 if (a > ULONG_MAX - (unsigned long)log_base || 571 a + (unsigned long)log_base > ULONG_MAX) 572 return 0; 573 574 return access_ok(VERIFY_WRITE, log_base + a, 575 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); 576 } 577 578 /* Caller should have vq mutex and device mutex. */ 579 static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem, 580 int log_all) 581 { 582 int i; 583 584 if (!mem) 585 return 0; 586 587 for (i = 0; i < mem->nregions; ++i) { 588 struct vhost_memory_region *m = mem->regions + i; 589 unsigned long a = m->userspace_addr; 590 if (m->memory_size > ULONG_MAX) 591 return 0; 592 else if (!access_ok(VERIFY_WRITE, (void __user *)a, 593 m->memory_size)) 594 return 0; 595 else if (log_all && !log_access_ok(log_base, 596 m->guest_phys_addr, 597 m->memory_size)) 598 return 0; 599 } 600 return 1; 601 } 602 603 /* Can we switch to this memory table? */ 604 /* Caller should have device mutex but not vq mutex */ 605 static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, 606 int log_all) 607 { 608 int i; 609 610 for (i = 0; i < d->nvqs; ++i) { 611 int ok; 612 bool log; 613 614 mutex_lock(&d->vqs[i]->mutex); 615 log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL); 616 /* If ring is inactive, will check when it's enabled. */ 617 if (d->vqs[i]->private_data) 618 ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log); 619 else 620 ok = 1; 621 mutex_unlock(&d->vqs[i]->mutex); 622 if (!ok) 623 return 0; 624 } 625 return 1; 626 } 627 628 static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, 629 struct vring_desc __user *desc, 630 struct vring_avail __user *avail, 631 struct vring_used __user *used) 632 { 633 size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; 634 return access_ok(VERIFY_READ, desc, num * sizeof *desc) && 635 access_ok(VERIFY_READ, avail, 636 sizeof *avail + num * sizeof *avail->ring + s) && 637 access_ok(VERIFY_WRITE, used, 638 sizeof *used + num * sizeof *used->ring + s); 639 } 640 641 /* Can we log writes? */ 642 /* Caller should have device mutex but not vq mutex */ 643 int vhost_log_access_ok(struct vhost_dev *dev) 644 { 645 return memory_access_ok(dev, dev->memory, 1); 646 } 647 EXPORT_SYMBOL_GPL(vhost_log_access_ok); 648 649 /* Verify access for write logging. */ 650 /* Caller should have vq mutex and device mutex */ 651 static int vq_log_access_ok(struct vhost_virtqueue *vq, 652 void __user *log_base) 653 { 654 size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; 655 656 return vq_memory_access_ok(log_base, vq->memory, 657 vhost_has_feature(vq, VHOST_F_LOG_ALL)) && 658 (!vq->log_used || log_access_ok(log_base, vq->log_addr, 659 sizeof *vq->used + 660 vq->num * sizeof *vq->used->ring + s)); 661 } 662 663 /* Can we start vq? */ 664 /* Caller should have vq mutex and device mutex */ 665 int vhost_vq_access_ok(struct vhost_virtqueue *vq) 666 { 667 return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used) && 668 vq_log_access_ok(vq, vq->log_base); 669 } 670 EXPORT_SYMBOL_GPL(vhost_vq_access_ok); 671 672 static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2) 673 { 674 const struct vhost_memory_region *r1 = p1, *r2 = p2; 675 if (r1->guest_phys_addr < r2->guest_phys_addr) 676 return 1; 677 if (r1->guest_phys_addr > r2->guest_phys_addr) 678 return -1; 679 return 0; 680 } 681 682 static void *vhost_kvzalloc(unsigned long size) 683 { 684 void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 685 686 if (!n) 687 n = vzalloc(size); 688 return n; 689 } 690 691 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) 692 { 693 struct vhost_memory mem, *newmem, *oldmem; 694 unsigned long size = offsetof(struct vhost_memory, regions); 695 int i; 696 697 if (copy_from_user(&mem, m, size)) 698 return -EFAULT; 699 if (mem.padding) 700 return -EOPNOTSUPP; 701 if (mem.nregions > max_mem_regions) 702 return -E2BIG; 703 newmem = vhost_kvzalloc(size + mem.nregions * sizeof(*m->regions)); 704 if (!newmem) 705 return -ENOMEM; 706 707 memcpy(newmem, &mem, size); 708 if (copy_from_user(newmem->regions, m->regions, 709 mem.nregions * sizeof *m->regions)) { 710 kvfree(newmem); 711 return -EFAULT; 712 } 713 sort(newmem->regions, newmem->nregions, sizeof(*newmem->regions), 714 vhost_memory_reg_sort_cmp, NULL); 715 716 if (!memory_access_ok(d, newmem, 0)) { 717 kvfree(newmem); 718 return -EFAULT; 719 } 720 oldmem = d->memory; 721 d->memory = newmem; 722 723 /* All memory accesses are done under some VQ mutex. */ 724 for (i = 0; i < d->nvqs; ++i) { 725 mutex_lock(&d->vqs[i]->mutex); 726 d->vqs[i]->memory = newmem; 727 mutex_unlock(&d->vqs[i]->mutex); 728 } 729 kvfree(oldmem); 730 return 0; 731 } 732 733 long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) 734 { 735 struct file *eventfp, *filep = NULL; 736 bool pollstart = false, pollstop = false; 737 struct eventfd_ctx *ctx = NULL; 738 u32 __user *idxp = argp; 739 struct vhost_virtqueue *vq; 740 struct vhost_vring_state s; 741 struct vhost_vring_file f; 742 struct vhost_vring_addr a; 743 u32 idx; 744 long r; 745 746 r = get_user(idx, idxp); 747 if (r < 0) 748 return r; 749 if (idx >= d->nvqs) 750 return -ENOBUFS; 751 752 vq = d->vqs[idx]; 753 754 mutex_lock(&vq->mutex); 755 756 switch (ioctl) { 757 case VHOST_SET_VRING_NUM: 758 /* Resizing ring with an active backend? 759 * You don't want to do that. */ 760 if (vq->private_data) { 761 r = -EBUSY; 762 break; 763 } 764 if (copy_from_user(&s, argp, sizeof s)) { 765 r = -EFAULT; 766 break; 767 } 768 if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) { 769 r = -EINVAL; 770 break; 771 } 772 vq->num = s.num; 773 break; 774 case VHOST_SET_VRING_BASE: 775 /* Moving base with an active backend? 776 * You don't want to do that. */ 777 if (vq->private_data) { 778 r = -EBUSY; 779 break; 780 } 781 if (copy_from_user(&s, argp, sizeof s)) { 782 r = -EFAULT; 783 break; 784 } 785 if (s.num > 0xffff) { 786 r = -EINVAL; 787 break; 788 } 789 vq->last_avail_idx = s.num; 790 /* Forget the cached index value. */ 791 vq->avail_idx = vq->last_avail_idx; 792 break; 793 case VHOST_GET_VRING_BASE: 794 s.index = idx; 795 s.num = vq->last_avail_idx; 796 if (copy_to_user(argp, &s, sizeof s)) 797 r = -EFAULT; 798 break; 799 case VHOST_SET_VRING_ADDR: 800 if (copy_from_user(&a, argp, sizeof a)) { 801 r = -EFAULT; 802 break; 803 } 804 if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) { 805 r = -EOPNOTSUPP; 806 break; 807 } 808 /* For 32bit, verify that the top 32bits of the user 809 data are set to zero. */ 810 if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr || 811 (u64)(unsigned long)a.used_user_addr != a.used_user_addr || 812 (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) { 813 r = -EFAULT; 814 break; 815 } 816 817 /* Make sure it's safe to cast pointers to vring types. */ 818 BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE); 819 BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE); 820 if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) || 821 (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) || 822 (a.log_guest_addr & (sizeof(u64) - 1))) { 823 r = -EINVAL; 824 break; 825 } 826 827 /* We only verify access here if backend is configured. 828 * If it is not, we don't as size might not have been setup. 829 * We will verify when backend is configured. */ 830 if (vq->private_data) { 831 if (!vq_access_ok(vq, vq->num, 832 (void __user *)(unsigned long)a.desc_user_addr, 833 (void __user *)(unsigned long)a.avail_user_addr, 834 (void __user *)(unsigned long)a.used_user_addr)) { 835 r = -EINVAL; 836 break; 837 } 838 839 /* Also validate log access for used ring if enabled. */ 840 if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && 841 !log_access_ok(vq->log_base, a.log_guest_addr, 842 sizeof *vq->used + 843 vq->num * sizeof *vq->used->ring)) { 844 r = -EINVAL; 845 break; 846 } 847 } 848 849 vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); 850 vq->desc = (void __user *)(unsigned long)a.desc_user_addr; 851 vq->avail = (void __user *)(unsigned long)a.avail_user_addr; 852 vq->log_addr = a.log_guest_addr; 853 vq->used = (void __user *)(unsigned long)a.used_user_addr; 854 break; 855 case VHOST_SET_VRING_KICK: 856 if (copy_from_user(&f, argp, sizeof f)) { 857 r = -EFAULT; 858 break; 859 } 860 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); 861 if (IS_ERR(eventfp)) { 862 r = PTR_ERR(eventfp); 863 break; 864 } 865 if (eventfp != vq->kick) { 866 pollstop = (filep = vq->kick) != NULL; 867 pollstart = (vq->kick = eventfp) != NULL; 868 } else 869 filep = eventfp; 870 break; 871 case VHOST_SET_VRING_CALL: 872 if (copy_from_user(&f, argp, sizeof f)) { 873 r = -EFAULT; 874 break; 875 } 876 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); 877 if (IS_ERR(eventfp)) { 878 r = PTR_ERR(eventfp); 879 break; 880 } 881 if (eventfp != vq->call) { 882 filep = vq->call; 883 ctx = vq->call_ctx; 884 vq->call = eventfp; 885 vq->call_ctx = eventfp ? 886 eventfd_ctx_fileget(eventfp) : NULL; 887 } else 888 filep = eventfp; 889 break; 890 case VHOST_SET_VRING_ERR: 891 if (copy_from_user(&f, argp, sizeof f)) { 892 r = -EFAULT; 893 break; 894 } 895 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); 896 if (IS_ERR(eventfp)) { 897 r = PTR_ERR(eventfp); 898 break; 899 } 900 if (eventfp != vq->error) { 901 filep = vq->error; 902 vq->error = eventfp; 903 ctx = vq->error_ctx; 904 vq->error_ctx = eventfp ? 905 eventfd_ctx_fileget(eventfp) : NULL; 906 } else 907 filep = eventfp; 908 break; 909 case VHOST_SET_VRING_ENDIAN: 910 r = vhost_set_vring_endian(vq, argp); 911 break; 912 case VHOST_GET_VRING_ENDIAN: 913 r = vhost_get_vring_endian(vq, idx, argp); 914 break; 915 default: 916 r = -ENOIOCTLCMD; 917 } 918 919 if (pollstop && vq->handle_kick) 920 vhost_poll_stop(&vq->poll); 921 922 if (ctx) 923 eventfd_ctx_put(ctx); 924 if (filep) 925 fput(filep); 926 927 if (pollstart && vq->handle_kick) 928 r = vhost_poll_start(&vq->poll, vq->kick); 929 930 mutex_unlock(&vq->mutex); 931 932 if (pollstop && vq->handle_kick) 933 vhost_poll_flush(&vq->poll); 934 return r; 935 } 936 EXPORT_SYMBOL_GPL(vhost_vring_ioctl); 937 938 /* Caller must have device mutex */ 939 long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) 940 { 941 struct file *eventfp, *filep = NULL; 942 struct eventfd_ctx *ctx = NULL; 943 u64 p; 944 long r; 945 int i, fd; 946 947 /* If you are not the owner, you can become one */ 948 if (ioctl == VHOST_SET_OWNER) { 949 r = vhost_dev_set_owner(d); 950 goto done; 951 } 952 953 /* You must be the owner to do anything else */ 954 r = vhost_dev_check_owner(d); 955 if (r) 956 goto done; 957 958 switch (ioctl) { 959 case VHOST_SET_MEM_TABLE: 960 r = vhost_set_memory(d, argp); 961 break; 962 case VHOST_SET_LOG_BASE: 963 if (copy_from_user(&p, argp, sizeof p)) { 964 r = -EFAULT; 965 break; 966 } 967 if ((u64)(unsigned long)p != p) { 968 r = -EFAULT; 969 break; 970 } 971 for (i = 0; i < d->nvqs; ++i) { 972 struct vhost_virtqueue *vq; 973 void __user *base = (void __user *)(unsigned long)p; 974 vq = d->vqs[i]; 975 mutex_lock(&vq->mutex); 976 /* If ring is inactive, will check when it's enabled. */ 977 if (vq->private_data && !vq_log_access_ok(vq, base)) 978 r = -EFAULT; 979 else 980 vq->log_base = base; 981 mutex_unlock(&vq->mutex); 982 } 983 break; 984 case VHOST_SET_LOG_FD: 985 r = get_user(fd, (int __user *)argp); 986 if (r < 0) 987 break; 988 eventfp = fd == -1 ? NULL : eventfd_fget(fd); 989 if (IS_ERR(eventfp)) { 990 r = PTR_ERR(eventfp); 991 break; 992 } 993 if (eventfp != d->log_file) { 994 filep = d->log_file; 995 d->log_file = eventfp; 996 ctx = d->log_ctx; 997 d->log_ctx = eventfp ? 998 eventfd_ctx_fileget(eventfp) : NULL; 999 } else 1000 filep = eventfp; 1001 for (i = 0; i < d->nvqs; ++i) { 1002 mutex_lock(&d->vqs[i]->mutex); 1003 d->vqs[i]->log_ctx = d->log_ctx; 1004 mutex_unlock(&d->vqs[i]->mutex); 1005 } 1006 if (ctx) 1007 eventfd_ctx_put(ctx); 1008 if (filep) 1009 fput(filep); 1010 break; 1011 default: 1012 r = -ENOIOCTLCMD; 1013 break; 1014 } 1015 done: 1016 return r; 1017 } 1018 EXPORT_SYMBOL_GPL(vhost_dev_ioctl); 1019 1020 static const struct vhost_memory_region *find_region(struct vhost_memory *mem, 1021 __u64 addr, __u32 len) 1022 { 1023 const struct vhost_memory_region *reg; 1024 int start = 0, end = mem->nregions; 1025 1026 while (start < end) { 1027 int slot = start + (end - start) / 2; 1028 reg = mem->regions + slot; 1029 if (addr >= reg->guest_phys_addr) 1030 end = slot; 1031 else 1032 start = slot + 1; 1033 } 1034 1035 reg = mem->regions + start; 1036 if (addr >= reg->guest_phys_addr && 1037 reg->guest_phys_addr + reg->memory_size > addr) 1038 return reg; 1039 return NULL; 1040 } 1041 1042 /* TODO: This is really inefficient. We need something like get_user() 1043 * (instruction directly accesses the data, with an exception table entry 1044 * returning -EFAULT). See Documentation/x86/exception-tables.txt. 1045 */ 1046 static int set_bit_to_user(int nr, void __user *addr) 1047 { 1048 unsigned long log = (unsigned long)addr; 1049 struct page *page; 1050 void *base; 1051 int bit = nr + (log % PAGE_SIZE) * 8; 1052 int r; 1053 1054 r = get_user_pages_fast(log, 1, 1, &page); 1055 if (r < 0) 1056 return r; 1057 BUG_ON(r != 1); 1058 base = kmap_atomic(page); 1059 set_bit(bit, base); 1060 kunmap_atomic(base); 1061 set_page_dirty_lock(page); 1062 put_page(page); 1063 return 0; 1064 } 1065 1066 static int log_write(void __user *log_base, 1067 u64 write_address, u64 write_length) 1068 { 1069 u64 write_page = write_address / VHOST_PAGE_SIZE; 1070 int r; 1071 1072 if (!write_length) 1073 return 0; 1074 write_length += write_address % VHOST_PAGE_SIZE; 1075 for (;;) { 1076 u64 base = (u64)(unsigned long)log_base; 1077 u64 log = base + write_page / 8; 1078 int bit = write_page % 8; 1079 if ((u64)(unsigned long)log != log) 1080 return -EFAULT; 1081 r = set_bit_to_user(bit, (void __user *)(unsigned long)log); 1082 if (r < 0) 1083 return r; 1084 if (write_length <= VHOST_PAGE_SIZE) 1085 break; 1086 write_length -= VHOST_PAGE_SIZE; 1087 write_page += 1; 1088 } 1089 return r; 1090 } 1091 1092 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, 1093 unsigned int log_num, u64 len) 1094 { 1095 int i, r; 1096 1097 /* Make sure data written is seen before log. */ 1098 smp_wmb(); 1099 for (i = 0; i < log_num; ++i) { 1100 u64 l = min(log[i].len, len); 1101 r = log_write(vq->log_base, log[i].addr, l); 1102 if (r < 0) 1103 return r; 1104 len -= l; 1105 if (!len) { 1106 if (vq->log_ctx) 1107 eventfd_signal(vq->log_ctx, 1); 1108 return 0; 1109 } 1110 } 1111 /* Length written exceeds what we have stored. This is a bug. */ 1112 BUG(); 1113 return 0; 1114 } 1115 EXPORT_SYMBOL_GPL(vhost_log_write); 1116 1117 static int vhost_update_used_flags(struct vhost_virtqueue *vq) 1118 { 1119 void __user *used; 1120 if (__put_user(cpu_to_vhost16(vq, vq->used_flags), &vq->used->flags) < 0) 1121 return -EFAULT; 1122 if (unlikely(vq->log_used)) { 1123 /* Make sure the flag is seen before log. */ 1124 smp_wmb(); 1125 /* Log used flag write. */ 1126 used = &vq->used->flags; 1127 log_write(vq->log_base, vq->log_addr + 1128 (used - (void __user *)vq->used), 1129 sizeof vq->used->flags); 1130 if (vq->log_ctx) 1131 eventfd_signal(vq->log_ctx, 1); 1132 } 1133 return 0; 1134 } 1135 1136 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event) 1137 { 1138 if (__put_user(cpu_to_vhost16(vq, vq->avail_idx), vhost_avail_event(vq))) 1139 return -EFAULT; 1140 if (unlikely(vq->log_used)) { 1141 void __user *used; 1142 /* Make sure the event is seen before log. */ 1143 smp_wmb(); 1144 /* Log avail event write */ 1145 used = vhost_avail_event(vq); 1146 log_write(vq->log_base, vq->log_addr + 1147 (used - (void __user *)vq->used), 1148 sizeof *vhost_avail_event(vq)); 1149 if (vq->log_ctx) 1150 eventfd_signal(vq->log_ctx, 1); 1151 } 1152 return 0; 1153 } 1154 1155 int vhost_init_used(struct vhost_virtqueue *vq) 1156 { 1157 __virtio16 last_used_idx; 1158 int r; 1159 if (!vq->private_data) { 1160 vq->is_le = virtio_legacy_is_little_endian(); 1161 return 0; 1162 } 1163 1164 vhost_init_is_le(vq); 1165 1166 r = vhost_update_used_flags(vq); 1167 if (r) 1168 return r; 1169 vq->signalled_used_valid = false; 1170 if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) 1171 return -EFAULT; 1172 r = __get_user(last_used_idx, &vq->used->idx); 1173 if (r) 1174 return r; 1175 vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx); 1176 return 0; 1177 } 1178 EXPORT_SYMBOL_GPL(vhost_init_used); 1179 1180 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, 1181 struct iovec iov[], int iov_size) 1182 { 1183 const struct vhost_memory_region *reg; 1184 struct vhost_memory *mem; 1185 struct iovec *_iov; 1186 u64 s = 0; 1187 int ret = 0; 1188 1189 mem = vq->memory; 1190 while ((u64)len > s) { 1191 u64 size; 1192 if (unlikely(ret >= iov_size)) { 1193 ret = -ENOBUFS; 1194 break; 1195 } 1196 reg = find_region(mem, addr, len); 1197 if (unlikely(!reg)) { 1198 ret = -EFAULT; 1199 break; 1200 } 1201 _iov = iov + ret; 1202 size = reg->memory_size - addr + reg->guest_phys_addr; 1203 _iov->iov_len = min((u64)len - s, size); 1204 _iov->iov_base = (void __user *)(unsigned long) 1205 (reg->userspace_addr + addr - reg->guest_phys_addr); 1206 s += size; 1207 addr += size; 1208 ++ret; 1209 } 1210 1211 return ret; 1212 } 1213 1214 /* Each buffer in the virtqueues is actually a chain of descriptors. This 1215 * function returns the next descriptor in the chain, 1216 * or -1U if we're at the end. */ 1217 static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc) 1218 { 1219 unsigned int next; 1220 1221 /* If this descriptor says it doesn't chain, we're done. */ 1222 if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT))) 1223 return -1U; 1224 1225 /* Check they're not leading us off end of descriptors. */ 1226 next = vhost16_to_cpu(vq, desc->next); 1227 /* Make sure compiler knows to grab that: we don't want it changing! */ 1228 /* We will use the result as an index in an array, so most 1229 * architectures only need a compiler barrier here. */ 1230 read_barrier_depends(); 1231 1232 return next; 1233 } 1234 1235 static int get_indirect(struct vhost_virtqueue *vq, 1236 struct iovec iov[], unsigned int iov_size, 1237 unsigned int *out_num, unsigned int *in_num, 1238 struct vhost_log *log, unsigned int *log_num, 1239 struct vring_desc *indirect) 1240 { 1241 struct vring_desc desc; 1242 unsigned int i = 0, count, found = 0; 1243 u32 len = vhost32_to_cpu(vq, indirect->len); 1244 struct iov_iter from; 1245 int ret; 1246 1247 /* Sanity check */ 1248 if (unlikely(len % sizeof desc)) { 1249 vq_err(vq, "Invalid length in indirect descriptor: " 1250 "len 0x%llx not multiple of 0x%zx\n", 1251 (unsigned long long)len, 1252 sizeof desc); 1253 return -EINVAL; 1254 } 1255 1256 ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect, 1257 UIO_MAXIOV); 1258 if (unlikely(ret < 0)) { 1259 vq_err(vq, "Translation failure %d in indirect.\n", ret); 1260 return ret; 1261 } 1262 iov_iter_init(&from, READ, vq->indirect, ret, len); 1263 1264 /* We will use the result as an address to read from, so most 1265 * architectures only need a compiler barrier here. */ 1266 read_barrier_depends(); 1267 1268 count = len / sizeof desc; 1269 /* Buffers are chained via a 16 bit next field, so 1270 * we can have at most 2^16 of these. */ 1271 if (unlikely(count > USHRT_MAX + 1)) { 1272 vq_err(vq, "Indirect buffer length too big: %d\n", 1273 indirect->len); 1274 return -E2BIG; 1275 } 1276 1277 do { 1278 unsigned iov_count = *in_num + *out_num; 1279 if (unlikely(++found > count)) { 1280 vq_err(vq, "Loop detected: last one at %u " 1281 "indirect size %u\n", 1282 i, count); 1283 return -EINVAL; 1284 } 1285 if (unlikely(copy_from_iter(&desc, sizeof(desc), &from) != 1286 sizeof(desc))) { 1287 vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n", 1288 i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); 1289 return -EINVAL; 1290 } 1291 if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) { 1292 vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n", 1293 i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); 1294 return -EINVAL; 1295 } 1296 1297 ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr), 1298 vhost32_to_cpu(vq, desc.len), iov + iov_count, 1299 iov_size - iov_count); 1300 if (unlikely(ret < 0)) { 1301 vq_err(vq, "Translation failure %d indirect idx %d\n", 1302 ret, i); 1303 return ret; 1304 } 1305 /* If this is an input descriptor, increment that count. */ 1306 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) { 1307 *in_num += ret; 1308 if (unlikely(log)) { 1309 log[*log_num].addr = vhost64_to_cpu(vq, desc.addr); 1310 log[*log_num].len = vhost32_to_cpu(vq, desc.len); 1311 ++*log_num; 1312 } 1313 } else { 1314 /* If it's an output descriptor, they're all supposed 1315 * to come before any input descriptors. */ 1316 if (unlikely(*in_num)) { 1317 vq_err(vq, "Indirect descriptor " 1318 "has out after in: idx %d\n", i); 1319 return -EINVAL; 1320 } 1321 *out_num += ret; 1322 } 1323 } while ((i = next_desc(vq, &desc)) != -1); 1324 return 0; 1325 } 1326 1327 /* This looks in the virtqueue and for the first available buffer, and converts 1328 * it to an iovec for convenient access. Since descriptors consist of some 1329 * number of output then some number of input descriptors, it's actually two 1330 * iovecs, but we pack them into one and note how many of each there were. 1331 * 1332 * This function returns the descriptor number found, or vq->num (which is 1333 * never a valid descriptor number) if none was found. A negative code is 1334 * returned on error. */ 1335 int vhost_get_vq_desc(struct vhost_virtqueue *vq, 1336 struct iovec iov[], unsigned int iov_size, 1337 unsigned int *out_num, unsigned int *in_num, 1338 struct vhost_log *log, unsigned int *log_num) 1339 { 1340 struct vring_desc desc; 1341 unsigned int i, head, found = 0; 1342 u16 last_avail_idx; 1343 __virtio16 avail_idx; 1344 __virtio16 ring_head; 1345 int ret; 1346 1347 /* Check it isn't doing very strange things with descriptor numbers. */ 1348 last_avail_idx = vq->last_avail_idx; 1349 if (unlikely(__get_user(avail_idx, &vq->avail->idx))) { 1350 vq_err(vq, "Failed to access avail idx at %p\n", 1351 &vq->avail->idx); 1352 return -EFAULT; 1353 } 1354 vq->avail_idx = vhost16_to_cpu(vq, avail_idx); 1355 1356 if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { 1357 vq_err(vq, "Guest moved used index from %u to %u", 1358 last_avail_idx, vq->avail_idx); 1359 return -EFAULT; 1360 } 1361 1362 /* If there's nothing new since last we looked, return invalid. */ 1363 if (vq->avail_idx == last_avail_idx) 1364 return vq->num; 1365 1366 /* Only get avail ring entries after they have been exposed by guest. */ 1367 smp_rmb(); 1368 1369 /* Grab the next descriptor number they're advertising, and increment 1370 * the index we've seen. */ 1371 if (unlikely(__get_user(ring_head, 1372 &vq->avail->ring[last_avail_idx % vq->num]))) { 1373 vq_err(vq, "Failed to read head: idx %d address %p\n", 1374 last_avail_idx, 1375 &vq->avail->ring[last_avail_idx % vq->num]); 1376 return -EFAULT; 1377 } 1378 1379 head = vhost16_to_cpu(vq, ring_head); 1380 1381 /* If their number is silly, that's an error. */ 1382 if (unlikely(head >= vq->num)) { 1383 vq_err(vq, "Guest says index %u > %u is available", 1384 head, vq->num); 1385 return -EINVAL; 1386 } 1387 1388 /* When we start there are none of either input nor output. */ 1389 *out_num = *in_num = 0; 1390 if (unlikely(log)) 1391 *log_num = 0; 1392 1393 i = head; 1394 do { 1395 unsigned iov_count = *in_num + *out_num; 1396 if (unlikely(i >= vq->num)) { 1397 vq_err(vq, "Desc index is %u > %u, head = %u", 1398 i, vq->num, head); 1399 return -EINVAL; 1400 } 1401 if (unlikely(++found > vq->num)) { 1402 vq_err(vq, "Loop detected: last one at %u " 1403 "vq size %u head %u\n", 1404 i, vq->num, head); 1405 return -EINVAL; 1406 } 1407 ret = __copy_from_user(&desc, vq->desc + i, sizeof desc); 1408 if (unlikely(ret)) { 1409 vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", 1410 i, vq->desc + i); 1411 return -EFAULT; 1412 } 1413 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) { 1414 ret = get_indirect(vq, iov, iov_size, 1415 out_num, in_num, 1416 log, log_num, &desc); 1417 if (unlikely(ret < 0)) { 1418 vq_err(vq, "Failure detected " 1419 "in indirect descriptor at idx %d\n", i); 1420 return ret; 1421 } 1422 continue; 1423 } 1424 1425 ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr), 1426 vhost32_to_cpu(vq, desc.len), iov + iov_count, 1427 iov_size - iov_count); 1428 if (unlikely(ret < 0)) { 1429 vq_err(vq, "Translation failure %d descriptor idx %d\n", 1430 ret, i); 1431 return ret; 1432 } 1433 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) { 1434 /* If this is an input descriptor, 1435 * increment that count. */ 1436 *in_num += ret; 1437 if (unlikely(log)) { 1438 log[*log_num].addr = vhost64_to_cpu(vq, desc.addr); 1439 log[*log_num].len = vhost32_to_cpu(vq, desc.len); 1440 ++*log_num; 1441 } 1442 } else { 1443 /* If it's an output descriptor, they're all supposed 1444 * to come before any input descriptors. */ 1445 if (unlikely(*in_num)) { 1446 vq_err(vq, "Descriptor has out after in: " 1447 "idx %d\n", i); 1448 return -EINVAL; 1449 } 1450 *out_num += ret; 1451 } 1452 } while ((i = next_desc(vq, &desc)) != -1); 1453 1454 /* On success, increment avail index. */ 1455 vq->last_avail_idx++; 1456 1457 /* Assume notifications from guest are disabled at this point, 1458 * if they aren't we would need to update avail_event index. */ 1459 BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY)); 1460 return head; 1461 } 1462 EXPORT_SYMBOL_GPL(vhost_get_vq_desc); 1463 1464 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ 1465 void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n) 1466 { 1467 vq->last_avail_idx -= n; 1468 } 1469 EXPORT_SYMBOL_GPL(vhost_discard_vq_desc); 1470 1471 /* After we've used one of their buffers, we tell them about it. We'll then 1472 * want to notify the guest, using eventfd. */ 1473 int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) 1474 { 1475 struct vring_used_elem heads = { 1476 cpu_to_vhost32(vq, head), 1477 cpu_to_vhost32(vq, len) 1478 }; 1479 1480 return vhost_add_used_n(vq, &heads, 1); 1481 } 1482 EXPORT_SYMBOL_GPL(vhost_add_used); 1483 1484 static int __vhost_add_used_n(struct vhost_virtqueue *vq, 1485 struct vring_used_elem *heads, 1486 unsigned count) 1487 { 1488 struct vring_used_elem __user *used; 1489 u16 old, new; 1490 int start; 1491 1492 start = vq->last_used_idx % vq->num; 1493 used = vq->used->ring + start; 1494 if (count == 1) { 1495 if (__put_user(heads[0].id, &used->id)) { 1496 vq_err(vq, "Failed to write used id"); 1497 return -EFAULT; 1498 } 1499 if (__put_user(heads[0].len, &used->len)) { 1500 vq_err(vq, "Failed to write used len"); 1501 return -EFAULT; 1502 } 1503 } else if (__copy_to_user(used, heads, count * sizeof *used)) { 1504 vq_err(vq, "Failed to write used"); 1505 return -EFAULT; 1506 } 1507 if (unlikely(vq->log_used)) { 1508 /* Make sure data is seen before log. */ 1509 smp_wmb(); 1510 /* Log used ring entry write. */ 1511 log_write(vq->log_base, 1512 vq->log_addr + 1513 ((void __user *)used - (void __user *)vq->used), 1514 count * sizeof *used); 1515 } 1516 old = vq->last_used_idx; 1517 new = (vq->last_used_idx += count); 1518 /* If the driver never bothers to signal in a very long while, 1519 * used index might wrap around. If that happens, invalidate 1520 * signalled_used index we stored. TODO: make sure driver 1521 * signals at least once in 2^16 and remove this. */ 1522 if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old))) 1523 vq->signalled_used_valid = false; 1524 return 0; 1525 } 1526 1527 /* After we've used one of their buffers, we tell them about it. We'll then 1528 * want to notify the guest, using eventfd. */ 1529 int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, 1530 unsigned count) 1531 { 1532 int start, n, r; 1533 1534 start = vq->last_used_idx % vq->num; 1535 n = vq->num - start; 1536 if (n < count) { 1537 r = __vhost_add_used_n(vq, heads, n); 1538 if (r < 0) 1539 return r; 1540 heads += n; 1541 count -= n; 1542 } 1543 r = __vhost_add_used_n(vq, heads, count); 1544 1545 /* Make sure buffer is written before we update index. */ 1546 smp_wmb(); 1547 if (__put_user(cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx)) { 1548 vq_err(vq, "Failed to increment used idx"); 1549 return -EFAULT; 1550 } 1551 if (unlikely(vq->log_used)) { 1552 /* Log used index update. */ 1553 log_write(vq->log_base, 1554 vq->log_addr + offsetof(struct vring_used, idx), 1555 sizeof vq->used->idx); 1556 if (vq->log_ctx) 1557 eventfd_signal(vq->log_ctx, 1); 1558 } 1559 return r; 1560 } 1561 EXPORT_SYMBOL_GPL(vhost_add_used_n); 1562 1563 static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) 1564 { 1565 __u16 old, new; 1566 __virtio16 event; 1567 bool v; 1568 /* Flush out used index updates. This is paired 1569 * with the barrier that the Guest executes when enabling 1570 * interrupts. */ 1571 smp_mb(); 1572 1573 if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) && 1574 unlikely(vq->avail_idx == vq->last_avail_idx)) 1575 return true; 1576 1577 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { 1578 __virtio16 flags; 1579 if (__get_user(flags, &vq->avail->flags)) { 1580 vq_err(vq, "Failed to get flags"); 1581 return true; 1582 } 1583 return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT)); 1584 } 1585 old = vq->signalled_used; 1586 v = vq->signalled_used_valid; 1587 new = vq->signalled_used = vq->last_used_idx; 1588 vq->signalled_used_valid = true; 1589 1590 if (unlikely(!v)) 1591 return true; 1592 1593 if (__get_user(event, vhost_used_event(vq))) { 1594 vq_err(vq, "Failed to get used event idx"); 1595 return true; 1596 } 1597 return vring_need_event(vhost16_to_cpu(vq, event), new, old); 1598 } 1599 1600 /* This actually signals the guest, using eventfd. */ 1601 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) 1602 { 1603 /* Signal the Guest tell them we used something up. */ 1604 if (vq->call_ctx && vhost_notify(dev, vq)) 1605 eventfd_signal(vq->call_ctx, 1); 1606 } 1607 EXPORT_SYMBOL_GPL(vhost_signal); 1608 1609 /* And here's the combo meal deal. Supersize me! */ 1610 void vhost_add_used_and_signal(struct vhost_dev *dev, 1611 struct vhost_virtqueue *vq, 1612 unsigned int head, int len) 1613 { 1614 vhost_add_used(vq, head, len); 1615 vhost_signal(dev, vq); 1616 } 1617 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal); 1618 1619 /* multi-buffer version of vhost_add_used_and_signal */ 1620 void vhost_add_used_and_signal_n(struct vhost_dev *dev, 1621 struct vhost_virtqueue *vq, 1622 struct vring_used_elem *heads, unsigned count) 1623 { 1624 vhost_add_used_n(vq, heads, count); 1625 vhost_signal(dev, vq); 1626 } 1627 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); 1628 1629 /* OK, now we need to know about added descriptors. */ 1630 bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) 1631 { 1632 __virtio16 avail_idx; 1633 int r; 1634 1635 if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) 1636 return false; 1637 vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; 1638 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { 1639 r = vhost_update_used_flags(vq); 1640 if (r) { 1641 vq_err(vq, "Failed to enable notification at %p: %d\n", 1642 &vq->used->flags, r); 1643 return false; 1644 } 1645 } else { 1646 r = vhost_update_avail_event(vq, vq->avail_idx); 1647 if (r) { 1648 vq_err(vq, "Failed to update avail event index at %p: %d\n", 1649 vhost_avail_event(vq), r); 1650 return false; 1651 } 1652 } 1653 /* They could have slipped one in as we were doing that: make 1654 * sure it's written, then check again. */ 1655 smp_mb(); 1656 r = __get_user(avail_idx, &vq->avail->idx); 1657 if (r) { 1658 vq_err(vq, "Failed to check avail idx at %p: %d\n", 1659 &vq->avail->idx, r); 1660 return false; 1661 } 1662 1663 return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx; 1664 } 1665 EXPORT_SYMBOL_GPL(vhost_enable_notify); 1666 1667 /* We don't need to be notified again. */ 1668 void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) 1669 { 1670 int r; 1671 1672 if (vq->used_flags & VRING_USED_F_NO_NOTIFY) 1673 return; 1674 vq->used_flags |= VRING_USED_F_NO_NOTIFY; 1675 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { 1676 r = vhost_update_used_flags(vq); 1677 if (r) 1678 vq_err(vq, "Failed to enable notification at %p: %d\n", 1679 &vq->used->flags, r); 1680 } 1681 } 1682 EXPORT_SYMBOL_GPL(vhost_disable_notify); 1683 1684 static int __init vhost_init(void) 1685 { 1686 return 0; 1687 } 1688 1689 static void __exit vhost_exit(void) 1690 { 1691 } 1692 1693 module_init(vhost_init); 1694 module_exit(vhost_exit); 1695 1696 MODULE_VERSION("0.0.1"); 1697 MODULE_LICENSE("GPL v2"); 1698 MODULE_AUTHOR("Michael S. Tsirkin"); 1699 MODULE_DESCRIPTION("Host kernel accelerator for virtio"); 1700