1 /* 2 * vhost shadow virtqueue 3 * 4 * SPDX-FileCopyrightText: Red Hat, Inc. 2021 5 * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> 6 * 7 * SPDX-License-Identifier: GPL-2.0-or-later 8 */ 9 10 #include "qemu/osdep.h" 11 #include "hw/virtio/vhost-shadow-virtqueue.h" 12 13 #include "qemu/error-report.h" 14 #include "qapi/error.h" 15 #include "qemu/main-loop.h" 16 #include "qemu/log.h" 17 #include "qemu/memalign.h" 18 #include "linux-headers/linux/vhost.h" 19 20 /** 21 * Validate the transport device features that both guests can use with the SVQ 22 * and SVQs can use with the device. 23 * 24 * @dev_features: The features 25 * @errp: Error pointer 26 */ 27 bool vhost_svq_valid_features(uint64_t features, Error **errp) 28 { 29 bool ok = true; 30 uint64_t svq_features = features; 31 32 for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END; 33 ++b) { 34 switch (b) { 35 case VIRTIO_F_ANY_LAYOUT: 36 continue; 37 38 case VIRTIO_F_ACCESS_PLATFORM: 39 /* SVQ trust in the host's IOMMU to translate addresses */ 40 case VIRTIO_F_VERSION_1: 41 /* SVQ trust that the guest vring is little endian */ 42 if (!(svq_features & BIT_ULL(b))) { 43 svq_features |= BIT_ULL(b); 44 ok = false; 45 } 46 continue; 47 48 default: 49 if (svq_features & BIT_ULL(b)) { 50 svq_features &= ~BIT_ULL(b); 51 ok = false; 52 } 53 } 54 } 55 56 if (!ok) { 57 error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64 58 ", ok: 0x%"PRIx64, features, svq_features); 59 } 60 return ok; 61 } 62 63 /** 64 * Number of descriptors that the SVQ can make available from the guest. 65 * 66 * @svq: The svq 67 */ 68 static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq) 69 { 70 return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx); 71 } 72 73 /** 74 * Translate addresses between the qemu's virtual address and the SVQ IOVA 75 * 76 * @svq: Shadow VirtQueue 77 * @vaddr: Translated IOVA addresses 78 * @iovec: Source qemu's VA addresses 79 * @num: Length of iovec and minimum length of vaddr 80 */ 81 static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, 82 hwaddr *addrs, const struct iovec *iovec, 83 size_t num) 84 { 85 if (num == 0) { 86 return true; 87 } 88 89 for (size_t i = 0; i < num; ++i) { 90 DMAMap needle = { 91 .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base, 92 .size = iovec[i].iov_len, 93 }; 94 Int128 needle_last, map_last; 95 size_t off; 96 97 const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle); 98 /* 99 * Map cannot be NULL since iova map contains all guest space and 100 * qemu already has a physical address mapped 101 */ 102 if (unlikely(!map)) { 103 qemu_log_mask(LOG_GUEST_ERROR, 104 "Invalid address 0x%"HWADDR_PRIx" given by guest", 105 needle.translated_addr); 106 return false; 107 } 108 109 off = needle.translated_addr - map->translated_addr; 110 addrs[i] = map->iova + off; 111 112 needle_last = int128_add(int128_make64(needle.translated_addr), 113 int128_make64(iovec[i].iov_len)); 114 map_last = int128_make64(map->translated_addr + map->size); 115 if (unlikely(int128_gt(needle_last, map_last))) { 116 qemu_log_mask(LOG_GUEST_ERROR, 117 "Guest buffer expands over iova range"); 118 return false; 119 } 120 } 121 122 return true; 123 } 124 125 /** 126 * Write descriptors to SVQ vring 127 * 128 * @svq: The shadow virtqueue 129 * @sg: Cache for hwaddr 130 * @iovec: The iovec from the guest 131 * @num: iovec length 132 * @more_descs: True if more descriptors come in the chain 133 * @write: True if they are writeable descriptors 134 * 135 * Return true if success, false otherwise and print error. 136 */ 137 static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, 138 const struct iovec *iovec, size_t num, 139 bool more_descs, bool write) 140 { 141 uint16_t i = svq->free_head, last = svq->free_head; 142 unsigned n; 143 uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; 144 vring_desc_t *descs = svq->vring.desc; 145 bool ok; 146 147 if (num == 0) { 148 return true; 149 } 150 151 ok = vhost_svq_translate_addr(svq, sg, iovec, num); 152 if (unlikely(!ok)) { 153 return false; 154 } 155 156 for (n = 0; n < num; n++) { 157 if (more_descs || (n + 1 < num)) { 158 descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); 159 descs[i].next = cpu_to_le16(svq->desc_next[i]); 160 } else { 161 descs[i].flags = flags; 162 } 163 descs[i].addr = cpu_to_le64(sg[n]); 164 descs[i].len = cpu_to_le32(iovec[n].iov_len); 165 166 last = i; 167 i = cpu_to_le16(svq->desc_next[i]); 168 } 169 170 svq->free_head = le16_to_cpu(svq->desc_next[last]); 171 return true; 172 } 173 174 static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, 175 VirtQueueElement *elem, unsigned *head) 176 { 177 unsigned avail_idx; 178 vring_avail_t *avail = svq->vring.avail; 179 bool ok; 180 g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num)); 181 182 *head = svq->free_head; 183 184 /* We need some descriptors here */ 185 if (unlikely(!elem->out_num && !elem->in_num)) { 186 qemu_log_mask(LOG_GUEST_ERROR, 187 "Guest provided element with no descriptors"); 188 return false; 189 } 190 191 ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num, 192 elem->in_num > 0, false); 193 if (unlikely(!ok)) { 194 return false; 195 } 196 197 ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, 198 true); 199 if (unlikely(!ok)) { 200 return false; 201 } 202 203 /* 204 * Put the entry in the available array (but don't update avail->idx until 205 * they do sync). 206 */ 207 avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1); 208 avail->ring[avail_idx] = cpu_to_le16(*head); 209 svq->shadow_avail_idx++; 210 211 /* Update the avail index after write the descriptor */ 212 smp_wmb(); 213 avail->idx = cpu_to_le16(svq->shadow_avail_idx); 214 215 return true; 216 } 217 218 /** 219 * Add an element to a SVQ. 220 * 221 * The caller must check that there is enough slots for the new element. It 222 * takes ownership of the element: In case of failure, it is free and the SVQ 223 * is considered broken. 224 */ 225 static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem) 226 { 227 unsigned qemu_head; 228 bool ok = vhost_svq_add_split(svq, elem, &qemu_head); 229 if (unlikely(!ok)) { 230 g_free(elem); 231 return false; 232 } 233 234 svq->ring_id_maps[qemu_head] = elem; 235 return true; 236 } 237 238 static void vhost_svq_kick(VhostShadowVirtqueue *svq) 239 { 240 /* 241 * We need to expose the available array entries before checking the used 242 * flags 243 */ 244 smp_mb(); 245 if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) { 246 return; 247 } 248 249 event_notifier_set(&svq->hdev_kick); 250 } 251 252 /** 253 * Forward available buffers. 254 * 255 * @svq: Shadow VirtQueue 256 * 257 * Note that this function does not guarantee that all guest's available 258 * buffers are available to the device in SVQ avail ring. The guest may have 259 * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in 260 * qemu vaddr. 261 * 262 * If that happens, guest's kick notifications will be disabled until the 263 * device uses some buffers. 264 */ 265 static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) 266 { 267 /* Clear event notifier */ 268 event_notifier_test_and_clear(&svq->svq_kick); 269 270 /* Forward to the device as many available buffers as possible */ 271 do { 272 virtio_queue_set_notification(svq->vq, false); 273 274 while (true) { 275 VirtQueueElement *elem; 276 bool ok; 277 278 if (svq->next_guest_avail_elem) { 279 elem = g_steal_pointer(&svq->next_guest_avail_elem); 280 } else { 281 elem = virtqueue_pop(svq->vq, sizeof(*elem)); 282 } 283 284 if (!elem) { 285 break; 286 } 287 288 if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) { 289 /* 290 * This condition is possible since a contiguous buffer in GPA 291 * does not imply a contiguous buffer in qemu's VA 292 * scatter-gather segments. If that happens, the buffer exposed 293 * to the device needs to be a chain of descriptors at this 294 * moment. 295 * 296 * SVQ cannot hold more available buffers if we are here: 297 * queue the current guest descriptor and ignore further kicks 298 * until some elements are used. 299 */ 300 svq->next_guest_avail_elem = elem; 301 return; 302 } 303 304 ok = vhost_svq_add(svq, elem); 305 if (unlikely(!ok)) { 306 /* VQ is broken, just return and ignore any other kicks */ 307 return; 308 } 309 vhost_svq_kick(svq); 310 } 311 312 virtio_queue_set_notification(svq->vq, true); 313 } while (!virtio_queue_empty(svq->vq)); 314 } 315 316 /** 317 * Handle guest's kick. 318 * 319 * @n: guest kick event notifier, the one that guest set to notify svq. 320 */ 321 static void vhost_handle_guest_kick_notifier(EventNotifier *n) 322 { 323 VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick); 324 event_notifier_test_and_clear(n); 325 vhost_handle_guest_kick(svq); 326 } 327 328 static bool vhost_svq_more_used(VhostShadowVirtqueue *svq) 329 { 330 if (svq->last_used_idx != svq->shadow_used_idx) { 331 return true; 332 } 333 334 svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx); 335 336 return svq->last_used_idx != svq->shadow_used_idx; 337 } 338 339 /** 340 * Enable vhost device calls after disable them. 341 * 342 * @svq: The svq 343 * 344 * It returns false if there are pending used buffers from the vhost device, 345 * avoiding the possible races between SVQ checking for more work and enabling 346 * callbacks. True if SVQ used vring has no more pending buffers. 347 */ 348 static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq) 349 { 350 svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT); 351 /* Make sure the flag is written before the read of used_idx */ 352 smp_mb(); 353 return !vhost_svq_more_used(svq); 354 } 355 356 static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq) 357 { 358 svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT); 359 } 360 361 static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue *svq, 362 uint16_t num, uint16_t i) 363 { 364 for (uint16_t j = 0; j < (num - 1); ++j) { 365 i = le16_to_cpu(svq->desc_next[i]); 366 } 367 368 return i; 369 } 370 371 static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, 372 uint32_t *len) 373 { 374 const vring_used_t *used = svq->vring.used; 375 vring_used_elem_t used_elem; 376 uint16_t last_used, last_used_chain, num; 377 378 if (!vhost_svq_more_used(svq)) { 379 return NULL; 380 } 381 382 /* Only get used array entries after they have been exposed by dev */ 383 smp_rmb(); 384 last_used = svq->last_used_idx & (svq->vring.num - 1); 385 used_elem.id = le32_to_cpu(used->ring[last_used].id); 386 used_elem.len = le32_to_cpu(used->ring[last_used].len); 387 388 svq->last_used_idx++; 389 if (unlikely(used_elem.id >= svq->vring.num)) { 390 qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used", 391 svq->vdev->name, used_elem.id); 392 return NULL; 393 } 394 395 if (unlikely(!svq->ring_id_maps[used_elem.id])) { 396 qemu_log_mask(LOG_GUEST_ERROR, 397 "Device %s says index %u is used, but it was not available", 398 svq->vdev->name, used_elem.id); 399 return NULL; 400 } 401 402 num = svq->ring_id_maps[used_elem.id]->in_num + 403 svq->ring_id_maps[used_elem.id]->out_num; 404 last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id); 405 svq->desc_next[last_used_chain] = svq->free_head; 406 svq->free_head = used_elem.id; 407 408 *len = used_elem.len; 409 return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); 410 } 411 412 static void vhost_svq_flush(VhostShadowVirtqueue *svq, 413 bool check_for_avail_queue) 414 { 415 VirtQueue *vq = svq->vq; 416 417 /* Forward as many used buffers as possible. */ 418 do { 419 unsigned i = 0; 420 421 vhost_svq_disable_notification(svq); 422 while (true) { 423 uint32_t len; 424 g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len); 425 if (!elem) { 426 break; 427 } 428 429 if (unlikely(i >= svq->vring.num)) { 430 qemu_log_mask(LOG_GUEST_ERROR, 431 "More than %u used buffers obtained in a %u size SVQ", 432 i, svq->vring.num); 433 virtqueue_fill(vq, elem, len, i); 434 virtqueue_flush(vq, i); 435 return; 436 } 437 virtqueue_fill(vq, elem, len, i++); 438 } 439 440 virtqueue_flush(vq, i); 441 event_notifier_set(&svq->svq_call); 442 443 if (check_for_avail_queue && svq->next_guest_avail_elem) { 444 /* 445 * Avail ring was full when vhost_svq_flush was called, so it's a 446 * good moment to make more descriptors available if possible. 447 */ 448 vhost_handle_guest_kick(svq); 449 } 450 } while (!vhost_svq_enable_notification(svq)); 451 } 452 453 /** 454 * Forward used buffers. 455 * 456 * @n: hdev call event notifier, the one that device set to notify svq. 457 * 458 * Note that we are not making any buffers available in the loop, there is no 459 * way that it runs more than virtqueue size times. 460 */ 461 static void vhost_svq_handle_call(EventNotifier *n) 462 { 463 VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, 464 hdev_call); 465 event_notifier_test_and_clear(n); 466 vhost_svq_flush(svq, true); 467 } 468 469 /** 470 * Set the call notifier for the SVQ to call the guest 471 * 472 * @svq: Shadow virtqueue 473 * @call_fd: call notifier 474 * 475 * Called on BQL context. 476 */ 477 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd) 478 { 479 if (call_fd == VHOST_FILE_UNBIND) { 480 /* 481 * Fail event_notifier_set if called handling device call. 482 * 483 * SVQ still needs device notifications, since it needs to keep 484 * forwarding used buffers even with the unbind. 485 */ 486 memset(&svq->svq_call, 0, sizeof(svq->svq_call)); 487 } else { 488 event_notifier_init_fd(&svq->svq_call, call_fd); 489 } 490 } 491 492 /** 493 * Get the shadow vq vring address. 494 * @svq: Shadow virtqueue 495 * @addr: Destination to store address 496 */ 497 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq, 498 struct vhost_vring_addr *addr) 499 { 500 addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc; 501 addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail; 502 addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used; 503 } 504 505 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq) 506 { 507 size_t desc_size = sizeof(vring_desc_t) * svq->vring.num; 508 size_t avail_size = offsetof(vring_avail_t, ring) + 509 sizeof(uint16_t) * svq->vring.num; 510 511 return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size()); 512 } 513 514 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq) 515 { 516 size_t used_size = offsetof(vring_used_t, ring) + 517 sizeof(vring_used_elem_t) * svq->vring.num; 518 return ROUND_UP(used_size, qemu_real_host_page_size()); 519 } 520 521 /** 522 * Set a new file descriptor for the guest to kick the SVQ and notify for avail 523 * 524 * @svq: The svq 525 * @svq_kick_fd: The svq kick fd 526 * 527 * Note that the SVQ will never close the old file descriptor. 528 */ 529 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd) 530 { 531 EventNotifier *svq_kick = &svq->svq_kick; 532 bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick); 533 bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND; 534 535 if (poll_stop) { 536 event_notifier_set_handler(svq_kick, NULL); 537 } 538 539 /* 540 * event_notifier_set_handler already checks for guest's notifications if 541 * they arrive at the new file descriptor in the switch, so there is no 542 * need to explicitly check for them. 543 */ 544 if (poll_start) { 545 event_notifier_init_fd(svq_kick, svq_kick_fd); 546 event_notifier_set(svq_kick); 547 event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier); 548 } 549 } 550 551 /** 552 * Start the shadow virtqueue operation. 553 * 554 * @svq: Shadow Virtqueue 555 * @vdev: VirtIO device 556 * @vq: Virtqueue to shadow 557 */ 558 void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, 559 VirtQueue *vq) 560 { 561 size_t desc_size, driver_size, device_size; 562 563 svq->next_guest_avail_elem = NULL; 564 svq->shadow_avail_idx = 0; 565 svq->shadow_used_idx = 0; 566 svq->last_used_idx = 0; 567 svq->vdev = vdev; 568 svq->vq = vq; 569 570 svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq)); 571 driver_size = vhost_svq_driver_area_size(svq); 572 device_size = vhost_svq_device_area_size(svq); 573 svq->vring.desc = qemu_memalign(qemu_real_host_page_size(), driver_size); 574 desc_size = sizeof(vring_desc_t) * svq->vring.num; 575 svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size); 576 memset(svq->vring.desc, 0, driver_size); 577 svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size); 578 memset(svq->vring.used, 0, device_size); 579 svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num); 580 svq->desc_next = g_new0(uint16_t, svq->vring.num); 581 for (unsigned i = 0; i < svq->vring.num - 1; i++) { 582 svq->desc_next[i] = cpu_to_le16(i + 1); 583 } 584 } 585 586 /** 587 * Stop the shadow virtqueue operation. 588 * @svq: Shadow Virtqueue 589 */ 590 void vhost_svq_stop(VhostShadowVirtqueue *svq) 591 { 592 event_notifier_set_handler(&svq->svq_kick, NULL); 593 g_autofree VirtQueueElement *next_avail_elem = NULL; 594 595 if (!svq->vq) { 596 return; 597 } 598 599 /* Send all pending used descriptors to guest */ 600 vhost_svq_flush(svq, false); 601 602 for (unsigned i = 0; i < svq->vring.num; ++i) { 603 g_autofree VirtQueueElement *elem = NULL; 604 elem = g_steal_pointer(&svq->ring_id_maps[i]); 605 if (elem) { 606 virtqueue_detach_element(svq->vq, elem, 0); 607 } 608 } 609 610 next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem); 611 if (next_avail_elem) { 612 virtqueue_detach_element(svq->vq, next_avail_elem, 0); 613 } 614 svq->vq = NULL; 615 g_free(svq->desc_next); 616 g_free(svq->ring_id_maps); 617 qemu_vfree(svq->vring.desc); 618 qemu_vfree(svq->vring.used); 619 } 620 621 /** 622 * Creates vhost shadow virtqueue, and instructs the vhost device to use the 623 * shadow methods and file descriptors. 624 * 625 * @iova_tree: Tree to perform descriptors translations 626 * 627 * Returns the new virtqueue or NULL. 628 * 629 * In case of error, reason is reported through error_report. 630 */ 631 VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree) 632 { 633 g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1); 634 int r; 635 636 r = event_notifier_init(&svq->hdev_kick, 0); 637 if (r != 0) { 638 error_report("Couldn't create kick event notifier: %s (%d)", 639 g_strerror(errno), errno); 640 goto err_init_hdev_kick; 641 } 642 643 r = event_notifier_init(&svq->hdev_call, 0); 644 if (r != 0) { 645 error_report("Couldn't create call event notifier: %s (%d)", 646 g_strerror(errno), errno); 647 goto err_init_hdev_call; 648 } 649 650 event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND); 651 event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call); 652 svq->iova_tree = iova_tree; 653 return g_steal_pointer(&svq); 654 655 err_init_hdev_call: 656 event_notifier_cleanup(&svq->hdev_kick); 657 658 err_init_hdev_kick: 659 return NULL; 660 } 661 662 /** 663 * Free the resources of the shadow virtqueue. 664 * 665 * @pvq: gpointer to SVQ so it can be used by autofree functions. 666 */ 667 void vhost_svq_free(gpointer pvq) 668 { 669 VhostShadowVirtqueue *vq = pvq; 670 vhost_svq_stop(vq); 671 event_notifier_cleanup(&vq->hdev_kick); 672 event_notifier_set_handler(&vq->hdev_call, NULL); 673 event_notifier_cleanup(&vq->hdev_call); 674 g_free(vq); 675 } 676