1 /* 2 * KVMGT - the implementation of Intel mediated pass-through framework for KVM 3 * 4 * Copyright(c) 2014-2016 Intel Corporation. All rights reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 * 25 * Authors: 26 * Kevin Tian <kevin.tian@intel.com> 27 * Jike Song <jike.song@intel.com> 28 * Xiaoguang Chen <xiaoguang.chen@intel.com> 29 */ 30 31 #include <linux/init.h> 32 #include <linux/device.h> 33 #include <linux/mm.h> 34 #include <linux/mmu_context.h> 35 #include <linux/sched/mm.h> 36 #include <linux/types.h> 37 #include <linux/list.h> 38 #include <linux/rbtree.h> 39 #include <linux/spinlock.h> 40 #include <linux/eventfd.h> 41 #include <linux/uuid.h> 42 #include <linux/kvm_host.h> 43 #include <linux/vfio.h> 44 #include <linux/mdev.h> 45 #include <linux/debugfs.h> 46 47 #include <linux/nospec.h> 48 49 #include "i915_drv.h" 50 #include "gvt.h" 51 52 static const struct intel_gvt_ops *intel_gvt_ops; 53 54 /* helper macros copied from vfio-pci */ 55 #define VFIO_PCI_OFFSET_SHIFT 40 56 #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) 57 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) 58 #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) 59 60 #define OPREGION_SIGNATURE "IntelGraphicsMem" 61 62 struct vfio_region; 63 struct intel_vgpu_regops { 64 size_t (*rw)(struct intel_vgpu *vgpu, char *buf, 65 size_t count, loff_t *ppos, bool iswrite); 66 void (*release)(struct intel_vgpu *vgpu, 67 struct vfio_region *region); 68 }; 69 70 struct vfio_region { 71 u32 type; 72 u32 subtype; 73 size_t size; 74 u32 flags; 75 const struct intel_vgpu_regops *ops; 76 void *data; 77 }; 78 79 struct kvmgt_pgfn { 80 gfn_t gfn; 81 struct hlist_node hnode; 82 }; 83 84 struct kvmgt_guest_info { 85 struct kvm *kvm; 86 struct intel_vgpu *vgpu; 87 struct kvm_page_track_notifier_node track_node; 88 #define NR_BKT (1 << 18) 89 struct hlist_head ptable[NR_BKT]; 90 #undef NR_BKT 91 struct dentry *debugfs_cache_entries; 92 }; 93 94 struct gvt_dma { 95 struct intel_vgpu *vgpu; 96 struct rb_node gfn_node; 97 struct rb_node dma_addr_node; 98 gfn_t gfn; 99 dma_addr_t dma_addr; 100 unsigned long size; 101 struct kref ref; 102 }; 103 104 static inline bool handle_valid(unsigned long handle) 105 { 106 return !!(handle & ~0xff); 107 } 108 109 static int kvmgt_guest_init(struct mdev_device *mdev); 110 static void intel_vgpu_release_work(struct work_struct *work); 111 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info); 112 113 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, 114 unsigned long size) 115 { 116 int total_pages; 117 int npage; 118 int ret; 119 120 total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE; 121 122 for (npage = 0; npage < total_pages; npage++) { 123 unsigned long cur_gfn = gfn + npage; 124 125 ret = vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &cur_gfn, 1); 126 WARN_ON(ret != 1); 127 } 128 } 129 130 /* Pin a normal or compound guest page for dma. */ 131 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, 132 unsigned long size, struct page **page) 133 { 134 unsigned long base_pfn = 0; 135 int total_pages; 136 int npage; 137 int ret; 138 139 total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE; 140 /* 141 * We pin the pages one-by-one to avoid allocating a big arrary 142 * on stack to hold pfns. 143 */ 144 for (npage = 0; npage < total_pages; npage++) { 145 unsigned long cur_gfn = gfn + npage; 146 unsigned long pfn; 147 148 ret = vfio_pin_pages(mdev_dev(vgpu->vdev.mdev), &cur_gfn, 1, 149 IOMMU_READ | IOMMU_WRITE, &pfn); 150 if (ret != 1) { 151 gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n", 152 cur_gfn, ret); 153 goto err; 154 } 155 156 if (!pfn_valid(pfn)) { 157 gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn); 158 npage++; 159 ret = -EFAULT; 160 goto err; 161 } 162 163 if (npage == 0) 164 base_pfn = pfn; 165 else if (base_pfn + npage != pfn) { 166 gvt_vgpu_err("The pages are not continuous\n"); 167 ret = -EINVAL; 168 npage++; 169 goto err; 170 } 171 } 172 173 *page = pfn_to_page(base_pfn); 174 return 0; 175 err: 176 gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE); 177 return ret; 178 } 179 180 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn, 181 dma_addr_t *dma_addr, unsigned long size) 182 { 183 struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev; 184 struct page *page = NULL; 185 int ret; 186 187 ret = gvt_pin_guest_page(vgpu, gfn, size, &page); 188 if (ret) 189 return ret; 190 191 /* Setup DMA mapping. */ 192 *dma_addr = dma_map_page(dev, page, 0, size, PCI_DMA_BIDIRECTIONAL); 193 if (dma_mapping_error(dev, *dma_addr)) { 194 gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n", 195 page_to_pfn(page), ret); 196 gvt_unpin_guest_page(vgpu, gfn, size); 197 return -ENOMEM; 198 } 199 200 return 0; 201 } 202 203 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn, 204 dma_addr_t dma_addr, unsigned long size) 205 { 206 struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev; 207 208 dma_unmap_page(dev, dma_addr, size, PCI_DMA_BIDIRECTIONAL); 209 gvt_unpin_guest_page(vgpu, gfn, size); 210 } 211 212 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu, 213 dma_addr_t dma_addr) 214 { 215 struct rb_node *node = vgpu->vdev.dma_addr_cache.rb_node; 216 struct gvt_dma *itr; 217 218 while (node) { 219 itr = rb_entry(node, struct gvt_dma, dma_addr_node); 220 221 if (dma_addr < itr->dma_addr) 222 node = node->rb_left; 223 else if (dma_addr > itr->dma_addr) 224 node = node->rb_right; 225 else 226 return itr; 227 } 228 return NULL; 229 } 230 231 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn) 232 { 233 struct rb_node *node = vgpu->vdev.gfn_cache.rb_node; 234 struct gvt_dma *itr; 235 236 while (node) { 237 itr = rb_entry(node, struct gvt_dma, gfn_node); 238 239 if (gfn < itr->gfn) 240 node = node->rb_left; 241 else if (gfn > itr->gfn) 242 node = node->rb_right; 243 else 244 return itr; 245 } 246 return NULL; 247 } 248 249 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn, 250 dma_addr_t dma_addr, unsigned long size) 251 { 252 struct gvt_dma *new, *itr; 253 struct rb_node **link, *parent = NULL; 254 255 new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL); 256 if (!new) 257 return -ENOMEM; 258 259 new->vgpu = vgpu; 260 new->gfn = gfn; 261 new->dma_addr = dma_addr; 262 new->size = size; 263 kref_init(&new->ref); 264 265 /* gfn_cache maps gfn to struct gvt_dma. */ 266 link = &vgpu->vdev.gfn_cache.rb_node; 267 while (*link) { 268 parent = *link; 269 itr = rb_entry(parent, struct gvt_dma, gfn_node); 270 271 if (gfn < itr->gfn) 272 link = &parent->rb_left; 273 else 274 link = &parent->rb_right; 275 } 276 rb_link_node(&new->gfn_node, parent, link); 277 rb_insert_color(&new->gfn_node, &vgpu->vdev.gfn_cache); 278 279 /* dma_addr_cache maps dma addr to struct gvt_dma. */ 280 parent = NULL; 281 link = &vgpu->vdev.dma_addr_cache.rb_node; 282 while (*link) { 283 parent = *link; 284 itr = rb_entry(parent, struct gvt_dma, dma_addr_node); 285 286 if (dma_addr < itr->dma_addr) 287 link = &parent->rb_left; 288 else 289 link = &parent->rb_right; 290 } 291 rb_link_node(&new->dma_addr_node, parent, link); 292 rb_insert_color(&new->dma_addr_node, &vgpu->vdev.dma_addr_cache); 293 294 vgpu->vdev.nr_cache_entries++; 295 return 0; 296 } 297 298 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu, 299 struct gvt_dma *entry) 300 { 301 rb_erase(&entry->gfn_node, &vgpu->vdev.gfn_cache); 302 rb_erase(&entry->dma_addr_node, &vgpu->vdev.dma_addr_cache); 303 kfree(entry); 304 vgpu->vdev.nr_cache_entries--; 305 } 306 307 static void gvt_cache_destroy(struct intel_vgpu *vgpu) 308 { 309 struct gvt_dma *dma; 310 struct rb_node *node = NULL; 311 312 for (;;) { 313 mutex_lock(&vgpu->vdev.cache_lock); 314 node = rb_first(&vgpu->vdev.gfn_cache); 315 if (!node) { 316 mutex_unlock(&vgpu->vdev.cache_lock); 317 break; 318 } 319 dma = rb_entry(node, struct gvt_dma, gfn_node); 320 gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size); 321 __gvt_cache_remove_entry(vgpu, dma); 322 mutex_unlock(&vgpu->vdev.cache_lock); 323 } 324 } 325 326 static void gvt_cache_init(struct intel_vgpu *vgpu) 327 { 328 vgpu->vdev.gfn_cache = RB_ROOT; 329 vgpu->vdev.dma_addr_cache = RB_ROOT; 330 vgpu->vdev.nr_cache_entries = 0; 331 mutex_init(&vgpu->vdev.cache_lock); 332 } 333 334 static void kvmgt_protect_table_init(struct kvmgt_guest_info *info) 335 { 336 hash_init(info->ptable); 337 } 338 339 static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info) 340 { 341 struct kvmgt_pgfn *p; 342 struct hlist_node *tmp; 343 int i; 344 345 hash_for_each_safe(info->ptable, i, tmp, p, hnode) { 346 hash_del(&p->hnode); 347 kfree(p); 348 } 349 } 350 351 static struct kvmgt_pgfn * 352 __kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn) 353 { 354 struct kvmgt_pgfn *p, *res = NULL; 355 356 hash_for_each_possible(info->ptable, p, hnode, gfn) { 357 if (gfn == p->gfn) { 358 res = p; 359 break; 360 } 361 } 362 363 return res; 364 } 365 366 static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info, 367 gfn_t gfn) 368 { 369 struct kvmgt_pgfn *p; 370 371 p = __kvmgt_protect_table_find(info, gfn); 372 return !!p; 373 } 374 375 static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn) 376 { 377 struct kvmgt_pgfn *p; 378 379 if (kvmgt_gfn_is_write_protected(info, gfn)) 380 return; 381 382 p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC); 383 if (WARN(!p, "gfn: 0x%llx\n", gfn)) 384 return; 385 386 p->gfn = gfn; 387 hash_add(info->ptable, &p->hnode, gfn); 388 } 389 390 static void kvmgt_protect_table_del(struct kvmgt_guest_info *info, 391 gfn_t gfn) 392 { 393 struct kvmgt_pgfn *p; 394 395 p = __kvmgt_protect_table_find(info, gfn); 396 if (p) { 397 hash_del(&p->hnode); 398 kfree(p); 399 } 400 } 401 402 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf, 403 size_t count, loff_t *ppos, bool iswrite) 404 { 405 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - 406 VFIO_PCI_NUM_REGIONS; 407 void *base = vgpu->vdev.region[i].data; 408 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 409 410 if (pos >= vgpu->vdev.region[i].size || iswrite) { 411 gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n"); 412 return -EINVAL; 413 } 414 count = min(count, (size_t)(vgpu->vdev.region[i].size - pos)); 415 memcpy(buf, base + pos, count); 416 417 return count; 418 } 419 420 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu, 421 struct vfio_region *region) 422 { 423 } 424 425 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = { 426 .rw = intel_vgpu_reg_rw_opregion, 427 .release = intel_vgpu_reg_release_opregion, 428 }; 429 430 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu, 431 unsigned int type, unsigned int subtype, 432 const struct intel_vgpu_regops *ops, 433 size_t size, u32 flags, void *data) 434 { 435 struct vfio_region *region; 436 437 region = krealloc(vgpu->vdev.region, 438 (vgpu->vdev.num_regions + 1) * sizeof(*region), 439 GFP_KERNEL); 440 if (!region) 441 return -ENOMEM; 442 443 vgpu->vdev.region = region; 444 vgpu->vdev.region[vgpu->vdev.num_regions].type = type; 445 vgpu->vdev.region[vgpu->vdev.num_regions].subtype = subtype; 446 vgpu->vdev.region[vgpu->vdev.num_regions].ops = ops; 447 vgpu->vdev.region[vgpu->vdev.num_regions].size = size; 448 vgpu->vdev.region[vgpu->vdev.num_regions].flags = flags; 449 vgpu->vdev.region[vgpu->vdev.num_regions].data = data; 450 vgpu->vdev.num_regions++; 451 return 0; 452 } 453 454 static int kvmgt_get_vfio_device(void *p_vgpu) 455 { 456 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu; 457 458 vgpu->vdev.vfio_device = vfio_device_get_from_dev( 459 mdev_dev(vgpu->vdev.mdev)); 460 if (!vgpu->vdev.vfio_device) { 461 gvt_vgpu_err("failed to get vfio device\n"); 462 return -ENODEV; 463 } 464 return 0; 465 } 466 467 468 static int kvmgt_set_opregion(void *p_vgpu) 469 { 470 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu; 471 void *base; 472 int ret; 473 474 /* Each vgpu has its own opregion, although VFIO would create another 475 * one later. This one is used to expose opregion to VFIO. And the 476 * other one created by VFIO later, is used by guest actually. 477 */ 478 base = vgpu_opregion(vgpu)->va; 479 if (!base) 480 return -ENOMEM; 481 482 if (memcmp(base, OPREGION_SIGNATURE, 16)) { 483 memunmap(base); 484 return -EINVAL; 485 } 486 487 ret = intel_vgpu_register_reg(vgpu, 488 PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, 489 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, 490 &intel_vgpu_regops_opregion, OPREGION_SIZE, 491 VFIO_REGION_INFO_FLAG_READ, base); 492 493 return ret; 494 } 495 496 static void kvmgt_put_vfio_device(void *vgpu) 497 { 498 if (WARN_ON(!((struct intel_vgpu *)vgpu)->vdev.vfio_device)) 499 return; 500 501 vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device); 502 } 503 504 static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) 505 { 506 struct intel_vgpu *vgpu = NULL; 507 struct intel_vgpu_type *type; 508 struct device *pdev; 509 void *gvt; 510 int ret; 511 512 pdev = mdev_parent_dev(mdev); 513 gvt = kdev_to_i915(pdev)->gvt; 514 515 type = intel_gvt_ops->gvt_find_vgpu_type(gvt, kobject_name(kobj)); 516 if (!type) { 517 gvt_vgpu_err("failed to find type %s to create\n", 518 kobject_name(kobj)); 519 ret = -EINVAL; 520 goto out; 521 } 522 523 vgpu = intel_gvt_ops->vgpu_create(gvt, type); 524 if (IS_ERR_OR_NULL(vgpu)) { 525 ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu); 526 gvt_err("failed to create intel vgpu: %d\n", ret); 527 goto out; 528 } 529 530 INIT_WORK(&vgpu->vdev.release_work, intel_vgpu_release_work); 531 532 vgpu->vdev.mdev = mdev; 533 mdev_set_drvdata(mdev, vgpu); 534 535 gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n", 536 dev_name(mdev_dev(mdev))); 537 ret = 0; 538 539 out: 540 return ret; 541 } 542 543 static int intel_vgpu_remove(struct mdev_device *mdev) 544 { 545 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 546 547 if (handle_valid(vgpu->handle)) 548 return -EBUSY; 549 550 intel_gvt_ops->vgpu_destroy(vgpu); 551 return 0; 552 } 553 554 static int intel_vgpu_iommu_notifier(struct notifier_block *nb, 555 unsigned long action, void *data) 556 { 557 struct intel_vgpu *vgpu = container_of(nb, 558 struct intel_vgpu, 559 vdev.iommu_notifier); 560 561 if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) { 562 struct vfio_iommu_type1_dma_unmap *unmap = data; 563 struct gvt_dma *entry; 564 unsigned long iov_pfn, end_iov_pfn; 565 566 iov_pfn = unmap->iova >> PAGE_SHIFT; 567 end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE; 568 569 mutex_lock(&vgpu->vdev.cache_lock); 570 for (; iov_pfn < end_iov_pfn; iov_pfn++) { 571 entry = __gvt_cache_find_gfn(vgpu, iov_pfn); 572 if (!entry) 573 continue; 574 575 gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr, 576 entry->size); 577 __gvt_cache_remove_entry(vgpu, entry); 578 } 579 mutex_unlock(&vgpu->vdev.cache_lock); 580 } 581 582 return NOTIFY_OK; 583 } 584 585 static int intel_vgpu_group_notifier(struct notifier_block *nb, 586 unsigned long action, void *data) 587 { 588 struct intel_vgpu *vgpu = container_of(nb, 589 struct intel_vgpu, 590 vdev.group_notifier); 591 592 /* the only action we care about */ 593 if (action == VFIO_GROUP_NOTIFY_SET_KVM) { 594 vgpu->vdev.kvm = data; 595 596 if (!data) 597 schedule_work(&vgpu->vdev.release_work); 598 } 599 600 return NOTIFY_OK; 601 } 602 603 static int intel_vgpu_open(struct mdev_device *mdev) 604 { 605 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 606 unsigned long events; 607 int ret; 608 609 vgpu->vdev.iommu_notifier.notifier_call = intel_vgpu_iommu_notifier; 610 vgpu->vdev.group_notifier.notifier_call = intel_vgpu_group_notifier; 611 612 events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; 613 ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events, 614 &vgpu->vdev.iommu_notifier); 615 if (ret != 0) { 616 gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n", 617 ret); 618 goto out; 619 } 620 621 events = VFIO_GROUP_NOTIFY_SET_KVM; 622 ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events, 623 &vgpu->vdev.group_notifier); 624 if (ret != 0) { 625 gvt_vgpu_err("vfio_register_notifier for group failed: %d\n", 626 ret); 627 goto undo_iommu; 628 } 629 630 ret = kvmgt_guest_init(mdev); 631 if (ret) 632 goto undo_group; 633 634 intel_gvt_ops->vgpu_activate(vgpu); 635 636 atomic_set(&vgpu->vdev.released, 0); 637 return ret; 638 639 undo_group: 640 vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, 641 &vgpu->vdev.group_notifier); 642 643 undo_iommu: 644 vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, 645 &vgpu->vdev.iommu_notifier); 646 out: 647 return ret; 648 } 649 650 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu) 651 { 652 struct eventfd_ctx *trigger; 653 654 trigger = vgpu->vdev.msi_trigger; 655 if (trigger) { 656 eventfd_ctx_put(trigger); 657 vgpu->vdev.msi_trigger = NULL; 658 } 659 } 660 661 static void __intel_vgpu_release(struct intel_vgpu *vgpu) 662 { 663 struct kvmgt_guest_info *info; 664 int ret; 665 666 if (!handle_valid(vgpu->handle)) 667 return; 668 669 if (atomic_cmpxchg(&vgpu->vdev.released, 0, 1)) 670 return; 671 672 intel_gvt_ops->vgpu_release(vgpu); 673 674 ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_IOMMU_NOTIFY, 675 &vgpu->vdev.iommu_notifier); 676 WARN(ret, "vfio_unregister_notifier for iommu failed: %d\n", ret); 677 678 ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_GROUP_NOTIFY, 679 &vgpu->vdev.group_notifier); 680 WARN(ret, "vfio_unregister_notifier for group failed: %d\n", ret); 681 682 info = (struct kvmgt_guest_info *)vgpu->handle; 683 kvmgt_guest_exit(info); 684 685 intel_vgpu_release_msi_eventfd_ctx(vgpu); 686 687 vgpu->vdev.kvm = NULL; 688 vgpu->handle = 0; 689 } 690 691 static void intel_vgpu_release(struct mdev_device *mdev) 692 { 693 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 694 695 __intel_vgpu_release(vgpu); 696 } 697 698 static void intel_vgpu_release_work(struct work_struct *work) 699 { 700 struct intel_vgpu *vgpu = container_of(work, struct intel_vgpu, 701 vdev.release_work); 702 703 __intel_vgpu_release(vgpu); 704 } 705 706 static uint64_t intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar) 707 { 708 u32 start_lo, start_hi; 709 u32 mem_type; 710 711 start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) & 712 PCI_BASE_ADDRESS_MEM_MASK; 713 mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) & 714 PCI_BASE_ADDRESS_MEM_TYPE_MASK; 715 716 switch (mem_type) { 717 case PCI_BASE_ADDRESS_MEM_TYPE_64: 718 start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space 719 + bar + 4)); 720 break; 721 case PCI_BASE_ADDRESS_MEM_TYPE_32: 722 case PCI_BASE_ADDRESS_MEM_TYPE_1M: 723 /* 1M mem BAR treated as 32-bit BAR */ 724 default: 725 /* mem unknown type treated as 32-bit BAR */ 726 start_hi = 0; 727 break; 728 } 729 730 return ((u64)start_hi << 32) | start_lo; 731 } 732 733 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, uint64_t off, 734 void *buf, unsigned int count, bool is_write) 735 { 736 uint64_t bar_start = intel_vgpu_get_bar_addr(vgpu, bar); 737 int ret; 738 739 if (is_write) 740 ret = intel_gvt_ops->emulate_mmio_write(vgpu, 741 bar_start + off, buf, count); 742 else 743 ret = intel_gvt_ops->emulate_mmio_read(vgpu, 744 bar_start + off, buf, count); 745 return ret; 746 } 747 748 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, uint64_t off) 749 { 750 return off >= vgpu_aperture_offset(vgpu) && 751 off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu); 752 } 753 754 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, uint64_t off, 755 void *buf, unsigned long count, bool is_write) 756 { 757 void *aperture_va; 758 759 if (!intel_vgpu_in_aperture(vgpu, off) || 760 !intel_vgpu_in_aperture(vgpu, off + count)) { 761 gvt_vgpu_err("Invalid aperture offset %llu\n", off); 762 return -EINVAL; 763 } 764 765 aperture_va = io_mapping_map_wc(&vgpu->gvt->dev_priv->ggtt.iomap, 766 ALIGN_DOWN(off, PAGE_SIZE), 767 count + offset_in_page(off)); 768 if (!aperture_va) 769 return -EIO; 770 771 if (is_write) 772 memcpy(aperture_va + offset_in_page(off), buf, count); 773 else 774 memcpy(buf, aperture_va + offset_in_page(off), count); 775 776 io_mapping_unmap(aperture_va); 777 778 return 0; 779 } 780 781 static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf, 782 size_t count, loff_t *ppos, bool is_write) 783 { 784 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 785 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 786 uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 787 int ret = -EINVAL; 788 789 790 if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) { 791 gvt_vgpu_err("invalid index: %u\n", index); 792 return -EINVAL; 793 } 794 795 switch (index) { 796 case VFIO_PCI_CONFIG_REGION_INDEX: 797 if (is_write) 798 ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos, 799 buf, count); 800 else 801 ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos, 802 buf, count); 803 break; 804 case VFIO_PCI_BAR0_REGION_INDEX: 805 ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos, 806 buf, count, is_write); 807 break; 808 case VFIO_PCI_BAR2_REGION_INDEX: 809 ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write); 810 break; 811 case VFIO_PCI_BAR1_REGION_INDEX: 812 case VFIO_PCI_BAR3_REGION_INDEX: 813 case VFIO_PCI_BAR4_REGION_INDEX: 814 case VFIO_PCI_BAR5_REGION_INDEX: 815 case VFIO_PCI_VGA_REGION_INDEX: 816 case VFIO_PCI_ROM_REGION_INDEX: 817 break; 818 default: 819 if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) 820 return -EINVAL; 821 822 index -= VFIO_PCI_NUM_REGIONS; 823 return vgpu->vdev.region[index].ops->rw(vgpu, buf, count, 824 ppos, is_write); 825 } 826 827 return ret == 0 ? count : ret; 828 } 829 830 static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos) 831 { 832 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 833 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 834 struct intel_gvt *gvt = vgpu->gvt; 835 int offset; 836 837 /* Only allow MMIO GGTT entry access */ 838 if (index != PCI_BASE_ADDRESS_0) 839 return false; 840 841 offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) - 842 intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0); 843 844 return (offset >= gvt->device_info.gtt_start_offset && 845 offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ? 846 true : false; 847 } 848 849 static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf, 850 size_t count, loff_t *ppos) 851 { 852 unsigned int done = 0; 853 int ret; 854 855 while (count) { 856 size_t filled; 857 858 /* Only support GGTT entry 8 bytes read */ 859 if (count >= 8 && !(*ppos % 8) && 860 gtt_entry(mdev, ppos)) { 861 u64 val; 862 863 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), 864 ppos, false); 865 if (ret <= 0) 866 goto read_err; 867 868 if (copy_to_user(buf, &val, sizeof(val))) 869 goto read_err; 870 871 filled = 8; 872 } else if (count >= 4 && !(*ppos % 4)) { 873 u32 val; 874 875 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), 876 ppos, false); 877 if (ret <= 0) 878 goto read_err; 879 880 if (copy_to_user(buf, &val, sizeof(val))) 881 goto read_err; 882 883 filled = 4; 884 } else if (count >= 2 && !(*ppos % 2)) { 885 u16 val; 886 887 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), 888 ppos, false); 889 if (ret <= 0) 890 goto read_err; 891 892 if (copy_to_user(buf, &val, sizeof(val))) 893 goto read_err; 894 895 filled = 2; 896 } else { 897 u8 val; 898 899 ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos, 900 false); 901 if (ret <= 0) 902 goto read_err; 903 904 if (copy_to_user(buf, &val, sizeof(val))) 905 goto read_err; 906 907 filled = 1; 908 } 909 910 count -= filled; 911 done += filled; 912 *ppos += filled; 913 buf += filled; 914 } 915 916 return done; 917 918 read_err: 919 return -EFAULT; 920 } 921 922 static ssize_t intel_vgpu_write(struct mdev_device *mdev, 923 const char __user *buf, 924 size_t count, loff_t *ppos) 925 { 926 unsigned int done = 0; 927 int ret; 928 929 while (count) { 930 size_t filled; 931 932 /* Only support GGTT entry 8 bytes write */ 933 if (count >= 8 && !(*ppos % 8) && 934 gtt_entry(mdev, ppos)) { 935 u64 val; 936 937 if (copy_from_user(&val, buf, sizeof(val))) 938 goto write_err; 939 940 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), 941 ppos, true); 942 if (ret <= 0) 943 goto write_err; 944 945 filled = 8; 946 } else if (count >= 4 && !(*ppos % 4)) { 947 u32 val; 948 949 if (copy_from_user(&val, buf, sizeof(val))) 950 goto write_err; 951 952 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), 953 ppos, true); 954 if (ret <= 0) 955 goto write_err; 956 957 filled = 4; 958 } else if (count >= 2 && !(*ppos % 2)) { 959 u16 val; 960 961 if (copy_from_user(&val, buf, sizeof(val))) 962 goto write_err; 963 964 ret = intel_vgpu_rw(mdev, (char *)&val, 965 sizeof(val), ppos, true); 966 if (ret <= 0) 967 goto write_err; 968 969 filled = 2; 970 } else { 971 u8 val; 972 973 if (copy_from_user(&val, buf, sizeof(val))) 974 goto write_err; 975 976 ret = intel_vgpu_rw(mdev, &val, sizeof(val), 977 ppos, true); 978 if (ret <= 0) 979 goto write_err; 980 981 filled = 1; 982 } 983 984 count -= filled; 985 done += filled; 986 *ppos += filled; 987 buf += filled; 988 } 989 990 return done; 991 write_err: 992 return -EFAULT; 993 } 994 995 static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) 996 { 997 unsigned int index; 998 u64 virtaddr; 999 unsigned long req_size, pgoff, req_start; 1000 pgprot_t pg_prot; 1001 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 1002 1003 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 1004 if (index >= VFIO_PCI_ROM_REGION_INDEX) 1005 return -EINVAL; 1006 1007 if (vma->vm_end < vma->vm_start) 1008 return -EINVAL; 1009 if ((vma->vm_flags & VM_SHARED) == 0) 1010 return -EINVAL; 1011 if (index != VFIO_PCI_BAR2_REGION_INDEX) 1012 return -EINVAL; 1013 1014 pg_prot = vma->vm_page_prot; 1015 virtaddr = vma->vm_start; 1016 req_size = vma->vm_end - vma->vm_start; 1017 pgoff = vma->vm_pgoff & 1018 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 1019 req_start = pgoff << PAGE_SHIFT; 1020 1021 if (!intel_vgpu_in_aperture(vgpu, req_start)) 1022 return -EINVAL; 1023 if (req_start + req_size > 1024 vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu)) 1025 return -EINVAL; 1026 1027 pgoff = (gvt_aperture_pa_base(vgpu->gvt) >> PAGE_SHIFT) + pgoff; 1028 1029 return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); 1030 } 1031 1032 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type) 1033 { 1034 if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX) 1035 return 1; 1036 1037 return 0; 1038 } 1039 1040 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu, 1041 unsigned int index, unsigned int start, 1042 unsigned int count, uint32_t flags, 1043 void *data) 1044 { 1045 return 0; 1046 } 1047 1048 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu, 1049 unsigned int index, unsigned int start, 1050 unsigned int count, uint32_t flags, void *data) 1051 { 1052 return 0; 1053 } 1054 1055 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu, 1056 unsigned int index, unsigned int start, unsigned int count, 1057 uint32_t flags, void *data) 1058 { 1059 return 0; 1060 } 1061 1062 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu, 1063 unsigned int index, unsigned int start, unsigned int count, 1064 uint32_t flags, void *data) 1065 { 1066 struct eventfd_ctx *trigger; 1067 1068 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 1069 int fd = *(int *)data; 1070 1071 trigger = eventfd_ctx_fdget(fd); 1072 if (IS_ERR(trigger)) { 1073 gvt_vgpu_err("eventfd_ctx_fdget failed\n"); 1074 return PTR_ERR(trigger); 1075 } 1076 vgpu->vdev.msi_trigger = trigger; 1077 } else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count) 1078 intel_vgpu_release_msi_eventfd_ctx(vgpu); 1079 1080 return 0; 1081 } 1082 1083 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, uint32_t flags, 1084 unsigned int index, unsigned int start, unsigned int count, 1085 void *data) 1086 { 1087 int (*func)(struct intel_vgpu *vgpu, unsigned int index, 1088 unsigned int start, unsigned int count, uint32_t flags, 1089 void *data) = NULL; 1090 1091 switch (index) { 1092 case VFIO_PCI_INTX_IRQ_INDEX: 1093 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { 1094 case VFIO_IRQ_SET_ACTION_MASK: 1095 func = intel_vgpu_set_intx_mask; 1096 break; 1097 case VFIO_IRQ_SET_ACTION_UNMASK: 1098 func = intel_vgpu_set_intx_unmask; 1099 break; 1100 case VFIO_IRQ_SET_ACTION_TRIGGER: 1101 func = intel_vgpu_set_intx_trigger; 1102 break; 1103 } 1104 break; 1105 case VFIO_PCI_MSI_IRQ_INDEX: 1106 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { 1107 case VFIO_IRQ_SET_ACTION_MASK: 1108 case VFIO_IRQ_SET_ACTION_UNMASK: 1109 /* XXX Need masking support exported */ 1110 break; 1111 case VFIO_IRQ_SET_ACTION_TRIGGER: 1112 func = intel_vgpu_set_msi_trigger; 1113 break; 1114 } 1115 break; 1116 } 1117 1118 if (!func) 1119 return -ENOTTY; 1120 1121 return func(vgpu, index, start, count, flags, data); 1122 } 1123 1124 static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, 1125 unsigned long arg) 1126 { 1127 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 1128 unsigned long minsz; 1129 1130 gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd); 1131 1132 if (cmd == VFIO_DEVICE_GET_INFO) { 1133 struct vfio_device_info info; 1134 1135 minsz = offsetofend(struct vfio_device_info, num_irqs); 1136 1137 if (copy_from_user(&info, (void __user *)arg, minsz)) 1138 return -EFAULT; 1139 1140 if (info.argsz < minsz) 1141 return -EINVAL; 1142 1143 info.flags = VFIO_DEVICE_FLAGS_PCI; 1144 info.flags |= VFIO_DEVICE_FLAGS_RESET; 1145 info.num_regions = VFIO_PCI_NUM_REGIONS + 1146 vgpu->vdev.num_regions; 1147 info.num_irqs = VFIO_PCI_NUM_IRQS; 1148 1149 return copy_to_user((void __user *)arg, &info, minsz) ? 1150 -EFAULT : 0; 1151 1152 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 1153 struct vfio_region_info info; 1154 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 1155 unsigned int i; 1156 int ret; 1157 struct vfio_region_info_cap_sparse_mmap *sparse = NULL; 1158 size_t size; 1159 int nr_areas = 1; 1160 int cap_type_id; 1161 1162 minsz = offsetofend(struct vfio_region_info, offset); 1163 1164 if (copy_from_user(&info, (void __user *)arg, minsz)) 1165 return -EFAULT; 1166 1167 if (info.argsz < minsz) 1168 return -EINVAL; 1169 1170 switch (info.index) { 1171 case VFIO_PCI_CONFIG_REGION_INDEX: 1172 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1173 info.size = vgpu->gvt->device_info.cfg_space_size; 1174 info.flags = VFIO_REGION_INFO_FLAG_READ | 1175 VFIO_REGION_INFO_FLAG_WRITE; 1176 break; 1177 case VFIO_PCI_BAR0_REGION_INDEX: 1178 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1179 info.size = vgpu->cfg_space.bar[info.index].size; 1180 if (!info.size) { 1181 info.flags = 0; 1182 break; 1183 } 1184 1185 info.flags = VFIO_REGION_INFO_FLAG_READ | 1186 VFIO_REGION_INFO_FLAG_WRITE; 1187 break; 1188 case VFIO_PCI_BAR1_REGION_INDEX: 1189 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1190 info.size = 0; 1191 info.flags = 0; 1192 break; 1193 case VFIO_PCI_BAR2_REGION_INDEX: 1194 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1195 info.flags = VFIO_REGION_INFO_FLAG_CAPS | 1196 VFIO_REGION_INFO_FLAG_MMAP | 1197 VFIO_REGION_INFO_FLAG_READ | 1198 VFIO_REGION_INFO_FLAG_WRITE; 1199 info.size = gvt_aperture_sz(vgpu->gvt); 1200 1201 size = sizeof(*sparse) + 1202 (nr_areas * sizeof(*sparse->areas)); 1203 sparse = kzalloc(size, GFP_KERNEL); 1204 if (!sparse) 1205 return -ENOMEM; 1206 1207 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 1208 sparse->header.version = 1; 1209 sparse->nr_areas = nr_areas; 1210 cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 1211 sparse->areas[0].offset = 1212 PAGE_ALIGN(vgpu_aperture_offset(vgpu)); 1213 sparse->areas[0].size = vgpu_aperture_sz(vgpu); 1214 break; 1215 1216 case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 1217 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1218 info.size = 0; 1219 info.flags = 0; 1220 1221 gvt_dbg_core("get region info bar:%d\n", info.index); 1222 break; 1223 1224 case VFIO_PCI_ROM_REGION_INDEX: 1225 case VFIO_PCI_VGA_REGION_INDEX: 1226 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1227 info.size = 0; 1228 info.flags = 0; 1229 1230 gvt_dbg_core("get region info index:%d\n", info.index); 1231 break; 1232 default: 1233 { 1234 struct vfio_region_info_cap_type cap_type = { 1235 .header.id = VFIO_REGION_INFO_CAP_TYPE, 1236 .header.version = 1 }; 1237 1238 if (info.index >= VFIO_PCI_NUM_REGIONS + 1239 vgpu->vdev.num_regions) 1240 return -EINVAL; 1241 info.index = 1242 array_index_nospec(info.index, 1243 VFIO_PCI_NUM_REGIONS + 1244 vgpu->vdev.num_regions); 1245 1246 i = info.index - VFIO_PCI_NUM_REGIONS; 1247 1248 info.offset = 1249 VFIO_PCI_INDEX_TO_OFFSET(info.index); 1250 info.size = vgpu->vdev.region[i].size; 1251 info.flags = vgpu->vdev.region[i].flags; 1252 1253 cap_type.type = vgpu->vdev.region[i].type; 1254 cap_type.subtype = vgpu->vdev.region[i].subtype; 1255 1256 ret = vfio_info_add_capability(&caps, 1257 &cap_type.header, 1258 sizeof(cap_type)); 1259 if (ret) 1260 return ret; 1261 } 1262 } 1263 1264 if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { 1265 switch (cap_type_id) { 1266 case VFIO_REGION_INFO_CAP_SPARSE_MMAP: 1267 ret = vfio_info_add_capability(&caps, 1268 &sparse->header, sizeof(*sparse) + 1269 (sparse->nr_areas * 1270 sizeof(*sparse->areas))); 1271 if (ret) { 1272 kfree(sparse); 1273 return ret; 1274 } 1275 break; 1276 default: 1277 kfree(sparse); 1278 return -EINVAL; 1279 } 1280 } 1281 1282 if (caps.size) { 1283 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 1284 if (info.argsz < sizeof(info) + caps.size) { 1285 info.argsz = sizeof(info) + caps.size; 1286 info.cap_offset = 0; 1287 } else { 1288 vfio_info_cap_shift(&caps, sizeof(info)); 1289 if (copy_to_user((void __user *)arg + 1290 sizeof(info), caps.buf, 1291 caps.size)) { 1292 kfree(caps.buf); 1293 kfree(sparse); 1294 return -EFAULT; 1295 } 1296 info.cap_offset = sizeof(info); 1297 } 1298 1299 kfree(caps.buf); 1300 } 1301 1302 kfree(sparse); 1303 return copy_to_user((void __user *)arg, &info, minsz) ? 1304 -EFAULT : 0; 1305 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 1306 struct vfio_irq_info info; 1307 1308 minsz = offsetofend(struct vfio_irq_info, count); 1309 1310 if (copy_from_user(&info, (void __user *)arg, minsz)) 1311 return -EFAULT; 1312 1313 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 1314 return -EINVAL; 1315 1316 switch (info.index) { 1317 case VFIO_PCI_INTX_IRQ_INDEX: 1318 case VFIO_PCI_MSI_IRQ_INDEX: 1319 break; 1320 default: 1321 return -EINVAL; 1322 } 1323 1324 info.flags = VFIO_IRQ_INFO_EVENTFD; 1325 1326 info.count = intel_vgpu_get_irq_count(vgpu, info.index); 1327 1328 if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 1329 info.flags |= (VFIO_IRQ_INFO_MASKABLE | 1330 VFIO_IRQ_INFO_AUTOMASKED); 1331 else 1332 info.flags |= VFIO_IRQ_INFO_NORESIZE; 1333 1334 return copy_to_user((void __user *)arg, &info, minsz) ? 1335 -EFAULT : 0; 1336 } else if (cmd == VFIO_DEVICE_SET_IRQS) { 1337 struct vfio_irq_set hdr; 1338 u8 *data = NULL; 1339 int ret = 0; 1340 size_t data_size = 0; 1341 1342 minsz = offsetofend(struct vfio_irq_set, count); 1343 1344 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 1345 return -EFAULT; 1346 1347 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { 1348 int max = intel_vgpu_get_irq_count(vgpu, hdr.index); 1349 1350 ret = vfio_set_irqs_validate_and_prepare(&hdr, max, 1351 VFIO_PCI_NUM_IRQS, &data_size); 1352 if (ret) { 1353 gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n"); 1354 return -EINVAL; 1355 } 1356 if (data_size) { 1357 data = memdup_user((void __user *)(arg + minsz), 1358 data_size); 1359 if (IS_ERR(data)) 1360 return PTR_ERR(data); 1361 } 1362 } 1363 1364 ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index, 1365 hdr.start, hdr.count, data); 1366 kfree(data); 1367 1368 return ret; 1369 } else if (cmd == VFIO_DEVICE_RESET) { 1370 intel_gvt_ops->vgpu_reset(vgpu); 1371 return 0; 1372 } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) { 1373 struct vfio_device_gfx_plane_info dmabuf; 1374 int ret = 0; 1375 1376 minsz = offsetofend(struct vfio_device_gfx_plane_info, 1377 dmabuf_id); 1378 if (copy_from_user(&dmabuf, (void __user *)arg, minsz)) 1379 return -EFAULT; 1380 if (dmabuf.argsz < minsz) 1381 return -EINVAL; 1382 1383 ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf); 1384 if (ret != 0) 1385 return ret; 1386 1387 return copy_to_user((void __user *)arg, &dmabuf, minsz) ? 1388 -EFAULT : 0; 1389 } else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) { 1390 __u32 dmabuf_id; 1391 __s32 dmabuf_fd; 1392 1393 if (get_user(dmabuf_id, (__u32 __user *)arg)) 1394 return -EFAULT; 1395 1396 dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id); 1397 return dmabuf_fd; 1398 1399 } 1400 1401 return -ENOTTY; 1402 } 1403 1404 static ssize_t 1405 vgpu_id_show(struct device *dev, struct device_attribute *attr, 1406 char *buf) 1407 { 1408 struct mdev_device *mdev = mdev_from_dev(dev); 1409 1410 if (mdev) { 1411 struct intel_vgpu *vgpu = (struct intel_vgpu *) 1412 mdev_get_drvdata(mdev); 1413 return sprintf(buf, "%d\n", vgpu->id); 1414 } 1415 return sprintf(buf, "\n"); 1416 } 1417 1418 static ssize_t 1419 hw_id_show(struct device *dev, struct device_attribute *attr, 1420 char *buf) 1421 { 1422 struct mdev_device *mdev = mdev_from_dev(dev); 1423 1424 if (mdev) { 1425 struct intel_vgpu *vgpu = (struct intel_vgpu *) 1426 mdev_get_drvdata(mdev); 1427 return sprintf(buf, "%u\n", 1428 vgpu->submission.shadow_ctx->hw_id); 1429 } 1430 return sprintf(buf, "\n"); 1431 } 1432 1433 static DEVICE_ATTR_RO(vgpu_id); 1434 static DEVICE_ATTR_RO(hw_id); 1435 1436 static struct attribute *intel_vgpu_attrs[] = { 1437 &dev_attr_vgpu_id.attr, 1438 &dev_attr_hw_id.attr, 1439 NULL 1440 }; 1441 1442 static const struct attribute_group intel_vgpu_group = { 1443 .name = "intel_vgpu", 1444 .attrs = intel_vgpu_attrs, 1445 }; 1446 1447 static const struct attribute_group *intel_vgpu_groups[] = { 1448 &intel_vgpu_group, 1449 NULL, 1450 }; 1451 1452 static struct mdev_parent_ops intel_vgpu_ops = { 1453 .mdev_attr_groups = intel_vgpu_groups, 1454 .create = intel_vgpu_create, 1455 .remove = intel_vgpu_remove, 1456 1457 .open = intel_vgpu_open, 1458 .release = intel_vgpu_release, 1459 1460 .read = intel_vgpu_read, 1461 .write = intel_vgpu_write, 1462 .mmap = intel_vgpu_mmap, 1463 .ioctl = intel_vgpu_ioctl, 1464 }; 1465 1466 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops) 1467 { 1468 struct attribute **kvm_type_attrs; 1469 struct attribute_group **kvm_vgpu_type_groups; 1470 1471 intel_gvt_ops = ops; 1472 if (!intel_gvt_ops->get_gvt_attrs(&kvm_type_attrs, 1473 &kvm_vgpu_type_groups)) 1474 return -EFAULT; 1475 intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups; 1476 1477 return mdev_register_device(dev, &intel_vgpu_ops); 1478 } 1479 1480 static void kvmgt_host_exit(struct device *dev, void *gvt) 1481 { 1482 mdev_unregister_device(dev); 1483 } 1484 1485 static int kvmgt_page_track_add(unsigned long handle, u64 gfn) 1486 { 1487 struct kvmgt_guest_info *info; 1488 struct kvm *kvm; 1489 struct kvm_memory_slot *slot; 1490 int idx; 1491 1492 if (!handle_valid(handle)) 1493 return -ESRCH; 1494 1495 info = (struct kvmgt_guest_info *)handle; 1496 kvm = info->kvm; 1497 1498 idx = srcu_read_lock(&kvm->srcu); 1499 slot = gfn_to_memslot(kvm, gfn); 1500 if (!slot) { 1501 srcu_read_unlock(&kvm->srcu, idx); 1502 return -EINVAL; 1503 } 1504 1505 spin_lock(&kvm->mmu_lock); 1506 1507 if (kvmgt_gfn_is_write_protected(info, gfn)) 1508 goto out; 1509 1510 kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); 1511 kvmgt_protect_table_add(info, gfn); 1512 1513 out: 1514 spin_unlock(&kvm->mmu_lock); 1515 srcu_read_unlock(&kvm->srcu, idx); 1516 return 0; 1517 } 1518 1519 static int kvmgt_page_track_remove(unsigned long handle, u64 gfn) 1520 { 1521 struct kvmgt_guest_info *info; 1522 struct kvm *kvm; 1523 struct kvm_memory_slot *slot; 1524 int idx; 1525 1526 if (!handle_valid(handle)) 1527 return 0; 1528 1529 info = (struct kvmgt_guest_info *)handle; 1530 kvm = info->kvm; 1531 1532 idx = srcu_read_lock(&kvm->srcu); 1533 slot = gfn_to_memslot(kvm, gfn); 1534 if (!slot) { 1535 srcu_read_unlock(&kvm->srcu, idx); 1536 return -EINVAL; 1537 } 1538 1539 spin_lock(&kvm->mmu_lock); 1540 1541 if (!kvmgt_gfn_is_write_protected(info, gfn)) 1542 goto out; 1543 1544 kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); 1545 kvmgt_protect_table_del(info, gfn); 1546 1547 out: 1548 spin_unlock(&kvm->mmu_lock); 1549 srcu_read_unlock(&kvm->srcu, idx); 1550 return 0; 1551 } 1552 1553 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1554 const u8 *val, int len, 1555 struct kvm_page_track_notifier_node *node) 1556 { 1557 struct kvmgt_guest_info *info = container_of(node, 1558 struct kvmgt_guest_info, track_node); 1559 1560 if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa))) 1561 intel_gvt_ops->write_protect_handler(info->vgpu, gpa, 1562 (void *)val, len); 1563 } 1564 1565 static void kvmgt_page_track_flush_slot(struct kvm *kvm, 1566 struct kvm_memory_slot *slot, 1567 struct kvm_page_track_notifier_node *node) 1568 { 1569 int i; 1570 gfn_t gfn; 1571 struct kvmgt_guest_info *info = container_of(node, 1572 struct kvmgt_guest_info, track_node); 1573 1574 spin_lock(&kvm->mmu_lock); 1575 for (i = 0; i < slot->npages; i++) { 1576 gfn = slot->base_gfn + i; 1577 if (kvmgt_gfn_is_write_protected(info, gfn)) { 1578 kvm_slot_page_track_remove_page(kvm, slot, gfn, 1579 KVM_PAGE_TRACK_WRITE); 1580 kvmgt_protect_table_del(info, gfn); 1581 } 1582 } 1583 spin_unlock(&kvm->mmu_lock); 1584 } 1585 1586 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm) 1587 { 1588 struct intel_vgpu *itr; 1589 struct kvmgt_guest_info *info; 1590 int id; 1591 bool ret = false; 1592 1593 mutex_lock(&vgpu->gvt->lock); 1594 for_each_active_vgpu(vgpu->gvt, itr, id) { 1595 if (!handle_valid(itr->handle)) 1596 continue; 1597 1598 info = (struct kvmgt_guest_info *)itr->handle; 1599 if (kvm && kvm == info->kvm) { 1600 ret = true; 1601 goto out; 1602 } 1603 } 1604 out: 1605 mutex_unlock(&vgpu->gvt->lock); 1606 return ret; 1607 } 1608 1609 static int kvmgt_guest_init(struct mdev_device *mdev) 1610 { 1611 struct kvmgt_guest_info *info; 1612 struct intel_vgpu *vgpu; 1613 struct kvm *kvm; 1614 1615 vgpu = mdev_get_drvdata(mdev); 1616 if (handle_valid(vgpu->handle)) 1617 return -EEXIST; 1618 1619 kvm = vgpu->vdev.kvm; 1620 if (!kvm || kvm->mm != current->mm) { 1621 gvt_vgpu_err("KVM is required to use Intel vGPU\n"); 1622 return -ESRCH; 1623 } 1624 1625 if (__kvmgt_vgpu_exist(vgpu, kvm)) 1626 return -EEXIST; 1627 1628 info = vzalloc(sizeof(struct kvmgt_guest_info)); 1629 if (!info) 1630 return -ENOMEM; 1631 1632 vgpu->handle = (unsigned long)info; 1633 info->vgpu = vgpu; 1634 info->kvm = kvm; 1635 kvm_get_kvm(info->kvm); 1636 1637 kvmgt_protect_table_init(info); 1638 gvt_cache_init(vgpu); 1639 1640 init_completion(&vgpu->vblank_done); 1641 1642 info->track_node.track_write = kvmgt_page_track_write; 1643 info->track_node.track_flush_slot = kvmgt_page_track_flush_slot; 1644 kvm_page_track_register_notifier(kvm, &info->track_node); 1645 1646 info->debugfs_cache_entries = debugfs_create_ulong( 1647 "kvmgt_nr_cache_entries", 1648 0444, vgpu->debugfs, 1649 &vgpu->vdev.nr_cache_entries); 1650 if (!info->debugfs_cache_entries) 1651 gvt_vgpu_err("Cannot create kvmgt debugfs entry\n"); 1652 1653 return 0; 1654 } 1655 1656 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info) 1657 { 1658 debugfs_remove(info->debugfs_cache_entries); 1659 1660 kvm_page_track_unregister_notifier(info->kvm, &info->track_node); 1661 kvm_put_kvm(info->kvm); 1662 kvmgt_protect_table_destroy(info); 1663 gvt_cache_destroy(info->vgpu); 1664 vfree(info); 1665 1666 return true; 1667 } 1668 1669 static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle) 1670 { 1671 /* nothing to do here */ 1672 return 0; 1673 } 1674 1675 static void kvmgt_detach_vgpu(void *p_vgpu) 1676 { 1677 int i; 1678 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu; 1679 1680 if (!vgpu->vdev.region) 1681 return; 1682 1683 for (i = 0; i < vgpu->vdev.num_regions; i++) 1684 if (vgpu->vdev.region[i].ops->release) 1685 vgpu->vdev.region[i].ops->release(vgpu, 1686 &vgpu->vdev.region[i]); 1687 vgpu->vdev.num_regions = 0; 1688 kfree(vgpu->vdev.region); 1689 vgpu->vdev.region = NULL; 1690 } 1691 1692 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data) 1693 { 1694 struct kvmgt_guest_info *info; 1695 struct intel_vgpu *vgpu; 1696 1697 if (!handle_valid(handle)) 1698 return -ESRCH; 1699 1700 info = (struct kvmgt_guest_info *)handle; 1701 vgpu = info->vgpu; 1702 1703 /* 1704 * When guest is poweroff, msi_trigger is set to NULL, but vgpu's 1705 * config and mmio register isn't restored to default during guest 1706 * poweroff. If this vgpu is still used in next vm, this vgpu's pipe 1707 * may be enabled, then once this vgpu is active, it will get inject 1708 * vblank interrupt request. But msi_trigger is null until msi is 1709 * enabled by guest. so if msi_trigger is null, success is still 1710 * returned and don't inject interrupt into guest. 1711 */ 1712 if (vgpu->vdev.msi_trigger == NULL) 1713 return 0; 1714 1715 if (eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1) 1716 return 0; 1717 1718 return -EFAULT; 1719 } 1720 1721 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn) 1722 { 1723 struct kvmgt_guest_info *info; 1724 kvm_pfn_t pfn; 1725 1726 if (!handle_valid(handle)) 1727 return INTEL_GVT_INVALID_ADDR; 1728 1729 info = (struct kvmgt_guest_info *)handle; 1730 1731 pfn = gfn_to_pfn(info->kvm, gfn); 1732 if (is_error_noslot_pfn(pfn)) 1733 return INTEL_GVT_INVALID_ADDR; 1734 1735 return pfn; 1736 } 1737 1738 static int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn, 1739 unsigned long size, dma_addr_t *dma_addr) 1740 { 1741 struct kvmgt_guest_info *info; 1742 struct intel_vgpu *vgpu; 1743 struct gvt_dma *entry; 1744 int ret; 1745 1746 if (!handle_valid(handle)) 1747 return -EINVAL; 1748 1749 info = (struct kvmgt_guest_info *)handle; 1750 vgpu = info->vgpu; 1751 1752 mutex_lock(&info->vgpu->vdev.cache_lock); 1753 1754 entry = __gvt_cache_find_gfn(info->vgpu, gfn); 1755 if (!entry) { 1756 ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size); 1757 if (ret) 1758 goto err_unlock; 1759 1760 ret = __gvt_cache_add(info->vgpu, gfn, *dma_addr, size); 1761 if (ret) 1762 goto err_unmap; 1763 } else { 1764 kref_get(&entry->ref); 1765 *dma_addr = entry->dma_addr; 1766 } 1767 1768 mutex_unlock(&info->vgpu->vdev.cache_lock); 1769 return 0; 1770 1771 err_unmap: 1772 gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size); 1773 err_unlock: 1774 mutex_unlock(&info->vgpu->vdev.cache_lock); 1775 return ret; 1776 } 1777 1778 static void __gvt_dma_release(struct kref *ref) 1779 { 1780 struct gvt_dma *entry = container_of(ref, typeof(*entry), ref); 1781 1782 gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr, 1783 entry->size); 1784 __gvt_cache_remove_entry(entry->vgpu, entry); 1785 } 1786 1787 static void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr) 1788 { 1789 struct kvmgt_guest_info *info; 1790 struct gvt_dma *entry; 1791 1792 if (!handle_valid(handle)) 1793 return; 1794 1795 info = (struct kvmgt_guest_info *)handle; 1796 1797 mutex_lock(&info->vgpu->vdev.cache_lock); 1798 entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr); 1799 if (entry) 1800 kref_put(&entry->ref, __gvt_dma_release); 1801 mutex_unlock(&info->vgpu->vdev.cache_lock); 1802 } 1803 1804 static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa, 1805 void *buf, unsigned long len, bool write) 1806 { 1807 struct kvmgt_guest_info *info; 1808 struct kvm *kvm; 1809 int idx, ret; 1810 bool kthread = current->mm == NULL; 1811 1812 if (!handle_valid(handle)) 1813 return -ESRCH; 1814 1815 info = (struct kvmgt_guest_info *)handle; 1816 kvm = info->kvm; 1817 1818 if (kthread) { 1819 if (!mmget_not_zero(kvm->mm)) 1820 return -EFAULT; 1821 use_mm(kvm->mm); 1822 } 1823 1824 idx = srcu_read_lock(&kvm->srcu); 1825 ret = write ? kvm_write_guest(kvm, gpa, buf, len) : 1826 kvm_read_guest(kvm, gpa, buf, len); 1827 srcu_read_unlock(&kvm->srcu, idx); 1828 1829 if (kthread) { 1830 unuse_mm(kvm->mm); 1831 mmput(kvm->mm); 1832 } 1833 1834 return ret; 1835 } 1836 1837 static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa, 1838 void *buf, unsigned long len) 1839 { 1840 return kvmgt_rw_gpa(handle, gpa, buf, len, false); 1841 } 1842 1843 static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa, 1844 void *buf, unsigned long len) 1845 { 1846 return kvmgt_rw_gpa(handle, gpa, buf, len, true); 1847 } 1848 1849 static unsigned long kvmgt_virt_to_pfn(void *addr) 1850 { 1851 return PFN_DOWN(__pa(addr)); 1852 } 1853 1854 static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn) 1855 { 1856 struct kvmgt_guest_info *info; 1857 struct kvm *kvm; 1858 int idx; 1859 bool ret; 1860 1861 if (!handle_valid(handle)) 1862 return false; 1863 1864 info = (struct kvmgt_guest_info *)handle; 1865 kvm = info->kvm; 1866 1867 idx = srcu_read_lock(&kvm->srcu); 1868 ret = kvm_is_visible_gfn(kvm, gfn); 1869 srcu_read_unlock(&kvm->srcu, idx); 1870 1871 return ret; 1872 } 1873 1874 struct intel_gvt_mpt kvmgt_mpt = { 1875 .host_init = kvmgt_host_init, 1876 .host_exit = kvmgt_host_exit, 1877 .attach_vgpu = kvmgt_attach_vgpu, 1878 .detach_vgpu = kvmgt_detach_vgpu, 1879 .inject_msi = kvmgt_inject_msi, 1880 .from_virt_to_mfn = kvmgt_virt_to_pfn, 1881 .enable_page_track = kvmgt_page_track_add, 1882 .disable_page_track = kvmgt_page_track_remove, 1883 .read_gpa = kvmgt_read_gpa, 1884 .write_gpa = kvmgt_write_gpa, 1885 .gfn_to_mfn = kvmgt_gfn_to_pfn, 1886 .dma_map_guest_page = kvmgt_dma_map_guest_page, 1887 .dma_unmap_guest_page = kvmgt_dma_unmap_guest_page, 1888 .set_opregion = kvmgt_set_opregion, 1889 .get_vfio_device = kvmgt_get_vfio_device, 1890 .put_vfio_device = kvmgt_put_vfio_device, 1891 .is_valid_gfn = kvmgt_is_valid_gfn, 1892 }; 1893 EXPORT_SYMBOL_GPL(kvmgt_mpt); 1894 1895 static int __init kvmgt_init(void) 1896 { 1897 return 0; 1898 } 1899 1900 static void __exit kvmgt_exit(void) 1901 { 1902 } 1903 1904 module_init(kvmgt_init); 1905 module_exit(kvmgt_exit); 1906 1907 MODULE_LICENSE("GPL and additional rights"); 1908 MODULE_AUTHOR("Intel Corporation"); 1909