1 /* 2 * KVMGT - the implementation of Intel mediated pass-through framework for KVM 3 * 4 * Copyright(c) 2014-2016 Intel Corporation. All rights reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 * 25 * Authors: 26 * Kevin Tian <kevin.tian@intel.com> 27 * Jike Song <jike.song@intel.com> 28 * Xiaoguang Chen <xiaoguang.chen@intel.com> 29 */ 30 31 #include <linux/init.h> 32 #include <linux/device.h> 33 #include <linux/mm.h> 34 #include <linux/mmu_context.h> 35 #include <linux/sched/mm.h> 36 #include <linux/types.h> 37 #include <linux/list.h> 38 #include <linux/rbtree.h> 39 #include <linux/spinlock.h> 40 #include <linux/eventfd.h> 41 #include <linux/uuid.h> 42 #include <linux/kvm_host.h> 43 #include <linux/vfio.h> 44 #include <linux/mdev.h> 45 #include <linux/debugfs.h> 46 47 #include <linux/nospec.h> 48 49 #include "i915_drv.h" 50 #include "gvt.h" 51 52 static const struct intel_gvt_ops *intel_gvt_ops; 53 54 /* helper macros copied from vfio-pci */ 55 #define VFIO_PCI_OFFSET_SHIFT 40 56 #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) 57 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) 58 #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) 59 60 #define OPREGION_SIGNATURE "IntelGraphicsMem" 61 62 struct vfio_region; 63 struct intel_vgpu_regops { 64 size_t (*rw)(struct intel_vgpu *vgpu, char *buf, 65 size_t count, loff_t *ppos, bool iswrite); 66 void (*release)(struct intel_vgpu *vgpu, 67 struct vfio_region *region); 68 }; 69 70 struct vfio_region { 71 u32 type; 72 u32 subtype; 73 size_t size; 74 u32 flags; 75 const struct intel_vgpu_regops *ops; 76 void *data; 77 }; 78 79 struct kvmgt_pgfn { 80 gfn_t gfn; 81 struct hlist_node hnode; 82 }; 83 84 struct kvmgt_guest_info { 85 struct kvm *kvm; 86 struct intel_vgpu *vgpu; 87 struct kvm_page_track_notifier_node track_node; 88 #define NR_BKT (1 << 18) 89 struct hlist_head ptable[NR_BKT]; 90 #undef NR_BKT 91 struct dentry *debugfs_cache_entries; 92 }; 93 94 struct gvt_dma { 95 struct intel_vgpu *vgpu; 96 struct rb_node gfn_node; 97 struct rb_node dma_addr_node; 98 gfn_t gfn; 99 dma_addr_t dma_addr; 100 unsigned long size; 101 struct kref ref; 102 }; 103 104 static inline bool handle_valid(unsigned long handle) 105 { 106 return !!(handle & ~0xff); 107 } 108 109 static int kvmgt_guest_init(struct mdev_device *mdev); 110 static void intel_vgpu_release_work(struct work_struct *work); 111 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info); 112 113 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, 114 unsigned long size) 115 { 116 int total_pages; 117 int npage; 118 int ret; 119 120 total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE; 121 122 for (npage = 0; npage < total_pages; npage++) { 123 unsigned long cur_gfn = gfn + npage; 124 125 ret = vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &cur_gfn, 1); 126 WARN_ON(ret != 1); 127 } 128 } 129 130 /* Pin a normal or compound guest page for dma. */ 131 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, 132 unsigned long size, struct page **page) 133 { 134 unsigned long base_pfn = 0; 135 int total_pages; 136 int npage; 137 int ret; 138 139 total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE; 140 /* 141 * We pin the pages one-by-one to avoid allocating a big arrary 142 * on stack to hold pfns. 143 */ 144 for (npage = 0; npage < total_pages; npage++) { 145 unsigned long cur_gfn = gfn + npage; 146 unsigned long pfn; 147 148 ret = vfio_pin_pages(mdev_dev(vgpu->vdev.mdev), &cur_gfn, 1, 149 IOMMU_READ | IOMMU_WRITE, &pfn); 150 if (ret != 1) { 151 gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n", 152 cur_gfn, ret); 153 goto err; 154 } 155 156 if (!pfn_valid(pfn)) { 157 gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn); 158 npage++; 159 ret = -EFAULT; 160 goto err; 161 } 162 163 if (npage == 0) 164 base_pfn = pfn; 165 else if (base_pfn + npage != pfn) { 166 gvt_vgpu_err("The pages are not continuous\n"); 167 ret = -EINVAL; 168 npage++; 169 goto err; 170 } 171 } 172 173 *page = pfn_to_page(base_pfn); 174 return 0; 175 err: 176 gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE); 177 return ret; 178 } 179 180 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn, 181 dma_addr_t *dma_addr, unsigned long size) 182 { 183 struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev; 184 struct page *page = NULL; 185 int ret; 186 187 ret = gvt_pin_guest_page(vgpu, gfn, size, &page); 188 if (ret) 189 return ret; 190 191 /* Setup DMA mapping. */ 192 *dma_addr = dma_map_page(dev, page, 0, size, PCI_DMA_BIDIRECTIONAL); 193 if (dma_mapping_error(dev, *dma_addr)) { 194 gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n", 195 page_to_pfn(page), ret); 196 gvt_unpin_guest_page(vgpu, gfn, size); 197 return -ENOMEM; 198 } 199 200 return 0; 201 } 202 203 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn, 204 dma_addr_t dma_addr, unsigned long size) 205 { 206 struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev; 207 208 dma_unmap_page(dev, dma_addr, size, PCI_DMA_BIDIRECTIONAL); 209 gvt_unpin_guest_page(vgpu, gfn, size); 210 } 211 212 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu, 213 dma_addr_t dma_addr) 214 { 215 struct rb_node *node = vgpu->vdev.dma_addr_cache.rb_node; 216 struct gvt_dma *itr; 217 218 while (node) { 219 itr = rb_entry(node, struct gvt_dma, dma_addr_node); 220 221 if (dma_addr < itr->dma_addr) 222 node = node->rb_left; 223 else if (dma_addr > itr->dma_addr) 224 node = node->rb_right; 225 else 226 return itr; 227 } 228 return NULL; 229 } 230 231 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn) 232 { 233 struct rb_node *node = vgpu->vdev.gfn_cache.rb_node; 234 struct gvt_dma *itr; 235 236 while (node) { 237 itr = rb_entry(node, struct gvt_dma, gfn_node); 238 239 if (gfn < itr->gfn) 240 node = node->rb_left; 241 else if (gfn > itr->gfn) 242 node = node->rb_right; 243 else 244 return itr; 245 } 246 return NULL; 247 } 248 249 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn, 250 dma_addr_t dma_addr, unsigned long size) 251 { 252 struct gvt_dma *new, *itr; 253 struct rb_node **link, *parent = NULL; 254 255 new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL); 256 if (!new) 257 return -ENOMEM; 258 259 new->vgpu = vgpu; 260 new->gfn = gfn; 261 new->dma_addr = dma_addr; 262 new->size = size; 263 kref_init(&new->ref); 264 265 /* gfn_cache maps gfn to struct gvt_dma. */ 266 link = &vgpu->vdev.gfn_cache.rb_node; 267 while (*link) { 268 parent = *link; 269 itr = rb_entry(parent, struct gvt_dma, gfn_node); 270 271 if (gfn < itr->gfn) 272 link = &parent->rb_left; 273 else 274 link = &parent->rb_right; 275 } 276 rb_link_node(&new->gfn_node, parent, link); 277 rb_insert_color(&new->gfn_node, &vgpu->vdev.gfn_cache); 278 279 /* dma_addr_cache maps dma addr to struct gvt_dma. */ 280 parent = NULL; 281 link = &vgpu->vdev.dma_addr_cache.rb_node; 282 while (*link) { 283 parent = *link; 284 itr = rb_entry(parent, struct gvt_dma, dma_addr_node); 285 286 if (dma_addr < itr->dma_addr) 287 link = &parent->rb_left; 288 else 289 link = &parent->rb_right; 290 } 291 rb_link_node(&new->dma_addr_node, parent, link); 292 rb_insert_color(&new->dma_addr_node, &vgpu->vdev.dma_addr_cache); 293 294 vgpu->vdev.nr_cache_entries++; 295 return 0; 296 } 297 298 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu, 299 struct gvt_dma *entry) 300 { 301 rb_erase(&entry->gfn_node, &vgpu->vdev.gfn_cache); 302 rb_erase(&entry->dma_addr_node, &vgpu->vdev.dma_addr_cache); 303 kfree(entry); 304 vgpu->vdev.nr_cache_entries--; 305 } 306 307 static void gvt_cache_destroy(struct intel_vgpu *vgpu) 308 { 309 struct gvt_dma *dma; 310 struct rb_node *node = NULL; 311 312 for (;;) { 313 mutex_lock(&vgpu->vdev.cache_lock); 314 node = rb_first(&vgpu->vdev.gfn_cache); 315 if (!node) { 316 mutex_unlock(&vgpu->vdev.cache_lock); 317 break; 318 } 319 dma = rb_entry(node, struct gvt_dma, gfn_node); 320 gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size); 321 __gvt_cache_remove_entry(vgpu, dma); 322 mutex_unlock(&vgpu->vdev.cache_lock); 323 } 324 } 325 326 static void gvt_cache_init(struct intel_vgpu *vgpu) 327 { 328 vgpu->vdev.gfn_cache = RB_ROOT; 329 vgpu->vdev.dma_addr_cache = RB_ROOT; 330 vgpu->vdev.nr_cache_entries = 0; 331 mutex_init(&vgpu->vdev.cache_lock); 332 } 333 334 static void kvmgt_protect_table_init(struct kvmgt_guest_info *info) 335 { 336 hash_init(info->ptable); 337 } 338 339 static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info) 340 { 341 struct kvmgt_pgfn *p; 342 struct hlist_node *tmp; 343 int i; 344 345 hash_for_each_safe(info->ptable, i, tmp, p, hnode) { 346 hash_del(&p->hnode); 347 kfree(p); 348 } 349 } 350 351 static struct kvmgt_pgfn * 352 __kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn) 353 { 354 struct kvmgt_pgfn *p, *res = NULL; 355 356 hash_for_each_possible(info->ptable, p, hnode, gfn) { 357 if (gfn == p->gfn) { 358 res = p; 359 break; 360 } 361 } 362 363 return res; 364 } 365 366 static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info, 367 gfn_t gfn) 368 { 369 struct kvmgt_pgfn *p; 370 371 p = __kvmgt_protect_table_find(info, gfn); 372 return !!p; 373 } 374 375 static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn) 376 { 377 struct kvmgt_pgfn *p; 378 379 if (kvmgt_gfn_is_write_protected(info, gfn)) 380 return; 381 382 p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC); 383 if (WARN(!p, "gfn: 0x%llx\n", gfn)) 384 return; 385 386 p->gfn = gfn; 387 hash_add(info->ptable, &p->hnode, gfn); 388 } 389 390 static void kvmgt_protect_table_del(struct kvmgt_guest_info *info, 391 gfn_t gfn) 392 { 393 struct kvmgt_pgfn *p; 394 395 p = __kvmgt_protect_table_find(info, gfn); 396 if (p) { 397 hash_del(&p->hnode); 398 kfree(p); 399 } 400 } 401 402 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf, 403 size_t count, loff_t *ppos, bool iswrite) 404 { 405 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - 406 VFIO_PCI_NUM_REGIONS; 407 void *base = vgpu->vdev.region[i].data; 408 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 409 410 if (pos >= vgpu->vdev.region[i].size || iswrite) { 411 gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n"); 412 return -EINVAL; 413 } 414 count = min(count, (size_t)(vgpu->vdev.region[i].size - pos)); 415 memcpy(buf, base + pos, count); 416 417 return count; 418 } 419 420 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu, 421 struct vfio_region *region) 422 { 423 } 424 425 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = { 426 .rw = intel_vgpu_reg_rw_opregion, 427 .release = intel_vgpu_reg_release_opregion, 428 }; 429 430 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu, 431 unsigned int type, unsigned int subtype, 432 const struct intel_vgpu_regops *ops, 433 size_t size, u32 flags, void *data) 434 { 435 struct vfio_region *region; 436 437 region = krealloc(vgpu->vdev.region, 438 (vgpu->vdev.num_regions + 1) * sizeof(*region), 439 GFP_KERNEL); 440 if (!region) 441 return -ENOMEM; 442 443 vgpu->vdev.region = region; 444 vgpu->vdev.region[vgpu->vdev.num_regions].type = type; 445 vgpu->vdev.region[vgpu->vdev.num_regions].subtype = subtype; 446 vgpu->vdev.region[vgpu->vdev.num_regions].ops = ops; 447 vgpu->vdev.region[vgpu->vdev.num_regions].size = size; 448 vgpu->vdev.region[vgpu->vdev.num_regions].flags = flags; 449 vgpu->vdev.region[vgpu->vdev.num_regions].data = data; 450 vgpu->vdev.num_regions++; 451 return 0; 452 } 453 454 static int kvmgt_get_vfio_device(void *p_vgpu) 455 { 456 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu; 457 458 vgpu->vdev.vfio_device = vfio_device_get_from_dev( 459 mdev_dev(vgpu->vdev.mdev)); 460 if (!vgpu->vdev.vfio_device) { 461 gvt_vgpu_err("failed to get vfio device\n"); 462 return -ENODEV; 463 } 464 return 0; 465 } 466 467 468 static int kvmgt_set_opregion(void *p_vgpu) 469 { 470 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu; 471 void *base; 472 int ret; 473 474 /* Each vgpu has its own opregion, although VFIO would create another 475 * one later. This one is used to expose opregion to VFIO. And the 476 * other one created by VFIO later, is used by guest actually. 477 */ 478 base = vgpu_opregion(vgpu)->va; 479 if (!base) 480 return -ENOMEM; 481 482 if (memcmp(base, OPREGION_SIGNATURE, 16)) { 483 memunmap(base); 484 return -EINVAL; 485 } 486 487 ret = intel_vgpu_register_reg(vgpu, 488 PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, 489 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, 490 &intel_vgpu_regops_opregion, OPREGION_SIZE, 491 VFIO_REGION_INFO_FLAG_READ, base); 492 493 return ret; 494 } 495 496 static void kvmgt_put_vfio_device(void *vgpu) 497 { 498 if (WARN_ON(!((struct intel_vgpu *)vgpu)->vdev.vfio_device)) 499 return; 500 501 vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device); 502 } 503 504 static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) 505 { 506 struct intel_vgpu *vgpu = NULL; 507 struct intel_vgpu_type *type; 508 struct device *pdev; 509 void *gvt; 510 int ret; 511 512 pdev = mdev_parent_dev(mdev); 513 gvt = kdev_to_i915(pdev)->gvt; 514 515 type = intel_gvt_ops->gvt_find_vgpu_type(gvt, kobject_name(kobj)); 516 if (!type) { 517 gvt_vgpu_err("failed to find type %s to create\n", 518 kobject_name(kobj)); 519 ret = -EINVAL; 520 goto out; 521 } 522 523 vgpu = intel_gvt_ops->vgpu_create(gvt, type); 524 if (IS_ERR_OR_NULL(vgpu)) { 525 ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu); 526 gvt_err("failed to create intel vgpu: %d\n", ret); 527 goto out; 528 } 529 530 INIT_WORK(&vgpu->vdev.release_work, intel_vgpu_release_work); 531 532 vgpu->vdev.mdev = mdev; 533 mdev_set_drvdata(mdev, vgpu); 534 535 gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n", 536 dev_name(mdev_dev(mdev))); 537 ret = 0; 538 539 out: 540 return ret; 541 } 542 543 static int intel_vgpu_remove(struct mdev_device *mdev) 544 { 545 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 546 547 if (handle_valid(vgpu->handle)) 548 return -EBUSY; 549 550 intel_gvt_ops->vgpu_destroy(vgpu); 551 return 0; 552 } 553 554 static int intel_vgpu_iommu_notifier(struct notifier_block *nb, 555 unsigned long action, void *data) 556 { 557 struct intel_vgpu *vgpu = container_of(nb, 558 struct intel_vgpu, 559 vdev.iommu_notifier); 560 561 if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) { 562 struct vfio_iommu_type1_dma_unmap *unmap = data; 563 struct gvt_dma *entry; 564 unsigned long iov_pfn, end_iov_pfn; 565 566 iov_pfn = unmap->iova >> PAGE_SHIFT; 567 end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE; 568 569 mutex_lock(&vgpu->vdev.cache_lock); 570 for (; iov_pfn < end_iov_pfn; iov_pfn++) { 571 entry = __gvt_cache_find_gfn(vgpu, iov_pfn); 572 if (!entry) 573 continue; 574 575 gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr, 576 entry->size); 577 __gvt_cache_remove_entry(vgpu, entry); 578 } 579 mutex_unlock(&vgpu->vdev.cache_lock); 580 } 581 582 return NOTIFY_OK; 583 } 584 585 static int intel_vgpu_group_notifier(struct notifier_block *nb, 586 unsigned long action, void *data) 587 { 588 struct intel_vgpu *vgpu = container_of(nb, 589 struct intel_vgpu, 590 vdev.group_notifier); 591 592 /* the only action we care about */ 593 if (action == VFIO_GROUP_NOTIFY_SET_KVM) { 594 vgpu->vdev.kvm = data; 595 596 if (!data) 597 schedule_work(&vgpu->vdev.release_work); 598 } 599 600 return NOTIFY_OK; 601 } 602 603 static int intel_vgpu_open(struct mdev_device *mdev) 604 { 605 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 606 unsigned long events; 607 int ret; 608 609 vgpu->vdev.iommu_notifier.notifier_call = intel_vgpu_iommu_notifier; 610 vgpu->vdev.group_notifier.notifier_call = intel_vgpu_group_notifier; 611 612 events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; 613 ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events, 614 &vgpu->vdev.iommu_notifier); 615 if (ret != 0) { 616 gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n", 617 ret); 618 goto out; 619 } 620 621 events = VFIO_GROUP_NOTIFY_SET_KVM; 622 ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events, 623 &vgpu->vdev.group_notifier); 624 if (ret != 0) { 625 gvt_vgpu_err("vfio_register_notifier for group failed: %d\n", 626 ret); 627 goto undo_iommu; 628 } 629 630 ret = kvmgt_guest_init(mdev); 631 if (ret) 632 goto undo_group; 633 634 intel_gvt_ops->vgpu_activate(vgpu); 635 636 atomic_set(&vgpu->vdev.released, 0); 637 return ret; 638 639 undo_group: 640 vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, 641 &vgpu->vdev.group_notifier); 642 643 undo_iommu: 644 vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, 645 &vgpu->vdev.iommu_notifier); 646 out: 647 return ret; 648 } 649 650 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu) 651 { 652 struct eventfd_ctx *trigger; 653 654 trigger = vgpu->vdev.msi_trigger; 655 if (trigger) { 656 eventfd_ctx_put(trigger); 657 vgpu->vdev.msi_trigger = NULL; 658 } 659 } 660 661 static void __intel_vgpu_release(struct intel_vgpu *vgpu) 662 { 663 struct kvmgt_guest_info *info; 664 int ret; 665 666 if (!handle_valid(vgpu->handle)) 667 return; 668 669 if (atomic_cmpxchg(&vgpu->vdev.released, 0, 1)) 670 return; 671 672 intel_gvt_ops->vgpu_release(vgpu); 673 674 ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_IOMMU_NOTIFY, 675 &vgpu->vdev.iommu_notifier); 676 WARN(ret, "vfio_unregister_notifier for iommu failed: %d\n", ret); 677 678 ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_GROUP_NOTIFY, 679 &vgpu->vdev.group_notifier); 680 WARN(ret, "vfio_unregister_notifier for group failed: %d\n", ret); 681 682 info = (struct kvmgt_guest_info *)vgpu->handle; 683 kvmgt_guest_exit(info); 684 685 intel_vgpu_release_msi_eventfd_ctx(vgpu); 686 687 vgpu->vdev.kvm = NULL; 688 vgpu->handle = 0; 689 } 690 691 static void intel_vgpu_release(struct mdev_device *mdev) 692 { 693 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 694 695 __intel_vgpu_release(vgpu); 696 } 697 698 static void intel_vgpu_release_work(struct work_struct *work) 699 { 700 struct intel_vgpu *vgpu = container_of(work, struct intel_vgpu, 701 vdev.release_work); 702 703 __intel_vgpu_release(vgpu); 704 } 705 706 static uint64_t intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar) 707 { 708 u32 start_lo, start_hi; 709 u32 mem_type; 710 711 start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) & 712 PCI_BASE_ADDRESS_MEM_MASK; 713 mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) & 714 PCI_BASE_ADDRESS_MEM_TYPE_MASK; 715 716 switch (mem_type) { 717 case PCI_BASE_ADDRESS_MEM_TYPE_64: 718 start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space 719 + bar + 4)); 720 break; 721 case PCI_BASE_ADDRESS_MEM_TYPE_32: 722 case PCI_BASE_ADDRESS_MEM_TYPE_1M: 723 /* 1M mem BAR treated as 32-bit BAR */ 724 default: 725 /* mem unknown type treated as 32-bit BAR */ 726 start_hi = 0; 727 break; 728 } 729 730 return ((u64)start_hi << 32) | start_lo; 731 } 732 733 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, uint64_t off, 734 void *buf, unsigned int count, bool is_write) 735 { 736 uint64_t bar_start = intel_vgpu_get_bar_addr(vgpu, bar); 737 int ret; 738 739 if (is_write) 740 ret = intel_gvt_ops->emulate_mmio_write(vgpu, 741 bar_start + off, buf, count); 742 else 743 ret = intel_gvt_ops->emulate_mmio_read(vgpu, 744 bar_start + off, buf, count); 745 return ret; 746 } 747 748 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, uint64_t off) 749 { 750 return off >= vgpu_aperture_offset(vgpu) && 751 off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu); 752 } 753 754 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, uint64_t off, 755 void *buf, unsigned long count, bool is_write) 756 { 757 void *aperture_va; 758 759 if (!intel_vgpu_in_aperture(vgpu, off) || 760 !intel_vgpu_in_aperture(vgpu, off + count)) { 761 gvt_vgpu_err("Invalid aperture offset %llu\n", off); 762 return -EINVAL; 763 } 764 765 aperture_va = io_mapping_map_wc(&vgpu->gvt->dev_priv->ggtt.iomap, 766 ALIGN_DOWN(off, PAGE_SIZE), 767 count + offset_in_page(off)); 768 if (!aperture_va) 769 return -EIO; 770 771 if (is_write) 772 memcpy(aperture_va + offset_in_page(off), buf, count); 773 else 774 memcpy(buf, aperture_va + offset_in_page(off), count); 775 776 io_mapping_unmap(aperture_va); 777 778 return 0; 779 } 780 781 static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf, 782 size_t count, loff_t *ppos, bool is_write) 783 { 784 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 785 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 786 uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 787 int ret = -EINVAL; 788 789 790 if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) { 791 gvt_vgpu_err("invalid index: %u\n", index); 792 return -EINVAL; 793 } 794 795 switch (index) { 796 case VFIO_PCI_CONFIG_REGION_INDEX: 797 if (is_write) 798 ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos, 799 buf, count); 800 else 801 ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos, 802 buf, count); 803 break; 804 case VFIO_PCI_BAR0_REGION_INDEX: 805 ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos, 806 buf, count, is_write); 807 break; 808 case VFIO_PCI_BAR2_REGION_INDEX: 809 ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write); 810 break; 811 case VFIO_PCI_BAR1_REGION_INDEX: 812 case VFIO_PCI_BAR3_REGION_INDEX: 813 case VFIO_PCI_BAR4_REGION_INDEX: 814 case VFIO_PCI_BAR5_REGION_INDEX: 815 case VFIO_PCI_VGA_REGION_INDEX: 816 case VFIO_PCI_ROM_REGION_INDEX: 817 break; 818 default: 819 if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) 820 return -EINVAL; 821 822 index -= VFIO_PCI_NUM_REGIONS; 823 return vgpu->vdev.region[index].ops->rw(vgpu, buf, count, 824 ppos, is_write); 825 } 826 827 return ret == 0 ? count : ret; 828 } 829 830 static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos) 831 { 832 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 833 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 834 struct intel_gvt *gvt = vgpu->gvt; 835 int offset; 836 837 /* Only allow MMIO GGTT entry access */ 838 if (index != PCI_BASE_ADDRESS_0) 839 return false; 840 841 offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) - 842 intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0); 843 844 return (offset >= gvt->device_info.gtt_start_offset && 845 offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ? 846 true : false; 847 } 848 849 static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf, 850 size_t count, loff_t *ppos) 851 { 852 unsigned int done = 0; 853 int ret; 854 855 while (count) { 856 size_t filled; 857 858 /* Only support GGTT entry 8 bytes read */ 859 if (count >= 8 && !(*ppos % 8) && 860 gtt_entry(mdev, ppos)) { 861 u64 val; 862 863 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), 864 ppos, false); 865 if (ret <= 0) 866 goto read_err; 867 868 if (copy_to_user(buf, &val, sizeof(val))) 869 goto read_err; 870 871 filled = 8; 872 } else if (count >= 4 && !(*ppos % 4)) { 873 u32 val; 874 875 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), 876 ppos, false); 877 if (ret <= 0) 878 goto read_err; 879 880 if (copy_to_user(buf, &val, sizeof(val))) 881 goto read_err; 882 883 filled = 4; 884 } else if (count >= 2 && !(*ppos % 2)) { 885 u16 val; 886 887 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), 888 ppos, false); 889 if (ret <= 0) 890 goto read_err; 891 892 if (copy_to_user(buf, &val, sizeof(val))) 893 goto read_err; 894 895 filled = 2; 896 } else { 897 u8 val; 898 899 ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos, 900 false); 901 if (ret <= 0) 902 goto read_err; 903 904 if (copy_to_user(buf, &val, sizeof(val))) 905 goto read_err; 906 907 filled = 1; 908 } 909 910 count -= filled; 911 done += filled; 912 *ppos += filled; 913 buf += filled; 914 } 915 916 return done; 917 918 read_err: 919 return -EFAULT; 920 } 921 922 static ssize_t intel_vgpu_write(struct mdev_device *mdev, 923 const char __user *buf, 924 size_t count, loff_t *ppos) 925 { 926 unsigned int done = 0; 927 int ret; 928 929 while (count) { 930 size_t filled; 931 932 /* Only support GGTT entry 8 bytes write */ 933 if (count >= 8 && !(*ppos % 8) && 934 gtt_entry(mdev, ppos)) { 935 u64 val; 936 937 if (copy_from_user(&val, buf, sizeof(val))) 938 goto write_err; 939 940 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), 941 ppos, true); 942 if (ret <= 0) 943 goto write_err; 944 945 filled = 8; 946 } else if (count >= 4 && !(*ppos % 4)) { 947 u32 val; 948 949 if (copy_from_user(&val, buf, sizeof(val))) 950 goto write_err; 951 952 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), 953 ppos, true); 954 if (ret <= 0) 955 goto write_err; 956 957 filled = 4; 958 } else if (count >= 2 && !(*ppos % 2)) { 959 u16 val; 960 961 if (copy_from_user(&val, buf, sizeof(val))) 962 goto write_err; 963 964 ret = intel_vgpu_rw(mdev, (char *)&val, 965 sizeof(val), ppos, true); 966 if (ret <= 0) 967 goto write_err; 968 969 filled = 2; 970 } else { 971 u8 val; 972 973 if (copy_from_user(&val, buf, sizeof(val))) 974 goto write_err; 975 976 ret = intel_vgpu_rw(mdev, &val, sizeof(val), 977 ppos, true); 978 if (ret <= 0) 979 goto write_err; 980 981 filled = 1; 982 } 983 984 count -= filled; 985 done += filled; 986 *ppos += filled; 987 buf += filled; 988 } 989 990 return done; 991 write_err: 992 return -EFAULT; 993 } 994 995 static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) 996 { 997 unsigned int index; 998 u64 virtaddr; 999 unsigned long req_size, pgoff = 0; 1000 pgprot_t pg_prot; 1001 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 1002 1003 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 1004 if (index >= VFIO_PCI_ROM_REGION_INDEX) 1005 return -EINVAL; 1006 1007 if (vma->vm_end < vma->vm_start) 1008 return -EINVAL; 1009 if ((vma->vm_flags & VM_SHARED) == 0) 1010 return -EINVAL; 1011 if (index != VFIO_PCI_BAR2_REGION_INDEX) 1012 return -EINVAL; 1013 1014 pg_prot = vma->vm_page_prot; 1015 virtaddr = vma->vm_start; 1016 req_size = vma->vm_end - vma->vm_start; 1017 pgoff = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT; 1018 1019 return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); 1020 } 1021 1022 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type) 1023 { 1024 if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX) 1025 return 1; 1026 1027 return 0; 1028 } 1029 1030 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu, 1031 unsigned int index, unsigned int start, 1032 unsigned int count, uint32_t flags, 1033 void *data) 1034 { 1035 return 0; 1036 } 1037 1038 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu, 1039 unsigned int index, unsigned int start, 1040 unsigned int count, uint32_t flags, void *data) 1041 { 1042 return 0; 1043 } 1044 1045 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu, 1046 unsigned int index, unsigned int start, unsigned int count, 1047 uint32_t flags, void *data) 1048 { 1049 return 0; 1050 } 1051 1052 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu, 1053 unsigned int index, unsigned int start, unsigned int count, 1054 uint32_t flags, void *data) 1055 { 1056 struct eventfd_ctx *trigger; 1057 1058 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 1059 int fd = *(int *)data; 1060 1061 trigger = eventfd_ctx_fdget(fd); 1062 if (IS_ERR(trigger)) { 1063 gvt_vgpu_err("eventfd_ctx_fdget failed\n"); 1064 return PTR_ERR(trigger); 1065 } 1066 vgpu->vdev.msi_trigger = trigger; 1067 } else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count) 1068 intel_vgpu_release_msi_eventfd_ctx(vgpu); 1069 1070 return 0; 1071 } 1072 1073 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, uint32_t flags, 1074 unsigned int index, unsigned int start, unsigned int count, 1075 void *data) 1076 { 1077 int (*func)(struct intel_vgpu *vgpu, unsigned int index, 1078 unsigned int start, unsigned int count, uint32_t flags, 1079 void *data) = NULL; 1080 1081 switch (index) { 1082 case VFIO_PCI_INTX_IRQ_INDEX: 1083 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { 1084 case VFIO_IRQ_SET_ACTION_MASK: 1085 func = intel_vgpu_set_intx_mask; 1086 break; 1087 case VFIO_IRQ_SET_ACTION_UNMASK: 1088 func = intel_vgpu_set_intx_unmask; 1089 break; 1090 case VFIO_IRQ_SET_ACTION_TRIGGER: 1091 func = intel_vgpu_set_intx_trigger; 1092 break; 1093 } 1094 break; 1095 case VFIO_PCI_MSI_IRQ_INDEX: 1096 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { 1097 case VFIO_IRQ_SET_ACTION_MASK: 1098 case VFIO_IRQ_SET_ACTION_UNMASK: 1099 /* XXX Need masking support exported */ 1100 break; 1101 case VFIO_IRQ_SET_ACTION_TRIGGER: 1102 func = intel_vgpu_set_msi_trigger; 1103 break; 1104 } 1105 break; 1106 } 1107 1108 if (!func) 1109 return -ENOTTY; 1110 1111 return func(vgpu, index, start, count, flags, data); 1112 } 1113 1114 static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, 1115 unsigned long arg) 1116 { 1117 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 1118 unsigned long minsz; 1119 1120 gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd); 1121 1122 if (cmd == VFIO_DEVICE_GET_INFO) { 1123 struct vfio_device_info info; 1124 1125 minsz = offsetofend(struct vfio_device_info, num_irqs); 1126 1127 if (copy_from_user(&info, (void __user *)arg, minsz)) 1128 return -EFAULT; 1129 1130 if (info.argsz < minsz) 1131 return -EINVAL; 1132 1133 info.flags = VFIO_DEVICE_FLAGS_PCI; 1134 info.flags |= VFIO_DEVICE_FLAGS_RESET; 1135 info.num_regions = VFIO_PCI_NUM_REGIONS + 1136 vgpu->vdev.num_regions; 1137 info.num_irqs = VFIO_PCI_NUM_IRQS; 1138 1139 return copy_to_user((void __user *)arg, &info, minsz) ? 1140 -EFAULT : 0; 1141 1142 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 1143 struct vfio_region_info info; 1144 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 1145 unsigned int i; 1146 int ret; 1147 struct vfio_region_info_cap_sparse_mmap *sparse = NULL; 1148 size_t size; 1149 int nr_areas = 1; 1150 int cap_type_id; 1151 1152 minsz = offsetofend(struct vfio_region_info, offset); 1153 1154 if (copy_from_user(&info, (void __user *)arg, minsz)) 1155 return -EFAULT; 1156 1157 if (info.argsz < minsz) 1158 return -EINVAL; 1159 1160 switch (info.index) { 1161 case VFIO_PCI_CONFIG_REGION_INDEX: 1162 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1163 info.size = vgpu->gvt->device_info.cfg_space_size; 1164 info.flags = VFIO_REGION_INFO_FLAG_READ | 1165 VFIO_REGION_INFO_FLAG_WRITE; 1166 break; 1167 case VFIO_PCI_BAR0_REGION_INDEX: 1168 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1169 info.size = vgpu->cfg_space.bar[info.index].size; 1170 if (!info.size) { 1171 info.flags = 0; 1172 break; 1173 } 1174 1175 info.flags = VFIO_REGION_INFO_FLAG_READ | 1176 VFIO_REGION_INFO_FLAG_WRITE; 1177 break; 1178 case VFIO_PCI_BAR1_REGION_INDEX: 1179 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1180 info.size = 0; 1181 info.flags = 0; 1182 break; 1183 case VFIO_PCI_BAR2_REGION_INDEX: 1184 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1185 info.flags = VFIO_REGION_INFO_FLAG_CAPS | 1186 VFIO_REGION_INFO_FLAG_MMAP | 1187 VFIO_REGION_INFO_FLAG_READ | 1188 VFIO_REGION_INFO_FLAG_WRITE; 1189 info.size = gvt_aperture_sz(vgpu->gvt); 1190 1191 size = sizeof(*sparse) + 1192 (nr_areas * sizeof(*sparse->areas)); 1193 sparse = kzalloc(size, GFP_KERNEL); 1194 if (!sparse) 1195 return -ENOMEM; 1196 1197 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 1198 sparse->header.version = 1; 1199 sparse->nr_areas = nr_areas; 1200 cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 1201 sparse->areas[0].offset = 1202 PAGE_ALIGN(vgpu_aperture_offset(vgpu)); 1203 sparse->areas[0].size = vgpu_aperture_sz(vgpu); 1204 break; 1205 1206 case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 1207 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1208 info.size = 0; 1209 info.flags = 0; 1210 1211 gvt_dbg_core("get region info bar:%d\n", info.index); 1212 break; 1213 1214 case VFIO_PCI_ROM_REGION_INDEX: 1215 case VFIO_PCI_VGA_REGION_INDEX: 1216 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1217 info.size = 0; 1218 info.flags = 0; 1219 1220 gvt_dbg_core("get region info index:%d\n", info.index); 1221 break; 1222 default: 1223 { 1224 struct vfio_region_info_cap_type cap_type = { 1225 .header.id = VFIO_REGION_INFO_CAP_TYPE, 1226 .header.version = 1 }; 1227 1228 if (info.index >= VFIO_PCI_NUM_REGIONS + 1229 vgpu->vdev.num_regions) 1230 return -EINVAL; 1231 info.index = 1232 array_index_nospec(info.index, 1233 VFIO_PCI_NUM_REGIONS + 1234 vgpu->vdev.num_regions); 1235 1236 i = info.index - VFIO_PCI_NUM_REGIONS; 1237 1238 info.offset = 1239 VFIO_PCI_INDEX_TO_OFFSET(info.index); 1240 info.size = vgpu->vdev.region[i].size; 1241 info.flags = vgpu->vdev.region[i].flags; 1242 1243 cap_type.type = vgpu->vdev.region[i].type; 1244 cap_type.subtype = vgpu->vdev.region[i].subtype; 1245 1246 ret = vfio_info_add_capability(&caps, 1247 &cap_type.header, 1248 sizeof(cap_type)); 1249 if (ret) 1250 return ret; 1251 } 1252 } 1253 1254 if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { 1255 switch (cap_type_id) { 1256 case VFIO_REGION_INFO_CAP_SPARSE_MMAP: 1257 ret = vfio_info_add_capability(&caps, 1258 &sparse->header, sizeof(*sparse) + 1259 (sparse->nr_areas * 1260 sizeof(*sparse->areas))); 1261 if (ret) { 1262 kfree(sparse); 1263 return ret; 1264 } 1265 break; 1266 default: 1267 kfree(sparse); 1268 return -EINVAL; 1269 } 1270 } 1271 1272 if (caps.size) { 1273 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 1274 if (info.argsz < sizeof(info) + caps.size) { 1275 info.argsz = sizeof(info) + caps.size; 1276 info.cap_offset = 0; 1277 } else { 1278 vfio_info_cap_shift(&caps, sizeof(info)); 1279 if (copy_to_user((void __user *)arg + 1280 sizeof(info), caps.buf, 1281 caps.size)) { 1282 kfree(caps.buf); 1283 kfree(sparse); 1284 return -EFAULT; 1285 } 1286 info.cap_offset = sizeof(info); 1287 } 1288 1289 kfree(caps.buf); 1290 } 1291 1292 kfree(sparse); 1293 return copy_to_user((void __user *)arg, &info, minsz) ? 1294 -EFAULT : 0; 1295 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 1296 struct vfio_irq_info info; 1297 1298 minsz = offsetofend(struct vfio_irq_info, count); 1299 1300 if (copy_from_user(&info, (void __user *)arg, minsz)) 1301 return -EFAULT; 1302 1303 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 1304 return -EINVAL; 1305 1306 switch (info.index) { 1307 case VFIO_PCI_INTX_IRQ_INDEX: 1308 case VFIO_PCI_MSI_IRQ_INDEX: 1309 break; 1310 default: 1311 return -EINVAL; 1312 } 1313 1314 info.flags = VFIO_IRQ_INFO_EVENTFD; 1315 1316 info.count = intel_vgpu_get_irq_count(vgpu, info.index); 1317 1318 if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 1319 info.flags |= (VFIO_IRQ_INFO_MASKABLE | 1320 VFIO_IRQ_INFO_AUTOMASKED); 1321 else 1322 info.flags |= VFIO_IRQ_INFO_NORESIZE; 1323 1324 return copy_to_user((void __user *)arg, &info, minsz) ? 1325 -EFAULT : 0; 1326 } else if (cmd == VFIO_DEVICE_SET_IRQS) { 1327 struct vfio_irq_set hdr; 1328 u8 *data = NULL; 1329 int ret = 0; 1330 size_t data_size = 0; 1331 1332 minsz = offsetofend(struct vfio_irq_set, count); 1333 1334 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 1335 return -EFAULT; 1336 1337 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { 1338 int max = intel_vgpu_get_irq_count(vgpu, hdr.index); 1339 1340 ret = vfio_set_irqs_validate_and_prepare(&hdr, max, 1341 VFIO_PCI_NUM_IRQS, &data_size); 1342 if (ret) { 1343 gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n"); 1344 return -EINVAL; 1345 } 1346 if (data_size) { 1347 data = memdup_user((void __user *)(arg + minsz), 1348 data_size); 1349 if (IS_ERR(data)) 1350 return PTR_ERR(data); 1351 } 1352 } 1353 1354 ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index, 1355 hdr.start, hdr.count, data); 1356 kfree(data); 1357 1358 return ret; 1359 } else if (cmd == VFIO_DEVICE_RESET) { 1360 intel_gvt_ops->vgpu_reset(vgpu); 1361 return 0; 1362 } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) { 1363 struct vfio_device_gfx_plane_info dmabuf; 1364 int ret = 0; 1365 1366 minsz = offsetofend(struct vfio_device_gfx_plane_info, 1367 dmabuf_id); 1368 if (copy_from_user(&dmabuf, (void __user *)arg, minsz)) 1369 return -EFAULT; 1370 if (dmabuf.argsz < minsz) 1371 return -EINVAL; 1372 1373 ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf); 1374 if (ret != 0) 1375 return ret; 1376 1377 return copy_to_user((void __user *)arg, &dmabuf, minsz) ? 1378 -EFAULT : 0; 1379 } else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) { 1380 __u32 dmabuf_id; 1381 __s32 dmabuf_fd; 1382 1383 if (get_user(dmabuf_id, (__u32 __user *)arg)) 1384 return -EFAULT; 1385 1386 dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id); 1387 return dmabuf_fd; 1388 1389 } 1390 1391 return -ENOTTY; 1392 } 1393 1394 static ssize_t 1395 vgpu_id_show(struct device *dev, struct device_attribute *attr, 1396 char *buf) 1397 { 1398 struct mdev_device *mdev = mdev_from_dev(dev); 1399 1400 if (mdev) { 1401 struct intel_vgpu *vgpu = (struct intel_vgpu *) 1402 mdev_get_drvdata(mdev); 1403 return sprintf(buf, "%d\n", vgpu->id); 1404 } 1405 return sprintf(buf, "\n"); 1406 } 1407 1408 static ssize_t 1409 hw_id_show(struct device *dev, struct device_attribute *attr, 1410 char *buf) 1411 { 1412 struct mdev_device *mdev = mdev_from_dev(dev); 1413 1414 if (mdev) { 1415 struct intel_vgpu *vgpu = (struct intel_vgpu *) 1416 mdev_get_drvdata(mdev); 1417 return sprintf(buf, "%u\n", 1418 vgpu->submission.shadow_ctx->hw_id); 1419 } 1420 return sprintf(buf, "\n"); 1421 } 1422 1423 static DEVICE_ATTR_RO(vgpu_id); 1424 static DEVICE_ATTR_RO(hw_id); 1425 1426 static struct attribute *intel_vgpu_attrs[] = { 1427 &dev_attr_vgpu_id.attr, 1428 &dev_attr_hw_id.attr, 1429 NULL 1430 }; 1431 1432 static const struct attribute_group intel_vgpu_group = { 1433 .name = "intel_vgpu", 1434 .attrs = intel_vgpu_attrs, 1435 }; 1436 1437 static const struct attribute_group *intel_vgpu_groups[] = { 1438 &intel_vgpu_group, 1439 NULL, 1440 }; 1441 1442 static struct mdev_parent_ops intel_vgpu_ops = { 1443 .mdev_attr_groups = intel_vgpu_groups, 1444 .create = intel_vgpu_create, 1445 .remove = intel_vgpu_remove, 1446 1447 .open = intel_vgpu_open, 1448 .release = intel_vgpu_release, 1449 1450 .read = intel_vgpu_read, 1451 .write = intel_vgpu_write, 1452 .mmap = intel_vgpu_mmap, 1453 .ioctl = intel_vgpu_ioctl, 1454 }; 1455 1456 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops) 1457 { 1458 struct attribute **kvm_type_attrs; 1459 struct attribute_group **kvm_vgpu_type_groups; 1460 1461 intel_gvt_ops = ops; 1462 if (!intel_gvt_ops->get_gvt_attrs(&kvm_type_attrs, 1463 &kvm_vgpu_type_groups)) 1464 return -EFAULT; 1465 intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups; 1466 1467 return mdev_register_device(dev, &intel_vgpu_ops); 1468 } 1469 1470 static void kvmgt_host_exit(struct device *dev, void *gvt) 1471 { 1472 mdev_unregister_device(dev); 1473 } 1474 1475 static int kvmgt_page_track_add(unsigned long handle, u64 gfn) 1476 { 1477 struct kvmgt_guest_info *info; 1478 struct kvm *kvm; 1479 struct kvm_memory_slot *slot; 1480 int idx; 1481 1482 if (!handle_valid(handle)) 1483 return -ESRCH; 1484 1485 info = (struct kvmgt_guest_info *)handle; 1486 kvm = info->kvm; 1487 1488 idx = srcu_read_lock(&kvm->srcu); 1489 slot = gfn_to_memslot(kvm, gfn); 1490 if (!slot) { 1491 srcu_read_unlock(&kvm->srcu, idx); 1492 return -EINVAL; 1493 } 1494 1495 spin_lock(&kvm->mmu_lock); 1496 1497 if (kvmgt_gfn_is_write_protected(info, gfn)) 1498 goto out; 1499 1500 kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); 1501 kvmgt_protect_table_add(info, gfn); 1502 1503 out: 1504 spin_unlock(&kvm->mmu_lock); 1505 srcu_read_unlock(&kvm->srcu, idx); 1506 return 0; 1507 } 1508 1509 static int kvmgt_page_track_remove(unsigned long handle, u64 gfn) 1510 { 1511 struct kvmgt_guest_info *info; 1512 struct kvm *kvm; 1513 struct kvm_memory_slot *slot; 1514 int idx; 1515 1516 if (!handle_valid(handle)) 1517 return 0; 1518 1519 info = (struct kvmgt_guest_info *)handle; 1520 kvm = info->kvm; 1521 1522 idx = srcu_read_lock(&kvm->srcu); 1523 slot = gfn_to_memslot(kvm, gfn); 1524 if (!slot) { 1525 srcu_read_unlock(&kvm->srcu, idx); 1526 return -EINVAL; 1527 } 1528 1529 spin_lock(&kvm->mmu_lock); 1530 1531 if (!kvmgt_gfn_is_write_protected(info, gfn)) 1532 goto out; 1533 1534 kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); 1535 kvmgt_protect_table_del(info, gfn); 1536 1537 out: 1538 spin_unlock(&kvm->mmu_lock); 1539 srcu_read_unlock(&kvm->srcu, idx); 1540 return 0; 1541 } 1542 1543 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1544 const u8 *val, int len, 1545 struct kvm_page_track_notifier_node *node) 1546 { 1547 struct kvmgt_guest_info *info = container_of(node, 1548 struct kvmgt_guest_info, track_node); 1549 1550 if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa))) 1551 intel_gvt_ops->write_protect_handler(info->vgpu, gpa, 1552 (void *)val, len); 1553 } 1554 1555 static void kvmgt_page_track_flush_slot(struct kvm *kvm, 1556 struct kvm_memory_slot *slot, 1557 struct kvm_page_track_notifier_node *node) 1558 { 1559 int i; 1560 gfn_t gfn; 1561 struct kvmgt_guest_info *info = container_of(node, 1562 struct kvmgt_guest_info, track_node); 1563 1564 spin_lock(&kvm->mmu_lock); 1565 for (i = 0; i < slot->npages; i++) { 1566 gfn = slot->base_gfn + i; 1567 if (kvmgt_gfn_is_write_protected(info, gfn)) { 1568 kvm_slot_page_track_remove_page(kvm, slot, gfn, 1569 KVM_PAGE_TRACK_WRITE); 1570 kvmgt_protect_table_del(info, gfn); 1571 } 1572 } 1573 spin_unlock(&kvm->mmu_lock); 1574 } 1575 1576 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm) 1577 { 1578 struct intel_vgpu *itr; 1579 struct kvmgt_guest_info *info; 1580 int id; 1581 bool ret = false; 1582 1583 mutex_lock(&vgpu->gvt->lock); 1584 for_each_active_vgpu(vgpu->gvt, itr, id) { 1585 if (!handle_valid(itr->handle)) 1586 continue; 1587 1588 info = (struct kvmgt_guest_info *)itr->handle; 1589 if (kvm && kvm == info->kvm) { 1590 ret = true; 1591 goto out; 1592 } 1593 } 1594 out: 1595 mutex_unlock(&vgpu->gvt->lock); 1596 return ret; 1597 } 1598 1599 static int kvmgt_guest_init(struct mdev_device *mdev) 1600 { 1601 struct kvmgt_guest_info *info; 1602 struct intel_vgpu *vgpu; 1603 struct kvm *kvm; 1604 1605 vgpu = mdev_get_drvdata(mdev); 1606 if (handle_valid(vgpu->handle)) 1607 return -EEXIST; 1608 1609 kvm = vgpu->vdev.kvm; 1610 if (!kvm || kvm->mm != current->mm) { 1611 gvt_vgpu_err("KVM is required to use Intel vGPU\n"); 1612 return -ESRCH; 1613 } 1614 1615 if (__kvmgt_vgpu_exist(vgpu, kvm)) 1616 return -EEXIST; 1617 1618 info = vzalloc(sizeof(struct kvmgt_guest_info)); 1619 if (!info) 1620 return -ENOMEM; 1621 1622 vgpu->handle = (unsigned long)info; 1623 info->vgpu = vgpu; 1624 info->kvm = kvm; 1625 kvm_get_kvm(info->kvm); 1626 1627 kvmgt_protect_table_init(info); 1628 gvt_cache_init(vgpu); 1629 1630 init_completion(&vgpu->vblank_done); 1631 1632 info->track_node.track_write = kvmgt_page_track_write; 1633 info->track_node.track_flush_slot = kvmgt_page_track_flush_slot; 1634 kvm_page_track_register_notifier(kvm, &info->track_node); 1635 1636 info->debugfs_cache_entries = debugfs_create_ulong( 1637 "kvmgt_nr_cache_entries", 1638 0444, vgpu->debugfs, 1639 &vgpu->vdev.nr_cache_entries); 1640 if (!info->debugfs_cache_entries) 1641 gvt_vgpu_err("Cannot create kvmgt debugfs entry\n"); 1642 1643 return 0; 1644 } 1645 1646 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info) 1647 { 1648 debugfs_remove(info->debugfs_cache_entries); 1649 1650 kvm_page_track_unregister_notifier(info->kvm, &info->track_node); 1651 kvm_put_kvm(info->kvm); 1652 kvmgt_protect_table_destroy(info); 1653 gvt_cache_destroy(info->vgpu); 1654 vfree(info); 1655 1656 return true; 1657 } 1658 1659 static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle) 1660 { 1661 /* nothing to do here */ 1662 return 0; 1663 } 1664 1665 static void kvmgt_detach_vgpu(unsigned long handle) 1666 { 1667 /* nothing to do here */ 1668 } 1669 1670 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data) 1671 { 1672 struct kvmgt_guest_info *info; 1673 struct intel_vgpu *vgpu; 1674 1675 if (!handle_valid(handle)) 1676 return -ESRCH; 1677 1678 info = (struct kvmgt_guest_info *)handle; 1679 vgpu = info->vgpu; 1680 1681 /* 1682 * When guest is poweroff, msi_trigger is set to NULL, but vgpu's 1683 * config and mmio register isn't restored to default during guest 1684 * poweroff. If this vgpu is still used in next vm, this vgpu's pipe 1685 * may be enabled, then once this vgpu is active, it will get inject 1686 * vblank interrupt request. But msi_trigger is null until msi is 1687 * enabled by guest. so if msi_trigger is null, success is still 1688 * returned and don't inject interrupt into guest. 1689 */ 1690 if (vgpu->vdev.msi_trigger == NULL) 1691 return 0; 1692 1693 if (eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1) 1694 return 0; 1695 1696 return -EFAULT; 1697 } 1698 1699 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn) 1700 { 1701 struct kvmgt_guest_info *info; 1702 kvm_pfn_t pfn; 1703 1704 if (!handle_valid(handle)) 1705 return INTEL_GVT_INVALID_ADDR; 1706 1707 info = (struct kvmgt_guest_info *)handle; 1708 1709 pfn = gfn_to_pfn(info->kvm, gfn); 1710 if (is_error_noslot_pfn(pfn)) 1711 return INTEL_GVT_INVALID_ADDR; 1712 1713 return pfn; 1714 } 1715 1716 int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn, 1717 unsigned long size, dma_addr_t *dma_addr) 1718 { 1719 struct kvmgt_guest_info *info; 1720 struct intel_vgpu *vgpu; 1721 struct gvt_dma *entry; 1722 int ret; 1723 1724 if (!handle_valid(handle)) 1725 return -EINVAL; 1726 1727 info = (struct kvmgt_guest_info *)handle; 1728 vgpu = info->vgpu; 1729 1730 mutex_lock(&info->vgpu->vdev.cache_lock); 1731 1732 entry = __gvt_cache_find_gfn(info->vgpu, gfn); 1733 if (!entry) { 1734 ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size); 1735 if (ret) 1736 goto err_unlock; 1737 1738 ret = __gvt_cache_add(info->vgpu, gfn, *dma_addr, size); 1739 if (ret) 1740 goto err_unmap; 1741 } else { 1742 kref_get(&entry->ref); 1743 *dma_addr = entry->dma_addr; 1744 } 1745 1746 mutex_unlock(&info->vgpu->vdev.cache_lock); 1747 return 0; 1748 1749 err_unmap: 1750 gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size); 1751 err_unlock: 1752 mutex_unlock(&info->vgpu->vdev.cache_lock); 1753 return ret; 1754 } 1755 1756 static void __gvt_dma_release(struct kref *ref) 1757 { 1758 struct gvt_dma *entry = container_of(ref, typeof(*entry), ref); 1759 1760 gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr, 1761 entry->size); 1762 __gvt_cache_remove_entry(entry->vgpu, entry); 1763 } 1764 1765 void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr) 1766 { 1767 struct kvmgt_guest_info *info; 1768 struct gvt_dma *entry; 1769 1770 if (!handle_valid(handle)) 1771 return; 1772 1773 info = (struct kvmgt_guest_info *)handle; 1774 1775 mutex_lock(&info->vgpu->vdev.cache_lock); 1776 entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr); 1777 if (entry) 1778 kref_put(&entry->ref, __gvt_dma_release); 1779 mutex_unlock(&info->vgpu->vdev.cache_lock); 1780 } 1781 1782 static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa, 1783 void *buf, unsigned long len, bool write) 1784 { 1785 struct kvmgt_guest_info *info; 1786 struct kvm *kvm; 1787 int idx, ret; 1788 bool kthread = current->mm == NULL; 1789 1790 if (!handle_valid(handle)) 1791 return -ESRCH; 1792 1793 info = (struct kvmgt_guest_info *)handle; 1794 kvm = info->kvm; 1795 1796 if (kthread) { 1797 if (!mmget_not_zero(kvm->mm)) 1798 return -EFAULT; 1799 use_mm(kvm->mm); 1800 } 1801 1802 idx = srcu_read_lock(&kvm->srcu); 1803 ret = write ? kvm_write_guest(kvm, gpa, buf, len) : 1804 kvm_read_guest(kvm, gpa, buf, len); 1805 srcu_read_unlock(&kvm->srcu, idx); 1806 1807 if (kthread) { 1808 unuse_mm(kvm->mm); 1809 mmput(kvm->mm); 1810 } 1811 1812 return ret; 1813 } 1814 1815 static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa, 1816 void *buf, unsigned long len) 1817 { 1818 return kvmgt_rw_gpa(handle, gpa, buf, len, false); 1819 } 1820 1821 static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa, 1822 void *buf, unsigned long len) 1823 { 1824 return kvmgt_rw_gpa(handle, gpa, buf, len, true); 1825 } 1826 1827 static unsigned long kvmgt_virt_to_pfn(void *addr) 1828 { 1829 return PFN_DOWN(__pa(addr)); 1830 } 1831 1832 static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn) 1833 { 1834 struct kvmgt_guest_info *info; 1835 struct kvm *kvm; 1836 int idx; 1837 bool ret; 1838 1839 if (!handle_valid(handle)) 1840 return false; 1841 1842 info = (struct kvmgt_guest_info *)handle; 1843 kvm = info->kvm; 1844 1845 idx = srcu_read_lock(&kvm->srcu); 1846 ret = kvm_is_visible_gfn(kvm, gfn); 1847 srcu_read_unlock(&kvm->srcu, idx); 1848 1849 return ret; 1850 } 1851 1852 struct intel_gvt_mpt kvmgt_mpt = { 1853 .host_init = kvmgt_host_init, 1854 .host_exit = kvmgt_host_exit, 1855 .attach_vgpu = kvmgt_attach_vgpu, 1856 .detach_vgpu = kvmgt_detach_vgpu, 1857 .inject_msi = kvmgt_inject_msi, 1858 .from_virt_to_mfn = kvmgt_virt_to_pfn, 1859 .enable_page_track = kvmgt_page_track_add, 1860 .disable_page_track = kvmgt_page_track_remove, 1861 .read_gpa = kvmgt_read_gpa, 1862 .write_gpa = kvmgt_write_gpa, 1863 .gfn_to_mfn = kvmgt_gfn_to_pfn, 1864 .dma_map_guest_page = kvmgt_dma_map_guest_page, 1865 .dma_unmap_guest_page = kvmgt_dma_unmap_guest_page, 1866 .set_opregion = kvmgt_set_opregion, 1867 .get_vfio_device = kvmgt_get_vfio_device, 1868 .put_vfio_device = kvmgt_put_vfio_device, 1869 .is_valid_gfn = kvmgt_is_valid_gfn, 1870 }; 1871 EXPORT_SYMBOL_GPL(kvmgt_mpt); 1872 1873 static int __init kvmgt_init(void) 1874 { 1875 return 0; 1876 } 1877 1878 static void __exit kvmgt_exit(void) 1879 { 1880 } 1881 1882 module_init(kvmgt_init); 1883 module_exit(kvmgt_exit); 1884 1885 MODULE_LICENSE("GPL and additional rights"); 1886 MODULE_AUTHOR("Intel Corporation"); 1887