1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2015 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org> 6 */ 7 8 #include <linux/intel-iommu.h> 9 #include <linux/mmu_notifier.h> 10 #include <linux/sched.h> 11 #include <linux/sched/mm.h> 12 #include <linux/slab.h> 13 #include <linux/intel-svm.h> 14 #include <linux/rculist.h> 15 #include <linux/pci.h> 16 #include <linux/pci-ats.h> 17 #include <linux/dmar.h> 18 #include <linux/interrupt.h> 19 #include <linux/mm_types.h> 20 #include <linux/ioasid.h> 21 #include <asm/page.h> 22 #include <asm/fpu/api.h> 23 24 #include "pasid.h" 25 26 static irqreturn_t prq_event_thread(int irq, void *d); 27 static void intel_svm_drain_prq(struct device *dev, u32 pasid); 28 29 #define PRQ_ORDER 0 30 31 int intel_svm_enable_prq(struct intel_iommu *iommu) 32 { 33 struct page *pages; 34 int irq, ret; 35 36 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); 37 if (!pages) { 38 pr_warn("IOMMU: %s: Failed to allocate page request queue\n", 39 iommu->name); 40 return -ENOMEM; 41 } 42 iommu->prq = page_address(pages); 43 44 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); 45 if (irq <= 0) { 46 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", 47 iommu->name); 48 ret = -EINVAL; 49 err: 50 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 51 iommu->prq = NULL; 52 return ret; 53 } 54 iommu->pr_irq = irq; 55 56 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); 57 58 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, 59 iommu->prq_name, iommu); 60 if (ret) { 61 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", 62 iommu->name); 63 dmar_free_hwirq(irq); 64 iommu->pr_irq = 0; 65 goto err; 66 } 67 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 68 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 69 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 70 71 init_completion(&iommu->prq_complete); 72 73 return 0; 74 } 75 76 int intel_svm_finish_prq(struct intel_iommu *iommu) 77 { 78 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 79 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 80 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 81 82 if (iommu->pr_irq) { 83 free_irq(iommu->pr_irq, iommu); 84 dmar_free_hwirq(iommu->pr_irq); 85 iommu->pr_irq = 0; 86 } 87 88 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 89 iommu->prq = NULL; 90 91 return 0; 92 } 93 94 static inline bool intel_svm_capable(struct intel_iommu *iommu) 95 { 96 return iommu->flags & VTD_FLAG_SVM_CAPABLE; 97 } 98 99 void intel_svm_check(struct intel_iommu *iommu) 100 { 101 if (!pasid_supported(iommu)) 102 return; 103 104 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && 105 !cap_fl1gp_support(iommu->cap)) { 106 pr_err("%s SVM disabled, incompatible 1GB page capability\n", 107 iommu->name); 108 return; 109 } 110 111 if (cpu_feature_enabled(X86_FEATURE_LA57) && 112 !cap_5lp_support(iommu->cap)) { 113 pr_err("%s SVM disabled, incompatible paging mode\n", 114 iommu->name); 115 return; 116 } 117 118 iommu->flags |= VTD_FLAG_SVM_CAPABLE; 119 } 120 121 static void __flush_svm_range_dev(struct intel_svm *svm, 122 struct intel_svm_dev *sdev, 123 unsigned long address, 124 unsigned long pages, int ih) 125 { 126 struct device_domain_info *info = get_domain_info(sdev->dev); 127 128 if (WARN_ON(!pages)) 129 return; 130 131 qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih); 132 if (info->ats_enabled) 133 qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, 134 svm->pasid, sdev->qdep, address, 135 order_base_2(pages)); 136 } 137 138 static void intel_flush_svm_range_dev(struct intel_svm *svm, 139 struct intel_svm_dev *sdev, 140 unsigned long address, 141 unsigned long pages, int ih) 142 { 143 unsigned long shift = ilog2(__roundup_pow_of_two(pages)); 144 unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift)); 145 unsigned long start = ALIGN_DOWN(address, align); 146 unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align); 147 148 while (start < end) { 149 __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih); 150 start += align; 151 } 152 } 153 154 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, 155 unsigned long pages, int ih) 156 { 157 struct intel_svm_dev *sdev; 158 159 rcu_read_lock(); 160 list_for_each_entry_rcu(sdev, &svm->devs, list) 161 intel_flush_svm_range_dev(svm, sdev, address, pages, ih); 162 rcu_read_unlock(); 163 } 164 165 /* Pages have been freed at this point */ 166 static void intel_invalidate_range(struct mmu_notifier *mn, 167 struct mm_struct *mm, 168 unsigned long start, unsigned long end) 169 { 170 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 171 172 intel_flush_svm_range(svm, start, 173 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); 174 } 175 176 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) 177 { 178 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 179 struct intel_svm_dev *sdev; 180 181 /* This might end up being called from exit_mmap(), *before* the page 182 * tables are cleared. And __mmu_notifier_release() will delete us from 183 * the list of notifiers so that our invalidate_range() callback doesn't 184 * get called when the page tables are cleared. So we need to protect 185 * against hardware accessing those page tables. 186 * 187 * We do it by clearing the entry in the PASID table and then flushing 188 * the IOTLB and the PASID table caches. This might upset hardware; 189 * perhaps we'll want to point the PASID to a dummy PGD (like the zero 190 * page) so that we end up taking a fault that the hardware really 191 * *has* to handle gracefully without affecting other processes. 192 */ 193 rcu_read_lock(); 194 list_for_each_entry_rcu(sdev, &svm->devs, list) 195 intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, 196 svm->pasid, true); 197 rcu_read_unlock(); 198 199 } 200 201 static const struct mmu_notifier_ops intel_mmuops = { 202 .release = intel_mm_release, 203 .invalidate_range = intel_invalidate_range, 204 }; 205 206 static DEFINE_MUTEX(pasid_mutex); 207 static LIST_HEAD(global_svm_list); 208 209 #define for_each_svm_dev(sdev, svm, d) \ 210 list_for_each_entry((sdev), &(svm)->devs, list) \ 211 if ((d) != (sdev)->dev) {} else 212 213 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, 214 struct intel_svm **rsvm, 215 struct intel_svm_dev **rsdev) 216 { 217 struct intel_svm_dev *d, *sdev = NULL; 218 struct intel_svm *svm; 219 220 /* The caller should hold the pasid_mutex lock */ 221 if (WARN_ON(!mutex_is_locked(&pasid_mutex))) 222 return -EINVAL; 223 224 if (pasid == INVALID_IOASID || pasid >= PASID_MAX) 225 return -EINVAL; 226 227 svm = ioasid_find(NULL, pasid, NULL); 228 if (IS_ERR(svm)) 229 return PTR_ERR(svm); 230 231 if (!svm) 232 goto out; 233 234 /* 235 * If we found svm for the PASID, there must be at least one device 236 * bond. 237 */ 238 if (WARN_ON(list_empty(&svm->devs))) 239 return -EINVAL; 240 241 rcu_read_lock(); 242 list_for_each_entry_rcu(d, &svm->devs, list) { 243 if (d->dev == dev) { 244 sdev = d; 245 break; 246 } 247 } 248 rcu_read_unlock(); 249 250 out: 251 *rsvm = svm; 252 *rsdev = sdev; 253 254 return 0; 255 } 256 257 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, 258 struct iommu_gpasid_bind_data *data) 259 { 260 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 261 struct intel_svm_dev *sdev = NULL; 262 struct dmar_domain *dmar_domain; 263 struct device_domain_info *info; 264 struct intel_svm *svm = NULL; 265 unsigned long iflags; 266 int ret = 0; 267 268 if (WARN_ON(!iommu) || !data) 269 return -EINVAL; 270 271 if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD) 272 return -EINVAL; 273 274 /* IOMMU core ensures argsz is more than the start of the union */ 275 if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd)) 276 return -EINVAL; 277 278 /* Make sure no undefined flags are used in vendor data */ 279 if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1)) 280 return -EINVAL; 281 282 if (!dev_is_pci(dev)) 283 return -ENOTSUPP; 284 285 /* VT-d supports devices with full 20 bit PASIDs only */ 286 if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) 287 return -EINVAL; 288 289 /* 290 * We only check host PASID range, we have no knowledge to check 291 * guest PASID range. 292 */ 293 if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) 294 return -EINVAL; 295 296 info = get_domain_info(dev); 297 if (!info) 298 return -EINVAL; 299 300 dmar_domain = to_dmar_domain(domain); 301 302 mutex_lock(&pasid_mutex); 303 ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev); 304 if (ret) 305 goto out; 306 307 if (sdev) { 308 /* 309 * Do not allow multiple bindings of the same device-PASID since 310 * there is only one SL page tables per PASID. We may revisit 311 * once sharing PGD across domains are supported. 312 */ 313 dev_warn_ratelimited(dev, "Already bound with PASID %u\n", 314 svm->pasid); 315 ret = -EBUSY; 316 goto out; 317 } 318 319 if (!svm) { 320 /* We come here when PASID has never been bond to a device. */ 321 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 322 if (!svm) { 323 ret = -ENOMEM; 324 goto out; 325 } 326 /* REVISIT: upper layer/VFIO can track host process that bind 327 * the PASID. ioasid_set = mm might be sufficient for vfio to 328 * check pasid VMM ownership. We can drop the following line 329 * once VFIO and IOASID set check is in place. 330 */ 331 svm->mm = get_task_mm(current); 332 svm->pasid = data->hpasid; 333 if (data->flags & IOMMU_SVA_GPASID_VAL) { 334 svm->gpasid = data->gpasid; 335 svm->flags |= SVM_FLAG_GUEST_PASID; 336 } 337 ioasid_set_data(data->hpasid, svm); 338 INIT_LIST_HEAD_RCU(&svm->devs); 339 mmput(svm->mm); 340 } 341 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 342 if (!sdev) { 343 ret = -ENOMEM; 344 goto out; 345 } 346 sdev->dev = dev; 347 sdev->sid = PCI_DEVID(info->bus, info->devfn); 348 sdev->iommu = iommu; 349 350 /* Only count users if device has aux domains */ 351 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 352 sdev->users = 1; 353 354 /* Set up device context entry for PASID if not enabled already */ 355 ret = intel_iommu_enable_pasid(iommu, sdev->dev); 356 if (ret) { 357 dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); 358 kfree(sdev); 359 goto out; 360 } 361 362 /* 363 * PASID table is per device for better security. Therefore, for 364 * each bind of a new device even with an existing PASID, we need to 365 * call the nested mode setup function here. 366 */ 367 spin_lock_irqsave(&iommu->lock, iflags); 368 ret = intel_pasid_setup_nested(iommu, dev, 369 (pgd_t *)(uintptr_t)data->gpgd, 370 data->hpasid, &data->vendor.vtd, dmar_domain, 371 data->addr_width); 372 spin_unlock_irqrestore(&iommu->lock, iflags); 373 if (ret) { 374 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", 375 data->hpasid, ret); 376 /* 377 * PASID entry should be in cleared state if nested mode 378 * set up failed. So we only need to clear IOASID tracking 379 * data such that free call will succeed. 380 */ 381 kfree(sdev); 382 goto out; 383 } 384 385 svm->flags |= SVM_FLAG_GUEST_MODE; 386 387 init_rcu_head(&sdev->rcu); 388 list_add_rcu(&sdev->list, &svm->devs); 389 out: 390 if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { 391 ioasid_set_data(data->hpasid, NULL); 392 kfree(svm); 393 } 394 395 mutex_unlock(&pasid_mutex); 396 return ret; 397 } 398 399 int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) 400 { 401 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 402 struct intel_svm_dev *sdev; 403 struct intel_svm *svm; 404 int ret; 405 406 if (WARN_ON(!iommu)) 407 return -EINVAL; 408 409 mutex_lock(&pasid_mutex); 410 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 411 if (ret) 412 goto out; 413 414 if (sdev) { 415 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) 416 sdev->users--; 417 if (!sdev->users) { 418 list_del_rcu(&sdev->list); 419 intel_pasid_tear_down_entry(iommu, dev, 420 svm->pasid, false); 421 intel_svm_drain_prq(dev, svm->pasid); 422 kfree_rcu(sdev, rcu); 423 424 if (list_empty(&svm->devs)) { 425 /* 426 * We do not free the IOASID here in that 427 * IOMMU driver did not allocate it. 428 * Unlike native SVM, IOASID for guest use was 429 * allocated prior to the bind call. 430 * In any case, if the free call comes before 431 * the unbind, IOMMU driver will get notified 432 * and perform cleanup. 433 */ 434 ioasid_set_data(pasid, NULL); 435 kfree(svm); 436 } 437 } 438 } 439 out: 440 mutex_unlock(&pasid_mutex); 441 return ret; 442 } 443 444 static void _load_pasid(void *unused) 445 { 446 update_pasid(); 447 } 448 449 static void load_pasid(struct mm_struct *mm, u32 pasid) 450 { 451 mutex_lock(&mm->context.lock); 452 453 /* Synchronize with READ_ONCE in update_pasid(). */ 454 smp_store_release(&mm->pasid, pasid); 455 456 /* Update PASID MSR on all CPUs running the mm's tasks. */ 457 on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true); 458 459 mutex_unlock(&mm->context.lock); 460 } 461 462 /* Caller must hold pasid_mutex, mm reference */ 463 static int 464 intel_svm_bind_mm(struct device *dev, unsigned int flags, 465 struct svm_dev_ops *ops, 466 struct mm_struct *mm, struct intel_svm_dev **sd) 467 { 468 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 469 struct device_domain_info *info; 470 struct intel_svm_dev *sdev; 471 struct intel_svm *svm = NULL; 472 unsigned long iflags; 473 int pasid_max; 474 int ret; 475 476 if (!iommu || dmar_disabled) 477 return -EINVAL; 478 479 if (!intel_svm_capable(iommu)) 480 return -ENOTSUPP; 481 482 if (dev_is_pci(dev)) { 483 pasid_max = pci_max_pasids(to_pci_dev(dev)); 484 if (pasid_max < 0) 485 return -EINVAL; 486 } else 487 pasid_max = 1 << 20; 488 489 /* Bind supervisor PASID shuld have mm = NULL */ 490 if (flags & SVM_FLAG_SUPERVISOR_MODE) { 491 if (!ecap_srs(iommu->ecap) || mm) { 492 pr_err("Supervisor PASID with user provided mm.\n"); 493 return -EINVAL; 494 } 495 } 496 497 if (!(flags & SVM_FLAG_PRIVATE_PASID)) { 498 struct intel_svm *t; 499 500 list_for_each_entry(t, &global_svm_list, list) { 501 if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID)) 502 continue; 503 504 svm = t; 505 if (svm->pasid >= pasid_max) { 506 dev_warn(dev, 507 "Limited PASID width. Cannot use existing PASID %d\n", 508 svm->pasid); 509 ret = -ENOSPC; 510 goto out; 511 } 512 513 /* Find the matching device in svm list */ 514 for_each_svm_dev(sdev, svm, dev) { 515 if (sdev->ops != ops) { 516 ret = -EBUSY; 517 goto out; 518 } 519 sdev->users++; 520 goto success; 521 } 522 523 break; 524 } 525 } 526 527 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 528 if (!sdev) { 529 ret = -ENOMEM; 530 goto out; 531 } 532 sdev->dev = dev; 533 sdev->iommu = iommu; 534 535 ret = intel_iommu_enable_pasid(iommu, dev); 536 if (ret) { 537 kfree(sdev); 538 goto out; 539 } 540 541 info = get_domain_info(dev); 542 sdev->did = FLPT_DEFAULT_DID; 543 sdev->sid = PCI_DEVID(info->bus, info->devfn); 544 if (info->ats_enabled) { 545 sdev->dev_iotlb = 1; 546 sdev->qdep = info->ats_qdep; 547 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) 548 sdev->qdep = 0; 549 } 550 551 /* Finish the setup now we know we're keeping it */ 552 sdev->users = 1; 553 sdev->ops = ops; 554 init_rcu_head(&sdev->rcu); 555 556 if (!svm) { 557 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 558 if (!svm) { 559 ret = -ENOMEM; 560 kfree(sdev); 561 goto out; 562 } 563 564 if (pasid_max > intel_pasid_max_id) 565 pasid_max = intel_pasid_max_id; 566 567 /* Do not use PASID 0, reserved for RID to PASID */ 568 svm->pasid = ioasid_alloc(NULL, PASID_MIN, 569 pasid_max - 1, svm); 570 if (svm->pasid == INVALID_IOASID) { 571 kfree(svm); 572 kfree(sdev); 573 ret = -ENOSPC; 574 goto out; 575 } 576 svm->notifier.ops = &intel_mmuops; 577 svm->mm = mm; 578 svm->flags = flags; 579 INIT_LIST_HEAD_RCU(&svm->devs); 580 INIT_LIST_HEAD(&svm->list); 581 ret = -ENOMEM; 582 if (mm) { 583 ret = mmu_notifier_register(&svm->notifier, mm); 584 if (ret) { 585 ioasid_put(svm->pasid); 586 kfree(svm); 587 kfree(sdev); 588 goto out; 589 } 590 } 591 592 spin_lock_irqsave(&iommu->lock, iflags); 593 ret = intel_pasid_setup_first_level(iommu, dev, 594 mm ? mm->pgd : init_mm.pgd, 595 svm->pasid, FLPT_DEFAULT_DID, 596 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 597 (cpu_feature_enabled(X86_FEATURE_LA57) ? 598 PASID_FLAG_FL5LP : 0)); 599 spin_unlock_irqrestore(&iommu->lock, iflags); 600 if (ret) { 601 if (mm) 602 mmu_notifier_unregister(&svm->notifier, mm); 603 ioasid_put(svm->pasid); 604 kfree(svm); 605 kfree(sdev); 606 goto out; 607 } 608 609 list_add_tail(&svm->list, &global_svm_list); 610 if (mm) { 611 /* The newly allocated pasid is loaded to the mm. */ 612 load_pasid(mm, svm->pasid); 613 } 614 } else { 615 /* 616 * Binding a new device with existing PASID, need to setup 617 * the PASID entry. 618 */ 619 spin_lock_irqsave(&iommu->lock, iflags); 620 ret = intel_pasid_setup_first_level(iommu, dev, 621 mm ? mm->pgd : init_mm.pgd, 622 svm->pasid, FLPT_DEFAULT_DID, 623 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | 624 (cpu_feature_enabled(X86_FEATURE_LA57) ? 625 PASID_FLAG_FL5LP : 0)); 626 spin_unlock_irqrestore(&iommu->lock, iflags); 627 if (ret) { 628 kfree(sdev); 629 goto out; 630 } 631 } 632 list_add_rcu(&sdev->list, &svm->devs); 633 success: 634 sdev->pasid = svm->pasid; 635 sdev->sva.dev = dev; 636 if (sd) 637 *sd = sdev; 638 ret = 0; 639 out: 640 return ret; 641 } 642 643 /* Caller must hold pasid_mutex */ 644 static int intel_svm_unbind_mm(struct device *dev, u32 pasid) 645 { 646 struct intel_svm_dev *sdev; 647 struct intel_iommu *iommu; 648 struct intel_svm *svm; 649 int ret = -EINVAL; 650 651 iommu = device_to_iommu(dev, NULL, NULL); 652 if (!iommu) 653 goto out; 654 655 ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); 656 if (ret) 657 goto out; 658 659 if (sdev) { 660 sdev->users--; 661 if (!sdev->users) { 662 list_del_rcu(&sdev->list); 663 /* Flush the PASID cache and IOTLB for this device. 664 * Note that we do depend on the hardware *not* using 665 * the PASID any more. Just as we depend on other 666 * devices never using PASIDs that they have no right 667 * to use. We have a *shared* PASID table, because it's 668 * large and has to be physically contiguous. So it's 669 * hard to be as defensive as we might like. */ 670 intel_pasid_tear_down_entry(iommu, dev, 671 svm->pasid, false); 672 intel_svm_drain_prq(dev, svm->pasid); 673 kfree_rcu(sdev, rcu); 674 675 if (list_empty(&svm->devs)) { 676 ioasid_put(svm->pasid); 677 if (svm->mm) { 678 mmu_notifier_unregister(&svm->notifier, svm->mm); 679 /* Clear mm's pasid. */ 680 load_pasid(svm->mm, PASID_DISABLED); 681 } 682 list_del(&svm->list); 683 /* We mandate that no page faults may be outstanding 684 * for the PASID when intel_svm_unbind_mm() is called. 685 * If that is not obeyed, subtle errors will happen. 686 * Let's make them less subtle... */ 687 memset(svm, 0x6b, sizeof(*svm)); 688 kfree(svm); 689 } 690 } 691 } 692 out: 693 return ret; 694 } 695 696 /* Page request queue descriptor */ 697 struct page_req_dsc { 698 union { 699 struct { 700 u64 type:8; 701 u64 pasid_present:1; 702 u64 priv_data_present:1; 703 u64 rsvd:6; 704 u64 rid:16; 705 u64 pasid:20; 706 u64 exe_req:1; 707 u64 pm_req:1; 708 u64 rsvd2:10; 709 }; 710 u64 qw_0; 711 }; 712 union { 713 struct { 714 u64 rd_req:1; 715 u64 wr_req:1; 716 u64 lpig:1; 717 u64 prg_index:9; 718 u64 addr:52; 719 }; 720 u64 qw_1; 721 }; 722 u64 priv_data[2]; 723 }; 724 725 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) 726 727 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req) 728 { 729 unsigned long requested = 0; 730 731 if (req->exe_req) 732 requested |= VM_EXEC; 733 734 if (req->rd_req) 735 requested |= VM_READ; 736 737 if (req->wr_req) 738 requested |= VM_WRITE; 739 740 return (requested & ~vma->vm_flags) != 0; 741 } 742 743 static bool is_canonical_address(u64 addr) 744 { 745 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 746 long saddr = (long) addr; 747 748 return (((saddr << shift) >> shift) == saddr); 749 } 750 751 /** 752 * intel_svm_drain_prq - Drain page requests and responses for a pasid 753 * @dev: target device 754 * @pasid: pasid for draining 755 * 756 * Drain all pending page requests and responses related to @pasid in both 757 * software and hardware. This is supposed to be called after the device 758 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB 759 * and DevTLB have been invalidated. 760 * 761 * It waits until all pending page requests for @pasid in the page fault 762 * queue are completed by the prq handling thread. Then follow the steps 763 * described in VT-d spec CH7.10 to drain all page requests and page 764 * responses pending in the hardware. 765 */ 766 static void intel_svm_drain_prq(struct device *dev, u32 pasid) 767 { 768 struct device_domain_info *info; 769 struct dmar_domain *domain; 770 struct intel_iommu *iommu; 771 struct qi_desc desc[3]; 772 struct pci_dev *pdev; 773 int head, tail; 774 u16 sid, did; 775 int qdep; 776 777 info = get_domain_info(dev); 778 if (WARN_ON(!info || !dev_is_pci(dev))) 779 return; 780 781 if (!info->pri_enabled) 782 return; 783 784 iommu = info->iommu; 785 domain = info->domain; 786 pdev = to_pci_dev(dev); 787 sid = PCI_DEVID(info->bus, info->devfn); 788 did = domain->iommu_did[iommu->seq_id]; 789 qdep = pci_ats_queue_depth(pdev); 790 791 /* 792 * Check and wait until all pending page requests in the queue are 793 * handled by the prq handling thread. 794 */ 795 prq_retry: 796 reinit_completion(&iommu->prq_complete); 797 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 798 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 799 while (head != tail) { 800 struct page_req_dsc *req; 801 802 req = &iommu->prq[head / sizeof(*req)]; 803 if (!req->pasid_present || req->pasid != pasid) { 804 head = (head + sizeof(*req)) & PRQ_RING_MASK; 805 continue; 806 } 807 808 wait_for_completion(&iommu->prq_complete); 809 goto prq_retry; 810 } 811 812 /* 813 * Perform steps described in VT-d spec CH7.10 to drain page 814 * requests and responses in hardware. 815 */ 816 memset(desc, 0, sizeof(desc)); 817 desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | 818 QI_IWD_FENCE | 819 QI_IWD_TYPE; 820 desc[1].qw0 = QI_EIOTLB_PASID(pasid) | 821 QI_EIOTLB_DID(did) | 822 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 823 QI_EIOTLB_TYPE; 824 desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | 825 QI_DEV_EIOTLB_SID(sid) | 826 QI_DEV_EIOTLB_QDEP(qdep) | 827 QI_DEIOTLB_TYPE | 828 QI_DEV_IOTLB_PFSID(info->pfsid); 829 qi_retry: 830 reinit_completion(&iommu->prq_complete); 831 qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); 832 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 833 wait_for_completion(&iommu->prq_complete); 834 goto qi_retry; 835 } 836 } 837 838 static int prq_to_iommu_prot(struct page_req_dsc *req) 839 { 840 int prot = 0; 841 842 if (req->rd_req) 843 prot |= IOMMU_FAULT_PERM_READ; 844 if (req->wr_req) 845 prot |= IOMMU_FAULT_PERM_WRITE; 846 if (req->exe_req) 847 prot |= IOMMU_FAULT_PERM_EXEC; 848 if (req->pm_req) 849 prot |= IOMMU_FAULT_PERM_PRIV; 850 851 return prot; 852 } 853 854 static int 855 intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) 856 { 857 struct iommu_fault_event event; 858 859 if (!dev || !dev_is_pci(dev)) 860 return -ENODEV; 861 862 /* Fill in event data for device specific processing */ 863 memset(&event, 0, sizeof(struct iommu_fault_event)); 864 event.fault.type = IOMMU_FAULT_PAGE_REQ; 865 event.fault.prm.addr = desc->addr; 866 event.fault.prm.pasid = desc->pasid; 867 event.fault.prm.grpid = desc->prg_index; 868 event.fault.prm.perm = prq_to_iommu_prot(desc); 869 870 if (desc->lpig) 871 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 872 if (desc->pasid_present) { 873 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 874 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; 875 } 876 if (desc->priv_data_present) { 877 /* 878 * Set last page in group bit if private data is present, 879 * page response is required as it does for LPIG. 880 * iommu_report_device_fault() doesn't understand this vendor 881 * specific requirement thus we set last_page as a workaround. 882 */ 883 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 884 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 885 memcpy(event.fault.prm.private_data, desc->priv_data, 886 sizeof(desc->priv_data)); 887 } 888 889 return iommu_report_device_fault(dev, &event); 890 } 891 892 static irqreturn_t prq_event_thread(int irq, void *d) 893 { 894 struct intel_svm_dev *sdev = NULL; 895 struct intel_iommu *iommu = d; 896 struct intel_svm *svm = NULL; 897 int head, tail, handled = 0; 898 899 /* Clear PPR bit before reading head/tail registers, to 900 * ensure that we get a new interrupt if needed. */ 901 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 902 903 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 904 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 905 while (head != tail) { 906 struct vm_area_struct *vma; 907 struct page_req_dsc *req; 908 struct qi_desc resp; 909 int result; 910 vm_fault_t ret; 911 u64 address; 912 913 handled = 1; 914 req = &iommu->prq[head / sizeof(*req)]; 915 result = QI_RESP_INVALID; 916 address = (u64)req->addr << VTD_PAGE_SHIFT; 917 if (!req->pasid_present) { 918 pr_err("%s: Page request without PASID: %08llx %08llx\n", 919 iommu->name, ((unsigned long long *)req)[0], 920 ((unsigned long long *)req)[1]); 921 goto no_pasid; 922 } 923 924 if (!svm || svm->pasid != req->pasid) { 925 rcu_read_lock(); 926 svm = ioasid_find(NULL, req->pasid, NULL); 927 /* It *can't* go away, because the driver is not permitted 928 * to unbind the mm while any page faults are outstanding. 929 * So we only need RCU to protect the internal idr code. */ 930 rcu_read_unlock(); 931 if (IS_ERR_OR_NULL(svm)) { 932 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", 933 iommu->name, req->pasid, ((unsigned long long *)req)[0], 934 ((unsigned long long *)req)[1]); 935 goto no_pasid; 936 } 937 } 938 939 if (!sdev || sdev->sid != req->rid) { 940 struct intel_svm_dev *t; 941 942 sdev = NULL; 943 rcu_read_lock(); 944 list_for_each_entry_rcu(t, &svm->devs, list) { 945 if (t->sid == req->rid) { 946 sdev = t; 947 break; 948 } 949 } 950 rcu_read_unlock(); 951 } 952 953 /* Since we're using init_mm.pgd directly, we should never take 954 * any faults on kernel addresses. */ 955 if (!svm->mm) 956 goto bad_req; 957 958 /* If address is not canonical, return invalid response */ 959 if (!is_canonical_address(address)) 960 goto bad_req; 961 962 /* 963 * If prq is to be handled outside iommu driver via receiver of 964 * the fault notifiers, we skip the page response here. 965 */ 966 if (svm->flags & SVM_FLAG_GUEST_MODE) { 967 if (sdev && !intel_svm_prq_report(sdev->dev, req)) 968 goto prq_advance; 969 else 970 goto bad_req; 971 } 972 973 /* If the mm is already defunct, don't handle faults. */ 974 if (!mmget_not_zero(svm->mm)) 975 goto bad_req; 976 977 mmap_read_lock(svm->mm); 978 vma = find_extend_vma(svm->mm, address); 979 if (!vma || address < vma->vm_start) 980 goto invalid; 981 982 if (access_error(vma, req)) 983 goto invalid; 984 985 ret = handle_mm_fault(vma, address, 986 req->wr_req ? FAULT_FLAG_WRITE : 0, 987 NULL); 988 if (ret & VM_FAULT_ERROR) 989 goto invalid; 990 991 result = QI_RESP_SUCCESS; 992 invalid: 993 mmap_read_unlock(svm->mm); 994 mmput(svm->mm); 995 bad_req: 996 WARN_ON(!sdev); 997 if (sdev && sdev->ops && sdev->ops->fault_cb) { 998 int rwxp = (req->rd_req << 3) | (req->wr_req << 2) | 999 (req->exe_req << 1) | (req->pm_req); 1000 sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, 1001 req->priv_data, rwxp, result); 1002 } 1003 /* We get here in the error case where the PASID lookup failed, 1004 and these can be NULL. Do not use them below this point! */ 1005 sdev = NULL; 1006 svm = NULL; 1007 no_pasid: 1008 if (req->lpig || req->priv_data_present) { 1009 /* 1010 * Per VT-d spec. v3.0 ch7.7, system software must 1011 * respond with page group response if private data 1012 * is present (PDP) or last page in group (LPIG) bit 1013 * is set. This is an additional VT-d feature beyond 1014 * PCI ATS spec. 1015 */ 1016 resp.qw0 = QI_PGRP_PASID(req->pasid) | 1017 QI_PGRP_DID(req->rid) | 1018 QI_PGRP_PASID_P(req->pasid_present) | 1019 QI_PGRP_PDP(req->priv_data_present) | 1020 QI_PGRP_RESP_CODE(result) | 1021 QI_PGRP_RESP_TYPE; 1022 resp.qw1 = QI_PGRP_IDX(req->prg_index) | 1023 QI_PGRP_LPIG(req->lpig); 1024 1025 if (req->priv_data_present) 1026 memcpy(&resp.qw2, req->priv_data, 1027 sizeof(req->priv_data)); 1028 resp.qw2 = 0; 1029 resp.qw3 = 0; 1030 qi_submit_sync(iommu, &resp, 1, 0); 1031 } 1032 prq_advance: 1033 head = (head + sizeof(*req)) & PRQ_RING_MASK; 1034 } 1035 1036 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 1037 1038 /* 1039 * Clear the page request overflow bit and wake up all threads that 1040 * are waiting for the completion of this handling. 1041 */ 1042 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 1043 pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", 1044 iommu->name); 1045 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 1046 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 1047 if (head == tail) { 1048 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); 1049 pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", 1050 iommu->name); 1051 } 1052 } 1053 1054 if (!completion_done(&iommu->prq_complete)) 1055 complete(&iommu->prq_complete); 1056 1057 return IRQ_RETVAL(handled); 1058 } 1059 1060 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) 1061 struct iommu_sva * 1062 intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) 1063 { 1064 struct iommu_sva *sva = ERR_PTR(-EINVAL); 1065 struct intel_svm_dev *sdev = NULL; 1066 unsigned int flags = 0; 1067 int ret; 1068 1069 /* 1070 * TODO: Consolidate with generic iommu-sva bind after it is merged. 1071 * It will require shared SVM data structures, i.e. combine io_mm 1072 * and intel_svm etc. 1073 */ 1074 if (drvdata) 1075 flags = *(unsigned int *)drvdata; 1076 mutex_lock(&pasid_mutex); 1077 ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev); 1078 if (ret) 1079 sva = ERR_PTR(ret); 1080 else if (sdev) 1081 sva = &sdev->sva; 1082 else 1083 WARN(!sdev, "SVM bind succeeded with no sdev!\n"); 1084 1085 mutex_unlock(&pasid_mutex); 1086 1087 return sva; 1088 } 1089 1090 void intel_svm_unbind(struct iommu_sva *sva) 1091 { 1092 struct intel_svm_dev *sdev; 1093 1094 mutex_lock(&pasid_mutex); 1095 sdev = to_intel_svm_dev(sva); 1096 intel_svm_unbind_mm(sdev->dev, sdev->pasid); 1097 mutex_unlock(&pasid_mutex); 1098 } 1099 1100 u32 intel_svm_get_pasid(struct iommu_sva *sva) 1101 { 1102 struct intel_svm_dev *sdev; 1103 u32 pasid; 1104 1105 mutex_lock(&pasid_mutex); 1106 sdev = to_intel_svm_dev(sva); 1107 pasid = sdev->pasid; 1108 mutex_unlock(&pasid_mutex); 1109 1110 return pasid; 1111 } 1112 1113 int intel_svm_page_response(struct device *dev, 1114 struct iommu_fault_event *evt, 1115 struct iommu_page_response *msg) 1116 { 1117 struct iommu_fault_page_request *prm; 1118 struct intel_svm_dev *sdev = NULL; 1119 struct intel_svm *svm = NULL; 1120 struct intel_iommu *iommu; 1121 bool private_present; 1122 bool pasid_present; 1123 bool last_page; 1124 u8 bus, devfn; 1125 int ret = 0; 1126 u16 sid; 1127 1128 if (!dev || !dev_is_pci(dev)) 1129 return -ENODEV; 1130 1131 iommu = device_to_iommu(dev, &bus, &devfn); 1132 if (!iommu) 1133 return -ENODEV; 1134 1135 if (!msg || !evt) 1136 return -EINVAL; 1137 1138 mutex_lock(&pasid_mutex); 1139 1140 prm = &evt->fault.prm; 1141 sid = PCI_DEVID(bus, devfn); 1142 pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; 1143 private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; 1144 last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; 1145 1146 if (!pasid_present) { 1147 ret = -EINVAL; 1148 goto out; 1149 } 1150 1151 if (prm->pasid == 0 || prm->pasid >= PASID_MAX) { 1152 ret = -EINVAL; 1153 goto out; 1154 } 1155 1156 ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev); 1157 if (ret || !sdev) { 1158 ret = -ENODEV; 1159 goto out; 1160 } 1161 1162 /* 1163 * For responses from userspace, need to make sure that the 1164 * pasid has been bound to its mm. 1165 */ 1166 if (svm->flags & SVM_FLAG_GUEST_MODE) { 1167 struct mm_struct *mm; 1168 1169 mm = get_task_mm(current); 1170 if (!mm) { 1171 ret = -EINVAL; 1172 goto out; 1173 } 1174 1175 if (mm != svm->mm) { 1176 ret = -ENODEV; 1177 mmput(mm); 1178 goto out; 1179 } 1180 1181 mmput(mm); 1182 } 1183 1184 /* 1185 * Per VT-d spec. v3.0 ch7.7, system software must respond 1186 * with page group response if private data is present (PDP) 1187 * or last page in group (LPIG) bit is set. This is an 1188 * additional VT-d requirement beyond PCI ATS spec. 1189 */ 1190 if (last_page || private_present) { 1191 struct qi_desc desc; 1192 1193 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | 1194 QI_PGRP_PASID_P(pasid_present) | 1195 QI_PGRP_PDP(private_present) | 1196 QI_PGRP_RESP_CODE(msg->code) | 1197 QI_PGRP_RESP_TYPE; 1198 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); 1199 desc.qw2 = 0; 1200 desc.qw3 = 0; 1201 if (private_present) 1202 memcpy(&desc.qw2, prm->private_data, 1203 sizeof(prm->private_data)); 1204 1205 qi_submit_sync(iommu, &desc, 1, 0); 1206 } 1207 out: 1208 mutex_unlock(&pasid_mutex); 1209 return ret; 1210 } 1211